From 43d6b38fac7297a0525f2ec42aa40305367336f4 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 11 Mar 2026 18:45:04 -0600 Subject: [PATCH 001/221] reorganize the repo to acutally make it maintainable --- ben/src/bin/ben.rs | 10 +- ben/src/bin/pben.rs | 4 +- ben/src/bin/reben.rs | 4 +- ben/src/codec/decode/ben.rs | 83 ++ ben/src/codec/decode/ben32.rs | 75 + ben/src/codec/decode/mod.rs | 12 + .../decode_tests.rs => codec/decode/tests.rs} | 19 +- ben/src/codec/decode/xz.rs | 168 +++ ben/src/codec/encode/ben.rs | 99 ++ ben/src/codec/encode/jsonl.rs | 70 + ben/src/codec/encode/mod.rs | 13 + .../encode_tests.rs => codec/encode/tests.rs} | 8 +- ben/src/codec/encode/xz.rs | 86 ++ ben/src/codec/mod.rs | 3 + .../translate.rs => codec/translate/mod.rs} | 92 +- .../translate/tests.rs} | 8 +- ben/src/decode/mod.rs | 1305 ----------------- ben/src/encode/mod.rs | 727 --------- ben/src/encode/relabel.rs | 588 -------- ben/src/io/mod.rs | 2 + ben/src/io/reader.rs | 699 +++++++++ ben/src/io/writer.rs | 203 +++ ben/src/json/graph/mod.rs | 75 + ben/src/json/graph/tests.rs | 184 +++ ben/src/json/mod.rs | 1 + ben/src/lib.rs | 8 +- .../{decode/read.rs => ops/extract/mod.rs} | 123 +- .../read_tests.rs => ops/extract/tests.rs} | 6 - ben/src/ops/mod.rs | 2 + ben/src/ops/relabel/mod.rs | 174 +++ ben/src/ops/relabel/tests.rs | 330 +++++ ben/src/util/mod.rs | 1 + ben/src/util/rle/mod.rs | 44 + ben/src/util/rle/tests.rs | 19 + ben/src/utils.rs | 368 ----- ben/tests/test_impls_pipeline.rs | 9 +- ben/tests/test_pipeline.rs | 6 +- pyben/src/decode/mod.rs | 7 +- pyben/src/encode/mod.rs | 3 +- 39 files changed, 2409 insertions(+), 3229 deletions(-) create mode 100644 ben/src/codec/decode/ben.rs create mode 100644 ben/src/codec/decode/ben32.rs create mode 100644 ben/src/codec/decode/mod.rs rename ben/src/{decode/tests/decode_tests.rs => codec/decode/tests.rs} (98%) mode change 100755 => 100644 create mode 100644 ben/src/codec/decode/xz.rs create mode 100644 ben/src/codec/encode/ben.rs create mode 100644 ben/src/codec/encode/jsonl.rs create mode 100644 ben/src/codec/encode/mod.rs rename ben/src/{encode/tests/encode_tests.rs => codec/encode/tests.rs} (98%) mode change 100755 => 100644 create mode 100644 ben/src/codec/encode/xz.rs create mode 100644 ben/src/codec/mod.rs rename ben/src/{encode/translate.rs => codec/translate/mod.rs} (53%) mode change 100755 => 100644 rename ben/src/{encode/tests/translate_tests.rs => codec/translate/tests.rs} (96%) mode change 100755 => 100644 delete mode 100755 ben/src/decode/mod.rs delete mode 100755 ben/src/encode/mod.rs delete mode 100755 ben/src/encode/relabel.rs create mode 100644 ben/src/io/mod.rs create mode 100644 ben/src/io/reader.rs create mode 100644 ben/src/io/writer.rs create mode 100644 ben/src/json/graph/mod.rs create mode 100644 ben/src/json/graph/tests.rs create mode 100644 ben/src/json/mod.rs rename ben/src/{decode/read.rs => ops/extract/mod.rs} (50%) mode change 100755 => 100644 rename ben/src/{decode/tests/read_tests.rs => ops/extract/tests.rs} (91%) mode change 100755 => 100644 create mode 100644 ben/src/ops/mod.rs create mode 100644 ben/src/ops/relabel/mod.rs create mode 100644 ben/src/ops/relabel/tests.rs create mode 100644 ben/src/util/mod.rs create mode 100644 ben/src/util/rle/mod.rs create mode 100644 ben/src/util/rle/tests.rs delete mode 100755 ben/src/utils.rs diff --git a/ben/src/bin/ben.rs b/ben/src/bin/ben.rs index bfaf2b7..ec8b5f3 100755 --- a/ben/src/bin/ben.rs +++ b/ben/src/bin/ben.rs @@ -1,6 +1,10 @@ -use ben::decode::read::extract_assignment_ben; -use ben::decode::*; -use ben::encode::*; +use ben::codec::decode::{ + decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, +}; +use ben::codec::encode::{ + encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, +}; +use ben::ops::extract::extract_assignment_ben; use ben::{logln, BenVariant}; use clap::{Parser, ValueEnum}; use std::{ diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs index 73530c3..94eb108 100755 --- a/ben/src/bin/pben.rs +++ b/ben/src/bin/pben.rs @@ -1,5 +1,5 @@ -use ben::decode::*; -use ben::encode::*; +use ben::io::reader::BenDecoder; +use ben::io::writer::{BenEncoder, XBenEncoder}; use ben::{logln, BenVariant}; use clap::{Parser, ValueEnum}; use pcompress; diff --git a/ben/src/bin/reben.rs b/ben/src/bin/reben.rs index 4dd2cac..682f638 100755 --- a/ben/src/bin/reben.rs +++ b/ben/src/bin/reben.rs @@ -1,7 +1,7 @@ use ben::{ - encode::relabel::{relabel_ben_file, relabel_ben_file_with_map}, + json::graph::sort_json_file_by_key, logln, - utils::*, + ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, }; use clap::{Parser, ValueEnum}; use serde_json::{json, Value}; diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs new file mode 100644 index 0000000..39fee83 --- /dev/null +++ b/ben/src/codec/decode/ben.rs @@ -0,0 +1,83 @@ +use crate::io::reader::BenDecoder; +use std::io::{self, Read, Write}; + +pub fn decode_ben_line( + mut reader: R, + max_val_bits: u8, + max_len_bits: u8, + n_bytes: u32, +) -> io::Result> { + let mut assign_bits: Vec = vec![0; n_bytes as usize]; + reader.read_exact(&mut assign_bits)?; + + let n_assignments: usize = + (n_bytes as f64 / ((max_val_bits + max_len_bits) as f64 / 8.0)) as usize; + let mut output_rle: Vec<(u16, u16)> = Vec::with_capacity(n_assignments); + + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + + let mut val = 0; + let mut val_set = false; + let mut len = 0; + let mut len_set = false; + + for &byte in &assign_bits { + buffer |= (byte as u32).to_be() >> n_bits_in_buff; + n_bits_in_buff += 8; + + if n_bits_in_buff >= max_val_bits as u16 && !val_set { + val = (buffer >> (32 - max_val_bits)) as u16; + + buffer <<= max_val_bits; + n_bits_in_buff -= max_val_bits as u16; + val_set = true; + } + + if n_bits_in_buff >= max_len_bits as u16 && val_set && !len_set { + len = (buffer >> (32 - max_len_bits)) as u16; + buffer <<= max_len_bits; + n_bits_in_buff -= max_len_bits as u16; + len_set = true; + } + + if val_set && len_set { + if len > 0 { + output_rle.push((val, len)); + } + val_set = false; + len_set = false; + } + + while n_bits_in_buff >= max_val_bits as u16 + max_len_bits as u16 { + if n_bits_in_buff >= max_val_bits as u16 && !val_set { + val = (buffer >> (32 - max_val_bits)) as u16; + buffer <<= max_val_bits; + n_bits_in_buff -= max_val_bits as u16; + val_set = true; + } + + if n_bits_in_buff >= max_len_bits as u16 && val_set && !len_set { + len = (buffer >> (32 - max_len_bits)) as u16; + buffer <<= max_len_bits; + n_bits_in_buff -= max_len_bits as u16; + len_set = true; + } + + if val_set && len_set { + if len > 0 { + output_rle.push((val, len)); + } + val_set = false; + len_set = false; + } + } + } + + Ok(output_rle) +} + +pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { + let mut ben_decoder = BenDecoder::new(reader)?; + ben_decoder.write_all_jsonl(writer) +} diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs new file mode 100644 index 0000000..0245efe --- /dev/null +++ b/ben/src/codec/decode/ben32.rs @@ -0,0 +1,75 @@ +use crate::BenVariant; +use byteorder::{BigEndian, ReadBytesExt}; +use serde_json::json; +use std::io::{self, BufRead, Write}; + +pub(crate) fn decode_ben32_line( + mut reader: R, + variant: BenVariant, +) -> io::Result<(Vec, u16)> { + let mut buffer = [0u8; 4]; + let mut output_vec: Vec = Vec::new(); + + loop { + match reader.read_exact(&mut buffer) { + Ok(()) => { + let encoded = u32::from_be_bytes(buffer); + if encoded == 0 { + break; + } + + let value = (encoded >> 16) as u16; + let count = (encoded & 0xFFFF) as u16; + + for _ in 0..count { + output_vec.push(value); + } + } + Err(e) => { + return Err(e); + } + } + } + + let count = if variant == BenVariant::MkvChain { + reader + .read_u16::() + .expect("Error when reading sample.") + } else { + 1 + }; + + Ok((output_vec, count)) +} + +pub(crate) fn jsonl_decode_ben32( + mut reader: R, + mut writer: W, + starting_sample: usize, + variant: BenVariant, +) -> io::Result<()> { + let mut sample_number = 1; + loop { + let result = decode_ben32_line(&mut reader, variant); + if let Err(e) = result { + if e.kind() == io::ErrorKind::UnexpectedEof { + return Ok(()); + } + return Err(e); + } + + let (output_vec, count) = result.unwrap(); + + for _ in 0..count { + let line = json!({ + "assignment": output_vec, + "sample": sample_number + starting_sample, + }) + .to_string() + + "\n"; + + writer.write_all(line.as_bytes())?; + sample_number += 1; + } + } +} diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs new file mode 100644 index 0000000..2036898 --- /dev/null +++ b/ben/src/codec/decode/mod.rs @@ -0,0 +1,12 @@ +//! Decoding routines for BEN and XBEN formats. + +mod ben; +mod ben32; +mod xz; + +pub use ben::{decode_ben_line, decode_ben_to_jsonl}; +pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; +pub use xz::{decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress}; + +#[cfg(test)] +mod tests; diff --git a/ben/src/decode/tests/decode_tests.rs b/ben/src/codec/decode/tests.rs old mode 100755 new mode 100644 similarity index 98% rename from ben/src/decode/tests/decode_tests.rs rename to ben/src/codec/decode/tests.rs index 7bf1100..5e2ba20 --- a/ben/src/decode/tests/decode_tests.rs +++ b/ben/src/codec/decode/tests.rs @@ -1,19 +1,12 @@ use super::*; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; use serde_json::{json, Value}; #[test] fn test_jsonl_decode_ben_underflow() { let mut input: Vec = b"STANDARD BEN FILE".to_vec(); - input.extend(vec![ - 2, - 3, - 0, - 0, - 0, - 2, // N Bytes - 0b01100_100, - 0b01_11011_0, - ]); + input.extend(vec![2, 3, 0, 0, 0, 2, 0b01100_100, 0b01_11011_0]); let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); @@ -234,12 +227,12 @@ fn test_jsonl_decode_ben_max_len_65535() { fn test_decode_ben_max_val_and_len_at_65535() { let mut input: Vec = b"STANDARD BEN FILE".to_vec(); input.extend(vec![ - 16, // Max Val Bits - 16, // Max Len Bits + 16, + 16, 0, 0, 0, - 12, // N Bytes + 12, 0b00000000, 0b00000001, 0b00000000, diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs new file mode 100644 index 0000000..52bf1c3 --- /dev/null +++ b/ben/src/codec/decode/xz.rs @@ -0,0 +1,168 @@ +use crate::codec::decode::jsonl_decode_ben32; +use crate::codec::translate::ben32_to_ben_lines; +use crate::{log, logln, BenVariant}; +use std::io::{self, BufRead, Error, Read, Write}; +use xz2::read::XzDecoder; + +pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io::Result<()> { + let mut decoder = XzDecoder::new(reader); + + let mut first_buffer = [0u8; 17]; + + if let Err(e) = decoder.read_exact(&mut first_buffer) { + return Err(e); + } + + let variant = match &first_buffer { + b"STANDARD BEN FILE" => { + writer.write_all(b"STANDARD BEN FILE")?; + BenVariant::Standard + } + b"MKVCHAIN BEN FILE" => { + writer.write_all(b"MKVCHAIN BEN FILE")?; + BenVariant::MkvChain + } + _ => { + return Err(Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )); + } + }; + + let mut buffer = [0u8; 1048576]; + let mut overflow: Vec = Vec::new(); + + let mut line_count: usize = 0; + while let Ok(count) = decoder.read(&mut buffer) { + if count == 0 { + break; + } + + overflow.extend(&buffer[..count]); + + let mut last_valid_assignment = 0; + + match variant { + BenVariant::Standard => { + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 1; + line_count += 1; + log!("Decoding sample: {}\r", line_count); + } + } + } + BenVariant::MkvChain => { + for i in (3..overflow.len() - 2).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 3; + let lines = &overflow[i + 1..i + 3]; + let n_lines = u16::from_be_bytes([lines[0], lines[1]]); + line_count += n_lines as usize; + log!("Decoding sample: {}\r", line_count); + } + } + } + } + + if last_valid_assignment == 0 { + continue; + } + + ben32_to_ben_lines(&overflow[0..last_valid_assignment], &mut writer, variant)?; + overflow = overflow[last_valid_assignment..].to_vec(); + } + logln!(); + logln!("Done!"); + Ok(()) +} + +pub fn xz_decompress(reader: R, mut writer: W) -> io::Result<()> { + let mut decoder = XzDecoder::new(reader); + let mut buffer = [0u8; 4096]; + + while let Ok(count) = decoder.read(&mut buffer) { + if count == 0 { + break; + } + writer.write_all(&buffer[..count])?; + } + + Ok(()) +} + +pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { + let mut decoder = XzDecoder::new(reader); + + let mut first_buffer = [0u8; 17]; + + if let Err(e) = decoder.read_exact(&mut first_buffer) { + return Err(e); + } + + let variant = match &first_buffer { + b"STANDARD BEN FILE" => BenVariant::Standard, + b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + _ => { + return Err(Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )); + } + }; + + let mut buffer = [0u8; 1 << 20]; + let mut overflow: Vec = Vec::new(); + + let mut line_count: usize = 0; + let mut starting_sample: usize = 0; + while let Ok(count) = decoder.read(&mut buffer) { + if count == 0 { + break; + } + + overflow.extend(&buffer[..count]); + + let mut last_valid_assignment = 0; + + match variant { + BenVariant::Standard => { + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 1; + line_count += 1; + log!("Decoding sample: {}\r", line_count); + } + } + } + BenVariant::MkvChain => { + for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 3; + let lines = &overflow[i + 1..i + 3]; + let n_lines = u16::from_be_bytes([lines[0], lines[1]]); + line_count += n_lines as usize; + log!("Decoding sample: {}\r", line_count); + } + } + } + } + + if last_valid_assignment == 0 { + continue; + } + + jsonl_decode_ben32( + &overflow[0..last_valid_assignment], + &mut writer, + starting_sample, + variant, + )?; + overflow.drain(..last_valid_assignment); + starting_sample = line_count; + } + logln!(); + logln!("Done!"); + Ok(()) +} diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs new file mode 100644 index 0000000..0dfe18e --- /dev/null +++ b/ben/src/codec/encode/ben.rs @@ -0,0 +1,99 @@ +use crate::util::rle::assign_to_rle; +use serde_json::Value; + +pub(crate) fn encode_ben32_line(data: Value) -> Vec { + let assign_vec = data["assignment"].as_array().unwrap(); + let mut prev_assign: u16 = 0; + let mut count: u16 = 0; + let mut first = true; + + let mut ret = Vec::new(); + + for assignment in assign_vec { + let assign = assignment.as_u64().unwrap() as u16; + if first { + prev_assign = assign; + count = 1; + first = false; + continue; + } + if assign == prev_assign { + count += 1; + } else { + let encoded = (prev_assign as u32) << 16 | count as u32; + ret.extend(&encoded.to_be_bytes()); + prev_assign = assign; + count = 1; + } + } + + if count > 0 { + let encoded = (prev_assign as u32) << 16 | count as u32; + ret.extend(&encoded.to_be_bytes()); + } + + ret.extend([0, 0, 0, 0]); + ret +} + +pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { + let rle_vec: Vec<(u16, u16)> = assign_to_rle(assign_vec); + encode_ben_vec_from_rle(rle_vec) +} + +pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> Vec { + let mut output_vec: Vec = Vec::new(); + + let max_val: u16 = rle_vec.iter().max_by_key(|x| x.0).unwrap().0; + let max_len: u16 = rle_vec.iter().max_by_key(|x| x.1).unwrap().1; + let max_val_bits: u8 = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bits: u8 = 16 - max_len.leading_zeros() as u8; + let assign_bits: u32 = (max_val_bits + max_len_bits) as u32; + let n_bytes: u32 = if (assign_bits * rle_vec.len() as u32).is_multiple_of(8) { + (assign_bits * rle_vec.len() as u32) / 8 + } else { + (assign_bits * rle_vec.len() as u32) / 8 + 1 + }; + + output_vec.push(max_val_bits); + output_vec.push(max_len_bits); + output_vec.extend(n_bytes.to_be_bytes().as_slice()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for (val, len) in rle_vec { + let mut new_val: u32 = (remainder << max_val_bits) | (val as u32); + + let mut buff: u8; + + let mut n_bits_left: u8 = remainder_bits + max_val_bits; + + while n_bits_left >= 8 { + n_bits_left -= 8; + buff = (new_val >> n_bits_left) as u8; + output_vec.push(buff); + new_val &= !((0xFFFFFFFF as u32) << n_bits_left); + } + + new_val = (new_val << max_len_bits) | (len as u32); + n_bits_left += max_len_bits; + + while n_bits_left >= 8 { + n_bits_left -= 8; + buff = (new_val >> n_bits_left) as u8; + output_vec.push(buff); + new_val &= !((0xFFFFFFFF as u32) << n_bits_left); + } + + remainder_bits = n_bits_left; + remainder = new_val; + } + + if remainder_bits > 0 { + let buff = (remainder << (8 - remainder_bits)) as u8; + output_vec.push(buff); + } + + output_vec +} diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs new file mode 100644 index 0000000..5f493e0 --- /dev/null +++ b/ben/src/codec/encode/jsonl.rs @@ -0,0 +1,70 @@ +use crate::io::writer::{BenEncoder, XBenEncoder}; +use crate::{log, logln, BenVariant}; +use serde_json::Value; +use std::io::{BufRead, Result, Write}; +use xz2::stream::MtStreamBuilder; +use xz2::write::XzEncoder; + +pub fn encode_jsonl_to_xben( + reader: R, + writer: W, + variant: BenVariant, + n_threads: Option, + compression_level: Option, +) -> Result<()> { + let mut n_cpus: u32 = n_threads.unwrap_or(1); + n_cpus = n_cpus + .min( + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) as u32, + ) + .max(1); + + let level = compression_level.unwrap_or(9).clamp(0, 9); + + let mt = MtStreamBuilder::new() + .threads(n_cpus) + .preset(level) + .block_size(0) + .encoder() + .expect("init MT encoder"); + let encoder = XzEncoder::new_stream(writer, mt); + let mut ben_encoder = XBenEncoder::new(encoder, variant); + + let mut line_num = 1; + + for line_result in reader.lines() { + log!("Encoding line: {}\r", line_num); + line_num += 1; + let line = line_result?; + let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); + + ben_encoder.write_json_value(data)?; + } + + logln!(); + logln!("Done!"); + + Ok(()) +} + +pub fn encode_jsonl_to_ben( + reader: R, + writer: W, + variant: BenVariant, +) -> Result<()> { + let mut line_num = 1; + let mut ben_encoder = BenEncoder::new(writer, variant); + for line_result in reader.lines() { + log!("Encoding line: {}\r", line_num); + line_num += 1; + let line = line_result?; + let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); + + ben_encoder.write_json_value(data)?; + } + logln!(); + logln!("Done!"); + Ok(()) +} diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs new file mode 100644 index 0000000..65f34d8 --- /dev/null +++ b/ben/src/codec/encode/mod.rs @@ -0,0 +1,13 @@ +//! Encoding routines for BEN and XBEN formats. + +mod ben; +mod jsonl; +mod xz; + +pub(crate) use ben::encode_ben32_line; +pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle}; +pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +pub use xz::{encode_ben_to_xben, xz_compress}; + +#[cfg(test)] +mod tests; diff --git a/ben/src/encode/tests/encode_tests.rs b/ben/src/codec/encode/tests.rs old mode 100755 new mode 100644 similarity index 98% rename from ben/src/encode/tests/encode_tests.rs rename to ben/src/codec/encode/tests.rs index a508aa6..ab288e0 --- a/ben/src/encode/tests/encode_tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,5 +1,9 @@ use super::*; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; use serde_json::json; +use serde_json::Value; +use std::io::{BufRead, Write}; #[test] fn test_encode_jsonl_to_ben_underflow() { @@ -458,12 +462,12 @@ fn encode_jsonl_to_ben32(reader: R, mut writer: W) -> std: for line_result in reader.lines() { eprint!("Encoding line: {}\r", line_num); line_num += 1; - let line = line_result?; // Handle potential I/O errors for each line + let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); writer.write_all(&encode_ben32_line(data))?; } - eprintln!("Done!"); // Print newline after progress bar + eprintln!("Done!"); Ok(()) } diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs new file mode 100644 index 0000000..5e3ec3e --- /dev/null +++ b/ben/src/codec/encode/xz.rs @@ -0,0 +1,86 @@ +use crate::io::writer::XBenEncoder; +use crate::BenVariant; +use std::io::{self, BufRead, Result, Write}; +use xz2::stream::MtStreamBuilder; +use xz2::write::XzEncoder; + +pub fn xz_compress( + mut reader: R, + writer: W, + n_threads: Option, + compression_level: Option, +) -> Result<()> { + let mut buff = [0; 4096]; + + let mut n_cpus: u32 = n_threads.unwrap_or(1); + n_cpus = n_cpus + .min( + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) as u32, + ) + .max(1); + + let level = compression_level.unwrap_or(9).clamp(0, 9); + + let mt = MtStreamBuilder::new() + .threads(n_cpus) + .preset(level) + .block_size(0) + .encoder() + .expect("init MT encoder"); + let mut encoder = XzEncoder::new_stream(writer, mt); + + while let Ok(count) = reader.read(&mut buff) { + if count == 0 { + break; + } + encoder.write_all(&buff[..count])?; + } + drop(encoder); + Ok(()) +} + +pub fn encode_ben_to_xben( + mut reader: R, + writer: W, + n_threads: Option, + compression_level: Option, +) -> Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + + let mut n_cpus: u32 = n_threads.unwrap_or(1); + n_cpus = n_cpus + .min( + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) as u32, + ) + .max(1); + + let level = compression_level.unwrap_or(9).clamp(0, 9); + + let mt = MtStreamBuilder::new() + .threads(n_cpus) + .preset(level) + .block_size(0) + .encoder() + .expect("init MT encoder"); + let encoder = XzEncoder::new_stream(writer, mt); + + let mut ben_encoder = match &check_buffer { + b"STANDARD BEN FILE" => XBenEncoder::new(encoder, BenVariant::Standard), + b"MKVCHAIN BEN FILE" => XBenEncoder::new(encoder, BenVariant::MkvChain), + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )); + } + }; + + ben_encoder.write_ben_file(reader)?; + + Ok(()) +} diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs new file mode 100644 index 0000000..905e898 --- /dev/null +++ b/ben/src/codec/mod.rs @@ -0,0 +1,3 @@ +pub mod decode; +pub mod encode; +pub mod translate; diff --git a/ben/src/encode/translate.rs b/ben/src/codec/translate/mod.rs old mode 100755 new mode 100644 similarity index 53% rename from ben/src/encode/translate.rs rename to ben/src/codec/translate/mod.rs index c7b17e3..2899085 --- a/ben/src/encode/translate.rs +++ b/ben/src/codec/translate/mod.rs @@ -1,41 +1,19 @@ -//! This module contains the main functions that are used for translating -//! between the ben32 and BEN formats. The ben32 format is a simple run-length -//! encoding of an assignment vector done at the byte level and for which every -//! 32 bits of data encodes a one (assignment, count) pair. The BEN format is -//! a bit-packed version of the ben32 format along with some extra headers. +//! Translation helpers between BEN and ben32 representations. + use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Error, Read, Write}; -use super::{log, logln, BenVariant}; -use crate::decode::decode_ben_line; -use crate::encode::encode_ben_vec_from_rle; - -/// This function takes a ben32 encoded assignment vector and -/// transforms into a ben encoded assignment vector. -/// -/// # Arguments -/// -/// * `ben32_vec` - A vector of bytes containing the ben32 encoded assignment vector -/// -/// # Returns -/// -/// A vector of bytes containing the ben encoded assignment vector -/// -/// # Errors -/// -/// This function will return an error if the input ben32 vector is not a multiple of 4 -/// bytes long or if the end of line separator (4 bytes of 0) is missing. All -/// assignment vectors are expected to be a multiple of 4 bytes long since each -/// assignment vector is an run-length encoded as a 32 bit integer (2 bytes for -/// the value and 2 bytes for the count). The end of line separator is also the -/// only way that the ben32 format has to separate assignment vectors. +use crate::codec::decode::decode_ben_line; +use crate::codec::encode::encode_ben_vec_from_rle; +use crate::{log, logln, BenVariant}; + fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { let mut buffer = [0u8; 4]; let mut ben32_rle: Vec<(u16, u16)> = Vec::new(); let mut reader = ben32_vec.as_slice(); - if ben32_vec.len() % 4 != 0 { + if !ben32_vec.len().is_multiple_of(4) { return Err(Error::new( io::ErrorKind::InvalidData, "Invalid ben32 data length", @@ -46,13 +24,12 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { reader.read_exact(&mut buffer)?; let encoded = u32::from_be_bytes(buffer); - let value = (encoded >> 16) as u16; // High 16 bits - let count = (encoded & 0xFFFF) as u16; // Low 16 bits + let value = (encoded >> 16) as u16; + let count = (encoded & 0xFFFF) as u16; ben32_rle.push((value, count)); } - // read the last 4 bytes which should be 0 since they are a separator reader.read_exact(&mut buffer)?; if buffer != [0u8; 4] { return Err(Error::new( @@ -64,23 +41,6 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { Ok(encode_ben_vec_from_rle(ben32_rle)) } -/// This function takes a reader that contains a several ben32 encoded assignment -/// vectors and encodes them into ben encoded assignment vectors and writes them -/// to the designated writer. -/// -/// # Arguments -/// -/// * `reader` - A reader that contains ben32 encoded assignment vectors -/// * `writer` - A writer that will contain the ben encoded assignment vectors -/// -/// # Returns -/// -/// An io::Result containing the result of the operation -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid ben32 -/// data or if the writer encounters an error while writing the ben data. pub fn ben32_to_ben_lines( mut reader: R, mut writer: W, @@ -92,7 +52,6 @@ pub fn ben32_to_ben_lines( let mut n_reps = 0; - // extract the ben32 data 'inner: loop { match reader.read_exact(&mut ben32_read_buff) { Ok(()) => { @@ -123,18 +82,6 @@ pub fn ben32_to_ben_lines( Ok(()) } -/// This function takes a ben encoded assignment vector and transforms it into -/// a ben32 encoded assignment vector. -/// -/// # Arguments -/// -/// * `reader` - A reader that contains ben encoded assignment vectors -/// * `max_val_bits` - The maximum number of bits that the value of an assignment can have -/// * `max_len_bits` - The maximum number of bits that the length of an assignment can have -/// -/// # Returns -/// -/// A vector of bytes containing the ben32 encoded assignment vector fn ben_to_ben32_line( reader: R, max_val_bits: u8, @@ -145,7 +92,7 @@ fn ben_to_ben32_line( let mut ben32_vec: Vec = Vec::new(); - for (value, count) in ben_rle.into_iter() { + for (value, count) in ben_rle { let encoded = ((value as u32) << 16) | (count as u32); ben32_vec.extend(&encoded.to_be_bytes()); } @@ -155,23 +102,6 @@ fn ben_to_ben32_line( Ok(ben32_vec) } -/// This function takes a reader that contains a several ben encoded assignment -/// vectors and encodes them into ben32 encoded assignment vectors and writes them -/// to the designated writer. -/// -/// # Arguments -/// -/// * `reader` - A reader that contains ben encoded assignment vectors -/// * `writer` - A writer that will contain the ben32 encoded assignment vectors -/// -/// # Returns -/// -/// An io::Result containing the result of the operation -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid ben -/// data or if the writer encounters an error while writing the ben32 data. pub fn ben_to_ben32_lines( mut reader: R, mut writer: W, @@ -206,7 +136,6 @@ pub fn ben_to_ben32_lines( let ben32_vec = ben_to_ben32_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - // Read the number of repetitions AFTER the ben32 data let n_reps = reader.read_u16::()?; sample_number += n_reps as usize; writer.write_all(&ben32_vec)?; @@ -221,5 +150,4 @@ pub fn ben_to_ben32_lines( } #[cfg(test)] -#[path = "tests/translate_tests.rs"] mod tests; diff --git a/ben/src/encode/tests/translate_tests.rs b/ben/src/codec/translate/tests.rs old mode 100755 new mode 100644 similarity index 96% rename from ben/src/encode/tests/translate_tests.rs rename to ben/src/codec/translate/tests.rs index d2a1b26..8313f89 --- a/ben/src/encode/tests/translate_tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -1,15 +1,17 @@ use super::*; -use crate::encode::*; +use crate::codec::encode::{encode_ben32_line, encode_jsonl_to_ben}; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; use rand::SeedableRng; use rand_chacha::ChaCha8Rng; use rand_distr::{Distribution, Uniform}; use serde_json::{json, Value}; -use std::io::BufRead; +use std::io::{self, BufRead, Error, Read, Write}; fn encode_jsonl_to_ben32(reader: R, mut writer: W) -> std::io::Result<()> { writer.write_all("STANDARD BEN FILE".as_bytes())?; for line_result in reader.lines() { - let line = line_result?; // Handle potential I/O errors for each line + let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); writer.write_all(&encode_ben32_line(data))?; diff --git a/ben/src/decode/mod.rs b/ben/src/decode/mod.rs deleted file mode 100755 index e86c067..0000000 --- a/ben/src/decode/mod.rs +++ /dev/null @@ -1,1305 +0,0 @@ -//! This module contains the main functions for decoding XBEN and BEN files. -//! -//! XBEN files are generally transformed back into BEN files, and BEN files -//! are transformed into a JSONL file with the formatting -//! -//! ```json -//! {"assignment": [...], "sample": #} -//! ``` -//! -//! The BEN file format is a bit-packed binary format that is used to store -//! run-length encoded assignment vectors, and is streamable. Therefore, the -//! BEN file format works well with the `read` submodule of this module -//! which is designed to extract a single assignment vector from a BEN file. -pub mod read; - -use byteorder::{BigEndian, ReadBytesExt}; -use serde_json::json; -use std::fs::File; -use std::io::{self, BufRead, Read, Write}; // trait imports -use std::io::{BufReader, Cursor, Error}; // type import -use std::iter::Peekable; -use std::path::Path; -use std::path::PathBuf; -use xz2::read::XzDecoder; - -use crate::utils::rle_to_vec; - -use super::encode::translate::*; -use super::{log, logln, BenVariant}; - -pub type MkvRecord = (Vec, u16); - -#[derive(Debug)] -pub enum DecoderInitError { - InvalidFileFormat(Vec), - Io(io::Error), -} - -/// Check if the given header matches the XZ magic number. -/// This is used to provide a more informative error message when -/// a user tries to decode a compressed .xben file with the -/// `BenDecoder` instead of the `decode_xben_to_ben` function. -fn is_xz_header(h: &[u8]) -> bool { - h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" -} - -/// Convert a byte slice to a hex string for display purposes. -/// Each byte is represented as two uppercase hex digits, separated by spaces. -fn to_hex(bytes: &[u8]) -> String { - bytes - .iter() - .map(|b| format!("{:02X}", b)) - .collect::>() - .join(" ") -} - -impl std::fmt::Display for DecoderInitError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Io(e) => write!(f, "IO error: {e}"), - Self::InvalidFileFormat(header) => { - if is_xz_header(header) { - write!( - f, - "Invalid file format: Compressed header detected (hex: {}). \ - This reader expects an uncompressed .ben file. \ - Decompress this file using the BEN cli `ben -m decode .xben` tool \ - or the `decode_xben_to_ben` function in this library.", - to_hex(header) - ) - } else { - let lossy = String::from_utf8_lossy(header); - write!( - f, - "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", - to_hex(header) - ) - } - } - } - } -} - -impl std::error::Error for DecoderInitError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - DecoderInitError::Io(e) => Some(e), - DecoderInitError::InvalidFileFormat(_) => None, - } - } -} - -impl From for DecoderInitError { - fn from(error: io::Error) -> Self { - DecoderInitError::Io(error) - } -} - -impl From for io::Error { - fn from(error: DecoderInitError) -> Self { - match error { - DecoderInitError::Io(e) => e, - DecoderInitError::InvalidFileFormat(msg) => { - io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) - } - } - } -} - -pub struct BenDecoder { - reader: R, - sample_count: usize, - variant: BenVariant, -} - -/// A single frame from a BEN file. -#[derive(Clone)] -pub struct BenFrame { - pub max_val_bits: u8, // number of bits used for each value - pub max_len_bits: u8, // number of bits used for each run-length - pub count: u16, // repetition count (1 for Standard) - pub n_bytes: u32, // number of bytes used for the raw assignment data - pub raw_data: Vec, // raw bit-compressed BEN data -} - -impl BenDecoder { - /// Create a new BenDecoder from a reader. - /// The reader must contain a valid BEN file. - /// The first 17 bytes of the file are checked to determine - /// the variant of the BEN file. - pub fn new(mut reader: R) -> Result { - let mut check_buffer = [0u8; 17]; - - if let Err(e) = reader.read_exact(&mut check_buffer) { - return Err(DecoderInitError::Io(e)); - } - - match &check_buffer { - b"STANDARD BEN FILE" => Ok(BenDecoder { - reader, - sample_count: 0, - variant: BenVariant::Standard, - }), - b"MKVCHAIN BEN FILE" => Ok(BenDecoder { - reader, - sample_count: 0, - variant: BenVariant::MkvChain, - }), - _ => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), - } - } - - /// Write all decoded assignments to a writer in JSONL format. - /// - /// Arguments: - /// - /// * `writer`: A mutable reference to a writer where the JSONL output will be written. - fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - while let Some(result_tuple) = self.next() { - match result_tuple { - Ok((assignment, count)) => { - for _ in 0..count { - self.sample_count += 1; - let line = json!({ - "assignment": assignment, - "sample": self.sample_count, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes()).unwrap(); - } - } - Err(e) => { - return Err(e); - } - } - } - Ok(()) - } - - /// Internal helper function that pops a single ben frame from the reader. - /// This frame may then either be decoded into an assignment vector - /// or returned as-is for further processing. - fn pop_frame_from_reader(&mut self) -> Option> { - let mut b1 = [0u8; 1]; - let max_val_bits = match self.reader.read_exact(&mut b1) { - Ok(()) => b1[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - // clean EOF before starting a new frame - logln!(); - logln!("Done!"); - return None; - } - return Some(Err(e)); - } - }; - - let mut b2 = [0u8; 1]; - if let Err(e) = self.reader.read_exact(&mut b2) { - return Some(Err(e)); - } - let max_len_bits = b2[0]; - - let n_bytes = match self.reader.read_u32::() { - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - - let mut raw_assignment = vec![0u8; n_bytes as usize]; - if let Err(e) = self.reader.read_exact(&mut raw_assignment) { - return Some(Err(e)); - } - - let count = if self.variant == BenVariant::MkvChain { - match self.reader.read_u16::() { - Ok(c) => c, - Err(e) => return Some(Err(e)), - } - } else { - 1 - }; - - Some(Ok(BenFrame { - max_val_bits, - max_len_bits, - n_bytes, - raw_data: raw_assignment, - count, - })) - } -} - -/// Helper function to decode a ben frame into an assignment vector. -fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { - decode_ben_line( - Cursor::new(&frame.raw_data), - frame.max_val_bits, - frame.max_len_bits, - frame.n_bytes, - ) - .map(rle_to_vec) -} - -impl Iterator for BenDecoder { - type Item = io::Result; - - fn next(&mut self) -> Option> { - let ben_frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; - let assignment = match decode_ben_frame_to_assignment(&ben_frame) { - Ok(assgn) => assgn, - Err(e) => return Some(Err(e)), - }; - log!( - "Decoding sample: {}\r", - self.sample_count + ben_frame.count as usize - ); - Some(Ok((assignment, ben_frame.count))) - } -} - -pub struct BenFrameDecoeder { - inner: BenDecoder, -} - -impl BenFrameDecoeder { - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: BenDecoder::new(reader)?, - }) - } -} - -impl Iterator for BenFrameDecoeder { - type Item = io::Result; - - fn next(&mut self) -> Option { - self.inner.pop_frame_from_reader() - } -} - -impl BenDecoder { - /// Consume this decoder and iterate raw ben frames instead of decoded assignments. - pub fn into_frames(self) -> BenFrameDecoeder { - BenFrameDecoeder { inner: self } - } -} - -impl BenDecoder { - /// Count how many samples remain in this BEN stream. - /// Consumes the decoder (fast: walks frames only). - pub fn count_samples(self) -> io::Result { - let mut total = 0usize; - for frame_res in self.into_frames() { - let f = frame_res?; // BenFrame - total += f.count as usize; // 1 for Standard; >1 for MKVCHAIN - } - Ok(total) - } -} - -/// This function takes a reader containing a single ben32 encoded assignment -/// vector and decodes it into a full assignment vector of u16s. -/// -/// # Errors -/// -/// This function will return an error if the input reader is not a multiple of 4 -/// bytes long since each assignment vector is an run-length encoded as a 32 bit -/// integer (2 bytes for the value and 2 bytes for the count). -/// -fn decode_ben32_line(mut reader: R, variant: BenVariant) -> io::Result { - let mut buffer = [0u8; 4]; - let mut output_vec: Vec = Vec::new(); - - loop { - match reader.read_exact(&mut buffer) { - Ok(()) => { - let encoded = u32::from_be_bytes(buffer); - if encoded == 0 { - // Check for separator (all 0s) - break; // Exit loop to process next sample - } - - let value = (encoded >> 16) as u16; // High 16 bits - let count = (encoded & 0xFFFF) as u16; // Low 16 bits - - // Reconstruct the original data - for _ in 0..count { - output_vec.push(value); - } - } - Err(e) => { - return Err(e); // Propagate other errors - } - } - } - - let count = if variant == BenVariant::MkvChain { - reader - .read_u16::() - .expect("Error when reading sample.") - } else { - 1 - }; - - Ok((output_vec, count)) -} - -/// This function takes a reader containing a file encoded with the -/// "ben32" format and decodes it into a JSONL file. -/// -/// The output JSONL file will have the formatting -/// -/// ```json -/// {"assignment": [...], "sample": #} -/// ``` -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid ben32 -/// data or if the the decode method encounters while trying to extract a single -/// assignment vector, that error is propagated. -fn jsonl_decode_ben32( - mut reader: R, - mut writer: W, - starting_sample: usize, - variant: BenVariant, -) -> io::Result<()> { - let mut sample_number = 1; - loop { - let result = decode_ben32_line(&mut reader, variant); - if let Err(e) = result { - if e.kind() == io::ErrorKind::UnexpectedEof { - return Ok(()); - } - return Err(e); - } - - let (output_vec, count) = result.unwrap(); - - for _ in 0..count { - // Write the reconstructed vector as JSON to the output file - let line = json!({ - "assignment": output_vec, - "sample": sample_number + starting_sample, - }) - .to_string() - + "\n"; - - writer.write_all(line.as_bytes())?; - sample_number += 1; - } - } -} - -/// This function takes a reader containing a file encoded in the XBEN format -/// and decodes it into a BEN file. -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid xben -/// data or if the the decode method encounters while trying to convert the -/// xben data to ben data. -pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - - let mut first_buffer = [0u8; 17]; - - if let Err(e) = decoder.read_exact(&mut first_buffer) { - return Err(e); - } - - let variant = match &first_buffer { - b"STANDARD BEN FILE" => { - writer.write_all(b"STANDARD BEN FILE")?; - BenVariant::Standard - } - b"MKVCHAIN BEN FILE" => { - writer.write_all(b"MKVCHAIN BEN FILE")?; - BenVariant::MkvChain - } - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; - - let mut buffer = [0u8; 1048576]; // 1MB buffer - let mut overflow: Vec = Vec::new(); - - let mut line_count: usize = 0; - while let Ok(count) = decoder.read(&mut buffer) { - if count == 0 { - break; - } - - overflow.extend(&buffer[..count]); - - let mut last_valid_assignment = 0; - - // It is technically faster to read backwards from the last - // multiple of 4 smaller than the length of the overflow buffer - // but this provides only a minute speedup in almost all cases (maybe a - // few seconds). Reading from the front is both safer from a - // maintenance perspective and allows for a better progress indicator - match variant { - BenVariant::Standard => { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - log!("Decoding sample: {}\r", line_count); - } - } - } - BenVariant::MkvChain => { - for i in (3..overflow.len() - 2).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - log!("Decoding sample: {}\r", line_count); - } - } - } - } - - if last_valid_assignment == 0 { - continue; - } - - ben32_to_ben_lines(&overflow[0..last_valid_assignment], &mut writer, variant)?; - overflow = overflow[last_valid_assignment..].to_vec(); - } - logln!(); - logln!("Done!"); - Ok(()) -} - -/// This is a convenience function that decodes a general level 9 LZMA2 compressed file. -/// -/// ``` -/// use ben::encode::xz_compress; -/// use ben::decode::xz_decompress; -/// use lipsum::lipsum; -/// use std::io::{BufReader, BufWriter}; -/// -/// let input = lipsum(100); -/// let reader = BufReader::new(input.as_bytes()); -/// let mut output_buffer = Vec::new(); -/// let writer = BufWriter::new(&mut output_buffer); -/// -/// xz_compress(reader, writer, Some(1), Some(1)).unwrap(); -/// -/// let mut recovery_buff = Vec::new(); -/// let recovery_reader = BufWriter::new(&mut recovery_buff); -/// xz_decompress(output_buffer.as_slice(), recovery_reader).unwrap(); -/// println!("{:?}", output_buffer); -/// ``` -pub fn xz_decompress(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - let mut buffer = [0u8; 4096]; - - while let Ok(count) = decoder.read(&mut buffer) { - if count == 0 { - break; - } - writer.write_all(&buffer[..count])?; - } - - Ok(()) -} - -/// This is a helper function that is designed to read in a single -/// ben encoded line and convert it to a regular run-length encoded -/// assignment vector. -pub fn decode_ben_line( - mut reader: R, - max_val_bits: u8, - max_len_bits: u8, - n_bytes: u32, -) -> io::Result> { - let mut assign_bits: Vec = vec![0; n_bytes as usize]; - reader.read_exact(&mut assign_bits)?; - - // This should be right, but it doesn't need to be exact - let n_assignments: usize = - (n_bytes as f64 / ((max_val_bits + max_len_bits) as f64 / 8.0)) as usize; - let mut output_rle: Vec<(u16, u16)> = Vec::with_capacity(n_assignments); - - let mut buffer: u32 = 0; - let mut n_bits_in_buff: u16 = 0; - - let mut val = 0; - let mut val_set = false; - let mut len = 0; - let mut len_set = false; - - for (_, &byte) in assign_bits.iter().enumerate() { - buffer = buffer | ((byte as u32).to_be() >> (n_bits_in_buff)); - n_bits_in_buff += 8; - - if n_bits_in_buff >= max_val_bits as u16 && !val_set { - val = (buffer >> (32 - max_val_bits)) as u16; - - buffer = (buffer << max_val_bits) as u32; - n_bits_in_buff -= max_val_bits as u16; - val_set = true; - } - - if n_bits_in_buff >= max_len_bits as u16 && val_set && !len_set { - len = (buffer >> (32 - max_len_bits)) as u16; - buffer = buffer << max_len_bits; - n_bits_in_buff -= max_len_bits as u16; - len_set = true; - } - - if val_set && len_set { - // If max_val_bits and max_len_bits are <= 4 - // then the rle can bet (0,0) pairs pushed to it - if len > 0 { - output_rle.push((val, len)); - } - val_set = false; - len_set = false; - } - - while n_bits_in_buff >= max_val_bits as u16 + max_len_bits as u16 { - if n_bits_in_buff >= max_val_bits as u16 && !val_set { - val = (buffer >> (32 - max_val_bits)) as u16; - buffer = (buffer << max_val_bits) as u32; - n_bits_in_buff -= max_val_bits as u16; - val_set = true; - } - - if n_bits_in_buff >= max_len_bits as u16 && val_set && !len_set { - len = (buffer >> (32 - max_len_bits)) as u16; - buffer = buffer << max_len_bits; - n_bits_in_buff -= max_len_bits as u16; - len_set = true; - } - - if val_set && len_set { - // If the max_val_bits and max_len_bits are <= 4 - // then the rle can bet (0,0) pairs pushed to it - if len > 0 { - output_rle.push((val, len)); - } - val_set = false; - len_set = false; - } - } - } - - Ok(output_rle) -} - -/// This function takes a reader containing a file encoded in the BEN format -/// and decodes it into a JSONL file. -/// -/// The output JSONL file will have the formatting -/// -/// ```json -/// {"assignment": [...], "sample": #} -/// ``` -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid ben -/// data or if the the decode method encounters while trying to extract a single -/// assignment vector, that error is then propagated. -pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { - let mut ben_decoder = BenDecoder::new(reader)?; - ben_decoder.write_all_jsonl(writer) -} - -/// This function takes a reader containing a file encoded in the XBEN format -/// and decodes it into a JSONL file. -/// -/// The output JSONL file will have the formatting -/// -/// ```json -/// {"assignment": [...], "sample": #} -/// ``` -/// -/// # Errors -/// -/// This function will return an error if the input reader contains invalid xben -/// data or if the the decode method encounters while trying to extract a single -/// assignment vector, that error is then propagated. -pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - - let mut first_buffer = [0u8; 17]; - - if let Err(e) = decoder.read_exact(&mut first_buffer) { - return Err(e); - } - - let variant = match &first_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; - - let mut buffer = [0u8; 1 << 20]; // 1MB buffer - let mut overflow: Vec = Vec::new(); - - let mut line_count: usize = 0; - let mut starting_sample: usize = 0; - while let Ok(count) = decoder.read(&mut buffer) { - if count == 0 { - break; - } - - overflow.extend(&buffer[..count]); - - let mut last_valid_assignment = 0; - - // It is technically faster to read backwards from the last - // multiple of 4 smaller than the length of the overflow buffer - // but this provides only a minute speedup in almost all cases (maybe a - // few seconds). Reading from the front is both safer from a - // maintenance perspective and allows for a better progress indicator - match variant { - BenVariant::Standard => { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - log!("Decoding sample: {}\r", line_count); - } - } - } - BenVariant::MkvChain => { - // Need a different step size here because each assignment - // vector is no longer guaranteed to be a multiple of 4 bytes - // due to the 2-byte repetition count appended at the end - for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - log!("Decoding sample: {}\r", line_count); - } - } - } - } - - if last_valid_assignment == 0 { - continue; - } - - jsonl_decode_ben32( - &overflow[0..last_valid_assignment], - &mut writer, - starting_sample, - variant, - )?; - overflow.drain(..last_valid_assignment); - starting_sample = line_count; - } - logln!(); - logln!("Done!"); - Ok(()) -} - -/// Iterator over decoded assignments inside an XBEN stream. -/// Yields `(assignment, count)` where `count` is the repetition count -pub struct XBenDecoder { - xz: BufReader>, - pub variant: BenVariant, - overflow: Vec, - buf: Box<[u8]>, // reusable read buffer -} - -impl XBenDecoder { - pub fn new(reader: R) -> io::Result { - let xz = XzDecoder::new(reader); - let mut xz = BufReader::with_capacity(1 << 20, xz); - - // Read the 17-byte banner to determine variant - let mut first = [0u8; 17]; - xz.read_exact(&mut first)?; - let variant = match &first { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - _ => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Invalid .xben header (expecting STANDARD/MKVCHAIN BEN FILE)", - )); - } - }; - - Ok(Self { - xz, - variant, - overflow: Vec::with_capacity(1 << 20), - buf: vec![0u8; 1 << 20].into_boxed_slice(), - }) - } - - /// Try to pop one *complete* ben32 frame from `overflow`. - /// - /// # Arguments - /// - /// * `overflow` - A byte slice that may contain one or more complete ben32 frames. - /// - /// # Returns - /// - /// An Option containing a tuple of: - /// - /// * the complete frame as a byte slice, - /// * the number of bytes consumed from the start of `overflow` to get this frame, - fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { - match self.variant { - BenVariant::Standard => { - // Frame ends right after 4 zero bytes - // ... [payload] ... 00 00 00 00 - if overflow.len() < 4 { - return None; - } - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let end = i + 1; - let frame = &overflow[..end]; - // In STANDARD, count is always 1 - return Some((frame, end, 1)); - } - } - None - } - BenVariant::MkvChain => { - // ... [payload] ... 00 00 00 00 - if overflow.len() < 6 { - return None; - } - for i in (3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let count_hi = overflow[i + 1]; - let count_lo = overflow[i + 2]; - let count = u16::from_be_bytes([count_hi, count_lo]); - let end = i + 3; // inclusive of count bytes - let frame = &overflow[..end]; - return Some((frame, end, count)); - } - } - None - } - } - } -} - -/// Helper function to decode a ben32 frame (raw bytes) into an assignment vector. -fn decode_xben_frame_to_assignment( - frame_bytes: &[u8], - variant: BenVariant, -) -> io::Result> { - let cursor = Cursor::new(frame_bytes); - let (assignment, _) = decode_ben32_line(cursor, variant)?; - Ok(assignment) -} - -impl Iterator for XBenDecoder { - type Item = io::Result; - - fn next(&mut self) -> Option { - loop { - // If we already have a complete frame in overflow, decode and return it - if let Some((frame_bytes, consumed, count)) = - self.pop_frame_from_overflow(&self.overflow) - { - let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { - Ok(assignment) => Ok((assignment, count)), - Err(e) => Err(e), - }; - // drop the used bytes - self.overflow.drain(..consumed); - return Some(res); - } - - // Otherwise, read more from the XZ stream - let read = match self.xz.read(&mut self.buf) { - Ok(0) => { - // EOF: no more data; if there's leftover but not a full frame, report error or stop - if self.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.overflow.extend_from_slice(&self.buf[..read]); - } - } -} - -/// A frame is the raw ben32 bytes plus its repetition count (1 for Standard). -pub type Ben32Frame = (Vec, u16); - -/// Iterator over raw ben32 frames inside an XBEN stream. -/// -/// Yields `(frame_bytes, count)` where `frame_bytes` includes the 4-byte -/// 0x00_00_00_00 terminator; for `MkvChain` frames it also includes the -/// 2-byte big-endian repetition count at the end. `count` is the decoded -/// repetition count (1 for Standard). -/// -/// Mainly useful for finding an assignment quickly -pub struct XBenFrameDecoder { - inner: XBenDecoder, -} - -impl XBenFrameDecoder { - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: XBenDecoder::new(reader)?, - }) - } -} - -impl Iterator for XBenFrameDecoder { - type Item = io::Result; - - fn next(&mut self) -> Option { - loop { - if let Some((frame, consumed, count)) = - self.inner.pop_frame_from_overflow(&self.inner.overflow) - { - // copy out the frame; caller owns the bytes - let out = frame.to_vec(); - self.inner.overflow.drain(..consumed); - return Some(Ok((out, count))); - } - - // refill from xz - let read = match self.inner.xz.read(&mut self.inner.buf) { - Ok(0) => { - if self.inner.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.inner - .overflow - .extend_from_slice(&self.inner.buf[..read]); - } - } -} - -impl XBenDecoder { - /// Consumes the decoder and iterate raw ben32 frames instead of decoded assignments. - pub fn into_frames(self) -> XBenFrameDecoder { - XBenFrameDecoder { inner: self } - } -} - -impl XBenDecoder { - /// Count how many samples remain in this XBEN stream. - /// Consumes the decoder (fast: walks frames only). - pub fn count_samples(self) -> io::Result { - let mut total = 0usize; - for frame_res in self.into_frames() { - let (_bytes, cnt) = frame_res?; // raw ben32 bytes + repetition count - total += cnt as usize; - } - Ok(total) - } -} - -/// A generalized frame object that can be either a BenFrame -/// or a XBEN frame (raw bytes + variant). -#[derive(Clone)] -pub enum Frame { - Ben(BenFrame), // from BenFrameDecoeder - XBen(Vec, BenVariant), // raw ben32 bytes + variant (count is carried beside) -} - -pub enum Selection { - Indices(Peekable>), // 1-based, sorted - Every { step: usize, offset: usize }, // 1-based - Range { start: usize, end: usize }, // inclusive, 1-based -} - -/// Decode a Frame (Ben or XBen) into an assignment vector. -fn decode_frame_to_assignment(frame: &Frame) -> io::Result> { - match frame { - Frame::Ben(f) => decode_ben_frame_to_assignment(f), - Frame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), - } -} - -pub struct SubsampleFrameDecoder -where - I: Iterator>, -{ - inner: I, - selection: Selection, - sample: usize, // processed so far (1-based) -} - -impl SubsampleFrameDecoder -where - I: Iterator>, -{ - pub fn new(inner: I, selection: Selection) -> Self { - Self { - inner, - selection, - sample: 0, - } - } - - /// 1-based indices, in any order (duplicates removed internally). - pub fn by_indices(inner: I, indices: T) -> Self - where - T: IntoIterator, - { - let mut v: Vec = indices.into_iter().collect(); - v.sort_unstable(); - v.dedup(); - Self::new(inner, Selection::Indices(v.into_iter().peekable())) - } - - /// Inclusive 1-based range [start, end]. - pub fn by_range(inner: I, start: usize, end: usize) -> Self { - assert!( - start >= 1 && end >= start, - "range must be 1-based and end >= start" - ); - Self::new(inner, Selection::Range { start, end }) - } - - /// Every `step` samples starting from 1-based `offset`. - pub fn every(inner: I, step: usize, offset: usize) -> Self { - assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); - Self::new(inner, Selection::Every { step, offset }) - } - - // Helper function to count how many selected samples are in the interval [lo, hi]. - // Both lo and hi are 1-based, inclusive. - fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { - match &mut self.selection { - Selection::Indices(iter) => { - let mut taken = 0u16; - while let Some(&next) = iter.peek() { - if next < lo { - iter.next(); - continue; - } - if next > hi { - break; - } - iter.next(); - taken = taken.saturating_add(1); - } - taken - } - Selection::Every { step, offset } => { - let start = lo.max(*offset); - if start > hi { - return 0; - } - let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; - let first = start + ((*step - r) % *step); - if first > hi { - 0 - } else { - (1 + (hi - first) / *step) as u16 - } - } - Selection::Range { start, end } => { - if hi < *start || lo > *end { - 0 - } else { - let a = lo.max(*start); - let b = hi.min(*end); - (b - a + 1) as u16 - } - } - } - } -} - -impl Iterator for SubsampleFrameDecoder -where - I: Iterator>, -{ - type Item = io::Result; // (Vec, u16) - - fn next(&mut self) -> Option { - loop { - // early-exit for Range - if let Selection::Range { end, .. } = self.selection { - if self.sample >= end { - return None; - } - } - // early-exit for Indices - if let Selection::Indices(ref mut it) = self.selection { - if it.peek().is_none() { - return None; - } - } - - let (frame, count) = match self.inner.next()? { - Ok(x) => x, - Err(e) => return Some(Err(e)), - }; - - let lo = self.sample + 1; - let hi = self.sample + count as usize; - let selected = self.count_selected_in(lo, hi); - - // advance regardless - self.sample = hi; - - if selected > 0 { - match decode_frame_to_assignment(&frame) { - Ok(assignment) => return Some(Ok((assignment, selected))), - Err(e) => return Some(Err(e)), - } - } - } - } -} - -pub type FrameIter = Box> + Send>; - -/// Build a frame iterator from a file path and mode ("ben" or "xben") -/// -/// Frame iteration is useful for subsampling since you do not need to decode every frame -/// into an assignment vector. Since the BEN standard includes information about the number -/// of bytes used to encode each frame, reading through the file and extracting particular -/// frames is incredibly fast. -/// -/// # Arguments -/// -/// * `file_path` - A PathBuf pointing to the BEN or XBEN file. -/// * `mode` - A string slice indicating the file type: "ben" or "xben". -/// -/// # Returns -/// -/// An io::Result containing a boxed iterator over frames and their repetition counts. -pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { - let file = File::options().read(true).open(file_path)?; - let reader = BufReader::new(file); - - match mode { - "ben" => { - // Ben -> BenFrameDecoeder - let frames = BenFrameDecoeder::new(reader)?; // Iterator> - let mapped = frames.map(|res| { - res.map(|f| { - let cnt = f.count; - (Frame::Ben(f), cnt) - }) - }); - Ok(Box::new(mapped)) - } - "xben" => { - // XBen -> XBenFrameDecoder (need variant) - let x = XBenDecoder::new(reader)?; - let variant = x.variant; - let frames = x.into_frames(); // Iterator, u16)>> - let mapped = - frames.map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); - Ok(Box::new(mapped)) - } - _ => Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), - } -} - -impl BenDecoder { - /// Create a subsample iterator from this decoder that iterates over specific indices. - /// These indices are 1-based. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based indices to select. - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let count = f.count; - (Frame::Ben(f), count) - }) - }); - SubsampleFrameDecoder::by_indices(frames, indices) - } - - /// Create a subsample iterator from this decoder that iterates over a range of samples. - /// - /// # Arguments - /// - /// * `start` - The 1-based start index (inclusive). - /// * `end` - The 1-based end index (inclusive). - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (Frame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::by_range(frames, start, end) - } - - /// Create a subsample iterator from this decoder that iterates every `step` samples - /// starting from 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The step size (must be >= 1). - /// * `offset` - The 1-based offset to start from (must be >= 1). - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (Frame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::every(frames, step, offset) - } -} - -impl XBenDecoder { - /// Create a subsample iterator from this decoder that iterates over specific indices. - /// These indices are 1-based. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based indices to select. - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let variant = self.variant; // ensure BenVariant: Copy - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_indices(Box::new(frames), indices) - } - - /// Create a subsample iterator from this decoder that iterates over a range of samples. - /// - /// # Arguments - /// - /// * `start` - The 1-based start index (inclusive). - /// * `end` - The 1-based end index (inclusive). - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_range(Box::new(frames), start, end) - } - - /// Create a subsample iterator from this decoder that iterates every `step` samples - /// starting from 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The step size (must be >= 1). - /// * `offset` - The 1-based offset to start from (must be >= 1). - /// - /// # Returns - /// - /// An io::Result containing a SubsampleFrameDecoder that yields - /// decoded assignments and their repetition counts. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::every(Box::new(frames), step, offset) - } -} - -pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { - let iter = build_frame_iter(&path.to_path_buf(), mode)?; - let mut total = 0usize; - for item in iter { - let (_frame, cnt) = item?; - total += cnt as usize; - } - Ok(total) -} - -#[cfg(test)] -#[path = "tests/decode_tests.rs"] -mod tests; diff --git a/ben/src/encode/mod.rs b/ben/src/encode/mod.rs deleted file mode 100755 index c67ce19..0000000 --- a/ben/src/encode/mod.rs +++ /dev/null @@ -1,727 +0,0 @@ -//! This module contains the main encoding functions for turning an -//! input JSONL or BEN file into a BEN or XBEN file. -//! -//! Any input JSONL file is expected to be in the standard -//! -//! ```json -//! {"assignment": [...], "sample": #} -//! ``` -//! -//! format. -//! -//! The BEN format is -//! a simple bit-packed run-length encoded assignment vector with -//! some special headers that allow the decoder to know how many -//! bytes to read for each sample. -//! -//! -//! The XBEN format uses LZMA2 dictionary compression on -//! a byte-level decompressed version of the BEN format (known as ben32) -//! to achieve better compression ratios than we could achieve with applying -//! LZMA2 compression directly to the BEN format. - -pub mod relabel; -pub mod translate; - -use crate::utils::*; -use serde_json::Value; -use std::io::{self, BufRead, Read, Result, Write}; -use xz2::stream::MtStreamBuilder; -use xz2::write::XzEncoder; - -use self::translate::ben_to_ben32_lines; -use super::{log, logln, BenVariant}; - -/// A struct to make the writing of BEN files easier -/// and more ergonomic. -/// -/// # Example -/// -/// ``` -/// use ben::{encode::BenEncoder, BenVariant}; -/// -/// let mut buffer = Vec::new(); -/// let mut ben_encoder = BenEncoder::new(&mut buffer, BenVariant::Standard); -/// -/// ben_encoder.write_assignment(vec![1, 1, 1, 2, 2, 2]); -/// ``` -pub struct BenEncoder { - writer: W, - previous_sample: Vec, - count: u16, - variant: BenVariant, - complete: bool, -} - -impl BenEncoder { - /// Create a new BenEncoder instance and handles - /// the BEN file header. - /// - /// # Arguments - /// - /// * `writer` - A writer to write the BEN file to - /// * `variant` - The BEN variant to use (Standard or MkvChain) - /// - /// # Returns - /// - /// A new BenEncoder instance - pub fn new(mut writer: W, variant: BenVariant) -> Self { - match variant { - BenVariant::Standard => { - writer.write_all(b"STANDARD BEN FILE").unwrap(); - } - BenVariant::MkvChain => { - writer.write_all(b"MKVCHAIN BEN FILE").unwrap(); - } - } - BenEncoder { - writer, - previous_sample: Vec::new(), - count: 0, - complete: false, - variant: variant, - } - } - - /// Write a run-length encoded assignment vector to the - /// BEN file. - /// - /// # Arguments - /// - /// * `rle_vec` - A run-length encoded assignment vector to write - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - pub fn write_rle(&mut self, rle_vec: Vec<(u16, u16)>) -> Result<()> { - match self.variant { - BenVariant::Standard => { - let encoded = encode_ben_vec_from_rle(rle_vec); - self.writer.write_all(&encoded)?; - Ok(()) - } - BenVariant::MkvChain => { - let encoded = encode_ben_vec_from_rle(rle_vec); - if encoded == self.previous_sample { - self.count += 1; - } else { - if self.count > 0 { - self.writer.write_all(&self.previous_sample)?; - self.writer.write_all(&self.count.to_be_bytes())?; - } - self.previous_sample = encoded; - self.count = 1; - } - Ok(()) - } - } - } - - /// Write an assignment vector to the BEN file. - /// - /// # Arguments - /// - /// * `assign_vec` - An assignment vector to write - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - let rle_vec = assign_to_rle(assign_vec); - self.write_rle(rle_vec)?; - Ok(()) - } - - /// Write a JSON value containing an assignment vector to the BEN file. - /// - /// # Arguments - /// - /// * `data` - A JSON value containing an assignment vector - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let assign_vec = data["assignment"].as_array().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "'assignment' field either missing or is not an array of integers", - ) - })?; - let converted_vec = assign_vec - .into_iter() - .map(|x| { - let u = x.as_u64().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!( - "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", - x - ), - ) - })?; - - u16::try_from(u).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", u), - ) - }) - }) - .collect::>>()?; - - let rle_vec = assign_to_rle(converted_vec); - self.write_rle(rle_vec)?; - Ok(()) - } - - /// Cleanup function to make sure the last sample is written - /// to the BEN file if using the MkvChain variant. - /// - /// This function is automatically called when the BenEncoder - /// goes out of scope, but can be called manually if desired. - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - /// - /// # Errors - /// - /// This function will return an error if the writer encounters - /// an error while writing the last sample to the BEN file. - pub fn finish(&mut self) -> Result<()> { - if self.complete { - return Ok(()); - } - if self.variant == BenVariant::MkvChain && self.count > 0 { - self.writer - .write_all(&self.previous_sample) - .expect("Error while writing last line to file"); - self.writer - .write_all(&self.count.to_be_bytes()) - .expect("Error while writing last count to file"); - } - self.complete = true; - Ok(()) - } -} - -impl Drop for BenEncoder { - /// Make sure to finish writing the BEN file when the - /// BenEncoder goes out of scope. - fn drop(&mut self) { - let _ = self.finish(); - } -} - -/// A struct to make the writing of XBEN files easier -/// and more ergonomic. -pub struct XBenEncoder { - encoder: XzEncoder, - previous_sample: Vec, - count: u16, - variant: BenVariant, -} - -impl XBenEncoder { - /// Create a new XBenEncoder instance and handles - /// the XBEN file header. - /// - /// # Arguments - /// - /// * `encoder` - An XzEncoder to write the XBEN file to - /// * `variant` - The BEN variant to use (Standard or MkvChain) - /// - /// # Returns - /// - /// A new XBenEncoder instance - pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { - match variant { - BenVariant::Standard => { - encoder.write_all(b"STANDARD BEN FILE").unwrap(); - XBenEncoder { - encoder, - previous_sample: Vec::new(), - count: 0, - variant: BenVariant::Standard, - } - } - BenVariant::MkvChain => { - encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); - XBenEncoder { - encoder, - previous_sample: Vec::new(), - count: 0, - variant: BenVariant::MkvChain, - } - } - } - } - - /// Write a an assigment vector encoded as a JSON value - /// to the XBEN file. - /// - /// # Arguments - /// - /// * `data` - A JSON value containing an assignment vector - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let encoded = encode_ben32_line(data); - match self.variant { - BenVariant::Standard => { - self.encoder.write_all(&encoded)?; - } - BenVariant::MkvChain => { - if encoded == self.previous_sample { - self.count += 1; - } else { - if self.count > 0 { - self.encoder.write_all(&self.previous_sample)?; - self.encoder.write_all(&self.count.to_be_bytes())?; - } - self.previous_sample = encoded; - self.count = 1; - } - } - } - Ok(()) - } - - /// Converts a raw BEN assignment file into to an XBEN file. - /// This function will check to see if the header is there and then - /// handle it accordingly. - /// - /// # Arguments - /// - /// * `reader` - A buffered reader for the input BEN file - /// - /// # Returns - /// - /// A Result type that contains the result of the operation - pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { - let peek = reader.fill_buf()?; - let has_banner = peek.len() >= 17 - && (peek.starts_with(b"STANDARD BEN FILE") || peek.starts_with(b"MKVCHAIN BEN FILE")); - - if has_banner { - reader.consume(17); - } - - ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) - } -} - -impl Drop for XBenEncoder { - /// Make sure to finish writing the XBEN file when the - /// XBenEncoder goes out of scope. - fn drop(&mut self) { - if self.variant == BenVariant::MkvChain && self.count > 0 { - self.encoder - .write_all(&self.previous_sample) - .expect("Error writing last line to file"); - self.encoder - .write_all(&self.count.to_be_bytes()) - .expect("Error writing last line count to file"); - } - } -} - -/// This function takes a json encoded line containing an assignment -/// vector and a sample number and encodes the assignment vector -/// into a binary format known as "ben32". The ben32 format serves -/// as an intermediate format that allows for efficient compression -/// of BEN files using LZMA2 compression methods. -/// -/// # Arguments -/// -/// * `data` - A JSON object containing an assignment vector and a sample number -/// -/// # Returns -/// -/// A vector of bytes containing the ben32 encoded assignment vector -fn encode_ben32_line(data: Value) -> Vec { - let assign_vec = data["assignment"].as_array().unwrap(); - let mut prev_assign: u16 = 0; - let mut count: u16 = 0; - let mut first = true; - - let mut ret = Vec::new(); - - for assignment in assign_vec { - let assign = assignment.as_u64().unwrap() as u16; - if first { - prev_assign = assign; - count = 1; - first = false; - continue; - } - if assign == prev_assign { - count += 1; - } else { - let encoded = (prev_assign as u32) << 16 | count as u32; - ret.extend(&encoded.to_be_bytes()); - // Reset for next run - prev_assign = assign; - count = 1; - } - } - - // Handle the last run - if count > 0 { - let encoded = (prev_assign as u32) << 16 | count as u32; - ret.extend(&encoded.to_be_bytes()); - } - - ret.extend([0, 0, 0, 0]); - ret -} - -/// This function takes a JSONL file and compresses it to the -/// XBEN format. -/// -/// The JSONL file is assumed to be formatted in the standard -/// -/// ```json -/// {"assignment": [...], "sample": #} -/// ``` -/// -/// format. While the BEN format is -/// a simple bit-packed (streamable!) run-length encoded assignment -/// vector, the XBEN format uses LZMA2 dictionary compression on -/// the byte level to achieve better compression ratios. In order -/// to use XBEN files, the `decode_xben_to_ben` function must be -/// used to decode the file back into a BEN format. -/// -/// # Arguments -/// -/// * `reader` - A buffered reader for the input file -/// * `writer` - A writer for the output file -/// * `variant` - The BEN variant to use (Standard or MkvChain) -/// * `n_threads` - The number of threads to use for compression (optional) -/// * `compression_level` - The compression level to use (0-9, optional) -/// -/// # Returns -/// -/// A Result type that contains the result of the operation -pub fn encode_jsonl_to_xben( - reader: R, - writer: W, - variant: BenVariant, - n_threads: Option, - compression_level: Option, -) -> Result<()> { - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - - let level = compression_level.unwrap_or(9).min(9).max(0); - - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(0) - .encoder() - .expect("init MT encoder"); - let encoder = XzEncoder::new_stream(writer, mt); - let mut ben_encoder = XBenEncoder::new(encoder, variant); - - let mut line_num = 1; - - for line_result in reader.lines() { - log!("Encoding line: {}\r", line_num); - line_num += 1; - let line = line_result?; - let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); - - ben_encoder.write_json_value(data)?; - } - - logln!(); - logln!("Done!"); - - Ok(()) -} - -/// This is a convenience function that applies level 9 LZMA2 compression -/// to a general file. -/// -/// # Arguments -/// -/// * `reader` - A buffered reader for the input file -/// * `writer` - A writer for the output file -/// -/// # Returns -/// -/// A Result type that contains the result of the operation -/// -/// # Example -/// -/// ``` -/// use ben::encode::xz_compress; -/// use lipsum::lipsum; -/// use std::io::{BufReader, BufWriter}; -/// -/// let input = lipsum(100); -/// let reader = BufReader::new(input.as_bytes()); -/// -/// let mut output_buffer = Vec::new(); -/// let writer = BufWriter::new(&mut output_buffer); -/// -/// xz_compress(reader, writer, Some(1), Some(1)).unwrap(); -/// -/// println!("{:?}", output_buffer); -/// ``` -pub fn xz_compress( - mut reader: R, - writer: W, - n_threads: Option, - compression_level: Option, -) -> Result<()> { - let mut buff = [0; 4096]; - // let mut encoder = XzEncoder::new(writer, 1); - - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - - let level = compression_level.unwrap_or(9).min(9).max(0); - - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(0) - .encoder() - .expect("init MT encoder"); - let mut encoder = XzEncoder::new_stream(writer, mt); - - while let Ok(count) = reader.read(&mut buff) { - if count == 0 { - break; - } - encoder.write_all(&buff[..count])?; - } - drop(encoder); // Make sure to flush and finish compression - Ok(()) -} - -/// This function takes in a standard assignment vector and encodes -/// it into a bit-packed ben version. -/// -/// # Arguments -/// -/// * `assign_vec` - A vector of u16 values representing the assignment vector -/// -/// # Returns -/// -/// A vector of bytes containing the bit-packed ben encoded assignment vector -pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { - let rle_vec: Vec<(u16, u16)> = assign_to_rle(assign_vec); - encode_ben_vec_from_rle(rle_vec) -} - -/// This function takes a run-length encoded assignment vector and -/// encodes into a bit-packed ben version -/// -/// # Arguments -/// -/// * `rle_vec` - A vector of tuples containing the value and length of each run -/// -/// # Returns -/// -/// A vector of bytes containing the bit-packed ben encoded assignment vector -pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> Vec { - let mut output_vec: Vec = Vec::new(); - - let max_val: u16 = rle_vec.iter().max_by_key(|x| x.0).unwrap().0; - let max_len: u16 = rle_vec.iter().max_by_key(|x| x.1).unwrap().1; - let max_val_bits: u8 = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bits: u8 = 16 - max_len.leading_zeros() as u8; - let assign_bits: u32 = (max_val_bits + max_len_bits) as u32; - let n_bytes: u32 = if (assign_bits * rle_vec.len() as u32) % 8 == 0 { - (assign_bits * rle_vec.len() as u32) / 8 - } else { - (assign_bits * rle_vec.len() as u32) / 8 + 1 - }; - - output_vec.push(max_val_bits); - output_vec.push(max_len_bits); - output_vec.extend(n_bytes.to_be_bytes().as_slice()); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for (val, len) in rle_vec { - let mut new_val: u32 = (remainder << max_val_bits) | (val as u32); - - let mut buff: u8; - - let mut n_bits_left: u8 = remainder_bits + max_val_bits; - - while n_bits_left >= 8 { - n_bits_left -= 8; - buff = (new_val >> n_bits_left) as u8; - output_vec.push(buff); - new_val = new_val & (!((0xFFFFFFFF as u32) << n_bits_left)); - } - - new_val = (new_val << max_len_bits) | (len as u32); - n_bits_left += max_len_bits; - - while n_bits_left >= 8 { - n_bits_left -= 8; - buff = (new_val >> n_bits_left) as u8; - output_vec.push(buff); - new_val = new_val & (!((0xFFFFFFFF as u32) << n_bits_left)); - } - - remainder_bits = n_bits_left; - remainder = new_val; - } - - if remainder_bits > 0 { - let buff = (remainder << (8 - remainder_bits)) as u8; - output_vec.push(buff); - } - - output_vec -} - -/// This function takes a JSONL file and compresses it into -/// the BEN format. -/// -/// The JSONL file is assumed to be formatted in the standard -/// -/// ```json -/// {"assignment": [...], "sample": #} -/// ``` -/// -/// format. -/// -/// # Arguments -/// -/// * `reader` - A buffered reader for the input file -/// * `writer` - A writer for the output file -/// * `variant` - The BEN variant to use (Standard or MkvChain) -/// -/// # Returns -/// -/// A Result type that contains the result of the operation -/// -/// # Example -/// -/// ``` -/// use std::io::{BufReader, BufWriter}; -/// use serde_json::json; -/// use ben::{encode::encode_jsonl_to_ben, BenVariant}; -/// -/// let input = r#"{"assignment": [1,1,1,2,2,2], "sample": 1}"#.to_string() -/// + "\n" -/// + r#"{"assignment": [1,1,2,2,1,2], "sample": 2}"#; -/// -/// let reader = BufReader::new(input.as_bytes()); -/// let mut write_buffer = Vec::new(); -/// let mut writer = BufWriter::new(&mut write_buffer); -/// -/// encode_jsonl_to_ben(reader, writer, BenVariant::Standard).unwrap(); -/// -/// println!("{:?}", write_buffer); -/// // This will output -/// // [83, 84, 65, 78, 68, 65, 82, 68, 32, -/// // 66, 69, 78, 32, 70, 73, 76, 69, 2, -/// // 2, 0, 0, 0, 1, 123, 2, 2, 0, 0, 0, -/// // 2, 106, 89] -/// ``` -/// -pub fn encode_jsonl_to_ben( - reader: R, - writer: W, - variant: BenVariant, -) -> Result<()> { - let mut line_num = 1; - let mut ben_encoder = BenEncoder::new(writer, variant); - for line_result in reader.lines() { - log!("Encoding line: {}\r", line_num); - line_num += 1; - let line = line_result?; // Handle potential I/O errors for each line - let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); - - ben_encoder.write_json_value(data)?; - } - logln!(); - logln!("Done!"); // Print newline after progress bar - Ok(()) -} - -/// This function takes a BEN file and encodes it into an XBEN -/// file using bit-to-byte decompression followed by LZMA2 compression. -/// -/// # Arguments -/// -/// * `reader` - A buffered reader for the input file -/// * `writer` - A writer for the output file -/// * `n_threads` - The number of threads to use for compression (optional) -/// * `compression_level` - The compression level to use (0-9, optional) -/// -/// # Returns -/// -/// A Result type that contains the result of the operation -pub fn encode_ben_to_xben( - mut reader: R, - writer: W, - n_threads: Option, - compression_level: Option, -) -> Result<()> { - let mut check_buffer = [0u8; 17]; - reader.read_exact(&mut check_buffer)?; - - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - - let level = compression_level.unwrap_or(9).min(9).max(0); - - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(0) - .encoder() - .expect("init MT encoder"); - let encoder = XzEncoder::new_stream(writer, mt); - - let mut ben_encoder = match &check_buffer { - b"STANDARD BEN FILE" => XBenEncoder::new(encoder, BenVariant::Standard), - b"MKVCHAIN BEN FILE" => XBenEncoder::new(encoder, BenVariant::MkvChain), - _ => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; - - ben_encoder.write_ben_file(reader)?; - - Ok(()) -} - -#[cfg(test)] -#[path = "tests/encode_tests.rs"] -mod tests; diff --git a/ben/src/encode/relabel.rs b/ben/src/encode/relabel.rs deleted file mode 100755 index a5c1c10..0000000 --- a/ben/src/encode/relabel.rs +++ /dev/null @@ -1,588 +0,0 @@ -//! This module contains the main functions that are used in the `reben` binary -//! for relabeling the assignment vectors in a BEN file. The relabeling is done -//! can be doe either so that the values are in ascending order or according to -//! a mapping provided by the user in a map file. - -use crate::decode::*; -use crate::encode::*; -use byteorder::{BigEndian, ReadBytesExt}; -use std::collections::HashMap; -use std::io::Error; - -/// Relabels each of the assignment vectors in a BEN file so that the values are -/// in ascending order. -/// -/// # Arguments -/// -/// * `reader` - A reader that implements the `Read` trait containing the BEN file to -/// be relabeled. -/// * `writer` - A writer that implements the `Write` trait and which will contain the -/// relabeled BEN file. -/// -/// # Errors -/// -/// Returns an error if the file format is invalid or if there is an issue reading or writing -/// the file. -pub fn relabel_ben_lines( - mut reader: R, - mut writer: W, - variant: BenVariant, -) -> io::Result<()> { - let mut sample_number = 0; - loop { - let mut tmp_buffer = [0u8]; - let max_val_bits = match reader.read_exact(&mut tmp_buffer) { - Ok(_) => tmp_buffer[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - break; - } - return Err(e); - } - }; - - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - - let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - - // relabel the line - let mut label = 0; - let mut label_map = HashMap::new(); - for (val, _len) in ben_line.iter_mut() { - let new_val = match label_map.get(val) { - Some(v) => *v, - None => { - label += 1; - label_map.insert(*val, label); - label - } - }; - *val = new_val; - } - - let relabeled = encode_ben_vec_from_rle(ben_line); - writer.write_all(&relabeled)?; - - let count_occurrences = if variant == BenVariant::MkvChain { - let count = reader.read_u16::()?; - writer.write_all(&count.to_be_bytes())?; - count - } else { - 1 - }; - - sample_number += count_occurrences as usize; - - log!("Relabeling line: {}\r", sample_number); - } - logln!(); - logln!("Done!"); - - Ok(()) -} - -/// Relabels the values in a BEN file so that the assignment vector values are -/// in ascending order. So , if the assignment vector is [2, 3, 1, 4, 5, 5, 3, 4, 2] -/// the relabeled assignment vector will be [1, 2, 3, 4, 5, 5, 2, 4, 1]. -/// -/// # Arguments -/// -/// * `reader` - A reader that implements the `Read` trait containing the BEN file to -/// be relabeled. -/// * `writer` - A writer that implements the `Write` trait and which will contain the -/// relabeled BEN file. -/// -/// # Errors -/// -/// Returns an error if the file format is invalid or if there is an issue reading or writing -/// the file. -pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; - reader.read_exact(&mut check_buffer)?; - - let variant = match &check_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; - - writer.write_all(&check_buffer)?; - - relabel_ben_lines(&mut reader, &mut writer, variant)?; - - Ok(()) -} - -/// Relabels the values in a BEN file so that the assignment vector values are -/// sorted according to a mapping. The mapping is a HashMap where the key is the -/// position in the new assignment vector and the value is the position in the old -/// assignment vector. -/// -/// # Arguments -/// -/// * `reader` - A reader that implements the `Read` trait containing the BEN file to -/// be relabeled. -/// * `writer` - A writer that implements the `Write` trait and which will contain the -/// relabeled BEN file. -/// * `new_to_old_node_map` - A HashMap where the key is the position in the new assignment -/// vector and the value is the position in the old assignment vector. -/// -/// # Errors -/// -/// Returns an error if the file format is invalid or if there is an issue reading or writing -/// the file. -pub fn relabel_ben_lines_with_map( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, - variant: BenVariant, -) -> io::Result<()> { - let mut sample_number = 0; - loop { - let mut tmp_buffer = [0u8]; - let max_val_bits = match reader.read_exact(&mut tmp_buffer) { - Ok(_) => tmp_buffer[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - break; - } - return Err(e); - } - }; - - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - - let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - - let assignment_vec = rle_to_vec(ben_line); - let new_assignment_vec = assignment_vec - .iter() - .enumerate() - .map(|(i, _)| { - // position of the new value in the old assignment - let new_val_pos = new_to_old_node_map.get(&i).unwrap(); - // get the new value from the old assignment - let new_val = assignment_vec[*new_val_pos]; - new_val - }) - .collect::>(); - - let new_rle = assign_to_rle(new_assignment_vec); - - let relabeled = encode_ben_vec_from_rle(new_rle); - writer.write_all(&relabeled)?; - - let count_occurrences = if variant == BenVariant::MkvChain { - let count = reader.read_u16::()?; - writer.write_all(&count.to_be_bytes())?; - count - } else { - 1 - }; - - sample_number += count_occurrences as usize; - log!("Relabeling line: {}\r", sample_number); - } - logln!(); - logln!("Done!"); - - Ok(()) -} - -/// Relabels the values in a BEN file so that the assignment vector values are -/// sorted according to a mapping. The mapping is a HashMap where the key is the -/// position in the new assignment vector and the value is the position in the old -/// assignment vector. -/// -/// # Arguments -/// -/// * `reader` - A reader that implements the `Read` trait containing the BEN file to -/// be relabeled. -/// * `writer` - A writer that implements the `Write` trait and which will contain the -/// relabeled BEN file. -/// * `new_to_old_node_map` - A HashMap where the key is the position in the new assignment -/// vector and the value is the position in the old assignment vector. -/// -/// # Errors -/// -/// Returns an error if the file format is invalid or if there is an issue reading or writing -/// the file.d according to a mapping. The mapping is a HashMap where the key is the -pub fn relabel_ben_file_with_map( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, -) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; - reader.read_exact(&mut check_buffer)?; - - let variant = match &check_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; - - writer.write_all(&check_buffer)?; - - relabel_ben_lines_with_map(&mut reader, &mut writer, new_to_old_node_map, variant)?; - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use rand::seq::SliceRandom; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; - use rand_distr::{Distribution, Uniform}; - - fn shuffle_with_mapping(vec: &mut Vec) -> HashMap - where - T: Clone + std::cmp::PartialEq, - { - let mut rng = ChaCha8Rng::seed_from_u64(42); - let original_vec = vec.clone(); // Clone the original vector to preserve initial values - vec.shuffle(&mut rng); - - let mut map = HashMap::new(); - for (new_index, item) in vec.iter().enumerate() { - let original_index = original_vec.iter().position(|i| i == item).unwrap(); - map.insert(new_index, original_index); - } - map - } - - #[test] - fn test_relabel_ben_line_simple() { - let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; - - let input = encode_ben_vec_from_rle(in_rle); - - let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; - let expected = encode_ben_vec_from_rle(out_rle); - - let mut buf = Vec::new(); - relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); - - assert_eq!(buf, expected); - } - - #[test] - fn test_relabel_simple_file() { - let file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", - "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", - "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", - "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", - "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}", - "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}" - ); - - let input = file.as_bytes(); - - let mut output = Vec::new(); - let writer = io::BufWriter::new(&mut output); - - encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap(); - - let mut output2 = Vec::new(); - let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file(output.as_slice(), writer2).unwrap(); - - let mut output3 = Vec::new(); - let writer3 = io::BufWriter::new(&mut output3); - decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); - - let output_str = String::from_utf8(output3).unwrap(); - - let out_file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}", - "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}", - "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}", - "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}", - "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":6}", - "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":7}" - ); - - assert_eq!(output_str, out_file); - } - - #[test] - fn test_relabel_simple_file_mkv() { - let file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", - "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", - "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", - "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", - "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":6}", - "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":7}", - "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":8}", - "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":9}", - "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}" - ); - - let input = file.as_bytes(); - - let mut output = Vec::new(); - let writer = io::BufWriter::new(&mut output); - - encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap(); - - let mut output2 = Vec::new(); - let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file(output.as_slice(), writer2).unwrap(); - - let mut output3 = Vec::new(); - let writer3 = io::BufWriter::new(&mut output3); - decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); - - let output_str = String::from_utf8(output3).unwrap(); - - let out_file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}", - "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}", - "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}", - "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}", - "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":6}", - "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":7}", - "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":8}", - "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":9}", - "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":10}" - ); - - assert_eq!(output_str, out_file); - } - - #[test] - fn test_relabel_ben_line_with_map() { - let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; - let in_rle = assign_to_rle(in_assign); - - let input = encode_ben_vec_from_rle(in_rle); - - let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; - let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); - - let mut new_to_old_map = HashMap::new(); - new_to_old_map.insert(0, 2); - new_to_old_map.insert(1, 0); - new_to_old_map.insert(2, 8); - new_to_old_map.insert(3, 1); - new_to_old_map.insert(4, 6); - new_to_old_map.insert(5, 3); - new_to_old_map.insert(6, 7); - new_to_old_map.insert(7, 4); - new_to_old_map.insert(8, 5); - - let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), - &mut buf, - new_to_old_map, - BenVariant::Standard, - ) - .unwrap(); - - assert_eq!(buf, expected); - } - - #[test] - fn test_relabel_ben_line_with_shuffle() { - let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; - let mut out_assign = in_assign.clone(); - - let in_rle = assign_to_rle(in_assign); - let input = encode_ben_vec_from_rle(in_rle); - - let new_to_old_map = shuffle_with_mapping(&mut out_assign); - let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); - - let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), - &mut buf, - new_to_old_map, - BenVariant::Standard, - ) - .unwrap(); - - assert_eq!(buf, expected); - } - - #[test] - fn test_relabel_ben_line_with_large_shuffle() { - let seed = 129530786u64; - let mut rng = ChaCha8Rng::seed_from_u64(seed); - - let mu = Uniform::new(1, 21).expect("Could not make uniform sampler"); - - let in_assign = (0..100_000) - .map(|_| mu.sample(&mut rng) as u16) - .collect::>(); - let mut out_assign = in_assign.clone(); - - let in_rle = assign_to_rle(in_assign.to_vec()); - let input = encode_ben_vec_from_rle(in_rle); - - let new_to_old_map = shuffle_with_mapping(&mut out_assign); - let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); - - let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), - &mut buf, - new_to_old_map, - BenVariant::Standard, - ) - .unwrap(); - - assert_eq!(buf, expected); - } - - #[test] - fn test_relabel_simple_file_with_map() { - let file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", - "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", - "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", - "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", - "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}", - "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}" - ); - - let new_to_old_map: HashMap = [ - (0, 2), - (1, 3), - (2, 4), - (3, 5), - (4, 6), - (5, 7), - (6, 8), - (7, 0), - (8, 1), - ] - .iter() - .cloned() - .collect(); - - let input = file.as_bytes(); - - let mut output = Vec::new(); - let writer = io::BufWriter::new(&mut output); - - encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap(); - - let mut output2 = Vec::new(); - let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); - - let mut output3 = Vec::new(); - let writer3 = io::BufWriter::new(&mut output3); - decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); - - let output_str = String::from_utf8(output3).unwrap(); - - let out_file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}", - "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":2}", - "{\"assignment\":[1,1,2,2,3,3,4,3,3],\"sample\":3}", - "{\"assignment\":[2,1,4,3,2,1,1,4,3],\"sample\":4}", - "{\"assignment\":[2,4,1,3,1,4,3,3,2],\"sample\":5}", - "{\"assignment\":[3,3,4,4,5,5,1,2,2],\"sample\":6}", - "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":7}" - ); - - assert_eq!(output_str, out_file); - } - - #[test] - fn test_relabel_simple_file_with_map_mkv() { - let file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":2}", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":3}", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":4}", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":5}", - "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":6}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":7}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":8}", - "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":9}", - "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}", - ); - - let new_to_old_map: HashMap = [ - (0, 2), - (1, 3), - (2, 4), - (3, 5), - (4, 6), - (5, 7), - (6, 8), - (7, 0), - (8, 1), - ] - .iter() - .cloned() - .collect(); - - let input = file.as_bytes(); - - let mut output = Vec::new(); - let writer = io::BufWriter::new(&mut output); - - encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap(); - - let mut output2 = Vec::new(); - let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); - - let mut output3 = Vec::new(); - let writer3 = io::BufWriter::new(&mut output3); - decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); - - let output_str = String::from_utf8(output3).unwrap(); - - let out_file = format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":2}", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":3}", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":4}", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":5}", - "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":6}", - "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":7}", - "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":8}", - "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":9}", - "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":10}", - ); - - assert_eq!(output_str, out_file); - } -} diff --git a/ben/src/io/mod.rs b/ben/src/io/mod.rs new file mode 100644 index 0000000..c9134a0 --- /dev/null +++ b/ben/src/io/mod.rs @@ -0,0 +1,2 @@ +pub mod reader; +pub mod writer; diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs new file mode 100644 index 0000000..7006372 --- /dev/null +++ b/ben/src/io/reader.rs @@ -0,0 +1,699 @@ +use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::util::rle::rle_to_vec; +use crate::{log, logln, BenVariant}; +use byteorder::{BigEndian, ReadBytesExt}; +use serde_json::json; +use std::fs::File; +use std::io::{self, BufReader, Cursor, Read, Write}; +use std::iter::Peekable; +use std::path::{Path, PathBuf}; +use xz2::read::XzDecoder; + +pub type MkvRecord = (Vec, u16); +pub type Ben32Frame = (Vec, u16); +pub type FrameIter = Box> + Send>; + +#[derive(Debug)] +pub enum DecoderInitError { + InvalidFileFormat(Vec), + Io(io::Error), +} + +fn is_xz_header(h: &[u8]) -> bool { + h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" +} + +fn to_hex(bytes: &[u8]) -> String { + bytes + .iter() + .map(|b| format!("{:02X}", b)) + .collect::>() + .join(" ") +} + +impl std::fmt::Display for DecoderInitError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(e) => write!(f, "IO error: {e}"), + Self::InvalidFileFormat(header) => { + if is_xz_header(header) { + write!( + f, + "Invalid file format: Compressed header detected (hex: {}). \ + This reader expects an uncompressed .ben file. \ + Decompress this file using the BEN cli `ben -m decode .xben` tool \ + or the `decode_xben_to_ben` function in this library.", + to_hex(header) + ) + } else { + let lossy = String::from_utf8_lossy(header); + write!( + f, + "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", + to_hex(header) + ) + } + } + } + } +} + +impl std::error::Error for DecoderInitError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + DecoderInitError::Io(e) => Some(e), + DecoderInitError::InvalidFileFormat(_) => None, + } + } +} + +impl From for DecoderInitError { + fn from(error: io::Error) -> Self { + DecoderInitError::Io(error) + } +} + +impl From for io::Error { + fn from(error: DecoderInitError) -> Self { + match error { + DecoderInitError::Io(e) => e, + DecoderInitError::InvalidFileFormat(msg) => { + io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) + } + } + } +} + +pub struct BenDecoder { + reader: R, + sample_count: usize, + variant: BenVariant, +} + +#[derive(Clone)] +pub struct BenFrame { + pub max_val_bits: u8, + pub max_len_bits: u8, + pub count: u16, + pub n_bytes: u32, + pub raw_data: Vec, +} + +impl BenDecoder { + pub fn new(mut reader: R) -> Result { + let mut check_buffer = [0u8; 17]; + + if let Err(e) = reader.read_exact(&mut check_buffer) { + return Err(DecoderInitError::Io(e)); + } + + match &check_buffer { + b"STANDARD BEN FILE" => Ok(BenDecoder { + reader, + sample_count: 0, + variant: BenVariant::Standard, + }), + b"MKVCHAIN BEN FILE" => Ok(BenDecoder { + reader, + sample_count: 0, + variant: BenVariant::MkvChain, + }), + _ => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), + } + } + + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + while let Some(result_tuple) = self.next() { + match result_tuple { + Ok((assignment, count)) => { + for _ in 0..count { + self.sample_count += 1; + let line = json!({ + "assignment": assignment, + "sample": self.sample_count, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes()).unwrap(); + } + } + Err(e) => { + return Err(e); + } + } + } + Ok(()) + } + + fn pop_frame_from_reader(&mut self) -> Option> { + let mut b1 = [0u8; 1]; + let max_val_bits = match self.reader.read_exact(&mut b1) { + Ok(()) => b1[0], + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + logln!(); + logln!("Done!"); + return None; + } + return Some(Err(e)); + } + }; + + let mut b2 = [0u8; 1]; + if let Err(e) = self.reader.read_exact(&mut b2) { + return Some(Err(e)); + } + let max_len_bits = b2[0]; + + let n_bytes = match self.reader.read_u32::() { + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + + let mut raw_assignment = vec![0u8; n_bytes as usize]; + if let Err(e) = self.reader.read_exact(&mut raw_assignment) { + return Some(Err(e)); + } + + let count = if self.variant == BenVariant::MkvChain { + match self.reader.read_u16::() { + Ok(c) => c, + Err(e) => return Some(Err(e)), + } + } else { + 1 + }; + + Some(Ok(BenFrame { + max_val_bits, + max_len_bits, + n_bytes, + raw_data: raw_assignment, + count, + })) + } + + pub fn into_frames(self) -> BenFrameDecoeder { + BenFrameDecoeder { inner: self } + } + + pub fn count_samples(self) -> io::Result { + let mut total = 0usize; + for frame_res in self.into_frames() { + let f = frame_res?; + total += f.count as usize; + } + Ok(total) + } +} + +fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { + decode_ben_line( + Cursor::new(&frame.raw_data), + frame.max_val_bits, + frame.max_len_bits, + frame.n_bytes, + ) + .map(rle_to_vec) +} + +impl Iterator for BenDecoder { + type Item = io::Result; + + fn next(&mut self) -> Option> { + let ben_frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Some(Err(e)), + None => return None, + }; + let assignment = match decode_ben_frame_to_assignment(&ben_frame) { + Ok(assgn) => assgn, + Err(e) => return Some(Err(e)), + }; + log!( + "Decoding sample: {}\r", + self.sample_count + ben_frame.count as usize + ); + Some(Ok((assignment, ben_frame.count))) + } +} + +pub struct BenFrameDecoeder { + inner: BenDecoder, +} + +impl BenFrameDecoeder { + pub fn new(reader: R) -> io::Result { + Ok(Self { + inner: BenDecoder::new(reader)?, + }) + } +} + +impl Iterator for BenFrameDecoeder { + type Item = io::Result; + + fn next(&mut self) -> Option { + self.inner.pop_frame_from_reader() + } +} + +pub struct XBenDecoder { + xz: BufReader>, + pub variant: BenVariant, + overflow: Vec, + buf: Box<[u8]>, +} + +impl XBenDecoder { + pub fn new(reader: R) -> io::Result { + let xz = XzDecoder::new(reader); + let mut xz = BufReader::with_capacity(1 << 20, xz); + + let mut first = [0u8; 17]; + xz.read_exact(&mut first)?; + let variant = match &first { + b"STANDARD BEN FILE" => BenVariant::Standard, + b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid .xben header (expecting STANDARD/MKVCHAIN BEN FILE)", + )); + } + }; + + Ok(Self { + xz, + variant, + overflow: Vec::with_capacity(1 << 20), + buf: vec![0u8; 1 << 20].into_boxed_slice(), + }) + } + + fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { + match self.variant { + BenVariant::Standard => { + if overflow.len() < 4 { + return None; + } + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let end = i + 1; + let frame = &overflow[..end]; + return Some((frame, end, 1)); + } + } + None + } + BenVariant::MkvChain => { + if overflow.len() < 6 { + return None; + } + for i in (3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let count_hi = overflow[i + 1]; + let count_lo = overflow[i + 2]; + let count = u16::from_be_bytes([count_hi, count_lo]); + let end = i + 3; + let frame = &overflow[..end]; + return Some((frame, end, count)); + } + } + None + } + } + } + + pub fn into_frames(self) -> XBenFrameDecoder { + XBenFrameDecoder { inner: self } + } + + pub fn count_samples(self) -> io::Result { + let mut total = 0usize; + for frame_res in self.into_frames() { + let (_bytes, cnt) = frame_res?; + total += cnt as usize; + } + Ok(total) + } +} + +fn decode_xben_frame_to_assignment( + frame_bytes: &[u8], + variant: BenVariant, +) -> io::Result> { + let cursor = Cursor::new(frame_bytes); + let (assignment, _) = decode_ben32_line(cursor, variant)?; + Ok(assignment) +} + +impl Iterator for XBenDecoder { + type Item = io::Result; + + fn next(&mut self) -> Option { + loop { + if let Some((frame_bytes, consumed, count)) = + self.pop_frame_from_overflow(&self.overflow) + { + let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { + Ok(assignment) => Ok((assignment, count)), + Err(e) => Err(e), + }; + self.overflow.drain(..consumed); + return Some(res); + } + + let read = match self.xz.read(&mut self.buf) { + Ok(0) => { + if self.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated .xben stream (partial frame at EOF)", + ))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.overflow.extend_from_slice(&self.buf[..read]); + } + } +} + +pub struct XBenFrameDecoder { + inner: XBenDecoder, +} + +impl XBenFrameDecoder { + pub fn new(reader: R) -> io::Result { + Ok(Self { + inner: XBenDecoder::new(reader)?, + }) + } +} + +impl Iterator for XBenFrameDecoder { + type Item = io::Result; + + fn next(&mut self) -> Option { + loop { + if let Some((frame, consumed, count)) = + self.inner.pop_frame_from_overflow(&self.inner.overflow) + { + let out = frame.to_vec(); + self.inner.overflow.drain(..consumed); + return Some(Ok((out, count))); + } + + let read = match self.inner.xz.read(&mut self.inner.buf) { + Ok(0) => { + if self.inner.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated .xben stream (partial frame at EOF)", + ))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.inner + .overflow + .extend_from_slice(&self.inner.buf[..read]); + } + } +} + +#[derive(Clone)] +pub enum Frame { + Ben(BenFrame), + XBen(Vec, BenVariant), +} + +pub enum Selection { + Indices(Peekable>), + Every { step: usize, offset: usize }, + Range { start: usize, end: usize }, +} + +fn decode_frame_to_assignment(frame: &Frame) -> io::Result> { + match frame { + Frame::Ben(f) => decode_ben_frame_to_assignment(f), + Frame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), + } +} + +pub struct SubsampleFrameDecoder +where + I: Iterator>, +{ + inner: I, + selection: Selection, + sample: usize, +} + +impl SubsampleFrameDecoder +where + I: Iterator>, +{ + pub fn new(inner: I, selection: Selection) -> Self { + Self { + inner, + selection, + sample: 0, + } + } + + pub fn by_indices(inner: I, indices: T) -> Self + where + T: IntoIterator, + { + let mut v: Vec = indices.into_iter().collect(); + v.sort_unstable(); + v.dedup(); + Self::new(inner, Selection::Indices(v.into_iter().peekable())) + } + + pub fn by_range(inner: I, start: usize, end: usize) -> Self { + assert!( + start >= 1 && end >= start, + "range must be 1-based and end >= start" + ); + Self::new(inner, Selection::Range { start, end }) + } + + pub fn every(inner: I, step: usize, offset: usize) -> Self { + assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); + Self::new(inner, Selection::Every { step, offset }) + } + + fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { + match &mut self.selection { + Selection::Indices(iter) => { + let mut taken = 0u16; + while let Some(&next) = iter.peek() { + if next < lo { + iter.next(); + continue; + } + if next > hi { + break; + } + iter.next(); + taken = taken.saturating_add(1); + } + taken + } + Selection::Every { step, offset } => { + let start = lo.max(*offset); + if start > hi { + return 0; + } + let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; + let first = start + ((*step - r) % *step); + if first > hi { + 0 + } else { + (1 + (hi - first) / *step) as u16 + } + } + Selection::Range { start, end } => { + if hi < *start || lo > *end { + 0 + } else { + let a = lo.max(*start); + let b = hi.min(*end); + (b - a + 1) as u16 + } + } + } + } +} + +impl Iterator for SubsampleFrameDecoder +where + I: Iterator>, +{ + type Item = io::Result; + + fn next(&mut self) -> Option { + loop { + if let Selection::Range { end, .. } = self.selection { + if self.sample >= end { + return None; + } + } + if let Selection::Indices(ref mut it) = self.selection { + if it.peek().is_none() { + return None; + } + } + + let (frame, count) = match self.inner.next()? { + Ok(x) => x, + Err(e) => return Some(Err(e)), + }; + + let lo = self.sample + 1; + let hi = self.sample + count as usize; + let selected = self.count_selected_in(lo, hi); + + self.sample = hi; + + if selected > 0 { + match decode_frame_to_assignment(&frame) { + Ok(assignment) => return Some(Ok((assignment, selected))), + Err(e) => return Some(Err(e)), + } + } + } + } +} + +pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { + let file = File::options().read(true).open(file_path)?; + let reader = BufReader::new(file); + + match mode { + "ben" => { + let frames = BenFrameDecoeder::new(reader)?; + let mapped = frames.map(|res| { + res.map(|f| { + let cnt = f.count; + (Frame::Ben(f), cnt) + }) + }); + Ok(Box::new(mapped)) + } + "xben" => { + let x = XBenDecoder::new(reader)?; + let variant = x.variant; + let frames = x.into_frames(); + let mapped = + frames.map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + Ok(Box::new(mapped)) + } + _ => Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), + } +} + +impl BenDecoder { + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + Send> + where + T: IntoIterator, + { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let count = f.count; + (Frame::Ben(f), count) + }) + }); + SubsampleFrameDecoder::by_indices(frames, indices) + } + + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> + Send> { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let cnt = f.count; + (Frame::Ben(f), cnt) + }) + }); + SubsampleFrameDecoder::by_range(frames, start, end) + } + + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> + Send> { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let cnt = f.count; + (Frame::Ben(f), cnt) + }) + }); + SubsampleFrameDecoder::every(frames, step, offset) + } +} + +impl XBenDecoder { + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + Send> + where + T: IntoIterator, + { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_indices(Box::new(frames), indices) + } + + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_range(Box::new(frames), start, end) + } + + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::every(Box::new(frames), step, offset) + } +} + +pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { + let iter = build_frame_iter(&path.to_path_buf(), mode)?; + let mut total = 0usize; + for item in iter { + let (_frame, cnt) = item?; + total += cnt as usize; + } + Ok(total) +} diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs new file mode 100644 index 0000000..115ea89 --- /dev/null +++ b/ben/src/io/writer.rs @@ -0,0 +1,203 @@ +use crate::codec::encode::encode_ben32_line; +use crate::codec::encode::encode_ben_vec_from_rle; +use crate::codec::translate::ben_to_ben32_lines; +use crate::util::rle::assign_to_rle; +use crate::BenVariant; +use serde_json::Value; +use std::io::{self, BufRead, Result, Write}; +use xz2::write::XzEncoder; + +/// A struct to make the writing of BEN files easier and more ergonomic. +pub struct BenEncoder { + writer: W, + previous_sample: Vec, + count: u16, + variant: BenVariant, + complete: bool, +} + +impl BenEncoder { + pub fn new(mut writer: W, variant: BenVariant) -> Self { + match variant { + BenVariant::Standard => { + writer.write_all(b"STANDARD BEN FILE").unwrap(); + } + BenVariant::MkvChain => { + writer.write_all(b"MKVCHAIN BEN FILE").unwrap(); + } + } + BenEncoder { + writer, + previous_sample: Vec::new(), + count: 0, + complete: false, + variant, + } + } + + pub fn write_rle(&mut self, rle_vec: Vec<(u16, u16)>) -> Result<()> { + match self.variant { + BenVariant::Standard => { + let encoded = encode_ben_vec_from_rle(rle_vec); + self.writer.write_all(&encoded)?; + Ok(()) + } + BenVariant::MkvChain => { + let encoded = encode_ben_vec_from_rle(rle_vec); + if encoded == self.previous_sample { + self.count += 1; + } else { + if self.count > 0 { + self.writer.write_all(&self.previous_sample)?; + self.writer.write_all(&self.count.to_be_bytes())?; + } + self.previous_sample = encoded; + self.count = 1; + } + Ok(()) + } + } + } + + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { + let rle_vec = assign_to_rle(assign_vec); + self.write_rle(rle_vec)?; + Ok(()) + } + + pub fn write_json_value(&mut self, data: Value) -> Result<()> { + let assign_vec = data["assignment"].as_array().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "'assignment' field either missing or is not an array of integers", + ) + })?; + let converted_vec = assign_vec + .iter() + .map(|x| { + let u = x.as_u64().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + x + ), + ) + })?; + + u16::try_from(u).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", u), + ) + }) + }) + .collect::>>()?; + + let rle_vec = assign_to_rle(converted_vec); + self.write_rle(rle_vec)?; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + if self.complete { + return Ok(()); + } + if self.variant == BenVariant::MkvChain && self.count > 0 { + self.writer + .write_all(&self.previous_sample) + .expect("Error while writing last line to file"); + self.writer + .write_all(&self.count.to_be_bytes()) + .expect("Error while writing last count to file"); + } + self.complete = true; + Ok(()) + } +} + +impl Drop for BenEncoder { + fn drop(&mut self) { + let _ = self.finish(); + } +} + +/// A struct to make the writing of XBEN files easier and more ergonomic. +pub struct XBenEncoder { + encoder: XzEncoder, + previous_sample: Vec, + count: u16, + variant: BenVariant, +} + +impl XBenEncoder { + pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { + match variant { + BenVariant::Standard => { + encoder.write_all(b"STANDARD BEN FILE").unwrap(); + XBenEncoder { + encoder, + previous_sample: Vec::new(), + count: 0, + variant: BenVariant::Standard, + } + } + BenVariant::MkvChain => { + encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); + XBenEncoder { + encoder, + previous_sample: Vec::new(), + count: 0, + variant: BenVariant::MkvChain, + } + } + } + } + + pub fn write_json_value(&mut self, data: Value) -> Result<()> { + let encoded = encode_ben32_line(data); + match self.variant { + BenVariant::Standard => { + self.encoder.write_all(&encoded)?; + } + BenVariant::MkvChain => { + if encoded == self.previous_sample { + self.count += 1; + } else { + if self.count > 0 { + self.encoder.write_all(&self.previous_sample)?; + self.encoder.write_all(&self.count.to_be_bytes())?; + } + self.previous_sample = encoded; + self.count = 1; + } + } + } + Ok(()) + } + + pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { + let peek = reader.fill_buf()?; + let has_banner = peek.len() >= 17 + && (peek.starts_with(b"STANDARD BEN FILE") || peek.starts_with(b"MKVCHAIN BEN FILE")); + + if has_banner { + reader.consume(17); + } + + ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) + } +} + +impl Drop for XBenEncoder { + fn drop(&mut self) { + if self.variant == BenVariant::MkvChain && self.count > 0 { + self.encoder + .write_all(&self.previous_sample) + .expect("Error writing last line to file"); + self.encoder + .write_all(&self.count.to_be_bytes()) + .expect("Error writing last line count to file"); + } + } +} diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs new file mode 100644 index 0000000..8193778 --- /dev/null +++ b/ben/src/json/graph/mod.rs @@ -0,0 +1,75 @@ +//! JSON graph helpers used by relabeling workflows. + +use crate::{log, logln}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::io::{Read, Result, Write}; +use std::result::Result as StdResult; + +/// Sorts a JSON-formatted NetworkX graph file by a key. +pub fn sort_json_file_by_key( + reader: R, + mut writer: W, + key: &str, +) -> Result> { + logln!("Loading JSON file..."); + let mut data: Value = serde_json::from_reader(reader).unwrap(); + + logln!("Sorting JSON file by key: {}", key); + if let Some(nodes) = data["nodes"].as_array_mut() { + nodes.sort_by(|a, b| { + let extract_value = |val: &Value| -> StdResult { + match &val[key] { + Value::String(s) => s.parse::().map_err(|_| s.clone()), + Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()), + _ => Err(val[key].to_string()), + } + }; + + match (extract_value(a), extract_value(b)) { + (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), + (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), + (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), + (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), + } + }); + } + + let mut node_map = HashMap::new(); + let mut rev_node_map = HashMap::new(); + if let Some(nodes) = data["nodes"].as_array_mut() { + for (i, node) in nodes.iter_mut().enumerate() { + log!("Relabeling node: {}\r", i + 1); + node_map.insert(node["id"].to_string().parse::().unwrap(), i); + rev_node_map.insert(i, node["id"].to_string().parse::().unwrap()); + node["id"] = json!(i); + } + } + logln!(); + + let mut edge_array = Vec::new(); + if let Some(edges) = data["adjacency"].as_array() { + for i in 0..edges.len() { + log!("Relabeling edge: {}\r", i + 1); + let edge_list_location = + rev_node_map[&data["nodes"][i]["id"].to_string().parse::().unwrap()]; + let mut new_edge_lst = edges[edge_list_location].as_array().unwrap().clone(); + for link in &mut new_edge_lst { + let new = node_map[&link["id"].to_string().parse::().unwrap()]; + link["id"] = json!(new); + } + edge_array.push(new_edge_lst); + } + } + logln!(); + + data["adjacency"] = json!(edge_array); + + logln!("Writing new json to file..."); + writer.write_all(serde_json::to_string(&data).unwrap().as_bytes())?; + + Ok(node_map) +} + +#[cfg(test)] +mod tests; diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs new file mode 100644 index 0000000..68c7dbf --- /dev/null +++ b/ben/src/json/graph/tests.rs @@ -0,0 +1,184 @@ +use super::*; +use serde_json::Value; + +#[test] +fn test_relabel_small_file() { + let input = r#"{ + "adjacency": [ + [ { "id": 3 }, { "id": 1 } ], + [ { "id": 0 }, { "id": 4 }, { "id": 2 } ], + [ { "id": 1 }, { "id": 5 } ], + [ { "id": 0 }, { "id": 6 }, { "id": 4 } ], + [ { "id": 1 }, { "id": 3 }, { "id": 7 }, { "id": 5 } ], + [ { "id": 2 }, { "id": 4 }, { "id": 8 } ], + [ { "id": 3 }, { "id": 7 } ], + [ { "id": 4 }, { "id": 6 }, { "id": 8 } ], + [ { "id": 5 }, { "id": 7 } ] + ], + "directed": false, + "graph": [], + "multigraph": false, + "nodes": [ + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288005", + "id": 0 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288004", + "id": 1 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288003", + "id": 2 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288006", + "id": 3 + }, + { + "TOTPOP": 1, + "boundary_nodes": false, + "boundary_perim": 0, + "GEOID20": "20258288001", + "id": 4 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288002", + "id": 5 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288007", + "id": 6 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288008", + "id": 7 + }, + { + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "GEOID20": "20258288009", + "id": 8 + } + ] +} +"#; + + let reader = input.as_bytes(); + + let mut output = Vec::new(); + let writer = &mut output; + + let key = "GEOID20"; + + let _ = sort_json_file_by_key(reader, writer, key).unwrap(); + + let expected_output = r#"{ + "adjacency": [ + [ { "id": 3 }, { "id": 5 }, { "id": 7 }, { "id": 1 } ], + [ { "id": 2 }, { "id": 0 }, { "id": 8 } ], [ { "id": 3 }, { "id": 1 } ], + [ { "id": 4 }, { "id": 0 }, { "id": 2 } ], + [ { "id": 5 }, { "id": 3 } ], + [ { "id": 4 }, { "id": 6 }, { "id": 0 } ], + [ { "id": 5 }, { "id": 7 } ], + [ { "id": 0 }, { "id": 6 }, { "id": 8 } ], + [ { "id": 1 }, { "id": 7 } ] + ], + "directed": false, + "graph": [], + "multigraph": false, + "nodes": [ + { + "GEOID20": "20258288001", + "TOTPOP": 1, + "boundary_nodes": false, + "boundary_perim": 0, + "id": 0 + }, + { + "GEOID20": "20258288002", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 1 + }, + { + "GEOID20": "20258288003", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 2 + }, + { + "GEOID20": "20258288004", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 3 + }, + { + "GEOID20": "20258288005", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 4 + }, + { + "GEOID20": "20258288006", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 5 + }, + { + "GEOID20": "20258288007", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 6 + }, + { + "GEOID20": "20258288008", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 7 + }, + { + "GEOID20": "20258288009", + "TOTPOP": 1, + "boundary_nodes": true, + "boundary_perim": 1, + "id": 8 + } + ] +} +"#; + + let output_json: Value = serde_json::from_slice(&output).unwrap(); + let expected_output_json: Value = serde_json::from_str(expected_output).unwrap(); + + assert_eq!(output_json, expected_output_json); +} diff --git a/ben/src/json/mod.rs b/ben/src/json/mod.rs new file mode 100644 index 0000000..6f94350 --- /dev/null +++ b/ben/src/json/mod.rs @@ -0,0 +1 @@ +pub mod graph; diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 22b7cea..799c459 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -18,9 +18,11 @@ //! - `reben`: A tool for relabeling BEN files to improve compression ratios. //! -pub mod decode; -pub mod encode; -pub mod utils; +pub mod codec; +pub mod io; +pub mod json; +pub mod ops; +pub mod util; #[macro_export] macro_rules! log { diff --git a/ben/src/decode/read.rs b/ben/src/ops/extract/mod.rs old mode 100755 new mode 100644 similarity index 50% rename from ben/src/decode/read.rs rename to ben/src/ops/extract/mod.rs index 102ca00..88861d1 --- a/ben/src/decode/read.rs +++ b/ben/src/ops/extract/mod.rs @@ -1,39 +1,27 @@ -//! Module documentation. -//! -//! This module provides functionality for extracting single assignment -//! vectors from a BEN file. +//! Sample extraction helpers for BEN and XBEN streams. + +use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::io::reader::{BenDecoder, XBenDecoder}; +use crate::util::rle::rle_to_vec; use serde_json::Error as SerdeError; -use std::fmt::{self}; +use std::fmt; use std::io::Cursor; use std::io::{self, Read}; -use super::{decode_ben32_line, decode_ben_line, rle_to_vec, BenDecoder, XBenDecoder}; - -/// Types of errors that can occur during the extraction of assignments. #[derive(Debug)] pub enum SampleErrorKind { - /// Indicates the sample number is invalid. All sample numbers must be greater than 0. InvalidSampleNumber, - /// Indicates the sample number was not found in the file. The last sample number is provided. SampleNotFound { sample_number: usize }, - /// Wrapper for IO errors. IoError(io::Error), - /// Wrapper for JSON errors. JsonError(SerdeError), } -/// Error type for the extraction of assignments. #[derive(Debug)] pub struct SampleError { pub kind: SampleErrorKind, } impl SampleError { - /// Create a new error from an IO error. - /// - /// # Arguments - /// - /// * `error` - The IO error to wrap. pub fn new_io_error(error: io::Error) -> Self { SampleError { kind: SampleErrorKind::IoError(error), @@ -53,19 +41,13 @@ impl fmt::Display for SampleError { SampleErrorKind::SampleNotFound { sample_number } => { write!( f, - "Sample number not found in file. \ - Failed to find sample '{}'. \ - Last sample seems to be '{}'", + "Sample number not found in file. Failed to find sample '{}'. Last sample seems to be '{}'", sample_number, sample_number - 1 ) } - SampleErrorKind::IoError(e) => { - write!(f, "IO Error: {}", e) - } - SampleErrorKind::JsonError(e) => { - write!(f, "JSON Error: {}", e) - } + SampleErrorKind::IoError(e) => write!(f, "IO Error: {}", e), + SampleErrorKind::JsonError(e) => write!(f, "JSON Error: {}", e), } } } @@ -95,47 +77,6 @@ impl From for SampleError { } } -/// Extracts a single assignment from a binary-encoded data stream. -/// -/// # Arguments -/// -/// * `reader` - The reader to extract the assignment from. -/// * `sample_number` - The sample number to extract. -/// -/// # Returns -/// -/// This function returns a `Result` containing a `Vec` of the assignment if successful, -/// or a `SampleError` if an error occurred. -/// -/// # Example -/// -/// ```no_run -/// use ben::decode::read::extract_assignment_ben; -/// use std::{fs::File, io::BufReader}; -/// -/// let file = File::open("data.jsonl.ben").unwrap(); -/// let reader = BufReader::new(file); -/// let sample_number = 2; -/// -/// let result = extract_assignment_ben(reader, sample_number); -/// match result { -/// Ok(assignment) => { -/// eprintln!("Extracted assignment: {:?}", assignment); -/// } -/// Err(e) => { -/// eprintln!("Error: {}", e); -/// } -/// } -/// ``` -/// -/// # Errors -/// -/// This function can return a `SampleError` if an error occurs during the extraction process. -/// The error can be one of the following: -/// * `InvalidSampleNumber` - The sample number is invalid. All sample numbers must be greater than 0. -/// * `SampleNotFound` - The sample number was not found in the file. The last sample number is provided. -/// * `IoError` - An IO error occurred during the extraction process. -/// * `JsonError` - A JSON error occurred during the extraction process. pub fn extract_assignment_ben( mut reader: R, sample_number: usize, @@ -174,47 +115,6 @@ pub fn extract_assignment_ben( }) } -/// Extracts a single assignment from a binary-encoded data stream. -/// -/// # Arguments -/// -/// * `reader` - The reader to extract the assignment from. -/// * `sample_number` - The sample number to extract. -/// -/// # Returns -/// -/// This function returns a `Result` containing a `Vec` of the assignment if successful, -/// or a `SampleError` if an error occurred. -/// -/// # Example -/// -/// ```no_run -/// use ben::decode::read::extract_assignment_xben; -/// use std::{fs::File, io::BufReader}; -/// -/// let file = File::open("data.jsonl.xben").unwrap(); -/// let reader = BufReader::new(file); -/// let sample_number = 2; -/// -/// let result = extract_assignment_xben(reader, sample_number); -/// match result { -/// Ok(assignment) => { -/// eprintln!("Extracted assignment: {:?}", assignment); -/// } -/// Err(e) => { -/// eprintln!("Error: {}", e); -/// } -/// } -/// ``` -/// -/// # Errors -/// -/// This function can return a `SampleError` if an error occurs during the extraction process. -/// The error can be one of the following: -/// * `InvalidSampleNumber` - The sample number is invalid. All sample numbers must be greater than 0. -/// * `SampleNotFound` - The sample number was not found in the file. The last sample number is provided. -/// * `IoError` - An IO error occurred during the extraction process. -/// * `JsonError` - A JSON error occurred during the extraction process. pub fn extract_assignment_xben( mut reader: R, sample_number: usize, @@ -248,10 +148,5 @@ pub fn extract_assignment_xben( }) } -// #[cfg(test)] -// mod tests { -// include!("tests/read_tests.rs"); -// } #[cfg(test)] -#[path = "tests/read_tests.rs"] mod tests; diff --git a/ben/src/decode/tests/read_tests.rs b/ben/src/ops/extract/tests.rs old mode 100755 new mode 100644 similarity index 91% rename from ben/src/decode/tests/read_tests.rs rename to ben/src/ops/extract/tests.rs index f28cbc3..6d7cb6e --- a/ben/src/decode/tests/read_tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -2,9 +2,6 @@ use super::*; #[test] fn test_extract_assignment_ben() { - // [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4], - // [2,2,3,3,3,3,3,3,3,1,2,3] - // [1,2,3,4,5,6,7,8,9,10] let mut input: Vec = b"STANDARD BEN FILE".to_vec(); input.extend(vec![ 3, @@ -63,9 +60,6 @@ fn test_extract_assignment_ben() { #[test] fn test_extract_assignment_sample_too_large() { - // [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4], - // [2,2,3,3,3,3,3,3,3,1,2,3] - // [1,2,3,4,5,6,7,8,9,10] let mut input: Vec = b"STANDARD BEN FILE".to_vec(); input.extend(vec![ 3, diff --git a/ben/src/ops/mod.rs b/ben/src/ops/mod.rs new file mode 100644 index 0000000..59eae35 --- /dev/null +++ b/ben/src/ops/mod.rs @@ -0,0 +1,2 @@ +pub mod extract; +pub mod relabel; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs new file mode 100644 index 0000000..e85db71 --- /dev/null +++ b/ben/src/ops/relabel/mod.rs @@ -0,0 +1,174 @@ +//! Relabeling operations for BEN files. + +use crate::codec::decode::decode_ben_line; +use crate::codec::encode::encode_ben_vec_from_rle; +use crate::util::rle::{assign_to_rle, rle_to_vec}; +use crate::{log, logln, BenVariant}; +use byteorder::{BigEndian, ReadBytesExt}; +use std::collections::HashMap; +use std::io::{self, Error, Read, Write}; + +pub fn relabel_ben_lines( + mut reader: R, + mut writer: W, + variant: BenVariant, +) -> io::Result<()> { + let mut sample_number = 0; + loop { + let mut tmp_buffer = [0u8]; + let max_val_bits = match reader.read_exact(&mut tmp_buffer) { + Ok(_) => tmp_buffer[0], + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + break; + } + return Err(e); + } + }; + + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + + let mut label = 0; + let mut label_map = HashMap::new(); + for (val, _len) in &mut ben_line { + let new_val = match label_map.get(val) { + Some(v) => *v, + None => { + label += 1; + label_map.insert(*val, label); + label + } + }; + *val = new_val; + } + + let relabeled = encode_ben_vec_from_rle(ben_line); + writer.write_all(&relabeled)?; + + let count_occurrences = if variant == BenVariant::MkvChain { + let count = reader.read_u16::()?; + writer.write_all(&count.to_be_bytes())?; + count + } else { + 1 + }; + + sample_number += count_occurrences as usize; + + log!("Relabeling line: {}\r", sample_number); + } + logln!(); + logln!("Done!"); + + Ok(()) +} + +pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + + let variant = match &check_buffer { + b"STANDARD BEN FILE" => BenVariant::Standard, + b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + _ => { + return Err(Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )); + } + }; + + writer.write_all(&check_buffer)?; + + relabel_ben_lines(&mut reader, &mut writer, variant)?; + + Ok(()) +} + +pub fn relabel_ben_lines_with_map( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, + variant: BenVariant, +) -> io::Result<()> { + let mut sample_number = 0; + loop { + let mut tmp_buffer = [0u8]; + let max_val_bits = match reader.read_exact(&mut tmp_buffer) { + Ok(_) => tmp_buffer[0], + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + break; + } + return Err(e); + } + }; + + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + + let assignment_vec = rle_to_vec(ben_line); + let new_assignment_vec = assignment_vec + .iter() + .enumerate() + .map(|(i, _)| { + let new_val_pos = new_to_old_node_map.get(&i).unwrap(); + assignment_vec[*new_val_pos] + }) + .collect::>(); + + let new_rle = assign_to_rle(new_assignment_vec); + + let relabeled = encode_ben_vec_from_rle(new_rle); + writer.write_all(&relabeled)?; + + let count_occurrences = if variant == BenVariant::MkvChain { + let count = reader.read_u16::()?; + writer.write_all(&count.to_be_bytes())?; + count + } else { + 1 + }; + + sample_number += count_occurrences as usize; + log!("Relabeling line: {}\r", sample_number); + } + logln!(); + logln!("Done!"); + + Ok(()) +} + +pub fn relabel_ben_file_with_map( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + + let variant = match &check_buffer { + b"STANDARD BEN FILE" => BenVariant::Standard, + b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + _ => { + return Err(Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )); + } + }; + + writer.write_all(&check_buffer)?; + + relabel_ben_lines_with_map(&mut reader, &mut writer, new_to_old_node_map, variant)?; + + Ok(()) +} + +#[cfg(test)] +mod tests; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs new file mode 100644 index 0000000..3b8ddf9 --- /dev/null +++ b/ben/src/ops/relabel/tests.rs @@ -0,0 +1,330 @@ +use super::*; +use crate::codec::decode::decode_ben_to_jsonl; +use crate::codec::encode::{encode_ben_vec_from_rle, encode_jsonl_to_ben}; +use crate::util::rle::assign_to_rle; +use rand::seq::SliceRandom; +use rand::SeedableRng; +use rand_chacha::ChaCha8Rng; +use rand_distr::{Distribution, Uniform}; +use std::collections::HashMap; +use std::io; + +fn shuffle_with_mapping(vec: &mut Vec) -> HashMap +where + T: Clone + std::cmp::PartialEq, +{ + let mut rng = ChaCha8Rng::seed_from_u64(42); + let original_vec = vec.clone(); + vec.shuffle(&mut rng); + + let mut map = HashMap::new(); + for (new_index, item) in vec.iter().enumerate() { + let original_index = original_vec.iter().position(|i| i == item).unwrap(); + map.insert(new_index, original_index); + } + map +} + +#[test] +fn test_relabel_ben_line_simple() { + let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; + + let input = encode_ben_vec_from_rle(in_rle); + + let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; + let expected = encode_ben_vec_from_rle(out_rle); + + let mut buf = Vec::new(); + relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); + + assert_eq!(buf, expected); +} + +#[test] +fn test_relabel_simple_file() { + let file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", + "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", + "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", + "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", + "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}", + "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}" + ); + + let input = file.as_bytes(); + + let mut output = Vec::new(); + let writer = io::BufWriter::new(&mut output); + + encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap(); + + let mut output2 = Vec::new(); + let writer2 = io::BufWriter::new(&mut output2); + relabel_ben_file(output.as_slice(), writer2).unwrap(); + + let mut output3 = Vec::new(); + let writer3 = io::BufWriter::new(&mut output3); + decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); + + let output_str = String::from_utf8(output3).unwrap(); + + let out_file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}", + "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}", + "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}", + "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}", + "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":6}", + "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":7}" + ); + + assert_eq!(output_str, out_file); +} + +#[test] +fn test_relabel_simple_file_mkv() { + let file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", + "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", + "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", + "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", + "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":6}", + "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":7}", + "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":8}", + "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":9}", + "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}" + ); + + let input = file.as_bytes(); + + let mut output = Vec::new(); + let writer = io::BufWriter::new(&mut output); + + encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap(); + + let mut output2 = Vec::new(); + let writer2 = io::BufWriter::new(&mut output2); + relabel_ben_file(output.as_slice(), writer2).unwrap(); + + let mut output3 = Vec::new(); + let writer3 = io::BufWriter::new(&mut output3); + decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); + + let output_str = String::from_utf8(output3).unwrap(); + + let out_file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}", + "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}", + "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}", + "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}", + "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":6}", + "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":7}", + "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":8}", + "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":9}", + "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":10}" + ); + + assert_eq!(output_str, out_file); +} + +#[test] +fn test_relabel_ben_line_with_map() { + let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; + let in_rle = assign_to_rle(in_assign); + + let input = encode_ben_vec_from_rle(in_rle); + + let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; + let out_rle = assign_to_rle(out_assign); + let expected = encode_ben_vec_from_rle(out_rle); + + let mut new_to_old_map = HashMap::new(); + new_to_old_map.insert(0, 2); + new_to_old_map.insert(1, 0); + new_to_old_map.insert(2, 8); + new_to_old_map.insert(3, 1); + new_to_old_map.insert(4, 6); + new_to_old_map.insert(5, 3); + new_to_old_map.insert(6, 7); + new_to_old_map.insert(7, 4); + new_to_old_map.insert(8, 5); + + let mut buf = Vec::new(); + relabel_ben_lines_with_map( + input.as_slice(), + &mut buf, + new_to_old_map, + BenVariant::Standard, + ) + .unwrap(); + + assert_eq!(buf, expected); +} + +#[test] +fn test_relabel_ben_line_with_shuffle() { + let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; + let mut out_assign = in_assign.clone(); + + let in_rle = assign_to_rle(in_assign); + let input = encode_ben_vec_from_rle(in_rle); + + let new_to_old_map = shuffle_with_mapping(&mut out_assign); + let out_rle = assign_to_rle(out_assign); + let expected = encode_ben_vec_from_rle(out_rle); + + let mut buf = Vec::new(); + relabel_ben_lines_with_map( + input.as_slice(), + &mut buf, + new_to_old_map, + BenVariant::Standard, + ) + .unwrap(); + + assert_eq!(buf, expected); +} + +#[test] +fn test_relabel_ben_line_with_large_shuffle() { + let seed = 129530786u64; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + + let mu = Uniform::new(1, 21).expect("Could not make uniform sampler"); + + let in_assign = (0..100_000) + .map(|_| mu.sample(&mut rng) as u16) + .collect::>(); + let mut out_assign = in_assign.clone(); + + let in_rle = assign_to_rle(in_assign.to_vec()); + let input = encode_ben_vec_from_rle(in_rle); + + let new_to_old_map = shuffle_with_mapping(&mut out_assign); + let out_rle = assign_to_rle(out_assign); + let expected = encode_ben_vec_from_rle(out_rle); + + let mut buf = Vec::new(); + relabel_ben_lines_with_map( + input.as_slice(), + &mut buf, + new_to_old_map, + BenVariant::Standard, + ) + .unwrap(); + + assert_eq!(buf, expected); +} + +#[test] +fn test_relabel_simple_file_with_map() { + let file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}", + "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}", + "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}", + "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}", + "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}", + "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}" + ); + + let new_to_old_map: HashMap = + [(0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 0), (8, 1)] + .iter() + .cloned() + .collect(); + + let input = file.as_bytes(); + + let mut output = Vec::new(); + let writer = io::BufWriter::new(&mut output); + + encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap(); + + let mut output2 = Vec::new(); + let writer2 = io::BufWriter::new(&mut output2); + relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); + + let mut output3 = Vec::new(); + let writer3 = io::BufWriter::new(&mut output3); + decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); + + let output_str = String::from_utf8(output3).unwrap(); + + let out_file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}", + "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":2}", + "{\"assignment\":[1,1,2,2,3,3,4,3,3],\"sample\":3}", + "{\"assignment\":[2,1,4,3,2,1,1,4,3],\"sample\":4}", + "{\"assignment\":[2,4,1,3,1,4,3,3,2],\"sample\":5}", + "{\"assignment\":[3,3,4,4,5,5,1,2,2],\"sample\":6}", + "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":7}" + ); + + assert_eq!(output_str, out_file); +} + +#[test] +fn test_relabel_simple_file_with_map_mkv() { + let file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":2}", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":3}", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":4}", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":5}", + "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":6}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":7}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":8}", + "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":9}", + "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}", + ); + + let new_to_old_map: HashMap = + [(0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 0), (8, 1)] + .iter() + .cloned() + .collect(); + + let input = file.as_bytes(); + + let mut output = Vec::new(); + let writer = io::BufWriter::new(&mut output); + + encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap(); + + let mut output2 = Vec::new(); + let writer2 = io::BufWriter::new(&mut output2); + relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); + + let mut output3 = Vec::new(); + let writer3 = io::BufWriter::new(&mut output3); + decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap(); + + let output_str = String::from_utf8(output3).unwrap(); + + let out_file = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":2}", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":3}", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":4}", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":5}", + "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":6}", + "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":7}", + "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":8}", + "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":9}", + "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":10}", + ); + + assert_eq!(output_str, out_file); +} diff --git a/ben/src/util/mod.rs b/ben/src/util/mod.rs new file mode 100644 index 0000000..d09d240 --- /dev/null +++ b/ben/src/util/mod.rs @@ -0,0 +1 @@ +pub mod rle; diff --git a/ben/src/util/rle/mod.rs b/ben/src/util/rle/mod.rs new file mode 100644 index 0000000..43ec5f9 --- /dev/null +++ b/ben/src/util/rle/mod.rs @@ -0,0 +1,44 @@ +//! Utility functions for run-length encoding assignment vectors. + +/// Convert a vector of assignments to a run-length encoded (RLE) vector. +pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { + let mut prev_assign: u16 = 0; + let mut count: u16 = 0; + let mut first = true; + let mut rle_vec: Vec<(u16, u16)> = Vec::new(); + + for assign in assign_vec { + if first { + prev_assign = assign; + count = 1; + first = false; + continue; + } + if assign == prev_assign { + count += 1; + } else { + rle_vec.push((prev_assign, count)); + prev_assign = assign; + count = 1; + } + } + + if count > 0 { + rle_vec.push((prev_assign, count)); + } + rle_vec +} + +/// Convert a run-length encoded (RLE) vector to a vector of assignments. +pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec { + let mut output_vec: Vec = Vec::new(); + for (val, len) in rle_vec { + for _ in 0..len { + output_vec.push(val); + } + } + output_vec +} + +#[cfg(test)] +mod tests; diff --git a/ben/src/util/rle/tests.rs b/ben/src/util/rle/tests.rs new file mode 100644 index 0000000..2d7e735 --- /dev/null +++ b/ben/src/util/rle/tests.rs @@ -0,0 +1,19 @@ +use super::*; + +#[test] +fn test_assign_to_rle() { + let assign_vec: Vec = vec![1, 1, 1, 2, 2, 3]; + + let result: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)]; + + assert_eq!(assign_to_rle(assign_vec), result); +} + +#[test] +fn test_rle_to_vec() { + let rle_vec: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)]; + + let result: Vec = vec![1, 1, 1, 2, 2, 3]; + + assert_eq!(rle_to_vec(rle_vec), result); +} diff --git a/ben/src/utils.rs b/ben/src/utils.rs deleted file mode 100755 index 897f3c6..0000000 --- a/ben/src/utils.rs +++ /dev/null @@ -1,368 +0,0 @@ -//! This module provides some utility functions for working with assignments -//! and RLE encoding. It also provides a function to sort a JSON file by a key -//! so as to make the BEN encoding more efficient. - -use super::{log, logln}; -use serde_json::{json, Value}; -use std::collections::HashMap; -use std::io::{Read, Result, Write}; -use std::result::Result as StdResult; - -/// Convert a vector of assignments to a run-length encoded (RLE) vector. -/// -/// # Arguments -/// -/// * `assign_vec` - A vector of assignments to convert to RLE. -/// -/// # Returns -/// -/// A vector of tuples where the first element is the value and the second element is -/// the length of the run. -pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { - let mut prev_assign: u16 = 0; - let mut count: u16 = 0; - let mut first = true; - let mut rle_vec: Vec<(u16, u16)> = Vec::new(); - - for assign in assign_vec { - if first { - prev_assign = assign; - count = 1; - first = false; - continue; - } - if assign == prev_assign { - count += 1; - } else { - rle_vec.push((prev_assign, count)); - // Reset for next run - prev_assign = assign; - count = 1; - } - } - - // Handle the last run - if count > 0 { - rle_vec.push((prev_assign, count)); - } - rle_vec -} - -/// Convert a run-length encoded (RLE) vector to a vector of assignments. -/// -/// # Arguments -/// -/// * `rle_vec` - A vector of tuples where the first element is the value and the second element is -/// the length of the run. -/// -/// # Returns -/// -/// A vector of assignments. -pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec { - let mut output_vec: Vec = Vec::new(); - for (val, len) in rle_vec { - for _ in 0..len { - output_vec.push(val); - } - } - output_vec -} - -/// Sorts a JSON-formatted NetworkX graph file by a key. -/// This function will sort the nodes in the graph by the key provided and -/// then relabel the nodes in the graph from 0 to n-1 where n is the number -/// of nodes in the graph. It will also relabel the edges in the graph to -/// match the new node labels. -/// -/// # Arguments -/// -/// * `reader` - A reader for the JSON file to sort. -/// * `writer` - A writer for the new JSON file. -/// * `key` - The key to sort the nodes by. -/// -/// # Returns -/// -/// A Result containing a HashMap from the old node labels to the new node labels. -pub fn sort_json_file_by_key( - reader: R, - mut writer: W, - key: &str, -) -> Result> { - logln!("Loading JSON file..."); - let mut data: Value = serde_json::from_reader(reader).unwrap(); - - logln!("Sorting JSON file by key: {}", key); - if let Some(nodes) = data["nodes"].as_array_mut() { - nodes.sort_by(|a, b| { - let extract_value = |val: &Value| -> StdResult { - match &val[key] { - Value::String(s) => s.parse::().map_err(|_| s.clone()), - Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()), - _ => Err(val[key].to_string()), - } - }; - - match (extract_value(a), extract_value(b)) { - (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), - (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), - (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), - (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), - } - }); - } - - let mut node_map = HashMap::new(); - let mut rev_node_map = HashMap::new(); - if let Some(nodes) = data["nodes"].as_array_mut() { - for (i, node) in nodes.iter_mut().enumerate() { - log!("Relabeling node: {}\r", i + 1); - node_map.insert(node["id"].to_string().parse::().unwrap(), i); - rev_node_map.insert(i, node["id"].to_string().parse::().unwrap()); - node["id"] = json!(i); - } - } - logln!(); - - let mut edge_array = Vec::new(); - if let Some(edges) = data["adjacency"].as_array() { - for i in 0..edges.len() { - log!("Relabeling edge: {}\r", i + 1); - let edge_list_location = - rev_node_map[&data["nodes"][i]["id"].to_string().parse::().unwrap()]; - let mut new_edge_lst = edges[edge_list_location].as_array().unwrap().clone(); - for link in new_edge_lst.iter_mut() { - let new = node_map[&link["id"].to_string().parse::().unwrap()]; - link["id"] = json!(new); - } - edge_array.push(new_edge_lst); - } - } - logln!(); - - data["adjacency"] = json!(edge_array); - - logln!("Writing new json to file..."); - writer.write_all(serde_json::to_string(&data).unwrap().as_bytes())?; - - Ok(node_map) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_assign_to_rle() { - let assign_vec: Vec = vec![1, 1, 1, 2, 2, 3]; - - let result: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)]; - - assert_eq!(assign_to_rle(assign_vec), result); - } - - #[test] - fn test_rle_to_vec() { - let rle_vec: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)]; - - let result: Vec = vec![1, 1, 1, 2, 2, 3]; - - assert_eq!(rle_to_vec(rle_vec), result); - } - - #[test] - fn test_relabel_small_file() { - // - // 6 -- 7 -- 8 - // | | | - // 3 -- 4 -- 5 - // | | | - // 0 -- 1 -- 2 - // - let input = r#"{ - "adjacency": [ - [ { "id": 3 }, { "id": 1 } ], - [ { "id": 0 }, { "id": 4 }, { "id": 2 } ], - [ { "id": 1 }, { "id": 5 } ], - [ { "id": 0 }, { "id": 6 }, { "id": 4 } ], - [ { "id": 1 }, { "id": 3 }, { "id": 7 }, { "id": 5 } ], - [ { "id": 2 }, { "id": 4 }, { "id": 8 } ], - [ { "id": 3 }, { "id": 7 } ], - [ { "id": 4 }, { "id": 6 }, { "id": 8 } ], - [ { "id": 5 }, { "id": 7 } ] - ], - "directed": false, - "graph": [], - "multigraph": false, - "nodes": [ - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288005", - "id": 0 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288004", - "id": 1 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288003", - "id": 2 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288006", - "id": 3 - }, - { - "TOTPOP": 1, - "boundary_nodes": false, - "boundary_perim": 0, - "GEOID20": "20258288001", - "id": 4 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288002", - "id": 5 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288007", - "id": 6 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288008", - "id": 7 - }, - { - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "GEOID20": "20258288009", - "id": 8 - } - ] -} -"#; - - let reader = input.as_bytes(); - - let mut output = Vec::new(); - let writer = &mut output; - - let key = "GEOID20"; - - let _ = sort_json_file_by_key(reader, writer, key).unwrap(); - - // - // 6 -- 7 -- 8 - // | | | - // 5 -- 0 -- 1 - // | | | - // 4 -- 3 -- 2 - // - let expected_output = r#"{ - "adjacency": [ - [ { "id": 3 }, { "id": 5 }, { "id": 7 }, { "id": 1 } ], - [ { "id": 2 }, { "id": 0 }, { "id": 8 } ], [ { "id": 3 }, { "id": 1 } ], - [ { "id": 4 }, { "id": 0 }, { "id": 2 } ], - [ { "id": 5 }, { "id": 3 } ], - [ { "id": 4 }, { "id": 6 }, { "id": 0 } ], - [ { "id": 5 }, { "id": 7 } ], - [ { "id": 0 }, { "id": 6 }, { "id": 8 } ], - [ { "id": 1 }, { "id": 7 } ] - ], - "directed": false, - "graph": [], - "multigraph": false, - "nodes": [ - { - "GEOID20": "20258288001", - "TOTPOP": 1, - "boundary_nodes": false, - "boundary_perim": 0, - "id": 0 - }, - { - "GEOID20": "20258288002", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 1 - }, - { - "GEOID20": "20258288003", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 2 - }, - { - "GEOID20": "20258288004", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 3 - }, - { - "GEOID20": "20258288005", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 4 - }, - { - "GEOID20": "20258288006", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 5 - }, - { - "GEOID20": "20258288007", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 6 - }, - { - "GEOID20": "20258288008", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 7 - }, - { - "GEOID20": "20258288009", - "TOTPOP": 1, - "boundary_nodes": true, - "boundary_perim": 1, - "id": 8 - } - ] -} -"#; - - logln!(); - let output_json: Value = serde_json::from_slice(&output).unwrap(); - let expected_output_json: Value = serde_json::from_str(expected_output).unwrap(); - - assert_eq!(output_json, expected_output_json); - } -} diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 2d2677d..e95f0d8 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1,13 +1,14 @@ #![allow(clippy::needless_collect)] -use ben::decode::{ +use ben::codec::decode::{ decode_ben_line, decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, - BenDecoder, DecoderInitError, XBenDecoder, }; -use ben::encode::{ +use ben::codec::encode::{ encode_ben_to_xben, encode_ben_vec_from_rle, encode_jsonl_to_ben, encode_jsonl_to_xben, - xz_compress, BenEncoder, + xz_compress, }; +use ben::io::reader::{BenDecoder, DecoderInitError, XBenDecoder}; +use ben::io::writer::BenEncoder; use ben::BenVariant; use proptest::prelude::*; diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 321f275..128776d 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -1,6 +1,6 @@ -use ben::decode::*; -use ben::encode::*; -use ben::utils::*; +use ben::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; +use ben::codec::encode::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +use ben::util::rle::rle_to_vec; use ben::BenVariant; use serde_json::json; use std::io::{Cursor, Read, Write}; diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index dd811c1..f7ebf55 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -1,6 +1,7 @@ -use ben::decode::{ - build_frame_iter, count_samples_from_file, decode_ben_to_jsonl, decode_xben_to_ben, - decode_xben_to_jsonl, BenDecoder, MkvRecord, Selection, SubsampleFrameDecoder, XBenDecoder, +use ben::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; +use ben::io::reader::{ + build_frame_iter, count_samples_from_file, BenDecoder, MkvRecord, Selection, + SubsampleFrameDecoder, XBenDecoder, }; use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 53fbafa..8c8a1d5 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -1,4 +1,5 @@ -use ben::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, BenEncoder}; +use ben::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; +use ben::io::writer::BenEncoder; use ben::BenVariant; use pyo3::exceptions::{PyException, PyIOError}; use pyo3::prelude::PyResult; From 8da732f885a221b6801f8c2af484069b54d1f522 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 09:17:59 -0600 Subject: [PATCH 002/221] improve testing in ben --- ben/src/codec/decode/tests.rs | 81 +++++++ ben/src/codec/encode/tests.rs | 8 + ben/src/codec/translate/tests.rs | 98 +++++++++ ben/src/json/graph/tests.rs | 85 ++++++++ ben/src/ops/extract/tests.rs | 69 ++++++ ben/src/ops/relabel/tests.rs | 76 +++++++ ben/tests/test_impls_pipeline.rs | 359 ++++++++++++++++++++++++++++++- 7 files changed, 775 insertions(+), 1 deletion(-) diff --git a/ben/src/codec/decode/tests.rs b/ben/src/codec/decode/tests.rs index 5e2ba20..fcee0dd 100644 --- a/ben/src/codec/decode/tests.rs +++ b/ben/src/codec/decode/tests.rs @@ -1,7 +1,9 @@ use super::*; +use crate::codec::encode::xz_compress; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::{json, Value}; +use std::io::{self, BufRead, BufReader}; #[test] fn test_jsonl_decode_ben_underflow() { @@ -265,6 +267,85 @@ fn test_decode_ben_max_val_and_len_at_65535() { assert_eq!(output, (expected_output.to_string() + "\n").as_bytes()); } +#[test] +fn test_jsonl_decode_ben32_propagates_non_eof_error() { + struct AlwaysErrBuf; + + impl io::Read for AlwaysErrBuf { + fn read(&mut self, _buf: &mut [u8]) -> io::Result { + Err(io::Error::other("boom")) + } + } + + impl BufRead for AlwaysErrBuf { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + Err(io::Error::other("boom")) + } + + fn consume(&mut self, _amt: usize) {} + } + + let err = jsonl_decode_ben32(AlwaysErrBuf, Vec::new(), 0, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert_eq!(err.to_string(), "boom"); +} + +#[test] +fn test_decode_xben_to_ben_rejects_invalid_inner_header() { + let mut xz = Vec::new(); + xz_compress(BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), &mut xz, Some(1), Some(0)) + .unwrap(); + + let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn test_decode_xben_to_jsonl_rejects_invalid_inner_header() { + let mut xz = Vec::new(); + xz_compress(BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), &mut xz, Some(1), Some(0)) + .unwrap(); + + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn test_decode_xben_to_ben_handles_partial_overflow_without_frame() { + let mut xz = Vec::new(); + let mut inner = b"STANDARD BEN FILE".to_vec(); + inner.extend_from_slice(&[1, 2, 3]); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + + let mut out = Vec::new(); + decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap(); + assert_eq!(out, b"STANDARD BEN FILE"); +} + +#[test] +fn test_decode_xben_to_jsonl_handles_partial_overflow_without_frame() { + let mut xz = Vec::new(); + let mut inner = b"STANDARD BEN FILE".to_vec(); + inner.extend_from_slice(&[1, 2, 3]); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + + let mut out = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); + assert!(out.is_empty()); +} + +#[test] +fn test_decode_xben_to_ben_short_xz_stream_errors() { + let err = decode_xben_to_ben(BufReader::new([].as_slice()), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn test_decode_xben_to_jsonl_short_xz_stream_errors() { + let err = decode_xben_to_jsonl(BufReader::new([].as_slice()), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + #[test] fn test_decode_ben_single_element() { let mut input: Vec = b"STANDARD BEN FILE".to_vec(); diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index ab288e0..27b214b 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -258,6 +258,14 @@ fn test_encode_jsonl_to_ben_len_65535() { assert_eq!(buffer, expected_output); } +#[test] +fn test_encode_ben_vec_from_assign_matches_rle_entrypoint() { + let assign_vec = vec![4u16, 4, 4, 1, 1, 3, 3, 3, 2]; + let direct = encode_ben_vec_from_assign(assign_vec.clone()); + let via_rle = encode_ben_vec_from_rle(crate::util::rle::assign_to_rle(assign_vec)); + assert_eq!(direct, via_rle); +} + #[test] fn encode_jsonl_to_ben_max_val_and_len_at_65535() { let rle_vec: Vec<(u16, u16)> = vec![(1, 3), (65535, 65535), (8, 4)]; diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 8313f89..5abe3b8 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -233,3 +233,101 @@ fn test_random_translation_ben_to_ben32() { assert_eq!(writer, &buffer); } + +#[test] +fn test_ben32_to_ben_line_rejects_invalid_length() { + let err = ben32_to_ben_line(vec![1, 2, 3]).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert_eq!(err.to_string(), "Invalid ben32 data length"); +} + +#[test] +fn test_ben32_to_ben_line_rejects_missing_terminator() { + let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1]).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert_eq!( + err.to_string(), + "Invalid ben32 data format. Missing end of line separator." + ); +} + +#[test] +fn test_ben32_to_ben_lines_preserves_mkv_counts() { + let input = [ + 0, 7, 0, 3, 0, 0, 0, 0, 0, 5, // one ben32 record and count=5 + ]; + + let mut output = Vec::new(); + ben32_to_ben_lines(&input[..], &mut output, BenVariant::MkvChain).unwrap(); + + let count = u16::from_be_bytes([output[output.len() - 2], output[output.len() - 1]]); + assert_eq!(count, 5); +} + +#[test] +fn test_ben_to_ben32_lines_propagates_non_eof_read_errors() { + struct FailAfterFirstByte { + data: Vec, + reads: usize, + } + + impl Read for FailAfterFirstByte { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.reads += 1; + if self.reads == 1 { + buf[0] = self.data[0]; + Ok(1) + } else { + Err(io::Error::other("boom")) + } + } + } + + let mut output = Vec::new(); + let err = ben_to_ben32_lines( + FailAfterFirstByte { + data: vec![1], + reads: 0, + }, + &mut output, + BenVariant::Standard, + ) + .unwrap_err(); + + assert_eq!(err.kind(), io::ErrorKind::Other); + assert_eq!(err.to_string(), "boom"); +} + +#[test] +fn test_ben32_to_ben_lines_propagates_non_eof_read_errors() { + struct BoomReader; + + impl Read for BoomReader { + fn read(&mut self, _buf: &mut [u8]) -> io::Result { + Err(io::Error::other("boom")) + } + } + + let err = ben32_to_ben_lines(BoomReader, Vec::new(), BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert_eq!(err.to_string(), "boom"); +} + +#[test] +fn test_ben_to_ben32_lines_mkv_roundtrip() { + let jsonl = r#"{"assignment":[4,4,4],"sample":1} +{"assignment":[4,4,4],"sample":2} +{"assignment":[7,8],"sample":3} +"#; + + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut ben32 = Vec::new(); + ben_to_ben32_lines(&ben[17..], &mut ben32, BenVariant::MkvChain).unwrap(); + + let mut round = Vec::new(); + ben32_to_ben_lines(ben32.as_slice(), &mut round, BenVariant::MkvChain).unwrap(); + + assert_eq!(round, ben[17..]); +} diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 68c7dbf..4e9e45a 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -182,3 +182,88 @@ fn test_relabel_small_file() { assert_eq!(output_json, expected_output_json); } + +#[test] +fn test_sort_json_file_by_numeric_key() { + let input = r#"{ + "nodes": [ + {"id": 0, "rank": 20}, + {"id": 1, "rank": 5}, + {"id": 2, "rank": 10} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 0}, {"id": 2}], + [{"id": 1}] + ] + }"#; + + let mut output = Vec::new(); + let mapping = sort_json_file_by_key(input.as_bytes(), &mut output, "rank").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(mapping.get(&1), Some(&0)); + assert_eq!(mapping.get(&2), Some(&1)); + assert_eq!(mapping.get(&0), Some(&2)); + assert_eq!(output_json["nodes"][0]["rank"], 5); + assert_eq!(output_json["nodes"][1]["rank"], 10); + assert_eq!(output_json["nodes"][2]["rank"], 20); +} + +#[test] +fn test_sort_json_file_by_key_with_non_numeric_values() { + let input = r#"{ + "nodes": [ + {"id": 0, "key": {"nested": true}}, + {"id": 1, "key": "abc"}, + {"id": 2, "key": 7} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 2}], + [{"id": 0}] + ] + }"#; + + let mut output = Vec::new(); + sort_json_file_by_key(input.as_bytes(), &mut output, "key").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(output_json["nodes"][0]["key"], 7); + assert_eq!(output_json["nodes"][1]["key"], "abc"); + assert_eq!(output_json["nodes"][2]["key"], serde_json::json!({"nested": true})); +} + +#[test] +fn test_sort_json_file_by_key_err_string_vs_number_branch() { + let input = r#"{ + "nodes": [ + {"id": 0, "key": "zzz"}, + {"id": 1, "key": 3} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 0}] + ] + }"#; + + let mut output = Vec::new(); + sort_json_file_by_key(input.as_bytes(), &mut output, "key").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(output_json["nodes"][0]["key"], 3); + assert_eq!(output_json["nodes"][1]["key"], "zzz"); +} + +#[test] +fn test_sort_json_file_by_key_without_nodes_or_edges() { + let input = br#"{"graph": [], "directed": false}"#; + let mut output = Vec::new(); + + let mapping = sort_json_file_by_key(&input[..], &mut output, "unused").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert!(mapping.is_empty()); + assert_eq!(output_json["graph"], serde_json::json!([])); + assert_eq!(output_json["directed"], false); +} diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 6d7cb6e..e03f487 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -1,4 +1,9 @@ use super::*; +use crate::codec::encode::encode_jsonl_to_xben; +use crate::BenVariant; +use serde_json::json; +use std::error::Error as _; +use std::io::BufReader; #[test] fn test_extract_assignment_ben() { @@ -111,3 +116,67 @@ fn test_extract_assignment_sample_too_large() { ), } } + +#[test] +fn test_extract_assignment_ben_rejects_zero_sample_number() { + let err = extract_assignment_ben([].as_slice(), 0).unwrap_err(); + assert!(matches!(err.kind, SampleErrorKind::InvalidSampleNumber)); + assert_eq!( + err.to_string(), + "Invalid sample number. Sample number must be greater than 0" + ); + assert!(err.source().is_none()); +} + +#[test] +fn test_extract_assignment_xben_roundtrip_and_errors() { + let jsonl = [ + json!({"assignment":[1,1,2], "sample": 1}).to_string(), + json!({"assignment":[3,3,4], "sample": 2}).to_string(), + json!({"assignment":[3,3,4], "sample": 3}).to_string(), + ] + .join("\n") + + "\n"; + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(0), + ) + .unwrap(); + + let assignment = extract_assignment_xben(xben.as_slice(), 3).unwrap(); + assert_eq!(assignment, vec![3, 3, 4]); + + let missing = extract_assignment_xben(xben.as_slice(), 4).unwrap_err(); + assert!(matches!( + missing.kind, + SampleErrorKind::SampleNotFound { sample_number: 4 } + )); + assert_eq!( + missing.to_string(), + "Sample number not found in file. Failed to find sample '4'. Last sample seems to be '3'" + ); + assert!(missing.source().is_none()); + + let zero = extract_assignment_xben(xben.as_slice(), 0).unwrap_err(); + assert!(matches!(zero.kind, SampleErrorKind::InvalidSampleNumber)); +} + +#[test] +fn test_sample_error_conversion_and_sources() { + let io_err = io::Error::other("boom"); + let sample_err = SampleError::from(io_err); + assert!(matches!(sample_err.kind, SampleErrorKind::IoError(_))); + assert_eq!(sample_err.to_string(), "IO Error: boom"); + assert!(sample_err.source().is_some()); + + let json_err = serde_json::from_str::("{").unwrap_err(); + let sample_err = SampleError::from(json_err); + assert!(matches!(sample_err.kind, SampleErrorKind::JsonError(_))); + assert!(sample_err.to_string().starts_with("JSON Error: ")); + assert!(sample_err.source().is_some()); +} diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 3b8ddf9..81faebb 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -328,3 +328,79 @@ fn test_relabel_simple_file_with_map_mkv() { assert_eq!(output_str, out_file); } + +#[test] +fn test_relabel_file_rejects_invalid_header() { + let err = relabel_ben_file(b"not a valid banner".as_slice(), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert_eq!(err.to_string(), "Invalid file format"); +} + +#[test] +fn test_relabel_file_with_map_rejects_invalid_header() { + let err = relabel_ben_file_with_map( + b"not a valid banner".as_slice(), + Vec::new(), + HashMap::new(), + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert_eq!(err.to_string(), "Invalid file format"); +} + +#[test] +fn test_relabel_lines_propagate_non_eof_reader_error() { + struct BoomReader { + returned_first: bool, + } + + impl io::Read for BoomReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.returned_first { + return Err(io::Error::other("boom")); + } + self.returned_first = true; + buf[0] = 1; + Ok(1) + } + } + + let err = relabel_ben_lines( + BoomReader { + returned_first: false, + }, + Vec::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); +} + +#[test] +fn test_relabel_lines_with_map_propagate_non_eof_reader_error() { + struct BoomReader { + returned_first: bool, + } + + impl io::Read for BoomReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.returned_first { + return Err(io::Error::other("boom")); + } + self.returned_first = true; + buf[0] = 1; + Ok(1) + } + } + + let err = relabel_ben_lines_with_map( + BoomReader { + returned_first: false, + }, + Vec::new(), + HashMap::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); +} diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index e95f0d8..8d77ac9 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -7,13 +7,20 @@ use ben::codec::encode::{ encode_ben_to_xben, encode_ben_vec_from_rle, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; -use ben::io::reader::{BenDecoder, DecoderInitError, XBenDecoder}; +use ben::io::reader::{ + build_frame_iter, count_samples_from_file, BenDecoder, DecoderInitError, Frame, + SubsampleFrameDecoder, XBenDecoder, +}; use ben::io::writer::BenEncoder; use ben::BenVariant; use proptest::prelude::*; use serde_json::json; +use std::error::Error as _; +use std::fs; use std::io::{BufReader, Cursor, Write}; +use std::path::PathBuf; +use std::time::{SystemTime, UNIX_EPOCH}; // ---------- Helpers ---------- @@ -66,6 +73,17 @@ where Ok(out) } +fn collect_frames(it: I) -> std::io::Result> +where + I: IntoIterator>, +{ + let mut out = Vec::new(); + for rec in it { + out.push(rec?); + } + Ok(out) +} + // ---------- proptest strategies ---------- /// Strategy for a single assignment vector: @@ -698,3 +716,342 @@ fn xz_mt_params_are_capped_and_safe() { decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); assert_eq!(out, jsonl.as_bytes()); } + +#[test] +fn ben_encoder_write_assignment_path_roundtrips() { + let mut ben = Vec::new(); + { + let mut enc = BenEncoder::new(&mut ben, BenVariant::Standard); + enc.write_assignment(vec![9u16, 9, 2, 2, 2]).unwrap(); + enc.finish().unwrap(); + } + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + assert_eq!(out, br#"{"assignment":[9,9,2,2,2],"sample":1} +"#); +} + +#[test] +fn ben_decoder_new_reports_short_header_as_io_error() { + let err = BenDecoder::new([1u8, 2, 3].as_slice()).err().unwrap(); + match err { + DecoderInitError::Io(e) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), + other => panic!("unexpected error: {other:?}"), + } +} + +#[test] +fn ben_decoder_write_all_jsonl_propagates_frame_errors() { + let mut malformed = b"STANDARD BEN FILE".to_vec(); + malformed.extend_from_slice(&[3]); // start of a frame, but truncated + + let mut decoder = BenDecoder::new(malformed.as_slice()).unwrap(); + let err = decoder.write_all_jsonl(Vec::new()).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} + +#[test] +fn ben_decoder_count_samples_propagates_frame_errors() { + let mut malformed = b"STANDARD BEN FILE".to_vec(); + malformed.extend_from_slice(&[3]); + + let err = BenDecoder::new(malformed.as_slice()) + .unwrap() + .count_samples() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} + +#[test] +fn xben_frame_decoder_new_and_truncated_iteration_paths() { + let jsonl = r#"{"assignment":[1,1,1],"sample":1} +{"assignment":[2,2],"sample":2} +"#; + let mut xz = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xz, + BenVariant::Standard, + Some(1), + Some(0), + ) + .unwrap(); + + let mut frames = ben::io::reader::XBenFrameDecoder::new(xz.as_slice()).unwrap(); + assert!(frames.next().unwrap().is_ok()); + + let trimmed = &xz[..xz.len() - 1]; + let mut frames = ben::io::reader::XBenFrameDecoder::new(trimmed).unwrap(); + loop { + match frames.next() { + Some(Err(e)) => { + assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof); + break; + } + Some(Ok(_)) => continue, + None => panic!("expected truncated-frame error"), + } + } +} + +#[test] +fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { + let mut payload_only = Vec::new(); + { + let mut enc = BenEncoder::new(&mut payload_only, BenVariant::Standard); + enc.write_assignment(vec![5u16, 5, 7]).unwrap(); + enc.finish().unwrap(); + } + let payload_only = payload_only[17..].to_vec(); + + let mut xz = Vec::new(); + { + let mt = xz2::stream::MtStreamBuilder::new() + .threads(1) + .preset(0) + .block_size(0) + .encoder() + .unwrap(); + let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); + let mut xben = ben::io::writer::XBenEncoder::new(encoder, BenVariant::Standard); + xben.write_ben_file(BufReader::new(payload_only.as_slice())).unwrap(); + } + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut ben).unwrap(); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + assert_eq!(out, br#"{"assignment":[5,5,7],"sample":1} +"#); +} + +struct FailAfterN { + data: Vec, + pos: usize, + fail_at: usize, +} + +impl std::io::Read for FailAfterN { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.pos >= self.fail_at { + return Err(std::io::Error::other("boom")); + } + if self.pos >= self.data.len() { + return Ok(0); + } + let n = buf.len().min(self.data.len() - self.pos).min(self.fail_at - self.pos); + buf[..n].copy_from_slice(&self.data[self.pos..self.pos + n]); + self.pos += n; + Ok(n) + } +} + +#[test] +fn ben_decoder_frame_read_error_paths() { + let banner = b"STANDARD BEN FILE".to_vec(); + + let err = BenDecoder::new(FailAfterN { + data: [banner.clone(), vec![3]].concat(), + pos: 0, + fail_at: 18, + }) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); + + let err = BenDecoder::new(FailAfterN { + data: [banner.clone(), vec![3, 3, 0]].concat(), + pos: 0, + fail_at: 20, + }) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); + + let err = BenDecoder::new(FailAfterN { + data: [banner.clone(), vec![3, 3, 0, 0, 0, 1]].concat(), + pos: 0, + fail_at: 23, + }) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); +} + +#[test] +fn ben_decoder_mkv_count_read_error_path() { + let mut ben = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(br#"{"assignment":[1,1],"sample":1}"#.as_slice()), + &mut ben, + BenVariant::MkvChain, + ) + .unwrap(); + let truncated = ben[..ben.len() - 1].to_vec(); + let err = BenDecoder::new(truncated.as_slice()) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} + +#[test] +fn subsample_frame_decoder_propagates_inner_and_decode_errors() { + let mut inner = SubsampleFrameDecoder::by_indices( + vec![Err(std::io::Error::other("boom"))].into_iter(), + vec![1], + ); + let err = inner.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); + + let mut malformed = SubsampleFrameDecoder::by_indices( + vec![Ok((Frame::XBen(vec![1, 2, 3], BenVariant::Standard), 1))].into_iter(), + vec![1], + ); + let err = malformed.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} + +fn unique_temp_path(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("binary-ensemble-{name}-{nonce}.tmp")) +} + +#[test] +fn decoder_init_error_display_source_and_conversion_paths() { + let io_error = DecoderInitError::from(std::io::Error::other("boom")); + assert_eq!(io_error.to_string(), "IO error: boom"); + assert!(io_error.source().is_some()); + + let xz_bytes = { + let mut buf = Vec::new(); + xz_compress(BufReader::new(b"hello".as_slice()), &mut buf, Some(1), Some(0)).unwrap(); + buf + }; + let xz_header = xz_bytes[..17].to_vec(); + let invalid = DecoderInitError::InvalidFileFormat(xz_header.clone()); + let msg = invalid.to_string(); + assert!(msg.contains("Compressed header detected")); + assert!(msg.contains("decode_xben_to_ben")); + assert!(invalid.source().is_none()); + + let generic = DecoderInitError::InvalidFileFormat(b"not a ben header!!".to_vec()); + assert!(generic.to_string().contains("utf8-lossy")); + + let io_err: std::io::Error = DecoderInitError::InvalidFileFormat(xz_header).into(); + assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn ben_decoder_and_xben_decoder_count_samples() { + let jsonl = r#"{"assignment":[1,1],"sample":1} +{"assignment":[1,1],"sample":2} +{"assignment":[2,2],"sample":3} +"#; + + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); + assert_eq!(BenDecoder::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(0), + ) + .unwrap(); + assert_eq!(XBenDecoder::new(xben.as_slice()).unwrap().count_samples().unwrap(), 3); +} + +#[test] +fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { + let jsonl = r#"{"assignment":[1,1],"sample":1} +{"assignment":[2,2],"sample":2} +{"assignment":[2,2],"sample":3} +"#; + + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); + let ben_path = unique_temp_path("sample.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(0), + ) + .unwrap(); + let xben_path = unique_temp_path("sample.xben"); + fs::write(&xben_path, &xben).unwrap(); + + let ben_iter = build_frame_iter(&ben_path, "ben").unwrap(); + assert_eq!(collect_frames(ben_iter).unwrap().len(), 2); + + let xben_iter = build_frame_iter(&xben_path, "xben").unwrap(); + assert_eq!(collect_frames(xben_iter).unwrap().len(), 2); + + assert_eq!(count_samples_from_file(&ben_path, "ben").unwrap(), 3); + assert_eq!(count_samples_from_file(&xben_path, "xben").unwrap(), 3); + + let err = build_frame_iter(&ben_path, "wat").err().unwrap(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + + fs::remove_file(ben_path).unwrap(); + fs::remove_file(xben_path).unwrap(); +} + +#[test] +fn ben_decoder_subsample_helpers_work_on_public_api() { + let jsonl = r#"{"assignment":[1],"sample":1} +{"assignment":[2],"sample":2} +{"assignment":[3],"sample":3} +{"assignment":[4],"sample":4} +"#; + + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut by_indices = BenDecoder::new(ben.as_slice()) + .unwrap() + .into_subsample_by_indices(vec![4, 1, 1, 3]); + let picked = collect_records(&mut by_indices).unwrap(); + assert_eq!( + picked.into_iter().map(|(a, _)| a[0]).collect::>(), + vec![1, 3, 4] + ); + + let mut by_range = BenDecoder::new(ben.as_slice()) + .unwrap() + .into_subsample_by_range(2, 3); + let picked = collect_records(&mut by_range).unwrap(); + assert_eq!( + picked.into_iter().map(|(a, _)| a[0]).collect::>(), + vec![2, 3] + ); + + let mut every = BenDecoder::new(ben.as_slice()) + .unwrap() + .into_subsample_every(2, 2); + let picked = collect_records(&mut every).unwrap(); + assert_eq!( + picked.into_iter().map(|(a, _)| a[0]).collect::>(), + vec![2, 4] + ); +} From a85511ea5609867e2cdca5213bc273b06e10d1cb Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 09:41:45 -0600 Subject: [PATCH 003/221] reorg the clis --- ben/src/bin/ben.rs | 582 +----------------------------------------- ben/src/bin/pben.rs | 175 +------------ ben/src/bin/reben.rs | 221 +--------------- ben/src/cli/ben.rs | 467 +++++++++++++++++++++++++++++++++ ben/src/cli/common.rs | 24 ++ ben/src/cli/mod.rs | 4 + ben/src/cli/pben.rs | 164 ++++++++++++ ben/src/cli/reben.rs | 208 +++++++++++++++ ben/src/lib.rs | 1 + 9 files changed, 872 insertions(+), 974 deletions(-) create mode 100644 ben/src/cli/ben.rs create mode 100644 ben/src/cli/common.rs create mode 100644 ben/src/cli/mod.rs create mode 100644 ben/src/cli/pben.rs create mode 100644 ben/src/cli/reben.rs diff --git a/ben/src/bin/ben.rs b/ben/src/bin/ben.rs index ec8b5f3..e78bcf0 100755 --- a/ben/src/bin/ben.rs +++ b/ben/src/bin/ben.rs @@ -1,583 +1,3 @@ -use ben::codec::decode::{ - decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, -}; -use ben::codec::encode::{ - encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, -}; -use ben::ops::extract::extract_assignment_ben; -use ben::{logln, BenVariant}; -use clap::{Parser, ValueEnum}; -use std::{ - fs::File, - io::{self, BufReader, BufWriter, Result, Write}, - path::Path, -}; -/// Defines the mode of operation. -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -enum Mode { - Encode, - XEncode, - Decode, - XDecode, - Read, - XzCompress, - XzDecompress, -} - -/// Defines the command line arguments accepted by the program. -#[derive(Parser, Debug)] -#[command( - name = "Binary Ensemble CLI Tool", - about = "This is a command line tool for encoding and decoding binary ensemble files.", - version = "0.2.0" -)] -struct Args { - /// Mode to run the program in (encode, decode, or read). - #[arg(short, long, value_enum)] - mode: Mode, - - /// Input file to read from. - #[arg()] - input_file: Option, - - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. - #[arg(short, long)] - output_file: Option, - - /// The standard behaviour is to try and derive the output file - /// name from the input file name. If this flag is set, then this - /// logic is ignored and the output is printed to stdout. - /// This flag is considered a higher priority than - /// the output_file flag, so if both are present, the output - /// will be printed to stdout. - #[arg(short, long)] - print: bool, - - /// Sample number to extract. Optional. - #[arg(short = 'n', long)] - sample_number: Option, - - /// If input and output files are not provided, - /// then this tells the x-encode, x-decode, and decode modes - /// that the expected formats are BEN and XBEN - #[arg(short = 'b', long)] - ben_and_xben: bool, - - /// If input and output files are not provided, - /// then this tells the x-encode and x-decode modes - /// that the expected formats are JSONL and XBEN - #[arg(short = 'J', long)] - jsonl_and_xben: bool, - - /// If the input and output files are not provided, - /// then this tells the decode mode that the expected - /// formats are JSONL and BEN - #[arg(short = 'j', long)] - jsonl_and_ben: bool, - - /// When saving a file in the BEN format, the deault is to have - /// an assignment vector saved followed by the number of repetitions - /// of that assignment vector (this is useful for Markov chian methods - /// like ReCom). This flag will cause the program to forgo the repetition - /// count and just save all of the assignment vectors as they are encountered. - #[arg(short = 'a', long)] - save_all: bool, - - /// If the output file already exists, this flag - /// will cause the program to overwrite it without - /// asking the user for confirmation. - #[arg(short = 'w', long)] - overwrite: bool, - - /// Enables verbose printing for the CLI. Optional. - #[arg(short, long)] - verbose: bool, - - /// When running x-encoder, this flag will determine the number of cpus to use on the - /// system. By default, all available cpus will be used. - #[arg(short = 'c', long)] - n_cpus: Option, - - /// When running x-encoder, this flag will deterimine the level of compression to use. - /// By default, the highest level of compression will be used. - /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. - #[arg(short = 'l', long)] - compression_level: Option, -} - -fn encode_setup( - mode: Mode, - input_file_name: String, - output_file_name: Option, - overwrite: bool, -) -> Result { - let extension = if mode == Mode::XEncode { - ".xben" - } else if mode == Mode::Encode { - ".ben" - } else { - ".xz" - }; - - let out_file_name = match output_file_name { - Some(name) => name.to_owned(), - None => { - if input_file_name.ends_with(".ben") && extension == ".xben" { - input_file_name.trim_end_matches(".ben").to_owned() + extension - } else { - input_file_name.to_string() + extension - } - } - }; - - if let Err(e) = check_overwrite(&out_file_name, overwrite) { - return Err(e); - } - - Ok(out_file_name) -} - -fn decode_setup( - in_file_name: String, - out_file_name: Option, - full_decode: bool, - overwrite: bool, -) -> Result { - let out_file_name = if let Some(name) = out_file_name { - name.to_owned() - } else if in_file_name.ends_with(".ben") { - in_file_name.trim_end_matches(".ben").to_owned() - } else if in_file_name.ends_with(".xben") { - if !full_decode { - in_file_name.trim_end_matches(".xben").to_owned() + ".ben" - } else { - in_file_name.trim_end_matches(".xben").to_owned() - } - } else if in_file_name.ends_with(".xz") { - eprintln!( - "Error: Unsupported file type for decode mode {:?}. Please decompress xz files with \ - either the xz command line tool or the xz-decompress mode of this tool.", - in_file_name - ); - return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); - } else { - eprintln!( - "Error: Unsupported file type for decode mode {:?}. Supported types are .ben and .xben.", - in_file_name - ); - return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); - }; - - if let Err(e) = check_overwrite(&out_file_name, overwrite) { - return Err(e); - } - - Ok(out_file_name) -} - -fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { - if Path::new(file_name).exists() && !overwrite { - eprint!( - "File {:?} already exists, do you want to overwrite it? (y/[n]): ", - file_name - ); - let mut user_input = String::new(); - std::io::stdin().read_line(&mut user_input).unwrap(); - eprintln!(); - if user_input.trim().to_lowercase() != "y" { - return Err(std::io::Error::from(std::io::ErrorKind::AlreadyExists)); - } - } - Ok(()) -} - fn main() { - let args = Args::parse(); - - if args.verbose { - std::env::set_var("RUST_LOG", "trace"); - } - - match args.mode { - Mode::Encode => { - logln!("Running in encode mode"); - - let reader: Box; - let writer: Box; - - match args.input_file { - Some(in_file) => { - reader = Box::new(BufReader::new(File::open(&in_file).unwrap())) - as Box; - - if args.print { - writer = Box::new(BufWriter::new(io::stdout())) as Box; - } else { - let out_file_name = match encode_setup( - args.mode, - in_file, - args.output_file, - args.overwrite, - ) { - Ok(name) => name, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let out_file = File::create(&out_file_name).unwrap(); - writer = Box::new(BufWriter::new(out_file)) as Box; - } - } - None => { - reader = Box::new(BufReader::new(io::stdin())) as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - match args.output_file { - Some(name) => { - if let Err(e) = check_overwrite(&name, args.overwrite) { - eprintln!("Error: {:?}", e); - return; - } - let out_file = File::create(&name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - } - None => Box::new(BufWriter::new(io::stdout())) as Box, - } - } - } - }; - - let possible_error = if args.save_all { - encode_jsonl_to_ben(reader, writer, BenVariant::Standard) - } else { - encode_jsonl_to_ben(reader, writer, BenVariant::MkvChain) - }; - - match possible_error { - Ok(_) => {} - Err(err) => { - eprintln!("Error: {:?}", err); - } - } - } - Mode::XEncode => { - logln!("Running in xencode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_xben = args.ben_and_xben; - - let reader: Box; - let writer: Box; - - match args.input_file { - Some(in_file) => { - if in_file.ends_with(".ben") { - ben_and_xben = true; - } else if in_file.ends_with(".jsonl") { - jsonl_and_xben = true; - } - - reader = Box::new(BufReader::new(File::open(&in_file).unwrap())) - as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - let out_file_name = match encode_setup( - args.mode, - in_file, - args.output_file, - args.overwrite, - ) { - Ok(name) => name, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let out_file = File::create(&out_file_name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - }; - } - None => { - reader = Box::new(BufReader::new(io::stdin())) as Box; - - writer = match args.output_file { - Some(name) => { - if let Err(e) = check_overwrite(&name, args.overwrite) { - eprintln!("Error: {:?}", e); - return; - } - let out_file = File::create(&name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - } - None => Box::new(BufWriter::new(io::stdout())) as Box, - }; - } - }; - - if ben_and_xben { - if let Err(err) = - encode_ben_to_xben(reader, writer, args.n_cpus, args.compression_level) - { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_xben { - let possible_error = if args.save_all { - encode_jsonl_to_xben( - reader, - writer, - BenVariant::Standard, - args.n_cpus, - args.compression_level, - ) - } else { - encode_jsonl_to_xben( - reader, - writer, - BenVariant::MkvChain, - args.n_cpus, - args.compression_level, - ) - }; - if let Err(e) = possible_error { - eprintln!("Error: {:?}", e); - } - } else { - eprintln!("Error: Unsupported file type(s) for xencode mode"); - } - } - Mode::Decode => { - logln!("Running in decode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_ben = args.jsonl_and_ben; - - let reader: Box; - let writer: Box; - - match args.input_file { - Some(file) => { - if file.ends_with(".ben") { - jsonl_and_ben = true; - } else if file.ends_with(".xben") { - ben_and_xben = true; - } - - reader = Box::new(BufReader::new(File::open(&file).unwrap())) - as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - let out_file_name = - match decode_setup(file, args.output_file, false, args.overwrite) { - Ok(name) => name, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let out_file = File::create(&out_file_name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - }; - } - None => { - reader = Box::new(BufReader::new(io::stdin())) as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - match args.output_file { - Some(out_name) => { - if let Err(e) = check_overwrite(&out_name, args.overwrite) { - eprintln!("Error: {:?}", e); - return; - } - let out_file = File::create(&out_name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - } - None => Box::new(BufWriter::new(io::stdout())) as Box, - } - } - } - } - - if ben_and_xben { - if let Err(err) = decode_xben_to_ben(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_ben { - if let Err(err) = decode_ben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else { - eprintln!("Error: Unsupported file type(s) for decode mode"); - } - } - Mode::XDecode => { - logln!("Running in x-decode mode"); - - let reader: Box; - let writer: Box; - - match args.input_file { - Some(file) => { - reader = Box::new(BufReader::new(File::open(&file).unwrap())) - as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - let out_file_name = - match decode_setup(file, args.output_file, true, args.overwrite) { - Ok(name) => name, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let out_file = File::create(&out_file_name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - } - } - None => { - reader = Box::new(BufReader::new(io::stdin())) as Box; - - writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - match args.output_file { - Some(out_name) => { - if let Err(e) = check_overwrite(&out_name, args.overwrite) { - eprintln!("Error: {:?}", e); - return; - } - let out_file = File::create(&out_name).unwrap(); - Box::new(BufWriter::new(out_file)) as Box - } - None => Box::new(BufWriter::new(io::stdout())) as Box, - } - } - } - } - - if let Err(err) = decode_xben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } - Mode::Read => { - logln!("Running in read mode"); - let file: File = File::open( - &args - .input_file - .expect("Must provide input file for read mode."), - ) - .unwrap(); - let reader: BufReader = BufReader::new(file); - - if args.sample_number.is_none() { - eprintln!("Error: Sample number is required in read mode"); - return; - } - - let mut writer = if args.print { - Box::new(BufWriter::new(io::stdout())) as Box - } else { - match args.output_file { - Some(name) => { - let file: File = File::create(name).unwrap(); - Box::new(BufWriter::new(file)) as Box - } - None => Box::new(BufWriter::new(io::stdout())) as Box, - } - }; - - args.sample_number - .map(|n| match extract_assignment_ben(reader, n) { - Ok(vec) => writer.write_all(format!("{:?}\n", vec).as_bytes()).unwrap(), - Err(e) => eprintln!("Error: {:?}", e), - }); - } - Mode::XzCompress => { - logln!("Running in xz compress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-compress mode."); - let in_file = File::open(&in_file_name).unwrap(); - let reader = BufReader::new(in_file); - - let out_file_name = match args.output_file { - Some(name) => name, - None => in_file_name + ".xz", - }; - - if Path::new(&out_file_name).exists() { - eprint!( - "File {:?} already exists, do you want to overwrite it? (y/[n]): ", - out_file_name - ); - let mut user_input = String::new(); - std::io::stdin().read_line(&mut user_input).unwrap(); - if user_input.trim().to_lowercase() != "y" { - return; - } - eprintln!(); - } - - let out_file = File::create(out_file_name).unwrap(); - let writer = BufWriter::new(out_file); - - if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { - eprintln!("Error: {:?}", err); - } - logln!("Done!"); - } - Mode::XzDecompress => { - logln!("Running in xz decompress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-decompress mode."); - - if !in_file_name.ends_with(".xz") { - eprintln!("Error: Unsupported file type for xz decompress mode"); - return; - } - - let output_file_name = match args.output_file { - Some(name) => name, - None => in_file_name[..in_file_name.len() - 3].to_string(), - }; - - if Path::new(&output_file_name).exists() { - eprint!( - "File {:?} already exists, do you want to overwrite it? (y/[n]): ", - output_file_name - ); - eprintln!(); - let mut user_input = String::new(); - std::io::stdin().read_line(&mut user_input).unwrap(); - if user_input.trim().to_lowercase() != "y" { - return; - } - } - - let in_file = File::open(&in_file_name).unwrap(); - let reader = BufReader::new(in_file); - - let out_file = File::create(output_file_name).unwrap(); - let writer = BufWriter::new(out_file); - - if let Err(err) = xz_decompress(reader, writer) { - eprintln!("Error: {:?}", err); - } - } - } + ben::cli::ben::run(); } diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs index 94eb108..bee6e88 100755 --- a/ben/src/bin/pben.rs +++ b/ben/src/bin/pben.rs @@ -1,174 +1,3 @@ -use ben::io::reader::BenDecoder; -use ben::io::writer::{BenEncoder, XBenEncoder}; -use ben::{logln, BenVariant}; -use clap::{Parser, ValueEnum}; -use pcompress; -use pipe::pipe; -use std::{ - fs::File, - io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}, -}; -use xz2::write::XzEncoder; - -/// Defines the mode of operation. -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -enum Mode { - BenToPc, - PcToBen, - PcToXben, -} - -#[derive(Parser, Debug)] -#[command( - name = "Conversion tool for BEN and PCOMPRESS formats", - about = "This is a CLI tool that allows for the conversion between BEN and PCOMPRESS formats.", - version = "0.2.0" -)] -struct Args { - /// Mode to run the program in - #[arg(short, long, value_enum)] - mode: Mode, - - /// Input file to read from. - #[arg(short, long)] - input_file: Option, - - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. - #[arg(short, long)] - output_file: Option, - - /// If the output file already exists, this flag - /// will cause the program to overwrite it without - /// asking the user for confirmation. - #[arg(short = 'w', long)] - overwrite: bool, - - /// Enables verbose printing for the CLI. Optional. - #[arg(short, long)] - verbose: bool, -} - -fn main() -> Result<()> { - let args = Args::parse(); - - if args.verbose { - std::env::set_var("RUST_LOG", "trace"); - } - - match args.mode { - Mode::BenToPc => { - logln!("Converting BEN to PCOMPRESS"); - - let ben_reader: Box = match args.input_file { - Some(file) => Box::new(BufReader::new(File::open(&file).unwrap())), - None => Box::new(io::stdin()), - }; - - let mut pcompress_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - - let _ = std::thread::spawn(move || -> io::Result<()> { - assignment_decode_ben(ben_reader, pipe_writer) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - - pcompress::encode::encode(&mut buf_pipe_reader, &mut pcompress_writer, false); - - Ok(()) - } - Mode::PcToBen => { - logln!("Converting PCOMPRESS to BEN"); - - let mut pcompress_reader: BufReader> = match args.input_file { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), - None => BufReader::new(Box::new(io::stdin())), - }; - - let mut ben_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - - let mut buf_pipe_writer = BufWriter::new(pipe_writer); - - let _ = std::thread::spawn(move || { - pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - - assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer) - } - Mode::PcToXben => { - logln!("Converting PCOMPRESS to XBEN"); - - let mut pcompress_reader: BufReader> = match args.input_file { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), - None => BufReader::new(Box::new(io::stdin())), - }; - - let mut ben_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - - let mut buf_pipe_writer = BufWriter::new(pipe_writer); - - let _ = std::thread::spawn(move || { - pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - - assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer) - } - } -} - -fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { - let ben_reader = BenDecoder::new(&mut reader)?; - - for result in ben_reader { - match result { - Ok(assignment) => { - write!(writer, "{}\n", serde_json::to_string(&assignment).unwrap())?; - } - Err(e) => return Err(e), - } - } - - Ok(()) -} - -fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { - let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain); - - for line in reader.lines() { - let assignment: Vec = serde_json::from_str::>(&line.unwrap()) - .unwrap() - .into_iter() - .map(|x| x as u16 + 1) - .collect(); - ben_writer.write_assignment(assignment)?; - } - Ok(()) -} - -fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { - let encoder = XzEncoder::new(writer, 9); - let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain); - - xben_writer.write_ben_file(reader)?; - Ok(()) +fn main() -> std::io::Result<()> { + ben::cli::pben::run() } diff --git a/ben/src/bin/reben.rs b/ben/src/bin/reben.rs index 682f638..8a935d3 100755 --- a/ben/src/bin/reben.rs +++ b/ben/src/bin/reben.rs @@ -1,222 +1,3 @@ -use ben::{ - json::graph::sort_json_file_by_key, - logln, - ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, -}; -use clap::{Parser, ValueEnum}; -use serde_json::{json, Value}; -use std::{ - fs::File, - io::{BufReader, BufWriter, Write}, -}; - -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -enum Mode { - Json, - Ben, -} - -/// Defines the command line arguments accepted by the program. -#[derive(Parser, Debug)] -#[command( - name = "Relabeling Binary Ensemble CLI Tool", - about = concat!( - "This is a command line tool for relabeling binary ensembles ", - "to help improve compression ratios for BEN and XBEN files." - ), - version = "0.2.0" -)] - -// TODO: Change the name of shape_file to dual_graph_file. -struct Args { - /// Input file to read from. - #[arg()] - input_file: String, - - /// Output file to write to. - #[arg(short, long)] - output_file: Option, - - /// Key to sort the JSON or BEN file by. - #[arg(short, long)] - key: Option, - - /// Shape file to use for sorting the BEN file. Only needed - /// in BEN mode when a map is not provided. - #[arg(short, long)] - shape_file: Option, - - /// Map file to use for relabeling the BEN file. - #[arg(short = 'p', long)] - map_file: Option, - - /// Mode to run the program in (either JSON or BEN). - /// The JSON mode will sort a JSON file by a given key. - /// The BEN mode will relabel a BEN file according to a map file - /// or a key (the latter also requires a dual-graph file). If no - /// map file or key is provided, the BEN mode will canonicalize - /// the assignment vectors in the BEN file. - #[arg(short, long)] - mode: Mode, - - /// Verbosity level for the program. - #[arg(short, long)] - verbose: bool, -} - fn main() { - let args = Args::parse(); - - if args.verbose { - std::env::set_var("RUST_LOG", "trace"); - } - - match &args.mode { - Mode::Json => { - // TODO: Change the input file here to the shape file. - let input_file = File::open(&args.input_file).expect("Could not open input file."); - let reader = BufReader::new(input_file); - - let key = args.key.as_ref().expect("No key provided."); - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", key).as_str() - } - }; - - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - let map = sort_json_file_by_key(reader, writer, key); - - let map_file_name = args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", key).as_str() - + "_map.json"; - let map_file = File::create(map_file_name).expect("Could not create map file."); - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": key, - "relabeling_old_to_new_nodes_map": map.unwrap() - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .expect("Could not write map file."); - } - Mode::Ben => { - let input_file = File::open(&args.input_file).expect("Could not open input file."); - let reader = BufReader::new(input_file); - - if args.map_file.is_none() && args.key.is_none() { - logln!("Canonicalizing assignment vectors in ben file."); - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + "_canonicalized_assignments.jsonl.ben" - } - }; - - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - - let writer = BufWriter::new(output_file); - - relabel_ben_file(reader, writer).unwrap(); - return; - } - - if args.map_file.is_some() && args.key.is_some() { - panic!(concat!( - "Cannot provide both a map file and a key. ", - "Please provide either the map file or the key and the ", - "(JSON formatted) dual-graph file needed to generate a map file." - )); - } - - let mut map_file_name = String::new(); - if let Some(key) = args.key { - if let Some(shape) = args.shape_file { - logln!("Creating map file for key: {}", key); - - let output_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", key).as_str(); - - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - let shape_reader = - BufReader::new(File::open(&shape).expect("Could not open shape file.")); - let map = sort_json_file_by_key(shape_reader, writer, &key); - - map_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", key).as_str() - + "_map.json"; - let map_file = - File::create(&map_file_name).expect("Could not create map file."); - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": key, - "relabeling_old_to_new_nodes_map": map.unwrap() - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .expect("Could not write map file."); - } else { - panic!( - "{}", - format!("No shape file provided to go with key {:}", key) - ); - } - } - - if map_file_name.is_empty() { - map_file_name = args.map_file.as_ref().unwrap().to_owned(); - } - let map_file = File::open(&map_file_name).expect("Could not open map file."); - let map_reader = BufReader::new(map_file); - - let data: Value = serde_json::from_reader(map_reader).unwrap(); - - let new_to_old_node_map = data["relabeling_old_to_new_nodes_map"] - .as_object() - .unwrap() - .iter() - .map(|(k, v)| (v.as_u64().unwrap() as usize, k.parse::().unwrap())) - .collect::>(); - - let key = data["key"].as_str().unwrap(); - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + format!("_sorted_by_{}.jsonl.ben", key).as_str() - } - }; - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - logln!( - "Relabeling ben file according to map file {}", - map_file_name, - ); - - relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); - } - } + ben::cli::reben::run(); } diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs new file mode 100644 index 0000000..8cfd86d --- /dev/null +++ b/ben/src/cli/ben.rs @@ -0,0 +1,467 @@ +use crate::cli::common::{check_overwrite, set_verbose}; +use crate::codec::decode::{ + decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, +}; +use crate::codec::encode::{ + encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, +}; +use crate::ops::extract::extract_assignment_ben; +use crate::{logln, BenVariant}; +use clap::{Parser, ValueEnum}; +use std::{ + fs::File, + io::{self, BufReader, BufWriter, Result, Write}, +}; + +type DynReader = Box; +type DynWriter = Box; + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +enum Mode { + /// Encode JSONL into BEN. + Encode, + /// Encode JSONL or BEN into XBEN. + XEncode, + /// Decode BEN or XBEN into its less compressed representation. + Decode, + /// Fully decode XBEN into JSONL. + XDecode, + /// Read a single sample from a BEN file. + Read, + /// Compress an arbitrary stream with XZ. + XzCompress, + /// Decompress an `.xz` file. + XzDecompress, +} + +#[derive(Parser, Debug)] +#[command( + name = "Binary Ensemble CLI Tool", + about = "This is a command line tool for encoding and decoding binary ensemble files.", + version +)] +/// Defines the command line arguments accepted by the program. +struct Args { + /// Mode to run the program in (encode, decode, or read). + #[arg(short, long, value_enum)] + mode: Mode, + /// Input file to read from. + #[arg()] + input_file: Option, + /// Output file to write to. Optional. + /// If not provided, the output file will be determined + /// based on the input file and the mode of operation. + #[arg(short, long)] + output_file: Option, + /// The standard behaviour is to try and derive the output file + /// name from the input file name. If this flag is set, then this + /// logic is ignored and the output is printed to stdout. + /// This flag is considered a higher priority than + /// the output_file flag, so if both are present, the output + /// will be printed to stdout. + #[arg(short, long)] + print: bool, + /// Sample number to extract. Optional. + #[arg(short = 'n', long)] + sample_number: Option, + /// If input and output files are not provided, + /// then this tells the x-encode, x-decode, and decode modes + /// that the expected formats are BEN and XBEN + #[arg(short = 'b', long)] + ben_and_xben: bool, + /// If input and output files are not provided, + /// then this tells the x-encode and x-decode modes + /// that the expected formats are JSONL and XBEN + #[arg(short = 'J', long)] + jsonl_and_xben: bool, + /// If the input and output files are not provided, + /// then this tells the decode mode that the expected + /// formats are JSONL and BEN + #[arg(short = 'j', long)] + jsonl_and_ben: bool, + /// When saving a file in the BEN format, the deault is to have + /// an assignment vector saved followed by the number of repetitions + /// of that assignment vector (this is useful for Markov chian methods + /// like ReCom). This flag will cause the program to forgo the repetition + /// count and just save all of the assignment vectors as they are encountered. + #[arg(short = 'a', long)] + save_all: bool, + /// If the output file already exists, this flag + /// will cause the program to overwrite it without + /// asking the user for confirmation. + #[arg(short = 'w', long)] + overwrite: bool, + /// Enables verbose printing for the CLI. Optional. + #[arg(short, long)] + verbose: bool, + /// When running x-encoder, this flag will determine the number of cpus to use on the + /// system. By default, all available cpus will be used. + #[arg(short = 'c', long)] + n_cpus: Option, + /// When running x-encoder, this flag will deterimine the level of compression to use. + /// By default, the highest level of compression will be used. + /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. + #[arg(short = 'l', long)] + compression_level: Option, +} + +fn encode_setup( + mode: Mode, + input_file_name: String, + output_file_name: Option, + overwrite: bool, +) -> Result { + let extension = if mode == Mode::XEncode { + ".xben" + } else if mode == Mode::Encode { + ".ben" + } else { + ".xz" + }; + + let out_file_name = match output_file_name { + Some(name) => name.to_owned(), + None => { + if input_file_name.ends_with(".ben") && extension == ".xben" { + input_file_name.trim_end_matches(".ben").to_owned() + extension + } else { + input_file_name.to_string() + extension + } + } + }; + + check_overwrite(&out_file_name, overwrite)?; + Ok(out_file_name) +} + +fn decode_setup( + in_file_name: String, + out_file_name: Option, + full_decode: bool, + overwrite: bool, +) -> Result { + let out_file_name = if let Some(name) = out_file_name { + name.to_owned() + } else if in_file_name.ends_with(".ben") { + in_file_name.trim_end_matches(".ben").to_owned() + } else if in_file_name.ends_with(".xben") { + if !full_decode { + in_file_name.trim_end_matches(".xben").to_owned() + ".ben" + } else { + in_file_name.trim_end_matches(".xben").to_owned() + } + } else if in_file_name.ends_with(".xz") { + eprintln!( + "Error: Unsupported file type for decode mode {:?}. Please decompress xz files with \ + either the xz command line tool or the xz-decompress mode of this tool.", + in_file_name + ); + return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); + } else { + eprintln!( + "Error: Unsupported file type for decode mode {:?}. Supported types are .ben and .xben.", + in_file_name + ); + return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); + }; + + check_overwrite(&out_file_name, overwrite)?; + Ok(out_file_name) +} + +fn open_reader(input_file: Option<&str>) -> DynReader { + match input_file { + Some(path) => Box::new(BufReader::new(File::open(path).unwrap())), + None => Box::new(BufReader::new(io::stdin())), + } +} + +fn open_writer(output_file: Option<&str>, print: bool, overwrite: bool) -> Result { + if print { + return Ok(Box::new(BufWriter::new(io::stdout()))); + } + + match output_file { + Some(path) => { + check_overwrite(path, overwrite)?; + Ok(Box::new(BufWriter::new(File::create(path).unwrap()))) + } + None => Ok(Box::new(BufWriter::new(io::stdout()))), + } +} + +fn open_derived_writer(path: String) -> DynWriter { + Box::new(BufWriter::new(File::create(path).unwrap())) +} + +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + match args.mode { + Mode::Encode => { + logln!("Running in encode mode"); + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => match encode_setup( + args.mode, + in_file.clone(), + args.output_file.clone(), + args.overwrite, + ) { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + let possible_error = if args.save_all { + encode_jsonl_to_ben(reader, writer, BenVariant::Standard) + } else { + encode_jsonl_to_ben(reader, writer, BenVariant::MkvChain) + }; + + if let Err(err) = possible_error { + eprintln!("Error: {:?}", err); + } + } + Mode::XEncode => { + logln!("Running in xencode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_xben = args.ben_and_xben; + + if let Some(in_file) = args.input_file.as_ref() { + if in_file.ends_with(".ben") { + ben_and_xben = true; + } else if in_file.ends_with(".jsonl") { + jsonl_and_xben = true; + } + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => match encode_setup( + args.mode, + in_file.clone(), + args.output_file.clone(), + args.overwrite, + ) { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if ben_and_xben { + if let Err(err) = + encode_ben_to_xben(reader, writer, args.n_cpus, args.compression_level) + { + eprintln!("Error: {:?}", err); + } + } else if jsonl_and_xben { + let possible_error = if args.save_all { + encode_jsonl_to_xben( + reader, + writer, + BenVariant::Standard, + args.n_cpus, + args.compression_level, + ) + } else { + encode_jsonl_to_xben( + reader, + writer, + BenVariant::MkvChain, + args.n_cpus, + args.compression_level, + ) + }; + if let Err(e) = possible_error { + eprintln!("Error: {:?}", e); + } + } else { + eprintln!("Error: Unsupported file type(s) for xencode mode"); + } + } + Mode::Decode => { + logln!("Running in decode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_ben = args.jsonl_and_ben; + + if let Some(file) = args.input_file.as_ref() { + if file.ends_with(".ben") { + jsonl_and_ben = true; + } else if file.ends_with(".xben") { + ben_and_xben = true; + } + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + match decode_setup(file.clone(), args.output_file.clone(), false, args.overwrite) + { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + } + } + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if ben_and_xben { + if let Err(err) = decode_xben_to_ben(reader, writer) { + eprintln!("Error: {:?}", err); + } + } else if jsonl_and_ben { + if let Err(err) = decode_ben_to_jsonl(reader, writer) { + eprintln!("Error: {:?}", err); + } + } else { + eprintln!("Error: Unsupported file type(s) for decode mode"); + } + } + Mode::XDecode => { + logln!("Running in x-decode mode"); + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + match decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite) + { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + } + } + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if let Err(err) = decode_xben_to_jsonl(reader, writer) { + eprintln!("Error: {:?}", err); + } + } + Mode::Read => { + logln!("Running in read mode"); + let reader = BufReader::new( + File::open( + &args + .input_file + .expect("Must provide input file for read mode."), + ) + .unwrap(), + ); + + if args.sample_number.is_none() { + eprintln!("Error: Sample number is required in read mode"); + return; + } + + let mut writer = match open_writer(args.output_file.as_deref(), args.print, false) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + + args.sample_number + .map(|n| match extract_assignment_ben(reader, n) { + Ok(vec) => writer.write_all(format!("{:?}\n", vec).as_bytes()).unwrap(), + Err(e) => eprintln!("Error: {:?}", e), + }); + } + Mode::XzCompress => { + logln!("Running in xz compress mode"); + + let in_file_name = args + .input_file + .expect("Must provide input file for xz-compress mode."); + let reader = BufReader::new(File::open(&in_file_name).unwrap()); + + let out_file_name = match args.output_file { + Some(name) => name, + None => in_file_name + ".xz", + }; + + if let Err(err) = check_overwrite(&out_file_name, args.overwrite) { + eprintln!("Error: {:?}", err); + return; + } + + let writer = BufWriter::new(File::create(out_file_name).unwrap()); + + if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { + eprintln!("Error: {:?}", err); + } + logln!("Done!"); + } + Mode::XzDecompress => { + logln!("Running in xz decompress mode"); + + let in_file_name = args + .input_file + .expect("Must provide input file for xz-decompress mode."); + + if !in_file_name.ends_with(".xz") { + eprintln!("Error: Unsupported file type for xz decompress mode"); + return; + } + + let output_file_name = match args.output_file { + Some(name) => name, + None => in_file_name[..in_file_name.len() - 3].to_string(), + }; + + if let Err(err) = check_overwrite(&output_file_name, args.overwrite) { + eprintln!("Error: {:?}", err); + return; + } + + let reader = BufReader::new(File::open(&in_file_name).unwrap()); + let writer = BufWriter::new(File::create(output_file_name).unwrap()); + + if let Err(err) = xz_decompress(reader, writer) { + eprintln!("Error: {:?}", err); + } + } + } +} diff --git a/ben/src/cli/common.rs b/ben/src/cli/common.rs new file mode 100644 index 0000000..5913b96 --- /dev/null +++ b/ben/src/cli/common.rs @@ -0,0 +1,24 @@ +use std::io::{self, Result}; +use std::path::Path; + +pub fn set_verbose(verbose: bool) { + if verbose { + std::env::set_var("RUST_LOG", "trace"); + } +} + +pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { + if Path::new(file_name).exists() && !overwrite { + eprint!( + "File {:?} already exists, do you want to overwrite it? (y/[n]): ", + file_name + ); + let mut user_input = String::new(); + io::stdin().read_line(&mut user_input).unwrap(); + eprintln!(); + if user_input.trim().to_lowercase() != "y" { + return Err(io::Error::from(io::ErrorKind::AlreadyExists)); + } + } + Ok(()) +} diff --git a/ben/src/cli/mod.rs b/ben/src/cli/mod.rs new file mode 100644 index 0000000..712484f --- /dev/null +++ b/ben/src/cli/mod.rs @@ -0,0 +1,4 @@ +pub mod ben; +pub mod common; +pub mod pben; +pub mod reben; diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs new file mode 100644 index 0000000..1b7df99 --- /dev/null +++ b/ben/src/cli/pben.rs @@ -0,0 +1,164 @@ +use crate::cli::common::set_verbose; +use crate::io::reader::BenDecoder; +use crate::io::writer::{BenEncoder, XBenEncoder}; +use crate::{logln, BenVariant}; +use clap::{Parser, ValueEnum}; +use pipe::pipe; +use std::{ + fs::File, + io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}, +}; +use xz2::write::XzEncoder; + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +enum Mode { + /// Convert BEN into PCOMPRESS. + BenToPc, + /// Convert PCOMPRESS into BEN. + PcToBen, + /// Convert PCOMPRESS into XBEN. + PcToXben, +} + +#[derive(Parser, Debug)] +#[command( + name = "Conversion tool for BEN and PCOMPRESS formats", + about = "This is a CLI tool that allows for the conversion between BEN and PCOMPRESS formats.", + version +)] +/// Defines the command line arguments accepted by the program. +struct Args { + /// Mode to run the program in + #[arg(short, long, value_enum)] + mode: Mode, + /// Input file to read from. + #[arg(short, long)] + input_file: Option, + /// Output file to write to. Optional. + /// If not provided, the output file will be determined + /// based on the input file and the mode of operation. + #[arg(short, long)] + output_file: Option, + /// If the output file already exists, this flag + /// will cause the program to overwrite it without + /// asking the user for confirmation. + #[arg(short = 'w', long)] + overwrite: bool, + /// Enables verbose printing for the CLI. Optional. + #[arg(short, long)] + verbose: bool, +} + +pub fn run() -> Result<()> { + let args = Args::parse(); + set_verbose(args.verbose); + + match args.mode { + Mode::BenToPc => { + logln!("Converting BEN to PCOMPRESS"); + + let ben_reader: Box = match args.input_file { + Some(file) => Box::new(BufReader::new(File::open(&file).unwrap())), + None => Box::new(io::stdin()), + }; + + let mut pcompress_writer: BufWriter> = match args.output_file { + Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + + let _ = std::thread::spawn(move || -> io::Result<()> { + assignment_decode_ben(ben_reader, pipe_writer) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + pcompress::encode::encode(&mut buf_pipe_reader, &mut pcompress_writer, false); + Ok(()) + } + Mode::PcToBen => { + logln!("Converting PCOMPRESS to BEN"); + + let mut pcompress_reader: BufReader> = match args.input_file { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), + None => BufReader::new(Box::new(io::stdin())), + }; + + let mut ben_writer: BufWriter> = match args.output_file { + Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + let mut buf_pipe_writer = BufWriter::new(pipe_writer); + + let _ = std::thread::spawn(move || { + pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer) + } + Mode::PcToXben => { + logln!("Converting PCOMPRESS to XBEN"); + + let mut pcompress_reader: BufReader> = match args.input_file { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), + None => BufReader::new(Box::new(io::stdin())), + }; + + let mut ben_writer: BufWriter> = match args.output_file { + Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + let mut buf_pipe_writer = BufWriter::new(pipe_writer); + + let _ = std::thread::spawn(move || { + pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer) + } + } +} + +fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { + let ben_reader = BenDecoder::new(&mut reader)?; + + for result in ben_reader { + match result { + Ok(assignment) => { + write!(writer, "{}\n", serde_json::to_string(&assignment).unwrap())?; + } + Err(e) => return Err(e), + } + } + + Ok(()) +} + +fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { + let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain); + + for line in reader.lines() { + let assignment: Vec = serde_json::from_str::>(&line.unwrap()) + .unwrap() + .into_iter() + .map(|x| x as u16 + 1) + .collect(); + ben_writer.write_assignment(assignment)?; + } + Ok(()) +} + +fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { + let encoder = XzEncoder::new(writer, 9); + let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain); + xben_writer.write_ben_file(reader)?; + Ok(()) +} diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs new file mode 100644 index 0000000..6d0b9f6 --- /dev/null +++ b/ben/src/cli/reben.rs @@ -0,0 +1,208 @@ +use crate::cli::common::set_verbose; +use crate::{ + json::graph::sort_json_file_by_key, + logln, + ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, +}; +use clap::{Parser, ValueEnum}; +use serde_json::{json, Value}; +use std::{ + fs::File, + io::{BufReader, BufWriter, Write}, +}; + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +enum Mode { + /// Sort a JSON dual graph by a key and emit a relabeling map. + Json, + /// Relabel or canonicalize a BEN file. + Ben, +} + +#[derive(Parser, Debug)] +#[command( + name = "Relabeling Binary Ensemble CLI Tool", + about = concat!( + "This is a command line tool for relabeling binary ensembles ", + "to help improve compression ratios for BEN and XBEN files." + ), + version +)] +/// Defines the command line arguments accepted by the program. +// TODO: Change the name of shape_file to dual_graph_file. +struct Args { + /// Input file to read from. + #[arg()] + input_file: String, + /// Output file to write to. + #[arg(short, long)] + output_file: Option, + /// Key to sort the JSON or BEN file by. + #[arg(short, long)] + key: Option, + /// Shape file to use for sorting the BEN file. Only needed + /// in BEN mode when a map is not provided. + #[arg(short, long)] + shape_file: Option, + /// Map file to use for relabeling the BEN file. + #[arg(short = 'p', long)] + map_file: Option, + /// Mode to run the program in (either JSON or BEN). + /// The JSON mode will sort a JSON file by a given key. + /// The BEN mode will relabel a BEN file according to a map file + /// or a key (the latter also requires a dual-graph file). If no + /// map file or key is provided, the BEN mode will canonicalize + /// the assignment vectors in the BEN file. + #[arg(short, long)] + mode: Mode, + /// Verbosity level for the program. + #[arg(short, long)] + verbose: bool, +} + +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + match &args.mode { + Mode::Json => { + let input_file = File::open(&args.input_file).expect("Could not open input file."); + let reader = BufReader::new(input_file); + + let key = args.key.as_ref().expect("No key provided."); + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", key).as_str() + } + }; + + let output_file = + File::create(&output_file_name).expect("Could not create output file."); + let writer = BufWriter::new(output_file); + + let map = sort_json_file_by_key(reader, writer, key); + + let map_file_name = args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", key).as_str() + + "_map.json"; + let map_file = File::create(map_file_name).expect("Could not create map file."); + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": key, + "relabeling_old_to_new_nodes_map": map.unwrap() + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .expect("Could not write map file."); + } + Mode::Ben => { + let input_file = File::open(&args.input_file).expect("Could not open input file."); + let reader = BufReader::new(input_file); + + if args.map_file.is_none() && args.key.is_none() { + logln!("Canonicalizing assignment vectors in ben file."); + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + "_canonicalized_assignments.jsonl.ben" + } + }; + + let output_file = + File::create(&output_file_name).expect("Could not create output file."); + let writer = BufWriter::new(output_file); + + relabel_ben_file(reader, writer).unwrap(); + return; + } + + if args.map_file.is_some() && args.key.is_some() { + panic!(concat!( + "Cannot provide both a map file and a key. ", + "Please provide either the map file or the key and the ", + "(JSON formatted) dual-graph file needed to generate a map file." + )); + } + + let mut map_file_name = String::new(); + if let Some(key) = args.key { + if let Some(shape) = args.shape_file { + logln!("Creating map file for key: {}", key); + + let output_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", key).as_str(); + + let output_file = + File::create(&output_file_name).expect("Could not create output file."); + let writer = BufWriter::new(output_file); + + let shape_reader = + BufReader::new(File::open(&shape).expect("Could not open shape file.")); + let map = sort_json_file_by_key(shape_reader, writer, &key); + + map_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", key).as_str() + + "_map.json"; + let map_file = + File::create(&map_file_name).expect("Could not create map file."); + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": key, + "relabeling_old_to_new_nodes_map": map.unwrap() + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .expect("Could not write map file."); + } else { + panic!("{}", format!("No shape file provided to go with key {:}", key)); + } + } + + if map_file_name.is_empty() { + map_file_name = args.map_file.as_ref().unwrap().to_owned(); + } + let map_file = File::open(&map_file_name).expect("Could not open map file."); + let map_reader = BufReader::new(map_file); + + let data: Value = serde_json::from_reader(map_reader).unwrap(); + + let new_to_old_node_map = data["relabeling_old_to_new_nodes_map"] + .as_object() + .unwrap() + .iter() + .map(|(k, v)| (v.as_u64().unwrap() as usize, k.parse::().unwrap())) + .collect::>(); + + let key = data["key"].as_str().unwrap(); + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + format!("_sorted_by_{}.jsonl.ben", key).as_str() + } + }; + let output_file = + File::create(&output_file_name).expect("Could not create output file."); + let writer = BufWriter::new(output_file); + + logln!("Relabeling ben file according to map file {}", map_file_name,); + + relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); + } + } +} diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 799c459..5c44efe 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -18,6 +18,7 @@ //! - `reben`: A tool for relabeling BEN files to improve compression ratios. //! +pub mod cli; pub mod codec; pub mod io; pub mod json; From af2586b3a648fd1ce34b8cb7860a47a116070118 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:18:49 -0600 Subject: [PATCH 004/221] add cli tests --- ben/src/cli/ben.rs | 207 +++++++- ben/src/cli/common.rs | 37 ++ ben/src/cli/pben.rs | 105 ++++- ben/src/cli/reben.rs | 39 ++ ben/tests/test_cli.rs | 1050 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1434 insertions(+), 4 deletions(-) create mode 100644 ben/tests/test_cli.rs diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 8cfd86d..6e96525 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -240,7 +240,7 @@ pub fn run() { logln!("Running in xencode mode"); let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_xben = args.ben_and_xben; + let mut jsonl_and_xben = args.jsonl_and_xben; if let Some(in_file) = args.input_file.as_ref() { if in_file.ends_with(".ben") { @@ -465,3 +465,208 @@ pub fn run() { } } } + +#[cfg(test)] +mod tests { + use super::*; + use clap::{CommandFactory, Parser}; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("ben-cli-ben-{name}-{nonce}")) + } + + #[test] + fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("Binary Ensemble CLI Tool")); + assert!(help.contains("--mode")); + assert!(help.contains("x-encode")); + } + + #[test] + fn parse_encode_args() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--output-file", + "out.ben", + "--save-all", + "--verbose", + "input.jsonl", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Encode); + assert_eq!(args.input_file.as_deref(), Some("input.jsonl")); + assert_eq!(args.output_file.as_deref(), Some("out.ben")); + assert!(args.save_all); + assert!(args.verbose); + } + + #[test] + fn parse_xencode_stream_flags() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "x-encode", + "--jsonl-and-xben", + "--ben-and-xben", + "--jsonl-and-ben", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::XEncode); + assert!(args.jsonl_and_xben); + assert!(args.ben_and_xben); + assert!(args.jsonl_and_ben); + } + + #[test] + fn encode_setup_derives_extensions() { + assert_eq!( + encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true).unwrap(), + "samples.jsonl.ben" + ); + assert_eq!( + encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true).unwrap(), + "samples.xben" + ); + assert_eq!( + encode_setup(Mode::XzCompress, "samples.jsonl".to_string(), None, true).unwrap(), + "samples.jsonl.xz" + ); + } + + #[test] + fn encode_setup_respects_explicit_output() { + assert_eq!( + encode_setup( + Mode::Encode, + "ignored.jsonl".to_string(), + Some("custom-output.ben".to_string()), + true, + ) + .unwrap(), + "custom-output.ben" + ); + } + + #[test] + fn encode_setup_checks_overwrite() { + let path = unique_path("existing.ben"); + fs::write(&path, "already here").unwrap(); + + let err = encode_setup( + Mode::Encode, + "input.jsonl".to_string(), + Some(path.to_string_lossy().into_owned()), + true, + ); + assert!(err.is_ok()); + + fs::remove_file(path).unwrap(); + } + + #[test] + fn decode_setup_derives_ben_and_xben_outputs() { + assert_eq!( + decode_setup("samples.ben".to_string(), None, false, true).unwrap(), + "samples" + ); + assert_eq!( + decode_setup("samples.xben".to_string(), None, false, true).unwrap(), + "samples.ben" + ); + assert_eq!( + decode_setup("samples.xben".to_string(), None, true, true).unwrap(), + "samples" + ); + } + + #[test] + fn decode_setup_rejects_xz_input() { + let err = decode_setup("samples.xz".to_string(), None, false, true).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn decode_setup_rejects_unknown_input() { + let err = decode_setup("samples.data".to_string(), None, false, true).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn decode_setup_respects_explicit_output() { + assert_eq!( + decode_setup( + "samples.xben".to_string(), + Some("custom.jsonl".to_string()), + true, + true, + ) + .unwrap(), + "custom.jsonl" + ); + } + + #[test] + fn open_reader_reads_file_contents() { + let path = unique_path("reader.txt"); + fs::write(&path, "hello\nworld\n").unwrap(); + + let mut reader = open_reader(Some(path.to_str().unwrap())); + let mut content = String::new(); + std::io::Read::read_to_string(&mut reader, &mut content).unwrap(); + + assert_eq!(content, "hello\nworld\n"); + fs::remove_file(path).unwrap(); + } + + #[test] + fn open_reader_accepts_stdin() { + let _reader = open_reader(None); + } + + #[test] + fn open_writer_creates_file_and_writes() { + let path = unique_path("writer.txt"); + { + let mut writer = open_writer(Some(path.to_str().unwrap()), false, true).unwrap(); + writer.write_all(b"written").unwrap(); + } + + assert_eq!(fs::read_to_string(&path).unwrap(), "written"); + fs::remove_file(path).unwrap(); + } + + #[test] + fn open_writer_supports_stdout_and_print() { + let mut stdout_writer = open_writer(None, false, true).unwrap(); + stdout_writer.write_all(b"").unwrap(); + + let mut print_writer = open_writer(Some("ignored.txt"), true, false).unwrap(); + print_writer.write_all(b"").unwrap(); + } + + #[test] + fn open_derived_writer_creates_file() { + let path = unique_path("derived.txt"); + { + let mut writer = open_derived_writer(path.to_string_lossy().into_owned()); + writer.write_all(b"derived").unwrap(); + } + + assert_eq!(fs::read_to_string(&path).unwrap(), "derived"); + fs::remove_file(path).unwrap(); + } +} diff --git a/ben/src/cli/common.rs b/ben/src/cli/common.rs index 5913b96..9540c38 100644 --- a/ben/src/cli/common.rs +++ b/ben/src/cli/common.rs @@ -22,3 +22,40 @@ pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("ben-cli-common-{name}-{nonce}")) + } + + #[test] + fn set_verbose_sets_rust_log() { + std::env::remove_var("RUST_LOG"); + set_verbose(true); + assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("trace")); + } + + #[test] + fn check_overwrite_allows_missing_file() { + let path = unique_path("missing.txt"); + assert!(!path.exists()); + check_overwrite(path.to_str().unwrap(), false).unwrap(); + } + + #[test] + fn check_overwrite_allows_existing_file_when_forced() { + let path = unique_path("existing.txt"); + fs::write(&path, "hello").unwrap(); + check_overwrite(path.to_str().unwrap(), true).unwrap(); + fs::remove_file(path).unwrap(); + } +} diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 1b7df99..315f64e 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -3,6 +3,7 @@ use crate::io::reader::BenDecoder; use crate::io::writer::{BenEncoder, XBenEncoder}; use crate::{logln, BenVariant}; use clap::{Parser, ValueEnum}; +use serde_json::json; use pipe::pipe; use std::{ fs::File, @@ -132,8 +133,15 @@ fn assignment_decode_ben(mut reader: R, mut writer: W) -> io: for result in ben_reader { match result { - Ok(assignment) => { - write!(writer, "{}\n", serde_json::to_string(&assignment).unwrap())?; + Ok((assignment, count)) => { + let assignment: Vec = assignment + .into_iter() + .map(|x| x.saturating_sub(1) as usize) + .collect(); + let line = serde_json::to_string(&assignment).unwrap(); + for _ in 0..count { + writeln!(writer, "{line}")?; + } } Err(e) => return Err(e), } @@ -159,6 +167,97 @@ fn assignment_encode_ben(reader: R, writer: W) -> i fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { let encoder = XzEncoder::new(writer, 9); let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain); - xben_writer.write_ben_file(reader)?; + + for line in reader.lines() { + let assignment: Vec = serde_json::from_str::>(&line.unwrap()) + .unwrap() + .into_iter() + .map(|x| x as u16 + 1) + .collect(); + xben_writer.write_json_value(json!({ "assignment": assignment }))?; + } + Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; + use crate::codec::encode::encode_jsonl_to_ben; + use clap::{CommandFactory, Parser}; + use std::io::{BufReader, Cursor}; + + #[test] + fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("PCOMPRESS")); + assert!(help.contains("--mode")); + } + + #[test] + fn parse_pc_to_xben_args() { + let args = Args::try_parse_from([ + "pben", + "--mode", + "pc-to-xben", + "--input-file", + "input.pc", + "--output-file", + "output.xben", + "--verbose", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::PcToXben); + assert_eq!(args.input_file.as_deref(), Some("input.pc")); + assert_eq!(args.output_file.as_deref(), Some("output.xben")); + assert!(args.verbose); + } + + #[test] + fn assignment_decode_ben_writes_json_lines() { + let jsonl = br#"{"assignment":[1,1,2],"sample":1} +{"assignment":[2,3,3],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(&jsonl[..]), &mut ben, BenVariant::Standard).unwrap(); + + let mut out = Vec::new(); + assignment_decode_ben(Cursor::new(ben), &mut out).unwrap(); + + assert_eq!(String::from_utf8(out).unwrap(), "[0,0,1]\n[1,2,2]\n"); + } + + #[test] + fn assignment_encode_ben_offsets_values_and_writes_ben() { + let input = b"[0,0,1]\n[1,1,2]\n"; + let mut ben = Vec::new(); + assignment_encode_ben(BufReader::new(&input[..]), &mut ben).unwrap(); + + let mut out = Vec::new(); + decode_ben_to_jsonl(Cursor::new(ben), &mut out).unwrap(); + + let rendered = String::from_utf8(out).unwrap(); + assert!(rendered.contains(r#""assignment":[1,1,2]"#)); + assert!(rendered.contains(r#""assignment":[2,2,3]"#)); + } + + #[test] + fn assignment_encode_xben_offsets_values_and_writes_xben() { + let input = b"[0,1,1]\n[2,2,0]\n"; + + let mut xben = Vec::new(); + assignment_encode_xben(BufReader::new(&input[..]), &mut xben).unwrap(); + + let mut out = Vec::new(); + decode_xben_to_jsonl(Cursor::new(xben), &mut out).unwrap(); + + let rendered = String::from_utf8(out).unwrap(); + assert!(rendered.contains(r#""assignment":[1,2,2]"#)); + assert!(rendered.contains(r#""assignment":[3,3,1]"#)); + } +} diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 6d0b9f6..f263e99 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -206,3 +206,42 @@ pub fn run() { } } } + +#[cfg(test)] +mod tests { + use super::*; + use clap::{CommandFactory, Parser}; + + #[test] + fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("Relabeling Binary Ensemble CLI Tool")); + assert!(help.contains("--shape-file")); + assert!(help.contains("canonicalize")); + } + + #[test] + fn parse_json_mode_args() { + let args = Args::try_parse_from([ + "reben", + "dual_graph.json", + "--mode", + "json", + "--key", + "GEOID20", + "--output-file", + "sorted.json", + "--verbose", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Json); + assert_eq!(args.input_file, "dual_graph.json"); + assert_eq!(args.key.as_deref(), Some("GEOID20")); + assert_eq!(args.output_file.as_deref(), Some("sorted.json")); + assert!(args.verbose); + } +} diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs new file mode 100644 index 0000000..08c7d06 --- /dev/null +++ b/ben/tests/test_cli.rs @@ -0,0 +1,1050 @@ +use ben::codec::decode::decode_ben_to_jsonl; +use ben::codec::encode::encode_jsonl_to_ben; +use ben::BenVariant; +use serde_json::Value; +use std::fs; +use std::io::BufReader; +use std::path::{Path, PathBuf}; +use std::process::{Command, Output, Stdio}; +use std::time::{SystemTime, UNIX_EPOCH}; + +struct TempDir { + path: PathBuf, +} + +impl TempDir { + fn new(name: &str) -> Self { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let path = std::env::temp_dir().join(format!("binary-ensemble-cli-{name}-{nonce}")); + fs::create_dir_all(&path).unwrap(); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } +} + +impl Drop for TempDir { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } +} + +fn bin_path(name: &str) -> &'static str { + match name { + "ben" => env!("CARGO_BIN_EXE_ben"), + "pben" => env!("CARGO_BIN_EXE_pben"), + "reben" => env!("CARGO_BIN_EXE_reben"), + _ => panic!("unknown binary {name}"), + } +} + +fn run(bin: &str, args: &[&str], cwd: &Path) -> Output { + Command::new(bin_path(bin)) + .current_dir(cwd) + .args(args) + .output() + .unwrap() +} + +fn run_with_stdin(bin: &str, args: &[&str], cwd: &Path, stdin: &[u8]) -> Output { + let mut child = Command::new(bin_path(bin)) + .current_dir(cwd) + .args(args) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + { + let mut input = child.stdin.take().unwrap(); + use std::io::Write; + input.write_all(stdin).unwrap(); + } + child.wait_with_output().unwrap() +} + +fn run_stdin_stdout(bin: &str, args: &[&str], cwd: &Path, stdin: &[u8]) -> Output { + run_with_stdin(bin, args, cwd, stdin) +} + +fn assert_success(output: &Output) { + assert!( + output.status.success(), + "status: {:?}\nstdout:\n{}\nstderr:\n{}", + output.status, + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); +} + +fn assert_failure(output: &Output) { + assert!( + !output.status.success(), + "expected failure\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); +} + +fn sample_jsonl() -> &'static str { + r#"{"assignment":[1,1,2],"sample":1} +{"assignment":[2,2,3],"sample":2} +"# +} + +fn sample_graph() -> &'static str { + r#"{ + "nodes": [ + {"id": 2, "GEOID20": "B"}, + {"id": 0, "GEOID20": "A"}, + {"id": 1, "GEOID20": "C"} + ], + "adjacency": [ + [{"id": 0}, {"id": 1}], + [{"id": 2}], + [{"id": 2}] + ] +}"# +} + +#[test] +fn all_clis_report_help_and_package_version() { + for bin in ["ben", "pben", "reben"] { + let help = run(bin, &["--help"], Path::new(".")); + assert_success(&help); + let help_text = String::from_utf8_lossy(&help.stdout); + assert!(help_text.contains("Usage:")); + + let version = run(bin, &["--version"], Path::new(".")); + assert_success(&version); + let version_text = String::from_utf8_lossy(&version.stdout); + assert!(version_text.contains(env!("CARGO_PKG_VERSION"))); + } +} + +#[test] +fn ben_cli_encode_decode_read_and_x_modes_roundtrip() { + let temp = TempDir::new("ben-workflow"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + let decoded_path = temp.path().join("decoded.jsonl"); + let xben_path = temp.path().join("samples.xben"); + let xdecoded_path = temp.path().join("xdecoded.jsonl"); + + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + + let encode = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + ben_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + ); + assert_success(&encode); + + let decode = run( + "ben", + &[ + "--mode", + "decode", + ben_path.to_str().unwrap(), + "--output-file", + decoded_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&decode); + assert_eq!(fs::read_to_string(&decoded_path).unwrap(), sample_jsonl()); + + let read = run( + "ben", + &[ + "--mode", + "read", + ben_path.to_str().unwrap(), + "--sample-number", + "2", + "--print", + ], + temp.path(), + ); + assert_success(&read); + assert_eq!(String::from_utf8(read.stdout).unwrap(), "[2, 2, 3]\n"); + + let xencode = run( + "ben", + &[ + "--mode", + "x-encode", + jsonl_path.to_str().unwrap(), + "--output-file", + xben_path.to_str().unwrap(), + "--jsonl-and-xben", + "--save-all", + "--n-cpus", + "1", + "--compression-level", + "1", + "--overwrite", + ], + temp.path(), + ); + assert_success(&xencode); + + let xdecode = run( + "ben", + &[ + "--mode", + "x-decode", + xben_path.to_str().unwrap(), + "--output-file", + xdecoded_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&xdecode); + assert_eq!(fs::read_to_string(&xdecoded_path).unwrap(), sample_jsonl()); +} + +#[test] +fn ben_cli_supports_stdin_stdout_workflows() { + let temp = TempDir::new("ben-streams"); + + let encode = run_stdin_stdout( + "ben", + &["--mode", "encode", "--save-all"], + temp.path(), + sample_jsonl().as_bytes(), + ); + assert_success(&encode); + + let decode = run_stdin_stdout( + "ben", + &["--mode", "decode", "--jsonl-and-ben"], + temp.path(), + &encode.stdout, + ); + assert_success(&decode); + assert_eq!(String::from_utf8(decode.stdout).unwrap(), sample_jsonl()); + + let xencode_jsonl = run_stdin_stdout( + "ben", + &[ + "--mode", + "x-encode", + "--jsonl-and-xben", + "--save-all", + "--n-cpus", + "1", + "--compression-level", + "1", + ], + temp.path(), + sample_jsonl().as_bytes(), + ); + assert_success(&xencode_jsonl); + + let xdecode_jsonl = run_stdin_stdout( + "ben", + &["--mode", "x-decode"], + temp.path(), + &xencode_jsonl.stdout, + ); + assert_success(&xdecode_jsonl); + assert_eq!(String::from_utf8(xdecode_jsonl.stdout).unwrap(), sample_jsonl()); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(sample_jsonl().as_bytes()), + &mut ben_bytes, + BenVariant::MkvChain, + ) + .unwrap(); + + let xencode_ben = run_stdin_stdout( + "ben", + &[ + "--mode", + "x-encode", + "--ben-and-xben", + "--n-cpus", + "1", + "--compression-level", + "1", + ], + temp.path(), + &ben_bytes, + ); + assert_success(&xencode_ben); + + let decode_ben = run_stdin_stdout( + "ben", + &["--mode", "decode", "--ben-and-xben"], + temp.path(), + &xencode_ben.stdout, + ); + assert_success(&decode_ben); + + let mut roundtrip_jsonl = Vec::new(); + decode_ben_to_jsonl(BufReader::new(&decode_ben.stdout[..]), &mut roundtrip_jsonl).unwrap(); + let mut original_jsonl = Vec::new(); + decode_ben_to_jsonl(BufReader::new(&ben_bytes[..]), &mut original_jsonl).unwrap(); + assert_eq!(roundtrip_jsonl, original_jsonl); +} + +#[test] +fn ben_cli_xz_roundtrip_and_overwrite_prompt() { + let temp = TempDir::new("ben-xz"); + let input_path = temp.path().join("samples.jsonl"); + let xz_path = temp.path().join("samples.jsonl.xz"); + let restored_path = temp.path().join("samples.jsonl.restored"); + + fs::write(&input_path, sample_jsonl()).unwrap(); + + let compress = run( + "ben", + &[ + "--mode", + "xz-compress", + input_path.to_str().unwrap(), + "--output-file", + xz_path.to_str().unwrap(), + "--n-cpus", + "1", + "--compression-level", + "1", + "--overwrite", + ], + temp.path(), + ); + assert_success(&compress); + + fs::write(&restored_path, "stale output").unwrap(); + let decompress = run_with_stdin( + "ben", + &[ + "--mode", + "xz-decompress", + xz_path.to_str().unwrap(), + "--output-file", + restored_path.to_str().unwrap(), + ], + temp.path(), + b"y\n", + ); + assert_success(&decompress); + assert_eq!(fs::read_to_string(&restored_path).unwrap(), sample_jsonl()); +} + +#[test] +fn ben_cli_supports_ben_to_xben_and_xben_to_ben_paths() { + let temp = TempDir::new("ben-xben-paths"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + let xben_path = temp.path().join("samples.xben"); + let roundtrip_ben_path = temp.path().join("roundtrip.ben"); + + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(fs::File::open(&jsonl_path).unwrap()), + &mut ben_bytes, + BenVariant::MkvChain, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let xencode = run( + "ben", + &[ + "--mode", + "x-encode", + ben_path.to_str().unwrap(), + "--output-file", + xben_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&xencode); + + let decode = run( + "ben", + &[ + "--mode", + "decode", + xben_path.to_str().unwrap(), + "--output-file", + roundtrip_ben_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&decode); + + let mut original_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&ben_path).unwrap()), + &mut original_jsonl, + ) + .unwrap(); + let mut roundtrip_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&roundtrip_ben_path).unwrap()), + &mut roundtrip_jsonl, + ) + .unwrap(); + assert_eq!(original_jsonl, roundtrip_jsonl); +} + +#[test] +fn ben_cli_uses_default_output_names() { + let temp = TempDir::new("ben-defaults"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.jsonl.ben"); + let xz_path = temp.path().join("samples.jsonl.xz"); + + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + + let encode = run( + "ben", + &["--mode", "encode", jsonl_path.to_str().unwrap(), "--save-all"], + temp.path(), + ); + assert_success(&encode); + assert!(ben_path.exists()); + + fs::remove_file(&jsonl_path).unwrap(); + let decode = run("ben", &["--mode", "decode", ben_path.to_str().unwrap()], temp.path()); + assert_success(&decode); + assert_eq!(fs::read_to_string(&jsonl_path).unwrap(), sample_jsonl()); + + let compress = run( + "ben", + &["--mode", "xz-compress", jsonl_path.to_str().unwrap()], + temp.path(), + ); + assert_success(&compress); + assert!(xz_path.exists()); + + fs::remove_file(&jsonl_path).unwrap(); + let decompress = run("ben", &["--mode", "xz-decompress", xz_path.to_str().unwrap()], temp.path()); + assert_success(&decompress); + assert_eq!(fs::read_to_string(&jsonl_path).unwrap(), sample_jsonl()); +} + +#[test] +fn ben_cli_reports_expected_error_paths() { + let temp = TempDir::new("ben-errors"); + let bogus_jsonl = temp.path().join("bogus.jsonl"); + let bogus_txt = temp.path().join("bogus.txt"); + let bogus_xz = temp.path().join("bogus.data"); + fs::write(&bogus_jsonl, sample_jsonl()).unwrap(); + fs::write(&bogus_txt, sample_jsonl()).unwrap(); + fs::write(&bogus_xz, "not xz").unwrap(); + + let xencode = run( + "ben", + &["--mode", "x-encode", bogus_txt.to_str().unwrap()], + temp.path(), + ); + assert_success(&xencode); + assert!( + String::from_utf8_lossy(&xencode.stderr).contains("Unsupported file type(s) for xencode mode") + ); + + let decode = run( + "ben", + &["--mode", "decode", bogus_jsonl.to_str().unwrap()], + temp.path(), + ); + assert_success(&decode); + assert!( + String::from_utf8_lossy(&decode.stderr).contains("Unsupported file type for decode mode") + ); + + let read = run("ben", &["--mode", "read", bogus_jsonl.to_str().unwrap()], temp.path()); + assert_success(&read); + assert!( + String::from_utf8_lossy(&read.stderr).contains("Sample number is required in read mode") + ); + + let xz = run( + "ben", + &["--mode", "xz-decompress", bogus_xz.to_str().unwrap()], + temp.path(), + ); + assert_success(&xz); + assert!( + String::from_utf8_lossy(&xz.stderr).contains("Unsupported file type for xz decompress mode") + ); + + let bad_xben = run_stdin_stdout("ben", &["--mode", "x-decode"], temp.path(), b"not-an-xben"); + assert_success(&bad_xben); + assert!(String::from_utf8_lossy(&bad_xben.stderr).contains("Error:")); + + let bad_decode_ben = run_stdin_stdout( + "ben", + &["--mode", "decode", "--jsonl-and-ben"], + temp.path(), + b"not-a-ben", + ); + assert_success(&bad_decode_ben); + assert!(String::from_utf8_lossy(&bad_decode_ben.stderr).contains("Error:")); + + let bad_decode_xben = run_stdin_stdout( + "ben", + &["--mode", "decode", "--ben-and-xben"], + temp.path(), + b"not-an-xben", + ); + assert_success(&bad_decode_xben); + assert!(String::from_utf8_lossy(&bad_decode_xben.stderr).contains("Error:")); +} + +#[test] +fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { + let temp = TempDir::new("ben-overwrite"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + let xben_path = temp.path().join("samples.xben"); + let xz_path = temp.path().join("samples.xz"); + let occupied = temp.path().join("occupied.out"); + let invalid_ben = temp.path().join("invalid.ben"); + let invalid_xben = temp.path().join("invalid.xben"); + let invalid_xz = temp.path().join("invalid.xz"); + + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + fs::write(&occupied, "occupied").unwrap(); + fs::write(&invalid_ben, "not ben").unwrap(); + fs::write(&invalid_xben, "not xben").unwrap(); + fs::write(&invalid_xz, "not xz").unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(sample_jsonl().as_bytes()), + &mut ben_bytes, + BenVariant::MkvChain, + ) + .unwrap(); + fs::write(&ben_path, &ben_bytes).unwrap(); + + let xencode_from_ben = run( + "ben", + &[ + "--mode", + "x-encode", + ben_path.to_str().unwrap(), + "--output-file", + xben_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&xencode_from_ben); + + let xz_compress = run( + "ben", + &[ + "--mode", + "xz-compress", + jsonl_path.to_str().unwrap(), + "--output-file", + xz_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&xz_compress); + + for output in [ + run_with_stdin( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "encode", + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + sample_jsonl().as_bytes(), + ), + run_with_stdin( + "ben", + &[ + "--mode", + "x-encode", + ben_path.to_str().unwrap(), + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "x-encode", + "--jsonl-and-xben", + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + sample_jsonl().as_bytes(), + ), + run_with_stdin( + "ben", + &[ + "--mode", + "decode", + "--jsonl-and-ben", + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "x-decode", + xben_path.to_str().unwrap(), + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "x-decode", + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "read", + ben_path.to_str().unwrap(), + "--sample-number", + "1", + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "xz-compress", + jsonl_path.to_str().unwrap(), + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + run_with_stdin( + "ben", + &[ + "--mode", + "xz-decompress", + xz_path.to_str().unwrap(), + "--output-file", + occupied.to_str().unwrap(), + ], + temp.path(), + b"n\n", + ), + ] { + assert_success(&output); + assert!(String::from_utf8_lossy(&output.stderr).contains("AlreadyExists")); + } + + let invalid_ben_to_xben = run( + "ben", + &[ + "--mode", + "x-encode", + invalid_ben.to_str().unwrap(), + "--output-file", + temp.path().join("bad.xben").to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&invalid_ben_to_xben); + assert!(String::from_utf8_lossy(&invalid_ben_to_xben.stderr).contains("Error:")); + + let unsupported_decode = run_stdin_stdout("ben", &["--mode", "decode"], temp.path(), b""); + assert_success(&unsupported_decode); + assert!( + String::from_utf8_lossy(&unsupported_decode.stderr).contains("Unsupported file type(s) for decode mode") + ); + + let read_too_large = run( + "ben", + &[ + "--mode", + "read", + ben_path.to_str().unwrap(), + "--sample-number", + "99", + "--print", + ], + temp.path(), + ); + assert_success(&read_too_large); + assert!(String::from_utf8_lossy(&read_too_large.stderr).contains("Error:")); + + let invalid_decode_ben = run( + "ben", + &[ + "--mode", + "decode", + invalid_ben.to_str().unwrap(), + "--output-file", + temp.path().join("decoded.jsonl").to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&invalid_decode_ben); + assert!(String::from_utf8_lossy(&invalid_decode_ben.stderr).contains("Error:")); + + let invalid_decode_xben = run( + "ben", + &[ + "--mode", + "decode", + invalid_xben.to_str().unwrap(), + "--output-file", + temp.path().join("decoded.ben").to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&invalid_decode_xben); + assert!(String::from_utf8_lossy(&invalid_decode_xben.stderr).contains("Error:")); + + let invalid_xdecode = run( + "ben", + &[ + "--mode", + "x-decode", + invalid_xben.to_str().unwrap(), + "--output-file", + temp.path().join("decoded2.jsonl").to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&invalid_xdecode); + assert!(String::from_utf8_lossy(&invalid_xdecode.stderr).contains("Error:")); + + let invalid_xz_decompress = run( + "ben", + &[ + "--mode", + "xz-decompress", + invalid_xz.to_str().unwrap(), + "--output-file", + temp.path().join("decoded3.txt").to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&invalid_xz_decompress); +} + +#[test] +fn reben_cli_json_and_ben_modes_work() { + let temp = TempDir::new("reben-workflow"); + let graph_path = temp.path().join("dual_graph.json"); + let sorted_path = temp.path().join("sorted_graph.json"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.jsonl.ben"); + let canonical_path = temp.path().join("canonicalized.ben"); + let map_relabel_path = temp.path().join("map_relabel.ben"); + + fs::write(&graph_path, sample_graph()).unwrap(); + fs::write( + &jsonl_path, + r#"{"assignment":[9,9,4],"sample":1} +{"assignment":[4,7,7],"sample":2} +"#, + ) + .unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(fs::File::open(&jsonl_path).unwrap()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let sort_graph = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--key", + "GEOID20", + "--output-file", + sorted_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&sort_graph); + + let sorted_graph = fs::read_to_string(&sorted_path).unwrap(); + assert!(sorted_graph.contains(r#""id":0"#)); + assert!(sorted_graph.contains(r#""GEOID20":"A"#)); + + let map_path = temp.path().join("dual_graph_sorted_by_GEOID20_map.json"); + assert!(map_path.exists()); + + let canonicalize = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--output-file", + canonical_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&canonicalize); + + let relabel = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-file", + map_relabel_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&relabel); + + let mut canonical_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&canonical_path).unwrap()), + &mut canonical_jsonl, + ) + .unwrap(); + let canonical_text = String::from_utf8(canonical_jsonl).unwrap(); + assert!(canonical_text.contains(r#""assignment":[1,1,2]"#)); + assert!(canonical_text.contains(r#""assignment":[1,2,2]"#)); + + let mut relabeled_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&map_relabel_path).unwrap()), + &mut relabeled_jsonl, + ) + .unwrap(); + let relabeled_text = String::from_utf8(relabeled_jsonl).unwrap(); + assert!(relabeled_text.contains(r#""assignment":[9,4,9]"#)); +} + +#[test] +fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations() { + let temp = TempDir::new("reben-more"); + let graph_path = temp.path().join("shape.json"); + let ben_path = temp.path().join("samples.jsonl.ben"); + let relabeled_path = temp.path().join("rekeyed.ben"); + + fs::write(&graph_path, sample_graph()).unwrap(); + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(sample_jsonl().as_bytes()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let relabel = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--key", + "GEOID20", + "--shape-file", + graph_path.to_str().unwrap(), + "--output-file", + relabeled_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&relabel); + assert!(temp.path().join("shape_sorted_by_GEOID20_map.json").exists()); + + let generated_graph = temp.path().join("shape_sorted_by_GEOID20.json"); + let generated_map = temp.path().join("shape_sorted_by_GEOID20_map.json"); + let both = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--key", + "GEOID20", + "--shape-file", + graph_path.to_str().unwrap(), + "--map-file", + generated_map.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&both); + assert!(String::from_utf8_lossy(&both.stderr).contains("Cannot provide both a map file and a key")); + + let missing_shape = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--key", + "GEOID20", + ], + temp.path(), + ); + assert_failure(&missing_shape); + assert!(String::from_utf8_lossy(&missing_shape.stderr).contains("No shape file provided")); + + let sorted_json: Value = serde_json::from_str(&fs::read_to_string(generated_graph).unwrap()).unwrap(); + assert_eq!(sorted_json["nodes"][0]["GEOID20"], "A"); +} + +#[test] +fn pben_cli_converts_between_formats() { + let temp = TempDir::new("pben"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + let pc_path = temp.path().join("samples.pc"); + let roundtrip_ben_path = temp.path().join("roundtrip.ben"); + let xben_path = temp.path().join("samples.xben"); + + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(fs::File::open(&jsonl_path).unwrap()), + &mut ben_bytes, + BenVariant::MkvChain, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let ben_to_pc = run( + "pben", + &[ + "--mode", + "ben-to-pc", + "--input-file", + ben_path.to_str().unwrap(), + "--output-file", + pc_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&ben_to_pc); + assert!(pc_path.exists()); + + let pc_to_ben = run( + "pben", + &[ + "--mode", + "pc-to-ben", + "--input-file", + pc_path.to_str().unwrap(), + "--output-file", + roundtrip_ben_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&pc_to_ben); + + let pc_to_xben = run( + "pben", + &[ + "--mode", + "pc-to-xben", + "--input-file", + pc_path.to_str().unwrap(), + "--output-file", + xben_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&pc_to_xben); + + let mut roundtrip_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&roundtrip_ben_path).unwrap()), + &mut roundtrip_jsonl, + ) + .unwrap(); + assert!(String::from_utf8(roundtrip_jsonl).unwrap().contains(r#""assignment":[2,2,3]"#)); + + let xdecode = run( + "ben", + &[ + "--mode", + "x-decode", + xben_path.to_str().unwrap(), + "--print", + ], + temp.path(), + ); + assert_success(&xdecode); + let printed = String::from_utf8_lossy(&xdecode.stdout); + assert!(printed.contains(r#""assignment":[2,2,3]"#)); +} From c46d18cc82ae5b06220305afb804b1806dd592f0 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:37:34 -0600 Subject: [PATCH 005/221] modernize the logging structure in ben --- Cargo.lock | 143 +++++++++++++++++++++++++++++++++ ben/Cargo.toml | 2 + ben/src/cli/ben.rs | 18 ++--- ben/src/cli/common.rs | 27 ++++++- ben/src/cli/pben.rs | 8 +- ben/src/cli/reben.rs | 7 +- ben/src/codec/decode/xz.rs | 18 ++--- ben/src/codec/encode/jsonl.rs | 14 ++-- ben/src/codec/translate/mod.rs | 8 +- ben/src/io/reader.rs | 8 +- ben/src/json/graph/mod.rs | 16 ++-- ben/src/lib.rs | 20 +---- ben/src/logging.rs | 31 +++++++ ben/src/ops/relabel/mod.rs | 14 ++-- 14 files changed, 260 insertions(+), 74 deletions(-) create mode 100644 ben/src/logging.rs diff --git a/Cargo.lock b/Cargo.lock index e9dc5cc..45d7045 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "ansi_term" version = "0.12.1" @@ -92,6 +101,8 @@ dependencies = [ "rand_chacha 0.9.0", "rand_distr", "serde_json", + "tracing", + "tracing-subscriber", "xz2", ] @@ -336,6 +347,12 @@ dependencies = [ "rand_chacha 0.3.1", ] +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + [[package]] name = "lzma-sys" version = "0.1.20" @@ -347,6 +364,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.7.6" @@ -362,6 +388,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -395,6 +430,12 @@ dependencies = [ "structopt", ] +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + [[package]] name = "pipe" version = "0.4.0" @@ -651,6 +692,17 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + [[package]] name = "regex-syntax" version = "0.8.6" @@ -730,12 +782,27 @@ dependencies = [ "serde_core", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "strsim" version = "0.8.0" @@ -822,6 +889,76 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unarray" version = "0.1.4" @@ -858,6 +995,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vec_map" version = "0.8.2" diff --git a/ben/Cargo.toml b/ben/Cargo.toml index a795822..996b7a3 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -20,6 +20,8 @@ clap = { version = "^4.5.2", features = ["derive"] } pcompress = "1.0.7" pipe = "0.4.0" serde_json = "^1.0.107" +tracing = "0.1.41" +tracing-subscriber = { version = "0.3.20", features = ["env-filter", "fmt"] } xz2 = "0.1.7" diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 6e96525..5cca30a 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -6,7 +6,7 @@ use crate::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; use crate::ops::extract::extract_assignment_ben; -use crate::{logln, BenVariant}; +use crate::BenVariant; use clap::{Parser, ValueEnum}; use std::{ fs::File, @@ -201,7 +201,7 @@ pub fn run() { match args.mode { Mode::Encode => { - logln!("Running in encode mode"); + tracing::trace!("Running in encode mode"); let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { @@ -237,7 +237,7 @@ pub fn run() { } } Mode::XEncode => { - logln!("Running in xencode mode"); + tracing::trace!("Running in xencode mode"); let mut ben_and_xben = args.ben_and_xben; let mut jsonl_and_xben = args.jsonl_and_xben; @@ -305,7 +305,7 @@ pub fn run() { } } Mode::Decode => { - logln!("Running in decode mode"); + tracing::trace!("Running in decode mode"); let mut ben_and_xben = args.ben_and_xben; let mut jsonl_and_ben = args.jsonl_and_ben; @@ -352,7 +352,7 @@ pub fn run() { } } Mode::XDecode => { - logln!("Running in x-decode mode"); + tracing::trace!("Running in x-decode mode"); let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { @@ -380,7 +380,7 @@ pub fn run() { } } Mode::Read => { - logln!("Running in read mode"); + tracing::trace!("Running in read mode"); let reader = BufReader::new( File::open( &args @@ -410,7 +410,7 @@ pub fn run() { }); } Mode::XzCompress => { - logln!("Running in xz compress mode"); + tracing::trace!("Running in xz compress mode"); let in_file_name = args .input_file @@ -432,10 +432,10 @@ pub fn run() { if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { eprintln!("Error: {:?}", err); } - logln!("Done!"); + tracing::trace!("Done!"); } Mode::XzDecompress => { - logln!("Running in xz decompress mode"); + tracing::trace!("Running in xz decompress mode"); let in_file_name = args .input_file diff --git a/ben/src/cli/common.rs b/ben/src/cli/common.rs index 9540c38..b8c6297 100644 --- a/ben/src/cli/common.rs +++ b/ben/src/cli/common.rs @@ -2,9 +2,10 @@ use std::io::{self, Result}; use std::path::Path; pub fn set_verbose(verbose: bool) { - if verbose { + if verbose && std::env::var_os("RUST_LOG").is_none() { std::env::set_var("RUST_LOG", "trace"); } + crate::logging::init_logging(); } pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { @@ -27,8 +28,14 @@ pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { mod tests { use super::*; use std::fs; + use std::sync::{Mutex, OnceLock}; use std::time::{SystemTime, UNIX_EPOCH}; + fn env_lock() -> &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) + } + fn unique_path(name: &str) -> std::path::PathBuf { let nonce = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -39,11 +46,29 @@ mod tests { #[test] fn set_verbose_sets_rust_log() { + let _guard = env_lock().lock().unwrap(); std::env::remove_var("RUST_LOG"); set_verbose(true); assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("trace")); } + #[test] + fn set_verbose_preserves_existing_log_level() { + let _guard = env_lock().lock().unwrap(); + std::env::set_var("RUST_LOG", "debug"); + set_verbose(true); + assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("debug")); + std::env::remove_var("RUST_LOG"); + } + + #[test] + fn set_verbose_initializes_logger_without_setting_trace() { + let _guard = env_lock().lock().unwrap(); + std::env::remove_var("RUST_LOG"); + set_verbose(false); + assert!(std::env::var("RUST_LOG").is_err()); + } + #[test] fn check_overwrite_allows_missing_file() { let path = unique_path("missing.txt"); diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 315f64e..0be66a4 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -1,7 +1,7 @@ use crate::cli::common::set_verbose; use crate::io::reader::BenDecoder; use crate::io::writer::{BenEncoder, XBenEncoder}; -use crate::{logln, BenVariant}; +use crate::BenVariant; use clap::{Parser, ValueEnum}; use serde_json::json; use pipe::pipe; @@ -57,7 +57,7 @@ pub fn run() -> Result<()> { match args.mode { Mode::BenToPc => { - logln!("Converting BEN to PCOMPRESS"); + tracing::trace!("Converting BEN to PCOMPRESS"); let ben_reader: Box = match args.input_file { Some(file) => Box::new(BufReader::new(File::open(&file).unwrap())), @@ -80,7 +80,7 @@ pub fn run() -> Result<()> { Ok(()) } Mode::PcToBen => { - logln!("Converting PCOMPRESS to BEN"); + tracing::trace!("Converting PCOMPRESS to BEN"); let mut pcompress_reader: BufReader> = match args.input_file { Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), @@ -103,7 +103,7 @@ pub fn run() -> Result<()> { assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer) } Mode::PcToXben => { - logln!("Converting PCOMPRESS to XBEN"); + tracing::trace!("Converting PCOMPRESS to XBEN"); let mut pcompress_reader: BufReader> = match args.input_file { Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index f263e99..6db69e7 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -1,7 +1,6 @@ use crate::cli::common::set_verbose; use crate::{ json::graph::sort_json_file_by_key, - logln, ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, }; use clap::{Parser, ValueEnum}; @@ -108,7 +107,7 @@ pub fn run() { let reader = BufReader::new(input_file); if args.map_file.is_none() && args.key.is_none() { - logln!("Canonicalizing assignment vectors in ben file."); + tracing::trace!("Canonicalizing assignment vectors in ben file."); let output_file_name = match args.output_file { Some(name) => name, @@ -137,7 +136,7 @@ pub fn run() { let mut map_file_name = String::new(); if let Some(key) = args.key { if let Some(shape) = args.shape_file { - logln!("Creating map file for key: {}", key); + tracing::trace!("Creating map file for key: {}", key); let output_file_name = shape.trim_end_matches(".json").to_owned() + format!("_sorted_by_{}.json", key).as_str(); @@ -200,7 +199,7 @@ pub fn run() { File::create(&output_file_name).expect("Could not create output file."); let writer = BufWriter::new(output_file); - logln!("Relabeling ben file according to map file {}", map_file_name,); + tracing::trace!("Relabeling ben file according to map file {}", map_file_name,); relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); } diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 52bf1c3..590d8a2 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,6 +1,6 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::codec::translate::ben32_to_ben_lines; -use crate::{log, logln, BenVariant}; +use crate::{progress, BenVariant}; use std::io::{self, BufRead, Error, Read, Write}; use xz2::read::XzDecoder; @@ -49,7 +49,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: if overflow[i - 3..=i] == [0, 0, 0, 0] { last_valid_assignment = i + 1; line_count += 1; - log!("Decoding sample: {}\r", line_count); + progress!("Decoding sample: {}\r", line_count); } } } @@ -60,7 +60,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let lines = &overflow[i + 1..i + 3]; let n_lines = u16::from_be_bytes([lines[0], lines[1]]); line_count += n_lines as usize; - log!("Decoding sample: {}\r", line_count); + progress!("Decoding sample: {}\r", line_count); } } } @@ -73,8 +73,8 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: ben32_to_ben_lines(&overflow[0..last_valid_assignment], &mut writer, variant)?; overflow = overflow[last_valid_assignment..].to_vec(); } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } @@ -132,7 +132,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i if overflow[i - 3..=i] == [0, 0, 0, 0] { last_valid_assignment = i + 1; line_count += 1; - log!("Decoding sample: {}\r", line_count); + progress!("Decoding sample: {}\r", line_count); } } } @@ -143,7 +143,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let lines = &overflow[i + 1..i + 3]; let n_lines = u16::from_be_bytes([lines[0], lines[1]]); line_count += n_lines as usize; - log!("Decoding sample: {}\r", line_count); + progress!("Decoding sample: {}\r", line_count); } } } @@ -162,7 +162,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i overflow.drain(..last_valid_assignment); starting_sample = line_count; } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 5f493e0..ced23dc 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,5 +1,5 @@ use crate::io::writer::{BenEncoder, XBenEncoder}; -use crate::{log, logln, BenVariant}; +use crate::{progress, BenVariant}; use serde_json::Value; use std::io::{BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; @@ -35,7 +35,7 @@ pub fn encode_jsonl_to_xben( let mut line_num = 1; for line_result in reader.lines() { - log!("Encoding line: {}\r", line_num); + progress!("Encoding line: {}\r", line_num); line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); @@ -43,8 +43,8 @@ pub fn encode_jsonl_to_xben( ben_encoder.write_json_value(data)?; } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } @@ -57,14 +57,14 @@ pub fn encode_jsonl_to_ben( let mut line_num = 1; let mut ben_encoder = BenEncoder::new(writer, variant); for line_result in reader.lines() { - log!("Encoding line: {}\r", line_num); + progress!("Encoding line: {}\r", line_num); line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); ben_encoder.write_json_value(data)?; } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 2899085..f681ee6 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -5,7 +5,7 @@ use std::io::{self, Error, Read, Write}; use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; -use crate::{log, logln, BenVariant}; +use crate::{progress, BenVariant}; fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { let mut buffer = [0u8; 4]; @@ -123,7 +123,7 @@ pub fn ben_to_ben32_lines( let max_len_bits = reader.read_u8()?; let n_bytes = reader.read_u32::()?; - log!("Encoding line: {}\r", sample_number); + progress!("Encoding line: {}\r", sample_number); match variant { BenVariant::Standard => { @@ -144,8 +144,8 @@ pub fn ben_to_ben32_lines( } } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 7006372..91cde9c 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -1,6 +1,6 @@ use crate::codec::decode::{decode_ben32_line, decode_ben_line}; use crate::util::rle::rle_to_vec; -use crate::{log, logln, BenVariant}; +use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use serde_json::json; use std::fs::File; @@ -151,8 +151,8 @@ impl BenDecoder { Ok(()) => b1[0], Err(e) => { if e.kind() == io::ErrorKind::UnexpectedEof { - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); return None; } return Some(Err(e)); @@ -230,7 +230,7 @@ impl Iterator for BenDecoder { Ok(assgn) => assgn, Err(e) => return Some(Err(e)), }; - log!( + progress!( "Decoding sample: {}\r", self.sample_count + ben_frame.count as usize ); diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 8193778..f22a8bb 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -1,6 +1,6 @@ //! JSON graph helpers used by relabeling workflows. -use crate::{log, logln}; +use crate::progress; use serde_json::{json, Value}; use std::collections::HashMap; use std::io::{Read, Result, Write}; @@ -12,10 +12,10 @@ pub fn sort_json_file_by_key( mut writer: W, key: &str, ) -> Result> { - logln!("Loading JSON file..."); + tracing::trace!("Loading JSON file..."); let mut data: Value = serde_json::from_reader(reader).unwrap(); - logln!("Sorting JSON file by key: {}", key); + tracing::trace!("Sorting JSON file by key: {}", key); if let Some(nodes) = data["nodes"].as_array_mut() { nodes.sort_by(|a, b| { let extract_value = |val: &Value| -> StdResult { @@ -39,18 +39,18 @@ pub fn sort_json_file_by_key( let mut rev_node_map = HashMap::new(); if let Some(nodes) = data["nodes"].as_array_mut() { for (i, node) in nodes.iter_mut().enumerate() { - log!("Relabeling node: {}\r", i + 1); + progress!("Relabeling node: {}\r", i + 1); node_map.insert(node["id"].to_string().parse::().unwrap(), i); rev_node_map.insert(i, node["id"].to_string().parse::().unwrap()); node["id"] = json!(i); } } - logln!(); + tracing::trace!(""); let mut edge_array = Vec::new(); if let Some(edges) = data["adjacency"].as_array() { for i in 0..edges.len() { - log!("Relabeling edge: {}\r", i + 1); + progress!("Relabeling edge: {}\r", i + 1); let edge_list_location = rev_node_map[&data["nodes"][i]["id"].to_string().parse::().unwrap()]; let mut new_edge_lst = edges[edge_list_location].as_array().unwrap().clone(); @@ -61,11 +61,11 @@ pub fn sort_json_file_by_key( edge_array.push(new_edge_lst); } } - logln!(); + tracing::trace!(""); data["adjacency"] = json!(edge_array); - logln!("Writing new json to file..."); + tracing::trace!("Writing new json to file..."); writer.write_all(serde_json::to_string(&data).unwrap().as_bytes())?; Ok(node_map) diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 5c44efe..8469303 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -22,28 +22,14 @@ pub mod cli; pub mod codec; pub mod io; pub mod json; +pub mod logging; pub mod ops; pub mod util; #[macro_export] -macro_rules! log { +macro_rules! progress { ($($arg:tt)*) => {{ - if let Ok(log_level) = std::env::var("RUST_LOG") { - if log_level.to_lowercase() == "trace" { - eprint!($($arg)*); - } - } - }} -} - -#[macro_export] -macro_rules! logln { - ($($arg:tt)*) => {{ - if let Ok(log_level) = std::env::var("RUST_LOG") { - if log_level.to_lowercase() == "trace" { - eprintln!($($arg)*); - } - } + $crate::logging::trace_progress(format_args!($($arg)*)); }} } diff --git a/ben/src/logging.rs b/ben/src/logging.rs new file mode 100644 index 0000000..2be4aeb --- /dev/null +++ b/ben/src/logging.rs @@ -0,0 +1,31 @@ +use tracing::Level; +use tracing_subscriber::EnvFilter; +use std::sync::Once; + +static INIT_LOGGER: Once = Once::new(); + +pub fn init_logging() { + INIT_LOGGER.call_once(|| { + let filter = EnvFilter::try_from_default_env() + .or_else(|_| EnvFilter::try_new("off")) + .expect("valid fallback log filter"); + + let subscriber = tracing_subscriber::fmt() + .with_env_filter(filter) + .with_writer(std::io::stderr) + .without_time() + .with_target(false) + .with_level(false) + .with_ansi(false) + .event_format(tracing_subscriber::fmt::format().compact()) + .finish(); + + let _ = tracing::subscriber::set_global_default(subscriber); + }); +} + +pub fn trace_progress(args: std::fmt::Arguments<'_>) { + if tracing::enabled!(Level::TRACE) { + eprint!("{args}"); + } +} diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index e85db71..20eafb8 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -3,7 +3,7 @@ use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; use crate::util::rle::{assign_to_rle, rle_to_vec}; -use crate::{log, logln, BenVariant}; +use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; use std::io::{self, Error, Read, Write}; @@ -58,10 +58,10 @@ pub fn relabel_ben_lines( sample_number += count_occurrences as usize; - log!("Relabeling line: {}\r", sample_number); + progress!("Relabeling line: {}\r", sample_number); } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } @@ -136,10 +136,10 @@ pub fn relabel_ben_lines_with_map( }; sample_number += count_occurrences as usize; - log!("Relabeling line: {}\r", sample_number); + progress!("Relabeling line: {}\r", sample_number); } - logln!(); - logln!("Done!"); + tracing::trace!(""); + tracing::trace!("Done!"); Ok(()) } From 6fdaa1cbebafa9cecfa426652fec6c9af7e25038 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:01:37 -0600 Subject: [PATCH 006/221] small reorg and cleanup of pathologic cases in pyben --- pyben/pyben/_core.pyi | 12 +- pyben/src/common.rs | 58 ++++++++ pyben/src/decode/mod.rs | 136 ++++--------------- pyben/src/encode/mod.rs | 194 ++++----------------------- pyben/src/lib.rs | 1 + pyben/tests/test_python_pipelines.py | 44 ++++++ pyben/uv.lock | 108 ++++++++------- 7 files changed, 218 insertions(+), 335 deletions(-) create mode 100644 pyben/src/common.rs diff --git a/pyben/pyben/_core.pyi b/pyben/pyben/_core.pyi index ce32a3a..c68d5b3 100644 --- a/pyben/pyben/_core.pyi +++ b/pyben/pyben/_core.pyi @@ -121,8 +121,8 @@ class PyBenEncoder: Path to the output BEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "markov"}, optional - Select BEN variant. If None, defaults to "markov" (equivalent to "mkv_chain"). + variant : {"standard", "mkv_chain"}, optional + Select BEN variant. If None, defaults to "mkv_chain". Raises ------ @@ -232,8 +232,8 @@ def compress_jsonl_to_ben( Path to the output BEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "markov"}, optional - Select BEN variant. If None, defaults to "markov" (equivalent to "mkv_chain"). + variant : {"standard", "mkv_chain"}, optional + Select BEN variant. If None, defaults to "mkv_chain". Raises ------ @@ -262,8 +262,8 @@ def compress_jsonl_to_xben( Path to the output XBEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "markov"}, optional - Select BEN variant. If None, defaults to "markov" (equivalent to "mkv_chain"). + variant : {"standard", "mkv_chain"}, optional + Select BEN variant. If None, defaults to "mkv_chain". n_threads : Number of threads to use for compression. If None, defaults to the number of CPU cores. compression_level : diff --git a/pyben/src/common.rs b/pyben/src/common.rs new file mode 100644 index 0000000..e26cdfc --- /dev/null +++ b/pyben/src/common.rs @@ -0,0 +1,58 @@ +use ben::BenVariant; +use pyo3::exceptions::{PyIOError, PyValueError}; +use pyo3::prelude::PyResult; +use std::fs::File; +use std::io::{BufReader, BufWriter}; +use std::path::PathBuf; + +pub fn parse_variant(variant: Option<&str>) -> PyResult { + match variant { + Some("standard") => Ok(BenVariant::Standard), + Some("mkv_chain") | Some("markov") | None => Ok(BenVariant::MkvChain), + Some(other) => Err(PyValueError::new_err(format!( + "Unknown variant: {other}. Supported variants are 'standard' and 'mkv_chain'." + ))), + } +} + +pub fn validate_input_output_paths(in_file: &PathBuf, out_file: &PathBuf) -> PyResult<()> { + if in_file == out_file { + return Err(PyIOError::new_err("Input and output paths must differ.")); + } + if !in_file.exists() { + return Err(PyIOError::new_err(format!( + "Input file {} does not exist.", + in_file.display() + ))); + } + Ok(()) +} + +pub fn open_input(in_file: &PathBuf) -> PyResult> { + let infile = File::open(in_file) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; + Ok(BufReader::new(infile)) +} + +pub fn open_output(out_file: &PathBuf, overwrite: bool) -> PyResult> { + if out_file.exists() && !overwrite { + return Err(PyIOError::new_err(format!( + "Output file {} already exists (use overwrite=True to replace).", + out_file.display() + ))); + } + + let out_open = if overwrite { + File::options() + .write(true) + .create(true) + .truncate(true) + .open(out_file) + } else { + File::options().write(true).create_new(true).open(out_file) + }; + let outfile = out_open.map_err(|e| { + PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())) + })?; + Ok(BufWriter::new(outfile)) +} diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index f7ebf55..43bd4d8 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -1,3 +1,4 @@ +use crate::common::{open_input, open_output, validate_input_output_paths}; use ben::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use ben::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, MkvRecord, Selection, @@ -6,8 +7,7 @@ use ben::io::reader::{ use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; use pyo3::types::PyDict; -use std::fs::File; -use std::io::{self, BufReader, BufWriter}; +use std::io; use std::path::PathBuf; type DynIter = Box> + Send>; @@ -29,10 +29,7 @@ impl PyBenDecoder { #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { - let file = File::options().read(true).open(&file_path).map_err(|e| { - PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) - })?; - let reader = BufReader::new(file); + let reader = open_input(&file_path)?; let iter: DynIter = match mode { "ben" => { let ben = BenDecoder::new(reader).map_err(|e| { @@ -99,7 +96,11 @@ impl PyBenDecoder { } match slf.iter.next() { Some(Ok((assignment, count))) => { - assert!(count > 0, "non-positive count; data may be corrupted"); + if count == 0 { + return Err(PyException::new_err( + "Decoder yielded a zero-count record; data may be corrupted.", + )); + } slf.current_assignment = Some(assignment.clone()); slf.remaining_count = count - 1; Ok(Some(assignment)) @@ -144,6 +145,9 @@ impl PyBenDecoder { indices.sort_unstable(); indices.dedup(); + if indices.is_empty() { + return Err(PyException::new_err("indices must not be empty")); + } if indices[0] <= 0 { return Err(PyException::new_err("indices must be 1-based")); } @@ -217,6 +221,12 @@ impl PyBenDecoder { if step == 0 || offset == 0 { return Err(PyException::new_err("step and offset must be >= 1")); } + if offset > slf.base_len { + return Err(PyException::new_err(format!( + "offset must be <= number of samples in base data ({})", + slf.base_len + ))); + } let sel = Selection::Every { step, offset }; slf.len_hint = (slf.base_len + step - 1 - (offset - 1)) / step; @@ -245,39 +255,9 @@ pub fn decompress_xben_to_ben( out_file: PathBuf, overwrite: bool, ) -> PyResult<()> { - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); - - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - let writer = BufWriter::new(outfile); + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; decode_xben_to_ben(reader, writer).map_err(|e| { PyIOError::new_err(format!( @@ -298,43 +278,13 @@ pub fn decompress_xben_to_jsonl( out_file: PathBuf, overwrite: bool, ) -> PyResult<()> { - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); - - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - let writer = BufWriter::new(outfile); + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; decode_xben_to_jsonl(reader, writer).map_err(|e| { PyIOError::new_err(format!( - "Failed to convert XBEN to BEN from {} to {}: {e}", + "Failed to convert XBEN to JSONL from {} to {}: {e}", in_file.display(), out_file.display() )) @@ -351,43 +301,13 @@ pub fn decompress_ben_to_jsonl( out_file: PathBuf, overwrite: bool, ) -> PyResult<()> { - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); - - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - let writer = BufWriter::new(outfile); + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; decode_ben_to_jsonl(reader, writer).map_err(|e| { PyIOError::new_err(format!( - "Failed to convert XBEN to BEN from {} to {}: {e}", + "Failed to convert BEN to JSONL from {} to {}: {e}", in_file.display(), out_file.display() )) diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 8c8a1d5..6efcb0d 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -1,12 +1,12 @@ +use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; use ben::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; use ben::io::writer::BenEncoder; -use ben::BenVariant; -use pyo3::exceptions::{PyException, PyIOError}; +use pyo3::exceptions::PyIOError; use pyo3::prelude::PyResult; use pyo3::{pyclass, pyfunction, pymethods}; use std::fs::File; -use std::io::{BufReader, BufWriter}; -use std::path::{Path, PathBuf}; +use std::io::BufWriter; +use std::path::PathBuf; #[pyclass] pub struct PyBenEncoder { @@ -19,45 +19,10 @@ impl PyBenEncoder { #[pyo3(signature = (file_path, overwrite = false, variant = None))] #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] fn new(file_path: PathBuf, overwrite: bool, variant: Option) -> PyResult { - let ben_var = match variant.as_deref() { - Some("standard") => BenVariant::Standard, - Some("mkv_chain") => BenVariant::MkvChain, - Some(other) => { - return Err(PyException::new_err(format!( - "Unknown variant: {}. Supported variants are 'standard' and 'mkv_chain'.", - other - ))) - } - _ => BenVariant::MkvChain, - }; + let ben_var = parse_variant(variant.as_deref())?; + let writer = open_output(&file_path, overwrite)?; - let path = Path::new(&file_path); - let file = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&file_path) - .map_err(|e| { - PyIOError::new_err(format!("Failed to create file {:?}: {}", file_path, e)) - })? - } else { - if path.exists() { - return Err(PyIOError::new_err(format!( - "File {:?} already exists. Use overwrite=True to overwrite it.", - file_path - ))); - } - File::options() - .write(true) - .create_new(true) - .open(&file_path) - .map_err(|e| { - PyIOError::new_err(format!("Failed to create file {:?}: {}", file_path, e)) - })? - }; - - let encoder = BenEncoder::new(BufWriter::new(file), ben_var); + let encoder = BenEncoder::new(writer, ben_var); Ok(PyBenEncoder { encoder: Some(encoder), }) @@ -111,42 +76,9 @@ pub fn compress_ben_to_xben( n_threads: Option, compression_level: Option, ) -> PyResult<()> { - // Basic validations - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); - - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - - let writer = BufWriter::new(outfile); + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; encode_ben_to_xben(reader, writer, n_threads, compression_level).map_err(|e| { PyIOError::new_err(format!( @@ -160,59 +92,19 @@ pub fn compress_ben_to_xben( } #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="markov"))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='markov')")] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain')")] pub fn compress_jsonl_to_ben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, variant: &str, ) -> PyResult<()> { - let ben_var = match variant { - "standard" => BenVariant::Standard, - "mkv_chain" | "markov" => BenVariant::MkvChain, - other => { - eprintln!( - "Warning: Unknown variant '{}', defaulting to 'markov'", - other - ); - BenVariant::MkvChain - } - }; - - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); + let ben_var = parse_variant(Some(variant))?; + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - let writer = BufWriter::new(outfile); encode_jsonl_to_ben(reader, writer, ben_var).map_err(|e| { PyIOError::new_err(format!( "Failed to convert JSONL to BEN from {} to {}: {e}", @@ -224,9 +116,9 @@ pub fn compress_jsonl_to_ben( } #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="markov", n_threads=None, compression_level=None))] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None))] #[pyo3( - text_signature = "(in_file, out_file, overwrite=false, variant='markov', n_threads=None, compression_level=None)" + text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain', n_threads=None, compression_level=None)" )] pub fn compress_jsonl_to_xben( in_file: PathBuf, @@ -236,54 +128,14 @@ pub fn compress_jsonl_to_xben( n_threads: Option, compression_level: Option, ) -> PyResult<()> { - let ben_var = match variant { - "standard" => BenVariant::Standard, - "mkv_chain" | "markov" => BenVariant::MkvChain, - other => { - eprintln!( - "Warning: Unknown variant '{}', defaulting to 'markov'", - other - ); - BenVariant::MkvChain - } - }; + let ben_var = parse_variant(Some(variant))?; + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; - if in_file == out_file { - return Err(PyIOError::new_err("Input and output paths must differ.")); - } - if !in_file.exists() { - return Err(PyIOError::new_err(format!( - "Input file {} does not exist.", - in_file.display() - ))); - } - if out_file.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_file.display() - ))); - } - // Open input (read-only, buffered) - let infile = File::open(&in_file) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; - let reader = BufReader::new(infile); - - // Open/create output according to overwrite flag - let out_open = if overwrite { - File::options() - .write(true) - .create(true) - .truncate(true) - .open(&out_file) - } else { - File::options().write(true).create_new(true).open(&out_file) - }; - let outfile = out_open - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_file.display())))?; - let writer = BufWriter::new(outfile); encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level).map_err(|e| { PyIOError::new_err(format!( - "Failed to convert JSONL to BEN from {} to {}: {e}", + "Failed to convert JSONL to XBEN from {} to {}: {e}", in_file.display(), out_file.display() )) diff --git a/pyben/src/lib.rs b/pyben/src/lib.rs index 0df00ac..f976f1a 100755 --- a/pyben/src/lib.rs +++ b/pyben/src/lib.rs @@ -1,6 +1,7 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; // <-- needed for wrap_pyfunction! +pub mod common; pub mod decode; pub mod encode; diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 91d6625..0d7e3f4 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -332,3 +332,47 @@ def test_ben_to_xben_and_back(tmp_path: Path) -> None: decompress_ben_to_jsonl(ben2, out_jsonl, overwrite=True) assert src.read_bytes() == out_jsonl.read_bytes() + + +def test_decoder_subsample_indices_rejects_empty_input(tmp_path: Path) -> None: + rng = random.Random(123) + seq = gen_sequence_standard(rng, 10) + + src = tmp_path / "src.jsonl" + write_jsonl(seq, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + dec = PyBenDecoder(ben, mode="ben") + with pytest.raises(Exception, match="indices must not be empty"): + dec.subsample_indices([]) + + +def test_decoder_subsample_every_rejects_offset_past_end(tmp_path: Path) -> None: + rng = random.Random(456) + seq = gen_sequence_standard(rng, 10) + + src = tmp_path / "src.jsonl" + write_jsonl(seq, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + dec = PyBenDecoder(ben, mode="ben") + with pytest.raises(Exception, match="offset must be <="): + dec.subsample_every(2, 99) + + +def test_compress_helpers_reject_unknown_variants(tmp_path: Path) -> None: + rng = random.Random(789) + seq = gen_sequence_standard(rng, 5) + + src = tmp_path / "src.jsonl" + write_jsonl(seq, src) + + with pytest.raises(ValueError, match="Unknown variant"): + compress_jsonl_to_ben(src, tmp_path / "out.ben", overwrite=True, variant="weird") + + with pytest.raises(ValueError, match="Unknown variant"): + compress_jsonl_to_xben(src, tmp_path / "out.xben", overwrite=True, variant="weird") diff --git a/pyben/uv.lock b/pyben/uv.lock index b99eef5..9f985ad 100755 --- a/pyben/uv.lock +++ b/pyben/uv.lock @@ -88,6 +88,56 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" }, ] +[[package]] +name = "binary-ensemble" +version = "0.3.0" +source = { editable = "." } + +[package.optional-dependencies] +docs = [ + { name = "myst-nb" }, + { name = "myst-parser" }, + { name = "nbconvert" }, + { name = "recommonmark" }, + { name = "sphinx" }, + { name = "sphinx-autoapi" }, + { name = "sphinx-copybutton" }, + { name = "sphinx-rtd-theme" }, +] + +[package.dev-dependencies] +dev = [ + { name = "gerrychain", extra = ["geo"] }, + { name = "ipykernel" }, + { name = "ipywidgets" }, + { name = "maturin" }, + { name = "pytest" }, + { name = "tqdm" }, +] + +[package.metadata] +requires-dist = [ + { name = "myst-nb", marker = "extra == 'docs'", specifier = ">=1.3.0" }, + { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=4.0.1" }, + { name = "nbconvert", marker = "extra == 'docs'", specifier = ">=7.16.6" }, + { name = "recommonmark", marker = "extra == 'docs'", specifier = ">=0.7.1" }, + { name = "sphinx", marker = "extra == 'docs'", specifier = ">=8.2.3" }, + { name = "sphinx-autoapi", marker = "extra == 'docs'", specifier = ">=3.6.1" }, + { name = "sphinx-copybutton", marker = "extra == 'docs'", specifier = ">=0.5.2" }, + { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=3.0.2" }, +] +provides-extras = ["docs"] + +[package.metadata.requires-dev] +dev = [ + { name = "gerrychain", extras = ["geo"], specifier = ">=0.3.2" }, + { name = "ipykernel", specifier = ">=7.0.1" }, + { name = "ipywidgets", specifier = ">=8.1.7" }, + { name = "maturin", specifier = ">=1.9.6" }, + { name = "pytest", specifier = ">=8.4.2" }, + { name = "tqdm", specifier = ">=4.67.1" }, +] + [[package]] name = "bleach" version = "6.2.0" @@ -558,6 +608,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -567,6 +619,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -576,6 +630,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -583,6 +639,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -1556,56 +1614,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] -[[package]] -name = "py-ben" -version = "0.3.0" -source = { editable = "." } - -[package.optional-dependencies] -docs = [ - { name = "myst-nb" }, - { name = "myst-parser" }, - { name = "nbconvert" }, - { name = "recommonmark" }, - { name = "sphinx" }, - { name = "sphinx-autoapi" }, - { name = "sphinx-copybutton" }, - { name = "sphinx-rtd-theme" }, -] - -[package.dev-dependencies] -dev = [ - { name = "gerrychain", extra = ["geo"] }, - { name = "ipykernel" }, - { name = "ipywidgets" }, - { name = "maturin" }, - { name = "pytest" }, - { name = "tqdm" }, -] - -[package.metadata] -requires-dist = [ - { name = "myst-nb", marker = "extra == 'docs'", specifier = ">=1.3.0" }, - { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=4.0.1" }, - { name = "nbconvert", marker = "extra == 'docs'", specifier = ">=7.16.6" }, - { name = "recommonmark", marker = "extra == 'docs'", specifier = ">=0.7.1" }, - { name = "sphinx", marker = "extra == 'docs'", specifier = ">=8.2.3" }, - { name = "sphinx-autoapi", marker = "extra == 'docs'", specifier = ">=3.6.1" }, - { name = "sphinx-copybutton", marker = "extra == 'docs'", specifier = ">=0.5.2" }, - { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=3.0.2" }, -] -provides-extras = ["docs"] - -[package.metadata.requires-dev] -dev = [ - { name = "gerrychain", extras = ["geo"], specifier = ">=0.3.2" }, - { name = "ipykernel", specifier = ">=7.0.1" }, - { name = "ipywidgets", specifier = ">=8.1.7" }, - { name = "maturin", specifier = ">=1.9.6" }, - { name = "pytest", specifier = ">=8.4.2" }, - { name = "tqdm", specifier = ">=4.67.1" }, -] - [[package]] name = "pycparser" version = "2.23" From 8c11fc9b0cbeb402ee63318b366bd336a38793cd Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 18:58:39 -0600 Subject: [PATCH 007/221] migrate to taskfile --- Makefile | 81 ---------------- Taskfile.yml | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 81 deletions(-) delete mode 100644 Makefile create mode 100644 Taskfile.yml diff --git a/Makefile b/Makefile deleted file mode 100644 index 3ea3c00..0000000 --- a/Makefile +++ /dev/null @@ -1,81 +0,0 @@ -# Use bash and keep each recipe in a single shell so PATH/export persists. -SHELL := /bin/bash -.ONESHELL: - -# Where installers usually drop binaries -CARGO_BIN := $(HOME)/.cargo/bin -LOCAL_BIN := $(HOME)/.local/bin - -# Ensure those common install dirs are searched first -export PATH := $(CARGO_BIN):$(LOCAL_BIN):$(PATH) - -.PHONY: all help ensure-rust ensure-uv pyben-develop - -all: pyben-develop - -help: - @echo "Targets:" - @echo " make -> install rust & uv if needed, then build pyben (uv sync; uv run maturin develop)" - @echo " make ensure-rust -> install Rust via rustup if missing" - @echo " make ensure-uv -> install uv if missing" - @echo " make pyben-develop -> run uv sync && uv run maturin develop in ./pyben" - -ensure-rust: - @if ! command -v rustc >/dev/null 2>&1; then \ - echo "[rust] Installing Rust (rustup) ..."; \ - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal; \ - echo "[rust] Installed. Version: $$($(CARGO_BIN)/rustc --version)"; \ - else \ - echo "[rust] Found: $$(rustc --version)"; \ - fi - -ensure-uv: - @if ! command -v uv >/dev/null 2>&1; then \ - echo "[uv] Installing uv ..."; \ - curl -LsSf https://astral.sh/uv/install.sh | sh; \ - echo "[uv] Installed. Version: $$($(LOCAL_BIN)/uv --version 2>/dev/null || $(CARGO_BIN)/uv --version)"; \ - else \ - echo "[uv] Found: $$(uv --version)"; \ - fi - -pyben-develop: ensure-rust ensure-uv - # Make sure freshly installed binaries are picked up in this shell - export PATH="$(CARGO_BIN):$(LOCAL_BIN):$$PATH" - cd pyben - uv sync --all-groups - uv run maturin develop - -release: ensure-rust ensure-uv - # Make sure freshly installed binaries are picked up in this shell - export PATH="$(CARGO_BIN):$(LOCAL_BIN):$$PATH" - cd pyben - uv sync --all-groups - uv run maturin build --release - -clean: - cargo clean - cd pyben - rm -rf target - rm -rf dist - rm -rf pyben.egg-info - rm -rf src/pyben.c - rm -rf pyben/*abi3.so - rm -rf pyben/pyben.*.pyd - rm -rf .venv - rm -rf ./**/__pycache__ - cd docs - rm -rf _build - cd user - rm -rf example_data - -test-rust: - export PATH="$(CARGO_BIN):$(LOCAL_BIN):$$PATH" - cargo test - -test-python: pyben-develop - export PATH="$(CARGO_BIN):$(LOCAL_BIN):$$PATH" - cd pyben - uv run pytest - - -test: test-rust test-python diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 0000000..518ddc5 --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,267 @@ +# yaml-language-server: $schema=https://taskfile.dev/schema.json + +version: "3" + +vars: + CARGO_BIN: '{{.HOME}}/.cargo/bin' + LOCAL_BIN: '{{.HOME}}/.local/bin' + LLVM_BIN: '{{.HOME}}/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/bin' + COV_TARGET_DIR: '{{.ROOT_DIR}}/target/llvm-cov-target' + PYBEN_TEST_PATHS: 'tests/test_python_pipelines.py' + +tasks: + default: + desc: List available tasks + cmds: + - task: help + + help: + desc: List available tasks + silent: true + cmds: + - task --list-all + + ensure-rust-linux: &ensure-rust-unix + desc: Install Rust if it is not already available + internal: true + silent: true + status: + - command -v rustc + cmds: + - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal + + ensure-rust-darwin: *ensure-rust-unix + + ensure-rust-windows: + desc: Install Rust if it is not already available + internal: true + silent: true + status: + - where rustc + cmds: + - powershell -ExecutionPolicy ByPass -Command "irm https://win.rustup.rs/x86_64 | iex" + + ensure-uv-linux: &ensure-uv-unix + desc: Install uv if it is not already available + internal: true + silent: true + status: + - command -v uv + cmds: + - curl -LsSf https://astral.sh/uv/install.sh | sh + + ensure-uv-darwin: *ensure-uv-unix + + ensure-uv-windows: + desc: Install uv if it is not already available + internal: true + silent: true + status: + - where uv + cmds: + - powershell -ExecutionPolicy ByPass -Command "irm https://astral.sh/uv/install.ps1 | iex" + + ensure-toolchain: + desc: Ensure Rust and uv are installed + silent: true + deps: + - ensure-rust-{{OS}} + - ensure-uv-{{OS}} + + pyben-sync: + desc: Sync the pyben development environment + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: pyben + cmds: + - uv sync --all-groups + + pyben-develop: + desc: Build the editable pyben extension + silent: true + deps: + - pyben-sync + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: pyben + cmds: + - uv run maturin develop + + release: + desc: Build a release wheel for pyben + silent: true + deps: + - pyben-sync + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: pyben + cmds: + - uv run maturin build --release + + test-rust: + desc: Run Rust tests for the workspace + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo test + + test-python: + desc: Run the pyben Python tests + silent: true + deps: + - pyben-develop + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: pyben + cmds: + - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} + + test: + desc: Run Rust and Python tests + silent: true + cmds: + - task: test-rust + - task: test-python + + coverage-ben: + desc: Run Rust coverage for the ben crate + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)bin/' + + coverage-pyben: + desc: Run Python-driven Rust coverage for pyben + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo llvm-cov clean --workspace + - cargo llvm-cov -p pyben --no-report + - >- + bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; + export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; + cd "{{.ROOT_DIR}}/pyben"; + uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; + uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' + - >- + {{.LLVM_BIN}}/llvm-profdata merge -sparse + {{.ROOT_DIR}}/target/*.profraw + {{.COV_TARGET_DIR}}/*.profraw + -o /tmp/pyben.profdata + - >- + {{.LLVM_BIN}}/llvm-cov report + {{.COV_TARGET_DIR}}/debug/libpyben_core.so + -instr-profile=/tmp/pyben.profdata + --ignore-filename-regex='/.cargo/registry|/rustc/|^/mnt/.*/ben/src/' + + coverage-pyben-html: + desc: Generate an HTML Rust coverage report for pyben at /tmp/pyben-coverage.html + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo llvm-cov clean --workspace + - cargo llvm-cov -p pyben --no-report + - >- + bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; + export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; + cd "{{.ROOT_DIR}}/pyben"; + uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; + uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' + - >- + {{.LLVM_BIN}}/llvm-profdata merge -sparse + {{.ROOT_DIR}}/target/*.profraw + {{.COV_TARGET_DIR}}/*.profraw + -o /tmp/pyben.profdata + - >- + bash -lc '{{.LLVM_BIN}}/llvm-cov show {{.COV_TARGET_DIR}}/debug/libpyben_core.so + -instr-profile=/tmp/pyben.profdata + --ignore-filename-regex='\"'\"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'\"'\"' + --format=html > /tmp/pyben-coverage.html' + + coverage-summary: + desc: Run ben and pyben coverage and print both reports plus a combined summary table + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + TERM: xterm-256color + CLICOLOR_FORCE: "1" + cmds: + - >- + bash -lc ' + ben_report_file=/tmp/ben-coverage-report.txt; + pyben_report_file=/tmp/pyben-coverage-report.txt; + + cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)bin/'"'"' > "$ben_report_file"; + ben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_report_file")"; + + cargo llvm-cov clean --workspace >/dev/null; + cargo llvm-cov -p pyben --no-report >/dev/null; + eval "$(cargo llvm-cov show-env --sh)"; + export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; + cd "{{.ROOT_DIR}}/pyben"; + uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}" >/dev/null; + uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} >/dev/null; + cd "{{.ROOT_DIR}}"; + {{.LLVM_BIN}}/llvm-profdata merge -sparse target/*.profraw {{.COV_TARGET_DIR}}/*.profraw -o /tmp/pyben.profdata >/dev/null; + {{.LLVM_BIN}}/llvm-cov report {{.COV_TARGET_DIR}}/debug/libpyben_core.so -instr-profile=/tmp/pyben.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$pyben_report_file"; + pyben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$pyben_report_file")"; + + printf "\n%s\n\n" "BEN COVERAGE"; + cat "$ben_report_file"; + printf "\n%s\n\n" "PYBEN COVERAGE"; + cat "$pyben_report_file"; + printf "\n%-10s %-10s\n" "Target" "Lines"; + printf "%-10s %-10s\n" "ben" "${ben_total:-n/a}"; + printf "%-10s %-10s\n" "pyben" "${pyben_total:-n/a}"; + ' + + clean-linux: &clean-unix + desc: Clean build artifacts + internal: true + silent: true + cmds: + - cargo clean + - rm -rf pyben/target pyben/dist pyben/pyben.egg-info pyben/src/pyben.c + - rm -rf pyben/pyben/*abi3.so pyben/pyben/pyben.*.pyd + - rm -rf pyben/.venv + - find . -type d -name "__pycache__" -exec rm -rf {} + + - rm -rf docs/_build docs/user/example_data + + clean-darwin: *clean-unix + + clean-windows: + desc: Clean build artifacts + internal: true + silent: true + cmds: + - cargo clean + - cmd /c "if exist pyben\\target rmdir /s /q pyben\\target" + - cmd /c "if exist pyben\\dist rmdir /s /q pyben\\dist" + - cmd /c "if exist pyben\\pyben.egg-info rmdir /s /q pyben\\pyben.egg-info" + - cmd /c "if exist pyben\\.venv rmdir /s /q pyben\\.venv" + - powershell -NoProfile -Command "Get-ChildItem -Path . -Directory -Filter __pycache__ -Recurse | Remove-Item -Recurse -Force" + - cmd /c "if exist docs\\_build rmdir /s /q docs\\_build" + - cmd /c "if exist docs\\user\\example_data rmdir /s /q docs\\user\\example_data" + + clean: + desc: Clean build artifacts + silent: true + cmds: + - task: clean-{{OS}} From 48bcd73c4070e3c33c8873f27b613f2d106b349b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 12 Mar 2026 18:59:04 -0600 Subject: [PATCH 008/221] add tests and coverage for whole package --- ben/src/codec/encode/ben.rs | 30 ++- ben/src/codec/encode/jsonl.rs | 16 +- ben/src/codec/encode/tests.rs | 2 +- ben/src/codec/translate/tests.rs | 2 +- ben/src/io/writer.rs | 2 +- pyben/pyben/_core.pyi | 27 +++ pyben/src/decode/mod.rs | 241 +++++++++++-------- pyben/tests/test_python_pipelines.py | 347 +++++++++++++++++++++++++++ 8 files changed, 558 insertions(+), 109 deletions(-) diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 0dfe18e..6c7f1ad 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,8 +1,14 @@ use crate::util::rle::assign_to_rle; use serde_json::Value; - -pub(crate) fn encode_ben32_line(data: Value) -> Vec { - let assign_vec = data["assignment"].as_array().unwrap(); +use std::io; + +pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { + let assign_vec = data["assignment"].as_array().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "'assignment' field either missing or is not an array of integers", + ) + })?; let mut prev_assign: u16 = 0; let mut count: u16 = 0; let mut first = true; @@ -10,7 +16,21 @@ pub(crate) fn encode_ben32_line(data: Value) -> Vec { let mut ret = Vec::new(); for assignment in assign_vec { - let assign = assignment.as_u64().unwrap() as u16; + let assign_u64 = assignment.as_u64().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + assignment + ), + ) + })?; + let assign = u16::try_from(assign_u64).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", assign_u64), + ) + })?; if first { prev_assign = assign; count = 1; @@ -33,7 +53,7 @@ pub(crate) fn encode_ben32_line(data: Value) -> Vec { } ret.extend([0, 0, 0, 0]); - ret + Ok(ret) } pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index ced23dc..7432a49 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,7 +1,7 @@ use crate::io::writer::{BenEncoder, XBenEncoder}; use crate::{progress, BenVariant}; use serde_json::Value; -use std::io::{BufRead, Result, Write}; +use std::io::{self, BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; @@ -38,7 +38,12 @@ pub fn encode_jsonl_to_xben( progress!("Encoding line: {}\r", line_num); line_num += 1; let line = line_result?; - let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); + let data: Value = serde_json::from_str(&line).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Error parsing JSON from line: {e}"), + ) + })?; ben_encoder.write_json_value(data)?; } @@ -60,7 +65,12 @@ pub fn encode_jsonl_to_ben( progress!("Encoding line: {}\r", line_num); line_num += 1; let line = line_result?; - let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); + let data: Value = serde_json::from_str(&line).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Error parsing JSON from line: {e}"), + ) + })?; ben_encoder.write_json_value(data)?; } diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 27b214b..cc07a3d 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -473,7 +473,7 @@ fn encode_jsonl_to_ben32(reader: R, mut writer: W) -> std: let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); - writer.write_all(&encode_ben32_line(data))?; + writer.write_all(&encode_ben32_line(data)?)?; } eprintln!("Done!"); Ok(()) diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 5abe3b8..d396bc5 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -14,7 +14,7 @@ fn encode_jsonl_to_ben32(reader: R, mut writer: W) -> std: let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); - writer.write_all(&encode_ben32_line(data))?; + writer.write_all(&encode_ben32_line(data)?)?; } Ok(()) } diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 115ea89..0ac2143 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -155,7 +155,7 @@ impl XBenEncoder { } pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let encoded = encode_ben32_line(data); + let encoded = encode_ben32_line(data)?; match self.variant { BenVariant::Standard => { self.encoder.write_all(&encoded)?; diff --git a/pyben/pyben/_core.pyi b/pyben/pyben/_core.pyi index c68d5b3..809b5e6 100644 --- a/pyben/pyben/_core.pyi +++ b/pyben/pyben/_core.pyi @@ -5,6 +5,10 @@ class PyBenDecoder: """Iterator over assignments in a BEN or XBEN file. Open a decoder over a BEN (`.ben`) or XBEN (`.xben`) file. + Construction is lazy with respect to sample counting: opening the decoder does + not scan the whole file. The first call to :func:`len` or :meth:`count_samples` + will count samples and cache the result. + Parameters ---------- file_path : @@ -25,6 +29,23 @@ class PyBenDecoder: ) -> None: ... def __iter__(self) -> Iterator[list[int]]: ... def __next__(self) -> list[int]: ... + def __len__(self) -> int: + """Return the number of samples. + + Notes + ----- + The first call may require a full scan of the underlying file and can be + expensive for very large BEN/XBEN datasets. The result is cached after + the first successful count. + """ + ... + def count_samples(self) -> int: + """Count and cache the total number of samples in the source file. + + This is equivalent to calling :func:`len`, but is more explicit about + the fact that the first call may perform a full-file scan. + """ + ... def subsample_indices(self, indices: Iterable[int]) -> "PyBenDecoder": """Keep only the given **1-based** sample indices. @@ -47,6 +68,9 @@ class PyBenDecoder: def subsample_range(self, start: int, end: int) -> "PyBenDecoder": """Keep only samples in the inclusive **1-based** range [start, end]. + The base sample count is computed on demand if needed for bounds + validation. + Arguments --------- start : @@ -65,6 +89,9 @@ class PyBenDecoder: """Keep every `step`-th sample starting at **1-based** `offset`. Returns the same decoder (fluent API). + The base sample count is computed on demand if needed for bounds + validation. + Arguments --------- step : diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index 43bd4d8..bebd434 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -12,15 +12,45 @@ use std::path::PathBuf; type DynIter = Box> + Send>; +#[derive(Clone)] +enum DecoderMode { + Ben, + XBen, +} + +impl DecoderMode { + fn parse(mode: &str) -> PyResult { + match mode { + "ben" => Ok(Self::Ben), + "xben" => Ok(Self::XBen), + _ => Err(PyException::new_err( + "Unknown mode. Supported modes are 'ben' and 'xben'.", + )), + } + } + + fn as_str(&self) -> &'static str { + match self { + Self::Ben => "ben", + Self::XBen => "xben", + } + } +} + +#[derive(Clone)] +struct DecoderSource { + path: PathBuf, + mode: DecoderMode, +} + #[pyclass(module = "pyben", unsendable)] pub struct PyBenDecoder { + source: DecoderSource, iter: DynIter, current_assignment: Option>, remaining_count: u16, - src_path: PathBuf, - mode: String, - base_len: usize, - len_hint: usize, + base_len: Option, + len_hint: Option, } #[pymethods] @@ -29,58 +59,20 @@ impl PyBenDecoder { #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { - let reader = open_input(&file_path)?; - let iter: DynIter = match mode { - "ben" => { - let ben = BenDecoder::new(reader).map_err(|e| { - PyException::new_err(format!("Failed to create BenDecoder: {e}")) - })?; - Box::new(ben) - } - "xben" => { - let warnings = py.import("warnings")?; - let kwargs = PyDict::new(py); - // kwargs.set_item("stacklevel", 2)?; - - warnings.call_method( - "warn", - ( - "XBEN may take a second to start decoding.", - py.get_type::(), - ), - Some(&kwargs), - )?; - - let xben = XBenDecoder::new(reader).map_err(|e| { - PyException::new_err(format!("Failed to create XBenDecoder: {e}")) - })?; - Box::new(xben) - } - _ => { - return Err(PyException::new_err( - "Unknown mode. Supported modes are 'ben' and 'xben'.", - )); - } + let mode = DecoderMode::parse(mode)?; + let source = DecoderSource { + path: file_path, + mode, }; - - // Detach to get around the GIL - let base_len = py - .detach(|| count_samples_from_file(&file_path, mode)) - .map_err(|e| { - PyException::new_err(format!( - "Failed to count samples in {}: {e}", - file_path.display() - )) - })?; + let iter = build_iter(py, &source)?; Ok(Self { + source, iter, current_assignment: None, remaining_count: 0, - src_path: file_path, - mode: mode.to_string(), - base_len: base_len, - len_hint: base_len, + base_len: None, + len_hint: None, }) } @@ -113,8 +105,21 @@ impl PyBenDecoder { } // Because we want progress bars!!! - fn __len__(slf: PyRef) -> usize { - slf.len_hint + fn __len__(mut slf: PyRefMut, py: Python<'_>) -> PyResult { + if let Some(len_hint) = slf.len_hint { + return Ok(len_hint); + } + + let base_len = ensure_base_len(&mut slf, py)?; + slf.len_hint = Some(base_len); + Ok(base_len) + } + + #[pyo3(text_signature = "(self)")] + fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { + let base_len = ensure_base_len(&mut slf, py)?; + slf.len_hint = Some(base_len); + Ok(base_len) } #[pyo3(text_signature = "(self, indices, /)")] @@ -148,31 +153,20 @@ impl PyBenDecoder { if indices.is_empty() { return Err(PyException::new_err("indices must not be empty")); } + let base_len = ensure_base_len(&mut slf, py)?; if indices[0] <= 0 { return Err(PyException::new_err("indices must be 1-based")); } - if indices.last().unwrap() > &slf.base_len { + if indices.last().unwrap() > &base_len { return Err(PyException::new_err(format!( "indices must be <= number of samples in base data ({})", - slf.base_len + base_len ))); } - slf.len_hint = indices.len(); + let len_hint = indices.len(); let sel = Selection::Indices(indices.into_iter().peekable()); - - let frames = build_frame_iter(&slf.src_path, &slf.mode).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - slf.src_path.display() - )) - })?; - - let frame_decoder = SubsampleFrameDecoder::new(frames, sel); - - slf.iter = Box::new(frame_decoder); - slf.current_assignment = None; - slf.remaining_count = 0; + reset_with_selection(&mut slf, sel, len_hint)?; Ok(slf.into()) } @@ -181,34 +175,24 @@ impl PyBenDecoder { mut slf: PyRefMut<'py, Self>, start: usize, end: usize, + py: Python<'_>, ) -> PyResult> { if start == 0 || end < start { return Err(PyException::new_err( "range must be 1-based and end >= start", )); } - if end > slf.base_len { + let base_len = ensure_base_len(&mut slf, py)?; + if end > base_len { return Err(PyException::new_err(format!( "end must be <= number of samples in base data ({})", - slf.base_len + base_len ))); } let sel = Selection::Range { start, end }; - slf.len_hint = end - start + 1; - - let frames = build_frame_iter(&slf.src_path, &slf.mode).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - slf.src_path.display() - )) - })?; - - let frame_decoder = SubsampleFrameDecoder::new(frames, sel); - - slf.iter = Box::new(frame_decoder); - slf.current_assignment = None; - slf.remaining_count = 0; + let len_hint = end - start + 1; + reset_with_selection(&mut slf, sel, len_hint)?; Ok(slf.into()) } @@ -217,36 +201,97 @@ impl PyBenDecoder { mut slf: PyRefMut<'py, Self>, step: usize, offset: usize, + py: Python<'_>, ) -> PyResult> { if step == 0 || offset == 0 { return Err(PyException::new_err("step and offset must be >= 1")); } - if offset > slf.base_len { + let base_len = ensure_base_len(&mut slf, py)?; + if offset > base_len { return Err(PyException::new_err(format!( "offset must be <= number of samples in base data ({})", - slf.base_len + base_len ))); } let sel = Selection::Every { step, offset }; + let len_hint = (base_len + step - 1 - (offset - 1)) / step; + reset_with_selection(&mut slf, sel, len_hint)?; + Ok(slf.into()) + } +} - slf.len_hint = (slf.base_len + step - 1 - (offset - 1)) / step; +fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { + let warnings = py.import("warnings")?; + let kwargs = PyDict::new(py); - let frames = build_frame_iter(&slf.src_path, &slf.mode).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - slf.src_path.display() - )) - })?; + warnings.call_method( + "warn", + ( + "XBEN may take a second to start decoding.", + py.get_type::(), + ), + Some(&kwargs), + )?; - let frame_decoder = SubsampleFrameDecoder::new(frames, sel); + Ok(()) +} - slf.iter = Box::new(frame_decoder); - slf.current_assignment = None; - slf.remaining_count = 0; - Ok(slf.into()) +fn build_iter(py: Python<'_>, source: &DecoderSource) -> PyResult { + let reader = open_input(&source.path)?; + match source.mode { + DecoderMode::Ben => { + let ben = BenDecoder::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; + Ok(Box::new(ben)) + } + DecoderMode::XBen => { + warn_xben_startup(py)?; + let xben = XBenDecoder::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; + Ok(Box::new(xben)) + } } } +fn build_frames(source: &DecoderSource) -> PyResult { + build_frame_iter(&source.path, source.mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + source.path.display() + )) + }) +} + +fn reset_with_selection( + decoder: &mut PyBenDecoder, + selection: Selection, + len_hint: usize, +) -> PyResult<()> { + let frames = build_frames(&decoder.source)?; + let frame_decoder = SubsampleFrameDecoder::new(frames, selection); + decoder.iter = Box::new(frame_decoder); + decoder.current_assignment = None; + decoder.remaining_count = 0; + decoder.len_hint = Some(len_hint); + Ok(()) +} + +fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { + if let Some(base_len) = decoder.base_len { + return Ok(base_len); + } + + let path = decoder.source.path.clone(); + let mode = decoder.source.mode.as_str().to_string(); + let base_len = py + .detach(|| count_samples_from_file(&path, &mode)) + .map_err(|e| { + PyException::new_err(format!("Failed to count samples in {}: {e}", path.display())) + })?; + decoder.base_len = Some(base_len); + Ok(base_len) +} + #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 0d7e3f4..83859d0 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -5,6 +5,7 @@ import pytest +import pyben from pyben import ( PyBenDecoder, PyBenEncoder, @@ -376,3 +377,349 @@ def test_compress_helpers_reject_unknown_variants(tmp_path: Path) -> None: with pytest.raises(ValueError, match="Unknown variant"): compress_jsonl_to_xben(src, tmp_path / "out.xben", overwrite=True, variant="weird") + + +def test_module_exports_are_exposed() -> None: + expected = { + "PyBenDecoder", + "PyBenEncoder", + "compress_jsonl_to_ben", + "compress_ben_to_xben", + "compress_jsonl_to_xben", + "decompress_ben_to_jsonl", + "decompress_xben_to_jsonl", + "decompress_xben_to_ben", + } + assert expected.issubset(set(pyben.__all__)) + for name in expected: + assert hasattr(pyben, name) + assert hasattr(pyben, "_core") + + +def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: + samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3]] + + default_ben = tmp_path / "default.ben" + with PyBenEncoder(default_ben, overwrite=True) as enc: + for sample in samples: + enc.write(sample) + assert list(PyBenDecoder(default_ben, mode="ben")) == samples + + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + alias_ben = tmp_path / "alias.ben" + alias_xben = tmp_path / "alias.xben" + compress_jsonl_to_ben(src, alias_ben, overwrite=True, variant="markov") + compress_jsonl_to_xben( + src, + alias_xben, + overwrite=True, + variant="markov", + n_threads=1, + compression_level=1, + ) + assert list(PyBenDecoder(alias_ben, mode="ben")) == samples + assert list(PyBenDecoder(alias_xben, mode="xben")) == samples + + +def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: + out = tmp_path / "out.ben" + enc = PyBenEncoder(out, overwrite=True, variant="standard") + enc.write([1, 2, 3]) + enc.close() + enc.close() + with pytest.raises(OSError, match="already been closed"): + enc.write([1, 2, 3]) + + ctx_path = tmp_path / "ctx.ben" + with PyBenEncoder(ctx_path, overwrite=True, variant="standard") as ctx_enc: + ctx_enc.write([4, 5, 6]) + assert list(PyBenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] + + +def test_pybenencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> None: + out = tmp_path / "out.ben" + out.write_bytes(b"existing") + + with pytest.raises(ValueError, match="Unknown variant"): + PyBenEncoder(tmp_path / "bad.ben", overwrite=False, variant="weird") + + with pytest.raises(OSError, match="already exists"): + PyBenEncoder(out, overwrite=False, variant="standard") + + with pytest.raises(OSError, match="Failed to create"): + PyBenEncoder( + tmp_path / "missing-dir" / "out.ben", + overwrite=False, + variant="standard", + ) + + +def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: Path) -> None: + src = tmp_path / "src.jsonl" + write_jsonl([[1, 1, 2]], src) + + with pytest.raises(OSError, match="must differ"): + compress_jsonl_to_ben(src, src, overwrite=True, variant="standard") + + with pytest.raises(OSError, match="does not exist"): + compress_jsonl_to_ben( + tmp_path / "missing.jsonl", + tmp_path / "out.ben", + overwrite=True, + variant="standard", + ) + + bad_json = tmp_path / "bad.jsonl" + bad_json.write_text("not json\n", encoding="utf-8") + with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): + compress_jsonl_to_ben( + bad_json, + tmp_path / "bad.ben", + overwrite=True, + variant="standard", + ) + + bad_assign = tmp_path / "bad_assign.jsonl" + bad_assign.write_text('{"assignment":"bad","sample":1}\n', encoding="utf-8") + with pytest.raises(OSError, match="Failed to convert JSONL to XBEN"): + compress_jsonl_to_xben( + bad_assign, + tmp_path / "bad.xben", + overwrite=True, + variant="standard", + n_threads=1, + compression_level=1, + ) + + with pytest.raises(OSError, match="Failed to create"): + compress_jsonl_to_ben( + src, + tmp_path / "missing-dir" / "out.ben", + overwrite=True, + variant="standard", + ) + + +def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and_existing_output( + tmp_path: Path, +) -> None: + with pytest.raises(OSError, match="does not exist"): + compress_ben_to_xben( + tmp_path / "missing.ben", + tmp_path / "out.xben", + overwrite=True, + n_threads=1, + compression_level=1, + ) + + bad_ben = tmp_path / "bad.ben" + bad_ben.write_bytes(b"garbage") + + with pytest.raises(OSError, match="must differ"): + compress_ben_to_xben( + bad_ben, + bad_ben, + overwrite=True, + n_threads=1, + compression_level=1, + ) + + with pytest.raises(OSError, match="Failed to convert BEN to XBEN"): + compress_ben_to_xben( + bad_ben, + tmp_path / "out.xben", + overwrite=True, + n_threads=1, + compression_level=1, + ) + + src = tmp_path / "src.jsonl" + write_jsonl([[1, 2, 3]], src) + ben = tmp_path / "good.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + out = tmp_path / "exists.xben" + out.write_bytes(b"exists") + with pytest.raises(OSError, match="already exists"): + compress_ben_to_xben( + ben, + out, + overwrite=False, + n_threads=1, + compression_level=1, + ) + + +def test_decoder_constructor_and_mode_errors(tmp_path: Path) -> None: + with pytest.raises(Exception, match="Unknown mode"): + PyBenDecoder(tmp_path / "missing.ben", mode="weird") + + with pytest.raises(OSError, match="Failed to open"): + PyBenDecoder(tmp_path / "missing.ben", mode="ben") + + bad_ben = tmp_path / "bad.ben" + bad_ben.write_bytes(b"garbage") + with pytest.raises(Exception, match="Failed to create BenDecoder"): + PyBenDecoder(bad_ben, mode="ben") + + bad_xben = tmp_path / "bad.xben" + bad_xben.write_bytes(b"garbage") + with pytest.warns(UserWarning, match="XBEN may take a second"): + with pytest.raises(Exception, match="Failed to create XBenDecoder"): + PyBenDecoder(bad_xben, mode="xben") + + +def test_decoder_len_and_count_samples_are_lazy_and_cached(tmp_path: Path) -> None: + samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3], [4]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + + dec = PyBenDecoder(ben, mode="ben") + assert len(dec) == len(samples) + assert dec.count_samples() == len(samples) + assert list(dec) == samples + + gone = PyBenDecoder(ben, mode="ben") + assert len(gone) == len(samples) + ben.unlink() + with pytest.raises(Exception, match="Failed to create frame iterator"): + gone.subsample_range(1, 2) + + +def test_decoder_xben_len_count_and_warning(tmp_path: Path) -> None: + samples = [[1, 1], [1, 1], [2, 2], [3, 3]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + xben = tmp_path / "out.xben" + compress_jsonl_to_xben( + src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 + ) + + with pytest.warns(UserWarning, match="XBEN may take a second"): + dec = PyBenDecoder(xben, mode="xben") + assert len(dec) == len(samples) + assert dec.count_samples() == len(samples) + assert list(dec) == samples + + +def test_decoder_subsample_validations_and_warning_paths(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + with pytest.warns(UserWarning, match="sorted and unique"): + got = list(PyBenDecoder(ben, mode="ben").subsample_indices([5, 1, 1, 3])) + assert got == [samples[0], samples[2], samples[4]] + + with pytest.raises(Exception, match="indices must be 1-based"): + PyBenDecoder(ben, mode="ben").subsample_indices([0, 1]) + + with pytest.raises(Exception, match="indices must be <="): + PyBenDecoder(ben, mode="ben").subsample_indices([6]) + + with pytest.raises(Exception, match="range must be 1-based"): + PyBenDecoder(ben, mode="ben").subsample_range(0, 2) + + with pytest.raises(Exception, match="end must be <="): + PyBenDecoder(ben, mode="ben").subsample_range(1, 99) + + with pytest.raises(Exception, match="step and offset must be >= 1"): + PyBenDecoder(ben, mode="ben").subsample_every(0, 1) + + with pytest.raises(Exception, match="offset must be <="): + PyBenDecoder(ben, mode="ben").subsample_every(2, 99) + + assert list(PyBenDecoder(ben, mode="ben").subsample_range(2, 4)) == samples[1:4] + assert list(PyBenDecoder(ben, mode="ben").subsample_every(2, 2)) == samples[1::2] + + +def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears(tmp_path: Path) -> None: + src = tmp_path / "src.jsonl" + write_jsonl([[1], [2], [3]], src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + dec = PyBenDecoder(ben, mode="ben") + ben.unlink() + + with pytest.raises(Exception, match="Failed to count samples"): + dec.count_samples() + + +def test_decoder_reports_zero_count_and_bad_frame_errors(tmp_path: Path) -> None: + src = tmp_path / "src.jsonl" + write_jsonl([[1, 1, 2]], src) + + mkv_ben = tmp_path / "mkv.ben" + compress_jsonl_to_ben(src, mkv_ben, overwrite=True, variant="mkv_chain") + data = bytearray(mkv_ben.read_bytes()) + data[-2:] = b"\x00\x00" + mkv_ben.write_bytes(data) + with pytest.raises(Exception, match="zero-count"): + next(iter(PyBenDecoder(mkv_ben, mode="ben"))) + + standard_ben = tmp_path / "standard.ben" + compress_jsonl_to_ben(src, standard_ben, overwrite=True, variant="standard") + truncated = standard_ben.read_bytes()[:-1] + bad_ben = tmp_path / "truncated.ben" + bad_ben.write_bytes(truncated) + dec = PyBenDecoder(bad_ben, mode="ben") + with pytest.raises(Exception, match="Error decoding next item"): + next(iter(dec)) + + +def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_invalid_headers( + tmp_path: Path, +) -> None: + with pytest.raises(OSError, match="does not exist"): + decompress_ben_to_jsonl( + tmp_path / "missing.ben", + tmp_path / "out.jsonl", + overwrite=True, + ) + + bad_ben = tmp_path / "bad.ben" + bad_ben.write_bytes(b"garbage") + with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): + decompress_ben_to_jsonl( + bad_ben, + tmp_path / "out.jsonl", + overwrite=True, + ) + + bad_xben = tmp_path / "bad.xben" + bad_xben.write_bytes(b"garbage") + with pytest.raises(OSError, match="Failed to convert XBEN to BEN"): + decompress_xben_to_ben( + bad_xben, + tmp_path / "out.ben", + overwrite=True, + ) + + with pytest.raises(OSError, match="must differ"): + decompress_xben_to_jsonl( + bad_xben, + bad_xben, + overwrite=True, + ) + + src = tmp_path / "src.jsonl" + write_jsonl([[1, 2, 3]], src) + ben = tmp_path / "good.ben" + xben = tmp_path / "good.xben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + compress_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + + out = tmp_path / "exists.jsonl" + out.write_text("exists\n", encoding="utf-8") + with pytest.raises(OSError, match="already exists"): + decompress_ben_to_jsonl(ben, out, overwrite=False) From aaf413e01b65acc672894e32c5b8c9ceef07aac4 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 13 Mar 2026 08:49:36 -0600 Subject: [PATCH 009/221] rename internals so imports are use/import binary_ensemble --- Cargo.lock | 4 +- Cargo.toml | 3 + ben/Cargo.toml | 4 +- ben/src/bin/ben.rs | 2 +- ben/src/bin/pben.rs | 2 +- ben/src/bin/reben.rs | 2 +- ben/tests/test_cli.rs | 6 +- ben/tests/test_impls_pipeline.rs | 20 +- ben/tests/test_pipeline.rs | 8 +- docs/bendl-format-spec.md | 355 +++++++++++++++++++ docs/bendl-implementation-plan.md | 261 ++++++++++++++ docs/bendl-roadmap.md | 175 +++++++++ pyben/Cargo.toml | 2 +- pyben/{pyben => binary_ensemble}/__init__.py | 0 pyben/{pyben => binary_ensemble}/_core.pyi | 2 +- pyben/{pyben => binary_ensemble}/py.typed | 0 pyben/docs/conf.py | 2 +- pyben/pyproject.toml | 5 +- pyben/src/common.rs | 2 +- pyben/src/decode/mod.rs | 8 +- pyben/src/encode/mod.rs | 4 +- pyben/tests/test_python_pipelines.py | 10 +- pyben/uv.lock | 1 - 23 files changed, 835 insertions(+), 43 deletions(-) create mode 100644 docs/bendl-format-spec.md create mode 100644 docs/bendl-implementation-plan.md create mode 100644 docs/bendl-roadmap.md rename pyben/{pyben => binary_ensemble}/__init__.py (100%) rename pyben/{pyben => binary_ensemble}/_core.pyi (99%) rename pyben/{pyben => binary_ensemble}/py.typed (100%) diff --git a/Cargo.lock b/Cargo.lock index 45d7045..eaad981 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,7 +89,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "binary-ensemble" -version = "0.3.0" +version = "1.0.0" dependencies = [ "byteorder", "clap 4.5.48", @@ -521,7 +521,7 @@ dependencies = [ [[package]] name = "pyben" -version = "0.3.0" +version = "1.0.0" dependencies = [ "binary-ensemble", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index eb066ef..db34469 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,6 @@ exclude = [ "tests/*", "TODO.md", ] + +[workspace.package] +version = "1.0.0" diff --git a/ben/Cargo.toml b/ben/Cargo.toml index 996b7a3..d04cb53 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "binary-ensemble" -version = "0.3.0" +version.workspace = true edition = "2021" authors = ["Peter Rock "] exclude = ["example/"] @@ -12,7 +12,7 @@ repository = "https://github.com/peterrrock2/binary-ensemble" description = "A CLI tool for working with and compressing ensembles of districting plans" [lib] -name = "ben" +name = "binary_ensemble" [dependencies] byteorder = "1.5.0" diff --git a/ben/src/bin/ben.rs b/ben/src/bin/ben.rs index e78bcf0..b83d05f 100755 --- a/ben/src/bin/ben.rs +++ b/ben/src/bin/ben.rs @@ -1,3 +1,3 @@ fn main() { - ben::cli::ben::run(); + binary_ensemble::cli::ben::run(); } diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs index bee6e88..34c29ec 100755 --- a/ben/src/bin/pben.rs +++ b/ben/src/bin/pben.rs @@ -1,3 +1,3 @@ fn main() -> std::io::Result<()> { - ben::cli::pben::run() + binary_ensemble::cli::pben::run() } diff --git a/ben/src/bin/reben.rs b/ben/src/bin/reben.rs index 8a935d3..cedb1a2 100755 --- a/ben/src/bin/reben.rs +++ b/ben/src/bin/reben.rs @@ -1,3 +1,3 @@ fn main() { - ben::cli::reben::run(); + binary_ensemble::cli::reben::run(); } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 08c7d06..64d7c46 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1,6 +1,6 @@ -use ben::codec::decode::decode_ben_to_jsonl; -use ben::codec::encode::encode_jsonl_to_ben; -use ben::BenVariant; +use binary_ensemble::codec::decode::decode_ben_to_jsonl; +use binary_ensemble::codec::encode::encode_jsonl_to_ben; +use binary_ensemble::BenVariant; use serde_json::Value; use std::fs; use std::io::BufReader; diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 8d77ac9..08d79db 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1,18 +1,18 @@ #![allow(clippy::needless_collect)] -use ben::codec::decode::{ +use binary_ensemble::codec::decode::{ decode_ben_line, decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, }; -use ben::codec::encode::{ +use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_ben_vec_from_rle, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; -use ben::io::reader::{ +use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, DecoderInitError, Frame, SubsampleFrameDecoder, XBenDecoder, }; -use ben::io::writer::BenEncoder; -use ben::BenVariant; +use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::BenVariant; use proptest::prelude::*; use serde_json::json; @@ -73,9 +73,9 @@ where Ok(out) } -fn collect_frames(it: I) -> std::io::Result> +fn collect_frames(it: I) -> std::io::Result> where - I: IntoIterator>, + I: IntoIterator>, { let mut out = Vec::new(); for rec in it { @@ -778,11 +778,11 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { ) .unwrap(); - let mut frames = ben::io::reader::XBenFrameDecoder::new(xz.as_slice()).unwrap(); + let mut frames = binary_ensemble::io::reader::XBenFrameDecoder::new(xz.as_slice()).unwrap(); assert!(frames.next().unwrap().is_ok()); let trimmed = &xz[..xz.len() - 1]; - let mut frames = ben::io::reader::XBenFrameDecoder::new(trimmed).unwrap(); + let mut frames = binary_ensemble::io::reader::XBenFrameDecoder::new(trimmed).unwrap(); loop { match frames.next() { Some(Err(e)) => { @@ -814,7 +814,7 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { .encoder() .unwrap(); let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); - let mut xben = ben::io::writer::XBenEncoder::new(encoder, BenVariant::Standard); + let mut xben = binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard); xben.write_ben_file(BufReader::new(payload_only.as_slice())).unwrap(); } diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 128776d..1b8be71 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -1,7 +1,7 @@ -use ben::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; -use ben::codec::encode::{encode_jsonl_to_ben, encode_jsonl_to_xben}; -use ben::util::rle::rle_to_vec; -use ben::BenVariant; +use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; +use binary_ensemble::codec::encode::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +use binary_ensemble::util::rle::rle_to_vec; +use binary_ensemble::BenVariant; use serde_json::json; use std::io::{Cursor, Read, Write}; diff --git a/docs/bendl-format-spec.md b/docs/bendl-format-spec.md new file mode 100644 index 0000000..b02643a --- /dev/null +++ b/docs/bendl-format-spec.md @@ -0,0 +1,355 @@ +# BENDL Format Specification Draft + +## Status + +Draft design for a future `.bendl` file format. + +This document defines a concrete binary layout for a single-file dataset container that: + +- feels like one file to users +- keeps metadata and optional assets accessible near the front +- stores the assignment stream at the end +- supports interrupted writes +- can be finalized by patching the header + +This specification is intentionally separate from the existing `.ben` and `.xben` formats. + +## Design Goals + +- Single-file dataset container. +- Efficient access to front-loaded metadata. +- Stream-friendly assignment payloads. +- Recoverable partial files after interruption. +- Forward-compatible directory structure. +- Fast `sample_count` lookup for finalized bundles. + +## Terminology + +- `bundle`: a `.bendl` file. +- `asset`: a named front-loaded object such as a graph or relabel map. +- `assignment stream`: the trailing BEN or XBEN payload. +- `finalized bundle`: a bundle whose header has been patched to indicate successful completion. +- `incomplete bundle`: a bundle whose assignment stream may still be usable, but whose final size/count information is not authoritative. + +## File Layout + +A `.bendl` file is laid out as: + +```text +[Fixed Header] +[Directory Table] +[Asset Payloads] +[Assignment Stream] +``` + +The assignment stream is always the final data region in the file. + +## Byte Order + +All fixed-width integers are encoded as little-endian unless otherwise stated. + +## Fixed Header + +The file begins with a fixed-size 64-byte header: + +```text +offset size field +0 8 magic +8 2 major_version +10 2 minor_version +12 2 flags +14 1 complete +15 1 assignment_format +16 8 directory_offset +24 8 directory_len +32 8 stream_offset +40 8 stream_len +48 8 sample_count +56 8 reserved +``` + +### Header Fields + +- `magic` + - fixed bytes identifying the file as BENDL + - proposed value: `b"BENDL\\0\\0\\1"` +- `major_version` + - initial value: `1` +- `minor_version` + - initial value: `0` +- `flags` + - bundle-level feature flags +- `complete` + - `0` means incomplete/unfinalized + - `1` means finalized +- `assignment_format` + - `1 = BEN` + - `2 = XBEN` +- `directory_offset` + - byte offset of the directory table +- `directory_len` + - byte length of the directory table +- `stream_offset` + - byte offset where the assignment stream begins +- `stream_len` + - length in bytes of the assignment stream + - `0` if unknown/unfinalized +- `sample_count` + - number of expanded samples in the assignment stream + - `u64::MAX` if unknown/unfinalized +- `reserved` + - reserved for future extension + +## Header Flags + +Initial proposed header flags: + +- bit 0: directory contains checksums +- bit 1: bundle contains graph asset +- bit 2: bundle contains relabel map asset +- bit 3: bundle contains metadata asset + +Unrecognized flags must be ignored by readers unless a future version marks them as mandatory. + +## Directory Table + +The directory table is a compact binary table describing front-loaded assets. + +Layout: + +```text +offset size field +0 4 entry_count +4 ... repeated directory entries +``` + +Each directory entry has the following header: + +```text +offset size field +0 2 asset_type +2 2 asset_flags +4 2 name_len +6 2 reserved +8 8 payload_offset +16 8 payload_len +24 4 checksum_len +28 ... name bytes +... ... checksum bytes +``` + +### Directory Entry Fields + +- `asset_type` + - identifies the meaning of the payload +- `asset_flags` + - encoding/compression flags for that asset +- `name_len` + - UTF-8 byte length of the asset name +- `payload_offset` + - absolute file offset of the asset payload +- `payload_len` + - byte length of the asset payload +- `checksum_len` + - byte length of optional checksum bytes that follow the name +- `name bytes` + - UTF-8 asset name +- `checksum bytes` + - optional checksum payload, interpretation depends on flags + +### Asset Types + +Initial proposed asset types: + +- `1 = metadata.json` +- `2 = graph.json` +- `3 = relabel_map.json` +- `4 = custom user asset` + +### Asset Flags + +Initial proposed asset flags: + +- bit 0: payload is UTF-8 JSON +- bit 1: payload is zstd-compressed +- bit 2: checksum present + +Readers must skip unknown asset types and unknown flags when possible. + +## Asset Payload Region + +Assets are written after the directory table and before the assignment stream. + +Each asset payload is raw bytes referenced by the directory table. The bundle does not require per-asset wrapper headers in the payload region because offsets and lengths are already described by the directory entries. + +Examples of front-loaded assets: + +- graph file +- relabel map +- extra metadata JSON +- provenance/configuration info + +## Assignment Stream Region + +The assignment stream starts at `stream_offset` and occupies `stream_len` bytes if the bundle is finalized. + +The stream payload must be one of: + +- BEN byte stream +- XBEN byte stream + +The bundle does not reinterpret BEN/XBEN internals. It only stores the opaque assignment stream and records its format in `assignment_format`. + +### Incomplete Bundles + +If `complete == 0`: + +- `stream_len` may be `0` +- `sample_count` may be `u64::MAX` +- readers should treat assignment data as extending from `stream_offset` to EOF + +This allows partially written bundles to remain recoverable. + +## Finalization Rules + +Writers are expected to use this sequence: + +1. Write a provisional header with: + - `complete = 0` + - `stream_len = 0` + - `sample_count = u64::MAX` +2. Write the directory table. +3. Write all front-loaded assets. +4. Record `stream_offset`. +5. Write the assignment stream. +6. On successful completion: + - compute final `stream_len` + - compute final `sample_count` + - seek back to patch the header + - set `complete = 1` + +If writing is interrupted before step 6, the file remains an incomplete bundle. + +## Reader Rules + +Readers must: + +1. Validate `magic` and supported version. +2. Read the fixed header. +3. Read the directory table. +4. Make front-loaded assets available immediately. +5. Interpret the assignment stream according to `assignment_format`. +6. If `complete == 0`, treat the stream as running from `stream_offset` to EOF. + +Readers should expose: + +- whether the bundle is finalized +- whether `sample_count` is authoritative +- whether the assignment stream is still readable + +## Recovery Semantics + +If a bundle write is interrupted: + +- header and front-loaded assets should still be usable if fully written +- assignment data should be readable from `stream_offset` to EOF +- `sample_count` should be treated as unknown +- the bundle should be marked incomplete + +If the interruption happens before the directory or assets are fully written, the bundle may be unreadable. Writers should therefore prefer writing small front-loaded metadata first and beginning the assignment stream only after the directory is complete. + +## Metadata Conventions + +Although the directory is binary, metadata payloads should initially use JSON for ease of debugging. + +Recommended metadata file names: + +- `metadata.json` +- `graph.json` +- `relabel_map.json` + +Recommended metadata fields: + +```json +{ + "bundle_version": 1, + "assignments_format": "xben", + "variant": "mkv_chain", + "complete": false +} +``` + +## Versioning Strategy + +- incompatible structural changes require `major_version` bump +- additive backward-compatible fields may use `minor_version` bump +- unknown asset types should be ignored when possible + +## Suggested Rust Types + +Conceptual Rust representations: + +```rust +pub struct BendlHeader { + pub magic: [u8; 8], + pub major_version: u16, + pub minor_version: u16, + pub flags: u16, + pub complete: u8, + pub assignment_format: u8, + pub directory_offset: u64, + pub directory_len: u64, + pub stream_offset: u64, + pub stream_len: u64, + pub sample_count: u64, + pub reserved: u64, +} + +pub struct BendlDirectoryEntry { + pub asset_type: u16, + pub asset_flags: u16, + pub name: String, + pub payload_offset: u64, + pub payload_len: u64, + pub checksum: Option>, +} +``` + +## Suggested Module Layout + +If implemented in `ben`, the new code should likely live under: + +```text +ben/src/bundle/ + mod.rs + format.rs + reader.rs + writer.rs + manifest.rs +``` + +Responsibilities: + +- `format.rs`: binary header/directory definitions +- `reader.rs`: bundle reader +- `writer.rs`: bundle writer/finalizer +- `manifest.rs`: JSON metadata structs + +## Out of Scope for V1 + +- non-seekable `.bendl` writing +- embedding assignment count inside BEN/XBEN themselves +- random-write mutation of existing bundles +- archive-level compression beyond the assignment stream format + +## Current Recommendation + +Implement `.bendl` V1 as: + +- a seekable file container +- a fixed header plus binary directory +- front-loaded optional assets +- trailing BEN/XBEN assignment stream +- header patched on successful finalize + +This keeps the format simple, recoverable, and aligned with the current streaming requirements. diff --git a/docs/bendl-implementation-plan.md b/docs/bendl-implementation-plan.md new file mode 100644 index 0000000..5a95a4e --- /dev/null +++ b/docs/bendl-implementation-plan.md @@ -0,0 +1,261 @@ +# BENDL Implementation Plan + +## Goal + +Turn the `.bendl` roadmap and format specification into an implementation sequence that is low-risk and easy to validate incrementally. + +This plan assumes: + +- `.ben` and `.xben` remain unchanged +- `.bendl` is a new seekable container format +- the assignment stream is stored at the end of the file +- header fields are patched on successful finalization + +## Guiding Strategy + +Build `.bendl` in layers: + +1. binary format types +2. read-only support +3. write/finalize support +4. CLI integration +5. Python integration + +This keeps the early steps small and testable. + +## Phase 1: Core Format Types + +Add a new top-level module: + +```text +ben/src/bundle/ + mod.rs + format.rs + manifest.rs +``` + +### Tasks + +- Define `BendlHeader`. +- Define constants for: + - magic bytes + - version numbers + - assignment format identifiers + - asset types + - asset flags +- Define `BendlDirectoryEntry`. +- Implement binary encode/decode helpers for: + - header read/write + - directory entry read/write +- Add manifest-side serde structs for JSON metadata assets. + +### Deliverable + +Pure format layer with no I/O orchestration yet. + +### Tests + +- header round-trip tests +- directory entry round-trip tests +- invalid magic/version tests +- asset flag parsing tests + +## Phase 2: Read-Only Bundle Support + +Add: + +```text +ben/src/bundle/reader.rs +``` + +### Tasks + +- Implement `BendlReader`. +- Validate and parse the fixed header. +- Read and decode the directory table. +- Expose accessors for: + - `is_complete()` + - `sample_count() -> Option` + - `assignment_format()` + - `assets()` +- Implement helpers to: + - open asset payloads by name/type + - open the assignment stream region +- For incomplete bundles: + - treat assignment stream as `stream_offset..EOF` + +### Deliverable + +A read-only API that can inspect bundle metadata and expose the embedded assignment stream. + +### Tests + +- parse finalized bundle fixture +- parse incomplete bundle fixture +- recover front-loaded assets when `complete == 0` +- ignore unknown asset types cleanly + +## Phase 3: Bundle Writer + +Add: + +```text +ben/src/bundle/writer.rs +``` + +### Tasks + +- Implement `BendlWriter`. +- Write provisional header. +- Write directory table. +- Write front-loaded assets. +- Track `stream_offset`. +- Stream BEN or XBEN payload at the end. +- Count samples while writing. +- On `finish()`: + - compute `stream_len` + - patch header + - set `complete = 1` + +### Important Constraints + +- Writing should require `Seek`. +- `finish()` should be explicit. +- `Drop` should not silently attempt complex repair/finalization. + +### Deliverable + +A bundle writer that can produce finalized `.bendl` files and leave partially usable files behind if interrupted. + +### Tests + +- finalized bundle writes correct header fields +- incomplete writer leaves `complete = 0` +- assets remain readable after partial write +- correct `sample_count` patching + +## Phase 4: Assignment Stream Integration + +Connect bundle writing to the existing BEN/XBEN infrastructure. + +### Tasks + +- Allow writer to store: + - BEN assignment stream + - XBEN assignment stream +- Reuse existing encoders rather than reimplementing stream encoding. +- Add helper APIs such as: + - `write_ben_stream(...)` + - `write_xben_stream(...)` + - `open_assignment_reader(...)` + +### Deliverable + +The bundle layer becomes a thin container around the current assignment formats. + +### Tests + +- bundle with BEN payload decodes correctly +- bundle with XBEN payload decodes correctly +- incomplete XBEN stream remains partially readable when possible + +## Phase 5: CLI Support + +Add CLI commands after the core library is stable. + +Potential command surface: + +```text +ben bundle create +ben bundle inspect +ben bundle extract +``` + +### Tasks + +- create `.bendl` from assignment stream + optional assets +- inspect header and asset list +- extract embedded assets or assignment payload +- report completeness/finalization state + +### Deliverable + +User-facing bundle workflow in the Rust CLI. + +### Tests + +- integration tests for create/inspect/extract +- interrupted/incomplete bundle inspection +- metadata visibility before finalized stream count + +## Phase 6: Python Support + +Add optional `pyben` support once the Rust API settles. + +### Tasks + +- expose bundle inspection API +- expose `sample_count` if finalized +- expose graph/relabel-map asset loading +- optionally expose embedded assignment stream through `PyBenDecoder` + +### Deliverable + +Python can open `.bendl` as a higher-level dataset object. + +### Tests + +- open finalized bundle +- open incomplete bundle +- read graph metadata without forcing assignment scan + +## Recommended Implementation Order + +Recommended practical sequence: + +1. `format.rs` +2. `reader.rs` +3. tests + sample fixtures +4. `writer.rs` +5. CLI support +6. `pyben` support + +This order gives you inspection/debugging tools before write-path complexity. + +## Suggested Public API Shape + +Possible `ben` API surface: + +```rust +pub mod bundle; + +pub use bundle::reader::BendlReader; +pub use bundle::writer::BendlWriter; +``` + +And bundle module internals: + +```rust +bundle::format +bundle::manifest +bundle::reader +bundle::writer +``` + +## Risks + +- Header patching requires seekable outputs. +- Incomplete bundles need carefully defined recovery behavior. +- XBEN payloads may still require full scan when bundle metadata is absent or unfinalized. +- Asset directory changes should be versioned carefully to preserve forward compatibility. + +## Recommended First Milestone + +The first milestone should be: + +- parse and inspect `.bendl` files +- list bundled assets +- open assignment stream region +- expose `complete` and `sample_count` + +That gives immediate value and makes it easier to validate the spec before building the writer. diff --git a/docs/bendl-roadmap.md b/docs/bendl-roadmap.md new file mode 100644 index 0000000..f8fb8f8 --- /dev/null +++ b/docs/bendl-roadmap.md @@ -0,0 +1,175 @@ +# BENDL Roadmap + +## Goal + +Add a higher-level `.bendl` container format that feels like a single file to users while preserving the streamable nature of the underlying assignment data. + +The low-level assignment formats remain: + +- `.ben` +- `.xben` + +The new `.bendl` format is a richer file-oriented container for: + +- assignment data +- metadata +- graph data +- relabel maps +- future optional assets + +## Design Principles + +- Keep `.ben` and `.xben` streamable. +- Treat `.bendl` as a seekable container format for regular files. +- Put stable assets near the front of the file. +- Put the live assignment stream at the end of the file. +- Allow incomplete `.bendl` files to remain partially usable after interruption. +- Patch the header on successful finalization instead of requiring a footer. + +## Proposed Layout + +`.bendl` should use this high-level layout: + +```text +[Fixed Header] +[Directory / Metadata Section] +[Optional Extra Assets] +[Streaming Assignments Section] +``` + +Where: + +- the header is written first with placeholder values +- the directory and optional assets are written before streaming starts +- the assignment stream is appended at the end +- on successful completion, the writer seeks back and patches the header + +## Why This Layout + +This layout ensures: + +- graph data and relabel maps are readable even if the stream is interrupted +- the assignment stream can still be decoded up to EOF if the file is incomplete +- final facts like `sample_count` are only written once they are actually known + +## Header Concept + +The exact binary layout is still to be finalized, but the header should carry fields conceptually like: + +```rust +struct BendlHeader { + magic: [u8; 8], + version: u16, + flags: u16, + complete: u8, + reserved: [u8; 5], + directory_offset: u64, + directory_len: u64, + stream_offset: u64, + stream_len: u64, + sample_count: u64, +} +``` + +Notes: + +- `complete == 0` means the file was not finalized +- `stream_len == 0` can mean unknown or unfinalized +- `sample_count == u64::MAX` can represent unknown sample count + +## Directory / Asset Section + +The directory section should describe any front-loaded assets, such as: + +- graph +- relabel map +- metadata blob +- future extras + +This can be backed by a simple JSON or binary directory table. The important part is that these assets are discoverable without scanning the assignment stream. + +## Assignment Stream + +The assignment stream should be stored at the end of the file so writing can proceed incrementally. + +The stream payload may be: + +- BEN data +- XBEN data + +The `.bendl` container should treat this as the primary large append-only region. + +## Finalization Model + +Expected write flow: + +1. Write a provisional header. +2. Write directory data and optional assets. +3. Record `stream_offset`. +4. Stream the assignment data. +5. On successful completion, seek back and patch the header with: + - `complete = true` + - `stream_len` + - `sample_count` + - any other finalized metadata + +If writing is interrupted: + +- the header remains incomplete +- the front-loaded assets are still readable +- the assignment stream may still be readable up to EOF +- exact `sample_count` is unavailable unless the reader scans + +## Reader Semantics + +Reader behavior should be: + +- read the fixed header +- inspect `complete` +- load directory and front-loaded assets +- read assignment data starting at `stream_offset` +- if `complete == false`, treat the file as recoverable but incomplete + +This means `.bendl` readers should expose both: + +- whether the bundle is complete +- whether assignment data is still usable + +## Relationship to Existing Formats + +- `.ben` and `.xben` remain the portable stream/data formats +- `.bendl` becomes the richer container format for complete datasets + +This keeps responsibilities separated: + +- assignment encoding stays in BEN/XBEN +- dataset metadata and optional extras live in BENDL + +## PyBen Implications + +Potential future Python API support: + +- open a `.bendl` file directly +- expose `sample_count` immediately if finalized +- expose optional `graph` and `relabel_map` +- fall back to scanning assignment data if `sample_count` is unknown + +## Open Questions + +- exact binary encoding of the directory section +- whether the asset directory should be JSON or a compact binary table +- whether checksums should be included in the header +- whether assignment payload should always be XBEN inside `.bendl` +- whether `.bendl` writing should require seekable output explicitly + +## Current Recommendation + +Proceed with `.bendl` as: + +- a single-file container +- a seekable file format +- front-loaded metadata/assets +- trailing assignment stream +- header patched on finalize + +This best matches the requirements discussed so far. diff --git a/pyben/Cargo.toml b/pyben/Cargo.toml index 88c49f6..705ad06 100755 --- a/pyben/Cargo.toml +++ b/pyben/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyben" -version = "0.3.0" +version.workspace = true edition = "2021" authors = ["Peter Rock "] license = "MIT" diff --git a/pyben/pyben/__init__.py b/pyben/binary_ensemble/__init__.py similarity index 100% rename from pyben/pyben/__init__.py rename to pyben/binary_ensemble/__init__.py diff --git a/pyben/pyben/_core.pyi b/pyben/binary_ensemble/_core.pyi similarity index 99% rename from pyben/pyben/_core.pyi rename to pyben/binary_ensemble/_core.pyi index 809b5e6..adc8b2c 100644 --- a/pyben/pyben/_core.pyi +++ b/pyben/binary_ensemble/_core.pyi @@ -120,7 +120,7 @@ class PyBenEncoder: .. code-block:: python - from pyben import PyBenEncoder + from binary_ensemble import PyBenEncoder assignments = [ [1, 2, 1, 1, 2, 2], diff --git a/pyben/pyben/py.typed b/pyben/binary_ensemble/py.typed similarity index 100% rename from pyben/pyben/py.typed rename to pyben/binary_ensemble/py.typed diff --git a/pyben/docs/conf.py b/pyben/docs/conf.py index 731956f..98b6c49 100644 --- a/pyben/docs/conf.py +++ b/pyben/docs/conf.py @@ -52,7 +52,7 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] autoapi_type = "python" -autoapi_dirs = ["../pyben"] +autoapi_dirs = ["../binary_ensemble"] autoapi_clean = True autoapi_keep_files = False autoapi_ignore = [ diff --git a/pyben/pyproject.toml b/pyben/pyproject.toml index 2030d64..821d0db 100755 --- a/pyben/pyproject.toml +++ b/pyben/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "maturin" [project] name = "binary-ensemble" -version = "0.3.0" +dynamic = ["version"] description = "Python bindings for the Binary Ensemble Package." readme = "README.md" requires-python = ">=3.11" @@ -13,7 +13,7 @@ requires-python = ">=3.11" [tool.maturin] python-source = "." -module-name = "pyben._core" +module-name = "binary_ensemble._core" bindings = "pyo3" [tool.pytest.ini_options] @@ -42,4 +42,3 @@ dev = [ "pytest>=8.4.2", "tqdm>=4.67.1", ] - diff --git a/pyben/src/common.rs b/pyben/src/common.rs index e26cdfc..a3fd7fa 100644 --- a/pyben/src/common.rs +++ b/pyben/src/common.rs @@ -1,4 +1,4 @@ -use ben::BenVariant; +use binary_ensemble::BenVariant; use pyo3::exceptions::{PyIOError, PyValueError}; use pyo3::prelude::PyResult; use std::fs::File; diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index bebd434..2d71eaf 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -1,6 +1,6 @@ use crate::common::{open_input, open_output, validate_input_output_paths}; -use ben::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; -use ben::io::reader::{ +use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; +use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, MkvRecord, Selection, SubsampleFrameDecoder, XBenDecoder, }; @@ -43,7 +43,7 @@ struct DecoderSource { mode: DecoderMode, } -#[pyclass(module = "pyben", unsendable)] +#[pyclass(module = "binary_ensemble", unsendable)] pub struct PyBenDecoder { source: DecoderSource, iter: DynIter, @@ -253,7 +253,7 @@ fn build_iter(py: Python<'_>, source: &DecoderSource) -> PyResult { } } -fn build_frames(source: &DecoderSource) -> PyResult { +fn build_frames(source: &DecoderSource) -> PyResult { build_frame_iter(&source.path, source.mode.as_str()).map_err(|e| { PyException::new_err(format!( "Failed to create frame iterator from {}: {e}", diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 6efcb0d..9f017ec 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -1,6 +1,6 @@ use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; -use ben::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; -use ben::io::writer::BenEncoder; +use binary_ensemble::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; +use binary_ensemble::io::writer::BenEncoder; use pyo3::exceptions::PyIOError; use pyo3::prelude::PyResult; use pyo3::{pyclass, pyfunction, pymethods}; diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 83859d0..29bcd63 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -5,8 +5,8 @@ import pytest -import pyben -from pyben import ( +import binary_ensemble +from binary_ensemble import ( PyBenDecoder, PyBenEncoder, compress_ben_to_xben, @@ -390,10 +390,10 @@ def test_module_exports_are_exposed() -> None: "decompress_xben_to_jsonl", "decompress_xben_to_ben", } - assert expected.issubset(set(pyben.__all__)) + assert expected.issubset(set(binary_ensemble.__all__)) for name in expected: - assert hasattr(pyben, name) - assert hasattr(pyben, "_core") + assert hasattr(binary_ensemble, name) + assert hasattr(binary_ensemble, "_core") def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: diff --git a/pyben/uv.lock b/pyben/uv.lock index 9f985ad..a92bd5d 100755 --- a/pyben/uv.lock +++ b/pyben/uv.lock @@ -90,7 +90,6 @@ wheels = [ [[package]] name = "binary-ensemble" -version = "0.3.0" source = { editable = "." } [package.optional-dependencies] From 80ef93b0fd42f251deb4297a0682e5b1d90c614b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 11:36:19 -0600 Subject: [PATCH 010/221] More doc strings --- ben/src/bin/ben.rs | 1 + ben/src/bin/pben.rs | 1 + ben/src/bin/reben.rs | 1 + ben/src/cli/ben.rs | 55 ++++++ ben/src/cli/common.rs | 27 +++ ben/src/cli/mod.rs | 2 + ben/src/cli/pben.rs | 4 + ben/src/cli/reben.rs | 1 + ben/src/codec/decode/ben.rs | 31 ++++ ben/src/codec/decode/ben32.rs | 26 +++ ben/src/codec/decode/xz.rs | 34 ++++ ben/src/codec/encode/ben.rs | 33 ++++ ben/src/codec/encode/jsonl.rs | 33 ++++ ben/src/codec/encode/xz.rs | 32 ++++ ben/src/codec/mod.rs | 7 + ben/src/codec/translate/mod.rs | 51 ++++++ ben/src/io/mod.rs | 2 + ben/src/io/reader.rs | 320 +++++++++++++++++++++++++++++++++ ben/src/io/writer.rs | 82 +++++++++ ben/src/json/graph/mod.rs | 11 ++ ben/src/json/mod.rs | 2 + ben/src/lib.rs | 23 ++- ben/src/logging.rs | 23 +++ ben/src/ops/extract/mod.rs | 36 ++++ ben/src/ops/mod.rs | 2 + ben/src/ops/relabel/mod.rs | 54 ++++++ ben/src/util/mod.rs | 2 + ben/src/util/rle/mod.rs | 16 ++ 28 files changed, 909 insertions(+), 3 deletions(-) diff --git a/ben/src/bin/ben.rs b/ben/src/bin/ben.rs index b83d05f..ffe2698 100755 --- a/ben/src/bin/ben.rs +++ b/ben/src/bin/ben.rs @@ -1,3 +1,4 @@ +/// Entry point for the `ben` CLI binary. fn main() { binary_ensemble::cli::ben::run(); } diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs index 34c29ec..2409401 100755 --- a/ben/src/bin/pben.rs +++ b/ben/src/bin/pben.rs @@ -1,3 +1,4 @@ +/// Entry point for the `pben` CLI binary. fn main() -> std::io::Result<()> { binary_ensemble::cli::pben::run() } diff --git a/ben/src/bin/reben.rs b/ben/src/bin/reben.rs index cedb1a2..f94f8d2 100755 --- a/ben/src/bin/reben.rs +++ b/ben/src/bin/reben.rs @@ -1,3 +1,4 @@ +/// Entry point for the `reben` CLI binary. fn main() { binary_ensemble::cli::reben::run(); } diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 5cca30a..1dc9470 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -106,6 +106,18 @@ struct Args { compression_level: Option, } +/// Derive the output path for encode-style CLI modes. +/// +/// # Arguments +/// +/// * `mode` - The encode-oriented CLI mode being executed. +/// * `input_file_name` - The input file path supplied by the user. +/// * `output_file_name` - An optional explicit output path. +/// * `overwrite` - Whether to skip overwrite prompting. +/// +/// # Returns +/// +/// Returns the resolved output path. fn encode_setup( mode: Mode, input_file_name: String, @@ -135,6 +147,19 @@ fn encode_setup( Ok(out_file_name) } +/// Derive the output path for decode-style CLI modes. +/// +/// # Arguments +/// +/// * `in_file_name` - The input file path supplied by the user. +/// * `out_file_name` - An optional explicit output path. +/// * `full_decode` - Whether the decode should go all the way to JSONL instead +/// of stopping at BEN. +/// * `overwrite` - Whether to skip overwrite prompting. +/// +/// # Returns +/// +/// Returns the resolved output path. fn decode_setup( in_file_name: String, out_file_name: Option, @@ -170,6 +195,15 @@ fn decode_setup( Ok(out_file_name) } +/// Open either the requested input file or stdin. +/// +/// # Arguments +/// +/// * `input_file` - An optional input file path. +/// +/// # Returns +/// +/// Returns a buffered reader for the requested file or stdin. fn open_reader(input_file: Option<&str>) -> DynReader { match input_file { Some(path) => Box::new(BufReader::new(File::open(path).unwrap())), @@ -177,6 +211,17 @@ fn open_reader(input_file: Option<&str>) -> DynReader { } } +/// Open either the requested output file or stdout. +/// +/// # Arguments +/// +/// * `output_file` - An optional output file path. +/// * `print` - Whether output should be forced to stdout. +/// * `overwrite` - Whether to skip overwrite prompting for file outputs. +/// +/// # Returns +/// +/// Returns a buffered writer for the requested file or stdout. fn open_writer(output_file: Option<&str>, print: bool, overwrite: bool) -> Result { if print { return Ok(Box::new(BufWriter::new(io::stdout()))); @@ -191,10 +236,20 @@ fn open_writer(output_file: Option<&str>, print: bool, overwrite: bool) -> Resul } } +/// Open a writer for a path computed by one of the setup helpers. +/// +/// # Arguments +/// +/// * `path` - The output path to create. +/// +/// # Returns +/// +/// Returns a buffered writer for `path`. fn open_derived_writer(path: String) -> DynWriter { Box::new(BufWriter::new(File::create(path).unwrap())) } +/// Parse CLI arguments and execute the selected `ben` sub-mode. pub fn run() { let args = Args::parse(); set_verbose(args.verbose); diff --git a/ben/src/cli/common.rs b/ben/src/cli/common.rs index b8c6297..356c97f 100644 --- a/ben/src/cli/common.rs +++ b/ben/src/cli/common.rs @@ -1,6 +1,19 @@ use std::io::{self, Result}; use std::path::Path; +/// Configure tracing for CLI execution. +/// +/// When `verbose` is set and the user has not already provided `RUST_LOG`, the +/// default log filter is elevated to `trace`. The tracing subscriber is then +/// initialized exactly once for the process. +/// +/// # Arguments +/// +/// * `verbose` - Whether verbose trace logging should be enabled by default. +/// +/// # Returns +/// +/// This function does not return a value. pub fn set_verbose(verbose: bool) { if verbose && std::env::var_os("RUST_LOG").is_none() { std::env::set_var("RUST_LOG", "trace"); @@ -8,6 +21,20 @@ pub fn set_verbose(verbose: bool) { crate::logging::init_logging(); } +/// Confirm whether an existing output path may be overwritten. +/// +/// If `overwrite` is `false` and the destination already exists, the user is +/// prompted on stdin. An `AlreadyExists` error is returned when the user +/// declines. +/// +/// # Arguments +/// +/// * `file_name` - The candidate output path. +/// * `overwrite` - Whether to skip the interactive overwrite prompt. +/// +/// # Returns +/// +/// Returns `Ok(())` when the output path may be used. pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { if Path::new(file_name).exists() && !overwrite { eprint!( diff --git a/ben/src/cli/mod.rs b/ben/src/cli/mod.rs index 712484f..568f3ad 100644 --- a/ben/src/cli/mod.rs +++ b/ben/src/cli/mod.rs @@ -1,3 +1,5 @@ +//! Library-backed CLI implementations used by the `src/bin` entrypoints. + pub mod ben; pub mod common; pub mod pben; diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 0be66a4..4ae0ca2 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -51,6 +51,7 @@ struct Args { verbose: bool, } +/// Parse CLI arguments and execute the selected `pben` conversion. pub fn run() -> Result<()> { let args = Args::parse(); set_verbose(args.verbose); @@ -128,6 +129,7 @@ pub fn run() -> Result<()> { } } +/// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { let ben_reader = BenDecoder::new(&mut reader)?; @@ -150,6 +152,7 @@ fn assignment_decode_ben(mut reader: R, mut writer: W) -> io: Ok(()) } +/// Read zero-based assignment vectors and encode them as BEN. fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain); @@ -164,6 +167,7 @@ fn assignment_encode_ben(reader: R, writer: W) -> i Ok(()) } +/// Read zero-based assignment vectors and encode them as XBEN. fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { let encoder = XzEncoder::new(writer, 9); let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain); diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 6db69e7..3f86ccc 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -60,6 +60,7 @@ struct Args { verbose: bool, } +/// Parse CLI arguments and execute the selected `reben` mode. pub fn run() { let args = Args::parse(); set_verbose(args.verbose); diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 39fee83..801bfc2 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -1,6 +1,23 @@ use crate::io::reader::BenDecoder; use std::io::{self, Read, Write}; +/// Decode a single BEN frame payload into run-length encoded assignments. +/// +/// This function expects only the packed payload bytes for one BEN frame, not +/// the leading per-frame BEN header. +/// +/// # Arguments +/// +/// * `reader` - A reader positioned at the packed payload bytes for a single +/// BEN frame. +/// * `max_val_bits` - The number of bits used to encode each label value. +/// * `max_len_bits` - The number of bits used to encode each run length. +/// * `n_bytes` - The number of payload bytes to read from `reader`. +/// +/// # Returns +/// +/// Returns the decoded run-length encoded assignment vector as `(value, count)` +/// pairs. pub fn decode_ben_line( mut reader: R, max_val_bits: u8, @@ -77,6 +94,20 @@ pub fn decode_ben_line( Ok(output_rle) } +/// Decode a BEN stream into JSONL assignment records. +/// +/// Each decoded sample is written as a JSON object containing an `assignment` +/// vector and a 1-based `sample` index. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including the 17-byte BEN banner. +/// * `writer` - The destination that will receive one JSON object per decoded +/// sample. +/// +/// # Returns +/// +/// Returns `Ok(())` after the stream has been fully decoded and written. pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { let mut ben_decoder = BenDecoder::new(reader)?; ben_decoder.write_all_jsonl(writer) diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs index 0245efe..b6b1b92 100644 --- a/ben/src/codec/decode/ben32.rs +++ b/ben/src/codec/decode/ben32.rs @@ -3,6 +3,19 @@ use byteorder::{BigEndian, ReadBytesExt}; use serde_json::json; use std::io::{self, BufRead, Write}; +/// Decode a single ben32 frame into an assignment vector and repetition count. +/// +/// This helper is crate-private because ben32 is an implementation detail of +/// XBEN, but it underpins both the stream decoders and the translation logic. +/// +/// # Arguments +/// +/// * `reader` - A reader positioned at the start of a single ben32 frame. +/// * `variant` - The BEN variant used to interpret the frame tail. +/// +/// # Returns +/// +/// Returns the expanded assignment vector together with its repetition count. pub(crate) fn decode_ben32_line( mut reader: R, variant: BenVariant, @@ -42,6 +55,19 @@ pub(crate) fn decode_ben32_line( Ok((output_vec, count)) } +/// Decode a ben32 stream into JSONL assignment records. +/// +/// # Arguments +/// +/// * `reader` - The ben32 input stream. +/// * `writer` - The destination for the JSONL output. +/// * `starting_sample` - The 0-based sample offset that should be added to the +/// emitted sample numbers. +/// * `variant` - The BEN variant used to interpret repetition counts. +/// +/// # Returns +/// +/// Returns `Ok(())` after the ben32 stream has been fully decoded. pub(crate) fn jsonl_decode_ben32( mut reader: R, mut writer: W, diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 590d8a2..355cb56 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -4,6 +4,19 @@ use crate::{progress, BenVariant}; use std::io::{self, BufRead, Error, Read, Write}; use xz2::read::XzDecoder; +/// Decode an XBEN stream into an equivalent BEN stream. +/// +/// The output begins with the normal BEN banner followed by uncompressed BEN +/// frames. +/// +/// # Arguments +/// +/// * `reader` - The compressed XBEN input stream. +/// * `writer` - The destination for the uncompressed BEN stream. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full XBEN stream has been decoded into BEN. pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io::Result<()> { let mut decoder = XzDecoder::new(reader); @@ -78,6 +91,16 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: Ok(()) } +/// Decompress a general XZ byte stream without applying any BEN-specific logic. +/// +/// # Arguments +/// +/// * `reader` - The compressed XZ stream. +/// * `writer` - The destination for the decompressed bytes. +/// +/// # Returns +/// +/// Returns `Ok(())` once the compressed stream has been fully expanded. pub fn xz_decompress(reader: R, mut writer: W) -> io::Result<()> { let mut decoder = XzDecoder::new(reader); let mut buffer = [0u8; 4096]; @@ -92,6 +115,17 @@ pub fn xz_decompress(reader: R, mut writer: W) -> io::Resu Ok(()) } +/// Decode an XBEN stream directly into JSONL assignment records. +/// +/// # Arguments +/// +/// * `reader` - The compressed XBEN input stream. +/// * `writer` - The destination that will receive one JSON object per decoded +/// sample. +/// +/// # Returns +/// +/// Returns `Ok(())` after the XBEN stream has been fully decoded into JSONL. pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { let mut decoder = XzDecoder::new(reader); diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 6c7f1ad..2c54c42 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -2,6 +2,17 @@ use crate::util::rle::assign_to_rle; use serde_json::Value; use std::io; +/// Encode a JSON assignment record into the ben32 frame representation used by +/// XBEN streams. +/// +/// # Arguments +/// +/// * `data` - A JSON object containing an `assignment` array. +/// +/// # Returns +/// +/// Returns the encoded ben32 frame bytes terminated by the four-byte `0` +/// sentinel. pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( @@ -56,11 +67,33 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { Ok(ret) } +/// Encode a full assignment vector into a single BEN frame. +/// +/// # Arguments +/// +/// * `assign_vec` - The full assignment vector to encode. +/// +/// # Returns +/// +/// Returns the encoded BEN frame bytes, including the per-frame header. pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { let rle_vec: Vec<(u16, u16)> = assign_to_rle(assign_vec); encode_ben_vec_from_rle(rle_vec) } +/// Encode a run-length encoded assignment vector into a BEN frame. +/// +/// The returned byte vector contains the per-frame BEN header followed by the +/// packed `(value, run_length)` payload. +/// +/// # Arguments +/// +/// * `rle_vec` - The run-length encoded assignment vector as `(value, count)` +/// pairs. +/// +/// # Returns +/// +/// Returns the encoded BEN frame bytes, including the per-frame header. pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> Vec { let mut output_vec: Vec = Vec::new(); diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 7432a49..cb4b743 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -5,6 +5,24 @@ use std::io::{self, BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; +/// Encode JSONL assignment records directly into an XBEN stream. +/// +/// Each input line must be a JSON object with an `assignment` array. The output +/// stream begins with the standard BEN banner inside the compressed payload and +/// then stores each assignment in ben32 form. +/// +/// # Arguments +/// +/// * `reader` - A JSONL input stream with one assignment record per line. +/// * `writer` - The destination for the compressed XBEN bytes. +/// * `variant` - The BEN variant to use inside the XBEN payload. +/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe +/// default is chosen. +/// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// +/// # Returns +/// +/// Returns `Ok(())` after all JSONL lines have been encoded and written. pub fn encode_jsonl_to_xben( reader: R, writer: W, @@ -54,6 +72,21 @@ pub fn encode_jsonl_to_xben( Ok(()) } +/// Encode JSONL assignment records into an uncompressed BEN file. +/// +/// The input is expected to contain one JSON object per line with an +/// `assignment` array. The `sample` field is ignored because BEN sample order is +/// determined by the stream position. +/// +/// # Arguments +/// +/// * `reader` - A JSONL input stream with one assignment record per line. +/// * `writer` - The destination for the BEN bytes. +/// * `variant` - The BEN variant to use when writing the output stream. +/// +/// # Returns +/// +/// Returns `Ok(())` after all JSONL lines have been encoded and written. pub fn encode_jsonl_to_ben( reader: R, writer: W, diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 5e3ec3e..905f725 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -4,6 +4,22 @@ use std::io::{self, BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; +/// Compress an arbitrary byte stream with XZ/LZMA2. +/// +/// This is a general-purpose helper used by the XBEN tooling, but it can also +/// be used for plain XZ compression when BEN-specific framing is not needed. +/// +/// # Arguments +/// +/// * `reader` - The input byte stream to compress. +/// * `writer` - The destination for the compressed XZ bytes. +/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe +/// default is chosen. +/// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// +/// # Returns +/// +/// Returns `Ok(())` after the input stream has been fully compressed. pub fn xz_compress( mut reader: R, writer: W, @@ -41,6 +57,22 @@ pub fn xz_compress( Ok(()) } +/// Convert an existing BEN stream into an XBEN stream. +/// +/// The input must begin with a BEN banner so that the variant can be preserved +/// in the compressed output. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the compressed XBEN bytes. +/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe +/// default is chosen. +/// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// +/// # Returns +/// +/// Returns `Ok(())` after the BEN stream has been translated and compressed. pub fn encode_ben_to_xben( mut reader: R, writer: W, diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index 905e898..9903d3b 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -1,3 +1,10 @@ +//! Core format conversion logic for BEN-related representations. +//! +//! This module is split into three layers: +//! - [`encode`] for producing BEN or XBEN streams +//! - [`decode`] for recovering BEN, XBEN, or JSONL data +//! - [`translate`] for converting between BEN frames and their ben32 form + pub mod decode; pub mod encode; pub mod translate; diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index f681ee6..99c4a84 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -7,6 +7,15 @@ use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; use crate::{progress, BenVariant}; +/// Convert a single ben32 frame into a BEN frame payload. +/// +/// # Arguments +/// +/// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator. +/// +/// # Returns +/// +/// Returns the encoded BEN frame payload and header. fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { let mut buffer = [0u8; 4]; let mut ben32_rle: Vec<(u16, u16)> = Vec::new(); @@ -41,6 +50,21 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { Ok(encode_ben_vec_from_rle(ben32_rle)) } +/// Translate a stream of ben32 frames into BEN frames. +/// +/// This is primarily used while decoding XBEN, where the compressed payload is +/// stored in ben32 form. +/// +/// # Arguments +/// +/// * `reader` - The ben32 input stream. +/// * `writer` - The destination for the translated BEN frames. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each ben32 frame. +/// +/// # Returns +/// +/// Returns `Ok(())` after the input stream has been fully translated. pub fn ben32_to_ben_lines( mut reader: R, mut writer: W, @@ -82,6 +106,18 @@ pub fn ben32_to_ben_lines( Ok(()) } +/// Convert a single BEN frame payload into its ben32 representation. +/// +/// # Arguments +/// +/// * `reader` - A reader positioned at the BEN frame payload. +/// * `max_val_bits` - The number of bits used to encode each label value. +/// * `max_len_bits` - The number of bits used to encode each run length. +/// * `n_bytes` - The number of payload bytes to read. +/// +/// # Returns +/// +/// Returns the ben32 frame bytes, including the four-byte terminator. fn ben_to_ben32_line( reader: R, max_val_bits: u8, @@ -102,6 +138,21 @@ fn ben_to_ben32_line( Ok(ben32_vec) } +/// Translate a BEN stream into ben32 frames. +/// +/// This is the format used inside XBEN after the outer XZ compression layer is +/// removed. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the translated ben32 frames. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each translated frame. +/// +/// # Returns +/// +/// Returns `Ok(())` after the input stream has been fully translated. pub fn ben_to_ben32_lines( mut reader: R, mut writer: W, diff --git a/ben/src/io/mod.rs b/ben/src/io/mod.rs index c9134a0..785e3c0 100644 --- a/ben/src/io/mod.rs +++ b/ben/src/io/mod.rs @@ -1,2 +1,4 @@ +//! Stream-oriented readers and writers for BEN and XBEN files. + pub mod reader; pub mod writer; diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 91cde9c..f455091 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -9,20 +9,44 @@ use std::iter::Peekable; use std::path::{Path, PathBuf}; use xz2::read::XzDecoder; +/// A decoded assignment together with the number of times it repeats. pub type MkvRecord = (Vec, u16); +/// A raw ben32 frame together with the number of times it repeats. pub type Ben32Frame = (Vec, u16); +/// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. pub type FrameIter = Box> + Send>; #[derive(Debug)] +/// Errors produced while validating the header of a decoder input stream. pub enum DecoderInitError { + /// The leading bytes did not match any supported BEN banner. InvalidFileFormat(Vec), + /// An I/O error occurred while reading the header. Io(io::Error), } +/// Check whether a header prefix matches the XZ file signature. +/// +/// # Arguments +/// +/// * `h` - The bytes to inspect. +/// +/// # Returns +/// +/// Returns `true` when `h` begins with the standard XZ magic bytes. fn is_xz_header(h: &[u8]) -> bool { h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" } +/// Convert a byte slice into a space-separated uppercase hex string. +/// +/// # Arguments +/// +/// * `bytes` - The bytes to render. +/// +/// # Returns +/// +/// Returns the formatted hex string. fn to_hex(bytes: &[u8]) -> String { bytes .iter() @@ -32,6 +56,7 @@ fn to_hex(bytes: &[u8]) -> String { } impl std::fmt::Display for DecoderInitError { + /// Format the decoder initialization error for display. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Io(e) => write!(f, "IO error: {e}"), @@ -59,6 +84,7 @@ impl std::fmt::Display for DecoderInitError { } impl std::error::Error for DecoderInitError { + /// Return the underlying source error when one exists. fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DecoderInitError::Io(e) => Some(e), @@ -68,12 +94,14 @@ impl std::error::Error for DecoderInitError { } impl From for DecoderInitError { + /// Wrap a plain I/O error as a decoder initialization error. fn from(error: io::Error) -> Self { DecoderInitError::Io(error) } } impl From for io::Error { + /// Convert a decoder initialization error into a plain I/O error. fn from(error: DecoderInitError) -> Self { match error { DecoderInitError::Io(e) => e, @@ -84,6 +112,7 @@ impl From for io::Error { } } +/// Iterator over decoded assignments in an uncompressed BEN stream. pub struct BenDecoder { reader: R, sample_count: usize, @@ -91,15 +120,36 @@ pub struct BenDecoder { } #[derive(Clone)] +/// A single raw BEN frame. +/// +/// `raw_data` contains only the packed `(value, run_length)` payload and does +/// not include the outer frame header fields. pub struct BenFrame { + /// Number of bits used to encode each label value in `raw_data`. pub max_val_bits: u8, + /// Number of bits used to encode each run length in `raw_data`. pub max_len_bits: u8, + /// Number of repeated samples represented by this frame. pub count: u16, + /// Length in bytes of the packed payload stored in `raw_data`. pub n_bytes: u32, + /// Packed BEN payload for this frame. pub raw_data: Vec, } impl BenDecoder { + /// Create a decoder for an uncompressed BEN stream. + /// + /// The reader must begin with one of the BEN banners such as + /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. + /// + /// # Arguments + /// + /// * `reader` - The input BEN stream, including its 17-byte banner. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first BEN frame. pub fn new(mut reader: R) -> Result { let mut check_buffer = [0u8; 17]; @@ -122,6 +172,16 @@ impl BenDecoder { } } + /// Decode the remaining BEN stream and write it as JSONL. + /// + /// # Arguments + /// + /// * `writer` - The destination that will receive one JSON object per + /// decoded sample. + /// + /// # Returns + /// + /// Returns `Ok(())` after the remaining stream has been fully decoded. pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { while let Some(result_tuple) = self.next() { match result_tuple { @@ -145,6 +205,12 @@ impl BenDecoder { Ok(()) } + /// Read and return the next raw BEN frame from the underlying stream. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. fn pop_frame_from_reader(&mut self) -> Option> { let mut b1 = [0u8; 1]; let max_val_bits = match self.reader.read_exact(&mut b1) { @@ -193,10 +259,24 @@ impl BenDecoder { })) } + /// Consume this decoder and iterate over raw BEN frames instead of + /// materialized assignments. + /// + /// # Returns + /// + /// Returns an iterator that yields raw BEN frames from the remaining input. pub fn into_frames(self) -> BenFrameDecoeder { BenFrameDecoeder { inner: self } } + /// Count the number of samples remaining in the BEN stream. + /// + /// This consumes the decoder but only walks frame boundaries rather than + /// expanding every assignment into a full vector. + /// + /// # Returns + /// + /// Returns the number of remaining samples in the stream. pub fn count_samples(self) -> io::Result { let mut total = 0usize; for frame_res in self.into_frames() { @@ -207,6 +287,15 @@ impl BenDecoder { } } +/// Decode a raw BEN frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame` - The raw BEN frame to decode. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { decode_ben_line( Cursor::new(&frame.raw_data), @@ -220,6 +309,7 @@ fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { impl Iterator for BenDecoder { type Item = io::Result; + /// Decode and return the next assignment from the BEN stream. fn next(&mut self) -> Option> { let ben_frame = match self.pop_frame_from_reader() { Some(Ok(frame)) => frame, @@ -238,11 +328,21 @@ impl Iterator for BenDecoder { } } +/// Iterator over raw BEN frames. pub struct BenFrameDecoeder { inner: BenDecoder, } impl BenFrameDecoeder { + /// Create a raw BEN frame iterator from a reader. + /// + /// # Arguments + /// + /// * `reader` - The input BEN stream, including its 17-byte banner. + /// + /// # Returns + /// + /// Returns an iterator over raw BEN frames. pub fn new(reader: R) -> io::Result { Ok(Self { inner: BenDecoder::new(reader)?, @@ -253,19 +353,32 @@ impl BenFrameDecoeder { impl Iterator for BenFrameDecoeder { type Item = io::Result; + /// Return the next raw BEN frame from the input stream. fn next(&mut self) -> Option { self.inner.pop_frame_from_reader() } } +/// Iterator over decoded assignments in an XBEN stream. pub struct XBenDecoder { xz: BufReader>, + /// Variant encoded in the XBEN banner. pub variant: BenVariant, overflow: Vec, buf: Box<[u8]>, } impl XBenDecoder { + /// Create a decoder for an XBEN stream. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first ben32 frame in the + /// decompressed payload. pub fn new(reader: R) -> io::Result { let xz = XzDecoder::new(reader); let mut xz = BufReader::with_capacity(1 << 20, xz); @@ -291,6 +404,17 @@ impl XBenDecoder { }) } + /// Try to extract one complete ben32 frame from the buffered overflow. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain one or more + /// complete ben32 frames. + /// + /// # Returns + /// + /// Returns the frame bytes, the number of consumed bytes, and the decoded + /// repetition count when a complete frame is available. fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { match self.variant { BenVariant::Standard => { @@ -325,10 +449,22 @@ impl XBenDecoder { } } + /// Consume this decoder and iterate over raw ben32 frames instead of + /// materialized assignments. + /// + /// # Returns + /// + /// Returns an iterator that yields raw ben32 frames from the remaining + /// input. pub fn into_frames(self) -> XBenFrameDecoder { XBenFrameDecoder { inner: self } } + /// Count the number of samples remaining in the XBEN stream. + /// + /// # Returns + /// + /// Returns the number of remaining samples in the stream. pub fn count_samples(self) -> io::Result { let mut total = 0usize; for frame_res in self.into_frames() { @@ -339,6 +475,16 @@ impl XBenDecoder { } } +/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame_bytes` - The ben32 frame bytes. +/// * `variant` - The BEN variant used to interpret the frame tail. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. fn decode_xben_frame_to_assignment( frame_bytes: &[u8], variant: BenVariant, @@ -351,6 +497,7 @@ fn decode_xben_frame_to_assignment( impl Iterator for XBenDecoder { type Item = io::Result; + /// Decode and return the next assignment from the XBEN stream. fn next(&mut self) -> Option { loop { if let Some((frame_bytes, consumed, count)) = @@ -383,11 +530,21 @@ impl Iterator for XBenDecoder { } } +/// Iterator over raw ben32 frames inside an XBEN stream. pub struct XBenFrameDecoder { inner: XBenDecoder, } impl XBenFrameDecoder { + /// Create a raw XBEN frame iterator from a reader. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns an iterator over raw ben32 frames. pub fn new(reader: R) -> io::Result { Ok(Self { inner: XBenDecoder::new(reader)?, @@ -398,6 +555,7 @@ impl XBenFrameDecoder { impl Iterator for XBenFrameDecoder { type Item = io::Result; + /// Return the next raw ben32 frame from the input stream. fn next(&mut self) -> Option { loop { if let Some((frame, consumed, count)) = @@ -430,17 +588,33 @@ impl Iterator for XBenFrameDecoder { } #[derive(Clone)] +/// A generalized frame type used by the subsampling machinery. pub enum Frame { + /// A raw BEN frame. Ben(BenFrame), + /// A raw ben32 frame from an XBEN stream together with its variant. XBen(Vec, BenVariant), } +/// A selection strategy for extracting only part of a frame stream. pub enum Selection { + /// Select explicit 1-based indices. Indices(Peekable>), + /// Select every `step` samples starting at the 1-based `offset`. Every { step: usize, offset: usize }, + /// Select the inclusive 1-based range `[start, end]`. Range { start: usize, end: usize }, } +/// Decode a generic frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame` - Either a BEN frame or an XBEN ben32 frame. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. fn decode_frame_to_assignment(frame: &Frame) -> io::Result> { match frame { Frame::Ben(f) => decode_ben_frame_to_assignment(f), @@ -448,6 +622,7 @@ fn decode_frame_to_assignment(frame: &Frame) -> io::Result> { } } +/// Iterator adaptor that decodes only selected samples from a frame stream. pub struct SubsampleFrameDecoder where I: Iterator>, @@ -461,6 +636,16 @@ impl SubsampleFrameDecoder where I: Iterator>, { + /// Create a subsampling iterator from a lower-level frame iterator. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `selection` - The sample-selection rule to apply. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn new(inner: I, selection: Selection) -> Self { Self { inner, @@ -469,6 +654,18 @@ where } } + /// Select a set of 1-based sample indices. + /// + /// Indices are sorted and deduplicated before iteration begins. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn by_indices(inner: I, indices: T) -> Self where T: IntoIterator, @@ -479,6 +676,17 @@ where Self::new(inner, Selection::Indices(v.into_iter().peekable())) } + /// Select the inclusive 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn by_range(inner: I, start: usize, end: usize) -> Self { assert!( start >= 1 && end >= start, @@ -487,11 +695,32 @@ where Self::new(inner, Selection::Range { start, end }) } + /// Select every `step` samples beginning from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn every(inner: I, step: usize, offset: usize) -> Self { assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); Self::new(inner, Selection::Every { step, offset }) } + /// Count how many selected samples fall within an inclusive sample interval. + /// + /// # Arguments + /// + /// * `lo` - The first 1-based sample index covered by the current frame. + /// * `hi` - The last 1-based sample index covered by the current frame. + /// + /// # Returns + /// + /// Returns the number of selected samples represented by the frame. fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { match &mut self.selection { Selection::Indices(iter) => { @@ -541,6 +770,7 @@ where { type Item = io::Result; + /// Return the next decoded sample selected by the subsampling rule. fn next(&mut self) -> Option { loop { if let Selection::Range { end, .. } = self.selection { @@ -575,6 +805,19 @@ where } } +/// Build a generic frame iterator from a BEN or XBEN file path. +/// +/// Frame iteration is useful for subsampling and counting because it avoids +/// decoding every sample into a full assignment vector. +/// +/// # Arguments +/// +/// * `file_path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns a boxed iterator over generic frames and their repetition counts. pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { let file = File::options().read(true).open(file_path)?; let reader = BufReader::new(file); @@ -603,6 +846,16 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result BenDecoder { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + /// + /// # Arguments + /// + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_by_indices( self, indices: T, @@ -619,6 +872,17 @@ impl BenDecoder { SubsampleFrameDecoder::by_indices(frames, indices) } + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_by_range( self, start: usize, @@ -633,6 +897,17 @@ impl BenDecoder { SubsampleFrameDecoder::by_range(frames, start, end) } + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_every( self, step: usize, @@ -649,6 +924,16 @@ impl BenDecoder { } impl XBenDecoder { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + /// + /// # Arguments + /// + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_by_indices( self, indices: T, @@ -663,6 +948,17 @@ impl XBenDecoder { SubsampleFrameDecoder::by_indices(Box::new(frames), indices) } + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_by_range( self, start: usize, @@ -675,6 +971,17 @@ impl XBenDecoder { SubsampleFrameDecoder::by_range(Box::new(frames), start, end) } + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. pub fn into_subsample_every( self, step: usize, @@ -688,6 +995,19 @@ impl XBenDecoder { } } +/// Count the number of samples in a BEN or XBEN file on disk. +/// +/// The file is walked frame-by-frame, so this is linear in file size but avoids +/// materializing full assignment vectors. +/// +/// # Arguments +/// +/// * `path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns the number of samples in the file. pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { let iter = build_frame_iter(&path.to_path_buf(), mode)?; let mut total = 0usize; diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 0ac2143..f1bc05b 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -17,6 +17,16 @@ pub struct BenEncoder { } impl BenEncoder { + /// Create a new BEN writer and immediately emit the BEN banner. + /// + /// # Arguments + /// + /// * `writer` - The destination that will receive the BEN stream. + /// * `variant` - The BEN variant to encode. + /// + /// # Returns + /// + /// Returns a new encoder ready to accept assignments or RLE frames. pub fn new(mut writer: W, variant: BenVariant) -> Self { match variant { BenVariant::Standard => { @@ -35,6 +45,15 @@ impl BenEncoder { } } + /// Encode and write a run-length encoded assignment vector as one BEN frame. + /// + /// # Arguments + /// + /// * `rle_vec` - The assignment vector in `(value, count)` form. + /// + /// # Returns + /// + /// Returns `Ok(())` after the frame has been queued or written. pub fn write_rle(&mut self, rle_vec: Vec<(u16, u16)>) -> Result<()> { match self.variant { BenVariant::Standard => { @@ -59,12 +78,33 @@ impl BenEncoder { } } + /// Encode and write a full assignment vector. + /// + /// # Arguments + /// + /// * `assign_vec` - The full assignment vector to encode. + /// + /// # Returns + /// + /// Returns `Ok(())` after the assignment has been queued or written. pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { let rle_vec = assign_to_rle(assign_vec); self.write_rle(rle_vec)?; Ok(()) } + /// Encode and write a JSON assignment record. + /// + /// The input must contain an `assignment` array of integers. Other fields + /// are ignored. + /// + /// # Arguments + /// + /// * `data` - A JSON object containing an `assignment` array. + /// + /// # Returns + /// + /// Returns `Ok(())` after the record has been validated and encoded. pub fn write_json_value(&mut self, data: Value) -> Result<()> { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( @@ -99,6 +139,14 @@ impl BenEncoder { Ok(()) } + /// Flush any buffered repetition state to the underlying writer. + /// + /// This matters for [`BenVariant::MkvChain`], where repeated consecutive + /// samples are emitted only once together with their repetition count. + /// + /// # Returns + /// + /// Returns `Ok(())` once any buffered repetition state has been flushed. pub fn finish(&mut self) -> Result<()> { if self.complete { return Ok(()); @@ -117,6 +165,7 @@ impl BenEncoder { } impl Drop for BenEncoder { + /// Flush any buffered BEN state during drop. fn drop(&mut self) { let _ = self.finish(); } @@ -131,6 +180,17 @@ pub struct XBenEncoder { } impl XBenEncoder { + /// Create a new XBEN writer around an already-configured XZ encoder. + /// + /// # Arguments + /// + /// * `encoder` - The configured XZ encoder that will receive the ben32 + /// payload. + /// * `variant` - The BEN variant to encode inside the compressed stream. + /// + /// # Returns + /// + /// Returns a new XBEN encoder ready to accept assignments or BEN frames. pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { match variant { BenVariant::Standard => { @@ -154,6 +214,15 @@ impl XBenEncoder { } } + /// Encode and write a JSON assignment record into the compressed XBEN stream. + /// + /// # Arguments + /// + /// * `data` - A JSON object containing an `assignment` array. + /// + /// # Returns + /// + /// Returns `Ok(())` after the record has been validated and encoded. pub fn write_json_value(&mut self, data: Value) -> Result<()> { let encoded = encode_ben32_line(data)?; match self.variant { @@ -176,6 +245,18 @@ impl XBenEncoder { Ok(()) } + /// Read BEN frames from `reader` and write them into this XBEN stream. + /// + /// If the source still contains the 17-byte BEN banner, it is consumed and + /// replaced by the banner already written by this encoder. + /// + /// # Arguments + /// + /// * `reader` - The BEN input stream, with or without its banner. + /// + /// # Returns + /// + /// Returns `Ok(())` after the BEN stream has been translated into XBEN. pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { let peek = reader.fill_buf()?; let has_banner = peek.len() >= 17 @@ -190,6 +271,7 @@ impl XBenEncoder { } impl Drop for XBenEncoder { + /// Flush any buffered XBEN repetition state during drop. fn drop(&mut self) { if self.variant == BenVariant::MkvChain && self.count > 0 { self.encoder diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index f22a8bb..8206bd0 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -7,6 +7,17 @@ use std::io::{Read, Result, Write}; use std::result::Result as StdResult; /// Sorts a JSON-formatted NetworkX graph file by a key. +/// +/// # Arguments +/// +/// * `reader` - The source JSON graph in the NetworkX node-link style used by +/// the relabeling workflow. +/// * `writer` - The destination for the sorted JSON graph. +/// * `key` - The node attribute used to determine the new ordering. +/// +/// # Returns +/// +/// Returns a map from the original node id to the new node id. pub fn sort_json_file_by_key( reader: R, mut writer: W, diff --git a/ben/src/json/mod.rs b/ben/src/json/mod.rs index 6f94350..b8c9516 100644 --- a/ben/src/json/mod.rs +++ b/ben/src/json/mod.rs @@ -1 +1,3 @@ +//! JSON-oriented helpers that support BEN workflows. + pub mod graph; diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 8469303..b96cd51 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -1,6 +1,8 @@ -//! This crate provides several command line tools and functions for converting -//! ensembles of districting plans contained in a JSONL file with lines of the -//! form +//! Tools for working with binary ensembles of districting plans. +//! +//! This crate provides several command line tools and library functions for +//! converting ensembles of districting plans contained in a JSONL file with +//! lines of the form //! //! ```text //! {"assignment": , "sample": } @@ -18,14 +20,26 @@ //! - `reben`: A tool for relabeling BEN files to improve compression ratios. //! +/// Command-line entrypoints shared by the thin binaries in `src/bin`. pub mod cli; +/// Encoding, decoding, and format-to-format translation helpers. pub mod codec; +/// Streaming readers and writers for BEN and XBEN files. pub mod io; +/// JSON graph utilities used by relabeling workflows. pub mod json; +/// Logging and progress-output helpers used by the CLI and library. pub mod logging; +/// Higher-level operations such as extraction and relabeling. pub mod ops; +/// Miscellaneous utilities that do not fit into the other modules. pub mod util; +/// Print an in-place progress update when trace logging is enabled. +/// +/// This is intentionally separate from normal structured logging because many +/// callsites want carriage-return based terminal updates instead of line-based +/// log records. #[macro_export] macro_rules! progress { ($($arg:tt)*) => {{ @@ -34,7 +48,10 @@ macro_rules! progress { } #[derive(Debug, Clone, Copy, PartialEq)] +/// The BEN/XBEN variant used when encoding or decoding a stream. pub enum BenVariant { + /// Store each sample independently. Standard, + /// Store one frame plus a repetition count for repeated consecutive samples. MkvChain, } diff --git a/ben/src/logging.rs b/ben/src/logging.rs index 2be4aeb..31fafae 100644 --- a/ben/src/logging.rs +++ b/ben/src/logging.rs @@ -4,6 +4,16 @@ use std::sync::Once; static INIT_LOGGER: Once = Once::new(); +/// Initialize the global `tracing` subscriber used by the BEN CLIs. +/// +/// The subscriber reads `RUST_LOG` when present and otherwise defaults to +/// logging being disabled. Initialization is guarded so it is safe to call +/// multiple times. +/// +/// # Returns +/// +/// This function does not return a value. Repeated calls after the first are +/// no-ops. pub fn init_logging() { INIT_LOGGER.call_once(|| { let filter = EnvFilter::try_from_default_env() @@ -24,6 +34,19 @@ pub fn init_logging() { }); } +/// Emit a progress update to stderr when trace logging is enabled. +/// +/// This helper exists for progress-style output such as `"Encoding line: 42\r"` +/// that should redraw the current terminal line instead of creating a normal +/// structured log event. +/// +/// # Arguments +/// +/// * `args` - The formatted progress message to emit. +/// +/// # Returns +/// +/// This function does not return a value. pub fn trace_progress(args: std::fmt::Arguments<'_>) { if tracing::enabled!(Level::TRACE) { eprint!("{args}"); diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 88861d1..ce2864b 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -9,6 +9,7 @@ use std::io::Cursor; use std::io::{self, Read}; #[derive(Debug)] +/// Error categories returned when extracting an individual sample from a file. pub enum SampleErrorKind { InvalidSampleNumber, SampleNotFound { sample_number: usize }, @@ -17,11 +18,22 @@ pub enum SampleErrorKind { } #[derive(Debug)] +/// Error returned by sample extraction helpers. pub struct SampleError { + /// The underlying extraction failure category. pub kind: SampleErrorKind, } impl SampleError { + /// Wrap a plain I/O error as a [`SampleError`]. + /// + /// # Arguments + /// + /// * `error` - The underlying I/O error. + /// + /// # Returns + /// + /// Returns a new [`SampleError`] with [`SampleErrorKind::IoError`]. pub fn new_io_error(error: io::Error) -> Self { SampleError { kind: SampleErrorKind::IoError(error), @@ -30,6 +42,7 @@ impl SampleError { } impl fmt::Display for SampleError { + /// Format the sample extraction error for display. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match &self.kind { SampleErrorKind::InvalidSampleNumber => { @@ -53,6 +66,7 @@ impl fmt::Display for SampleError { } impl std::error::Error for SampleError { + /// Return the underlying source error when one exists. fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match &self.kind { SampleErrorKind::InvalidSampleNumber => None, @@ -64,12 +78,14 @@ impl std::error::Error for SampleError { } impl From for SampleError { + /// Wrap a plain I/O error as a sample extraction error. fn from(error: io::Error) -> Self { SampleError::new_io_error(error) } } impl From for SampleError { + /// Wrap a JSON parsing error as a sample extraction error. fn from(error: SerdeError) -> Self { SampleError { kind: SampleErrorKind::JsonError(error), @@ -77,6 +93,16 @@ impl From for SampleError { } } +/// Extract a single 1-based sample from an uncompressed BEN stream. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its 17-byte banner. +/// * `sample_number` - The 1-based sample index to retrieve. +/// +/// # Returns +/// +/// Returns the decoded assignment vector for the requested sample. pub fn extract_assignment_ben( mut reader: R, sample_number: usize, @@ -115,6 +141,16 @@ pub fn extract_assignment_ben( }) } +/// Extract a single 1-based sample from an XBEN stream. +/// +/// # Arguments +/// +/// * `reader` - The compressed XBEN input stream. +/// * `sample_number` - The 1-based sample index to retrieve. +/// +/// # Returns +/// +/// Returns the decoded assignment vector for the requested sample. pub fn extract_assignment_xben( mut reader: R, sample_number: usize, diff --git a/ben/src/ops/mod.rs b/ben/src/ops/mod.rs index 59eae35..098837c 100644 --- a/ben/src/ops/mod.rs +++ b/ben/src/ops/mod.rs @@ -1,2 +1,4 @@ +//! Higher-level operations built on top of BEN readers and writers. + pub mod extract; pub mod relabel; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 20eafb8..c8f4ac9 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -8,6 +8,21 @@ use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; use std::io::{self, Error, Read, Write}; +/// Canonicalize the labels used inside each BEN frame. +/// +/// Labels are reassigned in first-seen order within each assignment vector, +/// which can improve downstream compression ratios. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// +/// # Returns +/// +/// Returns `Ok(())` after all frames have been relabeled and written. pub fn relabel_ben_lines( mut reader: R, mut writer: W, @@ -66,6 +81,16 @@ pub fn relabel_ben_lines( Ok(()) } +/// Relabel an entire BEN file, preserving its leading BEN banner. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full BEN file has been relabeled. pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io::Result<()> { let mut check_buffer = [0u8; 17]; reader.read_exact(&mut check_buffer)?; @@ -88,6 +113,23 @@ pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io:: Ok(()) } +/// Relabel BEN frames using an externally supplied node map. +/// +/// `new_to_old_node_map` maps the new node index to the position that should be +/// read from the original assignment vector. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// +/// # Returns +/// +/// Returns `Ok(())` after all frames have been relabeled and written. pub fn relabel_ben_lines_with_map( mut reader: R, mut writer: W, @@ -144,6 +186,18 @@ pub fn relabel_ben_lines_with_map( Ok(()) } +/// Relabel an entire BEN file using an externally supplied node map. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full BEN file has been relabeled. pub fn relabel_ben_file_with_map( mut reader: R, mut writer: W, diff --git a/ben/src/util/mod.rs b/ben/src/util/mod.rs index d09d240..ab5e166 100644 --- a/ben/src/util/mod.rs +++ b/ben/src/util/mod.rs @@ -1 +1,3 @@ +//! Small utility helpers shared across the crate. + pub mod rle; diff --git a/ben/src/util/rle/mod.rs b/ben/src/util/rle/mod.rs index 43ec5f9..aa5c87b 100644 --- a/ben/src/util/rle/mod.rs +++ b/ben/src/util/rle/mod.rs @@ -1,6 +1,14 @@ //! Utility functions for run-length encoding assignment vectors. /// Convert a vector of assignments to a run-length encoded (RLE) vector. +/// +/// # Arguments +/// +/// * `assign_vec` - The full assignment vector. +/// +/// # Returns +/// +/// Returns the assignment vector as `(value, count)` pairs. pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { let mut prev_assign: u16 = 0; let mut count: u16 = 0; @@ -30,6 +38,14 @@ pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { } /// Convert a run-length encoded (RLE) vector to a vector of assignments. +/// +/// # Arguments +/// +/// * `rle_vec` - The run-length encoded assignment vector. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec { let mut output_vec: Vec = Vec::new(); for (val, len) in rle_vec { From 72fcaf1712437b48de7bd9282d7b33e7858a6371 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:09:31 -0600 Subject: [PATCH 011/221] Try to make relabeling faster --- ben/src/ops/relabel/mod.rs | 67 +++++++++++++++++++++++++------- ben/src/util/rle/mod.rs | 78 +++++++++++++++++++++++++++----------- 2 files changed, 110 insertions(+), 35 deletions(-) diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index c8f4ac9..12ed6ed 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -2,12 +2,41 @@ use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; -use crate::util::rle::{assign_to_rle, rle_to_vec}; +use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; use std::io::{self, Error, Read, Write}; +/// Convert a sparse permutation map into a dense index vector. +/// +/// # Arguments +/// +/// * `new_to_old_node_map` - The sparse map from new index to old index. +/// +/// # Returns +/// +/// Returns a dense permutation vector where `perm[new_idx] == old_idx`. +fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result> { + let Some(max_key) = new_to_old_node_map.keys().copied().max() else { + return Ok(Vec::new()); + }; + + let mut permutation = vec![usize::MAX; max_key + 1]; + for (&new_idx, &old_idx) in new_to_old_node_map { + permutation[new_idx] = old_idx; + } + + if permutation.iter().any(|&old_idx| old_idx == usize::MAX) { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Relabel map must contain a contiguous set of new indices", + )); + } + + Ok(permutation) +} + /// Canonicalize the labels used inside each BEN frame. /// /// Labels are reassigned in first-seen order within each assignment vector, @@ -29,6 +58,7 @@ pub fn relabel_ben_lines( variant: BenVariant, ) -> io::Result<()> { let mut sample_number = 0; + let mut label_map = HashMap::new(); loop { let mut tmp_buffer = [0u8]; let max_val_bits = match reader.read_exact(&mut tmp_buffer) { @@ -47,7 +77,8 @@ pub fn relabel_ben_lines( let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; let mut label = 0; - let mut label_map = HashMap::new(); + label_map.clear(); + label_map.reserve(ben_line.len()); for (val, _len) in &mut ben_line { let new_val = match label_map.get(val) { Some(v) => *v, @@ -137,6 +168,10 @@ pub fn relabel_ben_lines_with_map( variant: BenVariant, ) -> io::Result<()> { let mut sample_number = 0; + let permutation = dense_permutation(&new_to_old_node_map)?; + let mut assignment_vec = Vec::new(); + let mut new_assignment_vec = vec![0u16; permutation.len()]; + let mut new_rle = Vec::new(); loop { let mut tmp_buffer = [0u8]; let max_val_bits = match reader.read_exact(&mut tmp_buffer) { @@ -153,20 +188,26 @@ pub fn relabel_ben_lines_with_map( let n_bytes = reader.read_u32::()?; let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + rle_to_vec_in_place(&ben_line, &mut assignment_vec); - let assignment_vec = rle_to_vec(ben_line); - let new_assignment_vec = assignment_vec - .iter() - .enumerate() - .map(|(i, _)| { - let new_val_pos = new_to_old_node_map.get(&i).unwrap(); - assignment_vec[*new_val_pos] - }) - .collect::>(); + if assignment_vec.len() != permutation.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "Relabel map length {} does not match assignment length {}", + permutation.len(), + assignment_vec.len() + ), + )); + } + + for (new_idx, &old_idx) in permutation.iter().enumerate() { + new_assignment_vec[new_idx] = assignment_vec[old_idx]; + } - let new_rle = assign_to_rle(new_assignment_vec); + assign_slice_to_rle(&new_assignment_vec, &mut new_rle); - let relabeled = encode_ben_vec_from_rle(new_rle); + let relabeled = encode_ben_vec_from_rle(new_rle.clone()); writer.write_all(&relabeled)?; let count_occurrences = if variant == BenVariant::MkvChain { diff --git a/ben/src/util/rle/mod.rs b/ben/src/util/rle/mod.rs index aa5c87b..0f4d532 100644 --- a/ben/src/util/rle/mod.rs +++ b/ben/src/util/rle/mod.rs @@ -10,12 +10,66 @@ /// /// Returns the assignment vector as `(value, count)` pairs. pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { + let mut rle_vec: Vec<(u16, u16)> = Vec::new(); + assign_slice_to_rle(&assign_vec, &mut rle_vec); + rle_vec +} + +/// Convert a run-length encoded (RLE) vector to a vector of assignments. +/// +/// # Arguments +/// +/// * `rle_vec` - The run-length encoded assignment vector. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec { + let mut output_vec: Vec = Vec::new(); + rle_to_vec_in_place(&rle_vec, &mut output_vec); + output_vec +} + +/// Expand an RLE vector into a provided output buffer. +/// +/// # Arguments +/// +/// * `rle_vec` - The run-length encoded assignment vector. +/// * `output_vec` - The buffer that will receive the expanded assignments. +/// +/// # Returns +/// +/// This function does not return a value. +pub(crate) fn rle_to_vec_in_place(rle_vec: &[(u16, u16)], output_vec: &mut Vec) { + output_vec.clear(); + let total_len: usize = rle_vec.iter().map(|(_, len)| *len as usize).sum(); + if output_vec.capacity() < total_len { + output_vec.reserve(total_len - output_vec.capacity()); + } + for &(val, len) in rle_vec { + for _ in 0..len { + output_vec.push(val); + } + } +} + +/// Encode an assignment slice into a provided RLE output buffer. +/// +/// # Arguments +/// +/// * `assign_vec` - The full assignment vector. +/// * `rle_vec` - The buffer that will receive `(value, count)` pairs. +/// +/// # Returns +/// +/// This function does not return a value. +pub(crate) fn assign_slice_to_rle(assign_vec: &[u16], rle_vec: &mut Vec<(u16, u16)>) { + rle_vec.clear(); let mut prev_assign: u16 = 0; let mut count: u16 = 0; let mut first = true; - let mut rle_vec: Vec<(u16, u16)> = Vec::new(); - for assign in assign_vec { + for &assign in assign_vec { if first { prev_assign = assign; count = 1; @@ -34,26 +88,6 @@ pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { if count > 0 { rle_vec.push((prev_assign, count)); } - rle_vec -} - -/// Convert a run-length encoded (RLE) vector to a vector of assignments. -/// -/// # Arguments -/// -/// * `rle_vec` - The run-length encoded assignment vector. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec { - let mut output_vec: Vec = Vec::new(); - for (val, len) in rle_vec { - for _ in 0..len { - output_vec.push(val); - } - } - output_vec } #[cfg(test)] From 34c3907a5e99061c4b928ebbab8be1f86b85b1c5 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:22:52 -0600 Subject: [PATCH 012/221] fix pben counting logic --- ben/src/io/reader.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index f455091..5fc7234 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -186,11 +186,11 @@ impl BenDecoder { while let Some(result_tuple) = self.next() { match result_tuple { Ok((assignment, count)) => { - for _ in 0..count { - self.sample_count += 1; + let starting_sample = self.sample_count + 1 - count as usize; + for offset in 0..count as usize { let line = json!({ "assignment": assignment, - "sample": self.sample_count, + "sample": starting_sample + offset, }) .to_string() + "\n"; @@ -320,10 +320,8 @@ impl Iterator for BenDecoder { Ok(assgn) => assgn, Err(e) => return Some(Err(e)), }; - progress!( - "Decoding sample: {}\r", - self.sample_count + ben_frame.count as usize - ); + self.sample_count += ben_frame.count as usize; + progress!("Decoding sample: {}\r", self.sample_count); Some(Ok((assignment, ben_frame.count))) } } From 475c6075ade8ae953052bfea53f7ffb983e1bd3d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:26:23 -0600 Subject: [PATCH 013/221] speed up pben conversoin logic --- ben/src/cli/pben.rs | 106 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 18 deletions(-) diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 4ae0ca2..45c208c 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -1,4 +1,4 @@ -use crate::cli::common::set_verbose; +use crate::cli::common::{check_overwrite, set_verbose}; use crate::io::reader::BenDecoder; use crate::io::writer::{BenEncoder, XBenEncoder}; use crate::BenVariant; @@ -60,13 +60,18 @@ pub fn run() -> Result<()> { Mode::BenToPc => { tracing::trace!("Converting BEN to PCOMPRESS"); - let ben_reader: Box = match args.input_file { - Some(file) => Box::new(BufReader::new(File::open(&file).unwrap())), + let ben_reader: Box = match args.input_file.as_ref() { + Some(file) => Box::new(BufReader::new(File::open(file).unwrap())), None => Box::new(io::stdin()), }; - let mut pcompress_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + let mut pcompress_writer: BufWriter> = match resolved_output_path( + Mode::BenToPc, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), None => BufWriter::new(Box::new(io::stdout())), }; @@ -83,13 +88,18 @@ pub fn run() -> Result<()> { Mode::PcToBen => { tracing::trace!("Converting PCOMPRESS to BEN"); - let mut pcompress_reader: BufReader> = match args.input_file { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), + let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), None => BufReader::new(Box::new(io::stdin())), }; - let mut ben_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + let mut ben_writer: BufWriter> = match resolved_output_path( + Mode::PcToBen, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), None => BufWriter::new(Box::new(io::stdout())), }; @@ -106,13 +116,18 @@ pub fn run() -> Result<()> { Mode::PcToXben => { tracing::trace!("Converting PCOMPRESS to XBEN"); - let mut pcompress_reader: BufReader> = match args.input_file { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(&file).unwrap()))), + let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), None => BufReader::new(Box::new(io::stdin())), }; - let mut ben_writer: BufWriter> = match args.output_file { - Some(file) => BufWriter::new(Box::new(File::create(&file).unwrap())), + let mut ben_writer: BufWriter> = match resolved_output_path( + Mode::PcToXben, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), None => BufWriter::new(Box::new(io::stdout())), }; @@ -129,18 +144,53 @@ pub fn run() -> Result<()> { } } +/// Resolve the output file path for a `pben` mode. +fn resolved_output_path( + mode: Mode, + input_file: Option<&str>, + output_file: Option<&str>, + overwrite: bool, +) -> io::Result> { + let Some(path) = output_file + .map(ToOwned::to_owned) + .or_else(|| input_file.map(|input| derive_output_path(mode, input))) + else { + return Ok(None); + }; + + check_overwrite(&path, overwrite)?; + Ok(Some(path)) +} + +/// Derive the default output file name for a `pben` conversion mode. +fn derive_output_path(mode: Mode, input_file: &str) -> String { + match mode { + Mode::BenToPc => input_file + .strip_suffix(".ben") + .map(|prefix| format!("{prefix}.pcompress")) + .unwrap_or_else(|| format!("{input_file}.pcompress")), + Mode::PcToBen => input_file + .strip_suffix(".pcompress") + .or_else(|| input_file.strip_suffix(".pc")) + .map(|prefix| format!("{prefix}.ben")) + .unwrap_or_else(|| format!("{input_file}.ben")), + Mode::PcToXben => input_file + .strip_suffix(".pcompress") + .or_else(|| input_file.strip_suffix(".pc")) + .map(|prefix| format!("{prefix}.xben")) + .unwrap_or_else(|| format!("{input_file}.xben")), + } +} + /// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { let ben_reader = BenDecoder::new(&mut reader)?; + let mut line = String::new(); for result in ben_reader { match result { Ok((assignment, count)) => { - let assignment: Vec = assignment - .into_iter() - .map(|x| x.saturating_sub(1) as usize) - .collect(); - let line = serde_json::to_string(&assignment).unwrap(); + render_zero_based_assignment_line(&assignment, &mut line); for _ in 0..count { writeln!(writer, "{line}")?; } @@ -152,6 +202,19 @@ fn assignment_decode_ben(mut reader: R, mut writer: W) -> io: Ok(()) } +/// Render a BEN assignment vector as a zero-based JSON array for PCOMPRESS. +fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { + output.clear(); + output.push('['); + for (idx, value) in assignment.iter().enumerate() { + if idx > 0 { + output.push(','); + } + output.push_str(&value.saturating_sub(1).to_string()); + } + output.push(']'); +} + /// Read zero-based assignment vectors and encode them as BEN. fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain); @@ -222,6 +285,13 @@ mod tests { assert!(args.verbose); } + #[test] + fn derive_output_path_replaces_expected_suffixes() { + assert_eq!(derive_output_path(Mode::BenToPc, "plans.ben"), "plans.pcompress"); + assert_eq!(derive_output_path(Mode::PcToBen, "plans.pcompress"), "plans.ben"); + assert_eq!(derive_output_path(Mode::PcToXben, "plans.pc"), "plans.xben"); + } + #[test] fn assignment_decode_ben_writes_json_lines() { let jsonl = br#"{"assignment":[1,1,2],"sample":1} From 34ead308fe6eec5880ddb232c0e101935f00b64e Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:54:42 -0600 Subject: [PATCH 014/221] Add in spectral and Cuthill-McKee ordering to just see how well it does --- ben/src/cli/reben.rs | 122 ++++++++--- ben/src/json/graph/mod.rs | 396 ++++++++++++++++++++++++++++++++---- ben/src/json/graph/tests.rs | 56 +++++ ben/tests/test_cli.rs | 57 +++++- 4 files changed, 563 insertions(+), 68 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 3f86ccc..9ccb7d6 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -1,6 +1,6 @@ use crate::cli::common::set_verbose; use crate::{ - json::graph::sort_json_file_by_key, + json::graph::{sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod}, ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, }; use clap::{Parser, ValueEnum}; @@ -19,6 +19,15 @@ enum Mode { Ben, } +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Topology-based ordering methods for JSON graph relabeling. +enum OrderingMethod { + /// Spectral ordering based on the graph Laplacian. + Spectral, + /// Reverse Cuthill-McKee ordering. + ReverseCuthillMckee, +} + #[derive(Parser, Debug)] #[command( name = "Relabeling Binary Ensemble CLI Tool", @@ -40,6 +49,9 @@ struct Args { /// Key to sort the JSON or BEN file by. #[arg(short, long)] key: Option, + /// Topology-based ordering method to use instead of a key sort. + #[arg(long, value_enum)] + ordering: Option, /// Shape file to use for sorting the BEN file. Only needed /// in BEN mode when a map is not provided. #[arg(short, long)] @@ -48,9 +60,9 @@ struct Args { #[arg(short = 'p', long)] map_file: Option, /// Mode to run the program in (either JSON or BEN). - /// The JSON mode will sort a JSON file by a given key. - /// The BEN mode will relabel a BEN file according to a map file - /// or a key (the latter also requires a dual-graph file). If no + /// The JSON mode will sort a JSON file by a given key or graph-ordering + /// method. The BEN mode will relabel a BEN file according to a map file + /// or a graph-ordering request (which also requires a dual-graph file). If no /// map file or key is provided, the BEN mode will canonicalize /// the assignment vectors in the BEN file. #[arg(short, long)] @@ -69,14 +81,14 @@ pub fn run() { Mode::Json => { let input_file = File::open(&args.input_file).expect("Could not open input file."); let reader = BufReader::new(input_file); - - let key = args.key.as_ref().expect("No key provided."); + let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref()) + .expect("Provide either --key or --ordering."); let output_file_name = match args.output_file { Some(name) => name, None => { args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", key).as_str() + + format!("_sorted_by_{}.json", label).as_str() } }; @@ -84,10 +96,18 @@ pub fn run() { File::create(&output_file_name).expect("Could not create output file."); let writer = BufWriter::new(output_file); - let map = sort_json_file_by_key(reader, writer, key); + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(reader, writer, key) + } else { + sort_json_file_by_ordering( + reader, + writer, + to_graph_ordering(args.ordering.as_ref().unwrap()), + ) + }; let map_file_name = args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", key).as_str() + + format!("_sorted_by_{}", label).as_str() + "_map.json"; let map_file = File::create(map_file_name).expect("Could not create map file."); let mut map_writer = BufWriter::new(map_file); @@ -95,7 +115,8 @@ pub fn run() { let map_json = json!({ "input_file": args.input_file, "output_file": output_file_name, - "key": key, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), "relabeling_old_to_new_nodes_map": map.unwrap() }); @@ -107,7 +128,7 @@ pub fn run() { let input_file = File::open(&args.input_file).expect("Could not open input file."); let reader = BufReader::new(input_file); - if args.map_file.is_none() && args.key.is_none() { + if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { tracing::trace!("Canonicalizing assignment vectors in ben file."); let output_file_name = match args.output_file { @@ -126,21 +147,23 @@ pub fn run() { return; } - if args.map_file.is_some() && args.key.is_some() { + if args.map_file.is_some() && (args.key.is_some() || args.ordering.is_some()) { panic!(concat!( - "Cannot provide both a map file and a key. ", - "Please provide either the map file or the key and the ", + "Cannot provide both a map file and a sorting option. ", + "Please provide either the map file or the key/ordering and the ", "(JSON formatted) dual-graph file needed to generate a map file." )); } let mut map_file_name = String::new(); - if let Some(key) = args.key { + if args.key.is_some() || args.ordering.is_some() { if let Some(shape) = args.shape_file { - tracing::trace!("Creating map file for key: {}", key); + let label = + relabeling_label(args.key.as_deref(), args.ordering.as_ref()).unwrap(); + tracing::trace!("Creating map file for ordering: {}", label); let output_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", key).as_str(); + + format!("_sorted_by_{}.json", label).as_str(); let output_file = File::create(&output_file_name).expect("Could not create output file."); @@ -148,10 +171,18 @@ pub fn run() { let shape_reader = BufReader::new(File::open(&shape).expect("Could not open shape file.")); - let map = sort_json_file_by_key(shape_reader, writer, &key); + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(shape_reader, writer, key) + } else { + sort_json_file_by_ordering( + shape_reader, + writer, + to_graph_ordering(args.ordering.as_ref().unwrap()), + ) + }; map_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", key).as_str() + + format!("_sorted_by_{}", label).as_str() + "_map.json"; let map_file = File::create(&map_file_name).expect("Could not create map file."); @@ -160,7 +191,8 @@ pub fn run() { let map_json = json!({ "input_file": args.input_file, "output_file": output_file_name, - "key": key, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), "relabeling_old_to_new_nodes_map": map.unwrap() }); @@ -168,7 +200,7 @@ pub fn run() { .write_all(map_json.to_string().as_bytes()) .expect("Could not write map file."); } else { - panic!("{}", format!("No shape file provided to go with key {:}", key)); + panic!("No shape file provided to go with the requested ordering."); } } @@ -187,13 +219,17 @@ pub fn run() { .map(|(k, v)| (v.as_u64().unwrap() as usize, k.parse::().unwrap())) .collect::>(); - let key = data["key"].as_str().unwrap(); + let label = data["key"] + .as_str() + .map(ToOwned::to_owned) + .or_else(|| data["ordering_method"].as_str().map(ToOwned::to_owned)) + .unwrap_or_else(|| "map".to_string()); let output_file_name = match args.output_file { Some(name) => name, None => { args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + format!("_sorted_by_{}.jsonl.ben", key).as_str() + + format!("_sorted_by_{}.jsonl.ben", label).as_str() } }; let output_file = @@ -207,6 +243,29 @@ pub fn run() { } } +fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { + match ordering { + OrderingMethod::Spectral => GraphOrderingMethod::Spectral, + OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, + } +} + +fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { + match ordering { + OrderingMethod::Spectral => "spectral", + OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", + } +} + +fn relabeling_label(key: Option<&str>, ordering: Option<&OrderingMethod>) -> Option { + match (key, ordering) { + (Some(_), Some(_)) => panic!("Provide either --key or --ordering, not both."), + (Some(key), None) => Some(key.to_string()), + (None, Some(ordering)) => Some(ordering_method_name(ordering).to_string()), + (None, None) => None, + } +} + #[cfg(test)] mod tests { use super::*; @@ -244,4 +303,21 @@ mod tests { assert_eq!(args.output_file.as_deref(), Some("sorted.json")); assert!(args.verbose); } + + #[test] + fn parse_json_mode_ordering_args() { + let args = Args::try_parse_from([ + "reben", + "dual_graph.json", + "--mode", + "json", + "--ordering", + "spectral", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Json); + assert_eq!(args.ordering, Some(OrderingMethod::Spectral)); + assert!(args.key.is_none()); + } } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 8206bd0..1e8acce 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -2,10 +2,78 @@ use crate::progress; use serde_json::{json, Value}; -use std::collections::HashMap; -use std::io::{Read, Result, Write}; +use std::cmp::Ordering; +use std::collections::{HashMap, VecDeque}; +use std::io::{self, Read, Result, Write}; use std::result::Result as StdResult; +/// Topology-based graph ordering methods supported by `reben`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GraphOrderingMethod { + /// Order nodes using the Fiedler-vector style spectral ordering. + Spectral, + /// Order nodes using Reverse Cuthill-McKee. + ReverseCuthillMckee, +} + +#[derive(Clone)] +struct GraphJson { + data: Value, + nodes: Vec, + adjacency: Vec>, + node_ids: Vec, + adjacency_indices: Vec>, +} + +impl GraphJson { + fn from_reader(reader: R) -> io::Result { + let data: Value = serde_json::from_reader(reader)?; + let nodes = data["nodes"].as_array().cloned().unwrap_or_default(); + let adjacency = data["adjacency"] + .as_array() + .map(|rows| { + rows.iter() + .map(|row| row.as_array().cloned().unwrap_or_default()) + .collect::>() + }) + .unwrap_or_default(); + + let node_ids = nodes + .iter() + .map(parse_node_id) + .collect::>>()?; + let id_to_index = node_ids + .iter() + .enumerate() + .map(|(idx, &id)| (id, idx)) + .collect::>(); + let adjacency_indices = adjacency + .iter() + .map(|row| { + row.iter() + .map(|link| { + let id = parse_link_id(link)?; + id_to_index.get(&id).copied().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Adjacency references unknown node id {id}"), + ) + }) + }) + .collect::>>() + }) + .collect::>>()?; + + Ok(Self { + data, + nodes, + adjacency, + node_ids, + adjacency_indices, + }) + } +} + /// Sorts a JSON-formatted NetworkX graph file by a key. /// /// # Arguments @@ -20,66 +88,306 @@ use std::result::Result as StdResult; /// Returns a map from the original node id to the new node id. pub fn sort_json_file_by_key( reader: R, - mut writer: W, + writer: W, key: &str, ) -> Result> { tracing::trace!("Loading JSON file..."); - let mut data: Value = serde_json::from_reader(reader).unwrap(); + let graph = GraphJson::from_reader(reader)?; + let mut order: Vec = (0..graph.nodes.len()).collect(); tracing::trace!("Sorting JSON file by key: {}", key); - if let Some(nodes) = data["nodes"].as_array_mut() { - nodes.sort_by(|a, b| { - let extract_value = |val: &Value| -> StdResult { - match &val[key] { - Value::String(s) => s.parse::().map_err(|_| s.clone()), - Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()), - _ => Err(val[key].to_string()), - } - }; + order.sort_by(|&a, &b| compare_node_key(&graph.nodes[a], &graph.nodes[b], key)); - match (extract_value(a), extract_value(b)) { - (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), - (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), - (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), - (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), - } - }); - } + reorder_graph(graph, order, writer) +} - let mut node_map = HashMap::new(); - let mut rev_node_map = HashMap::new(); - if let Some(nodes) = data["nodes"].as_array_mut() { - for (i, node) in nodes.iter_mut().enumerate() { - progress!("Relabeling node: {}\r", i + 1); - node_map.insert(node["id"].to_string().parse::().unwrap(), i); - rev_node_map.insert(i, node["id"].to_string().parse::().unwrap()); - node["id"] = json!(i); +/// Reorder a JSON-formatted NetworkX graph file using a topology-based method. +/// +/// # Arguments +/// +/// * `reader` - The source JSON graph in the NetworkX node-link style used by +/// the relabeling workflow. +/// * `writer` - The destination for the reordered JSON graph. +/// * `method` - The topology-based ordering algorithm to apply. +/// +/// # Returns +/// +/// Returns a map from the original node id to the new node id. +pub fn sort_json_file_by_ordering( + reader: R, + writer: W, + method: GraphOrderingMethod, +) -> Result> { + tracing::trace!("Loading JSON file..."); + let graph = GraphJson::from_reader(reader)?; + tracing::trace!("Sorting JSON file by ordering method: {:?}", method); + + let order = match method { + GraphOrderingMethod::Spectral => spectral_order(&graph), + GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), + }; + + reorder_graph(graph, order, writer) +} + +fn parse_node_id(node: &Value) -> io::Result { + node["id"].as_u64().map(|v| v as usize).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Node id is not an unsigned integer: {}", node["id"]), + ) + }) +} + +fn parse_link_id(link: &Value) -> io::Result { + link["id"].as_u64().map(|v| v as usize).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Edge target id is not an unsigned integer: {}", link["id"]), + ) + }) +} + +fn compare_node_key(a: &Value, b: &Value, key: &str) -> Ordering { + let extract_value = |val: &Value| -> StdResult { + match &val[key] { + Value::String(s) => s.parse::().map_err(|_| s.clone()), + Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()), + _ => Err(val[key].to_string()), } + }; + + match (extract_value(a), extract_value(b)) { + (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), + (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), + (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), + (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), + } +} + +fn reorder_graph( + mut graph: GraphJson, + order: Vec, + mut writer: W, +) -> io::Result> { + let mut old_id_to_new = HashMap::with_capacity(order.len()); + let mut new_nodes = Vec::with_capacity(order.len()); + let mut new_adjacency = Vec::with_capacity(order.len()); + + for (new_idx, &old_idx) in order.iter().enumerate() { + progress!("Relabeling node: {}\r", new_idx + 1); + old_id_to_new.insert(graph.node_ids[old_idx], new_idx); } tracing::trace!(""); - let mut edge_array = Vec::new(); - if let Some(edges) = data["adjacency"].as_array() { - for i in 0..edges.len() { - progress!("Relabeling edge: {}\r", i + 1); - let edge_list_location = - rev_node_map[&data["nodes"][i]["id"].to_string().parse::().unwrap()]; - let mut new_edge_lst = edges[edge_list_location].as_array().unwrap().clone(); - for link in &mut new_edge_lst { - let new = node_map[&link["id"].to_string().parse::().unwrap()]; - link["id"] = json!(new); - } - edge_array.push(new_edge_lst); + for (new_idx, &old_idx) in order.iter().enumerate() { + let mut node = graph.nodes[old_idx].clone(); + node["id"] = json!(new_idx); + new_nodes.push(node); + } + + for (new_idx, &old_idx) in order.iter().enumerate() { + progress!("Relabeling edge: {}\r", new_idx + 1); + let mut new_edge_lst = graph.adjacency[old_idx].clone(); + for link in &mut new_edge_lst { + let old_neighbor_id = parse_link_id(link)?; + let new_neighbor = old_id_to_new[&old_neighbor_id]; + link["id"] = json!(new_neighbor); } + new_adjacency.push(Value::Array(new_edge_lst)); } tracing::trace!(""); - data["adjacency"] = json!(edge_array); + graph.data["nodes"] = Value::Array(new_nodes); + graph.data["adjacency"] = Value::Array(new_adjacency); tracing::trace!("Writing new json to file..."); - writer.write_all(serde_json::to_string(&data).unwrap().as_bytes())?; + let rendered = serde_json::to_string(&graph.data) + .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))?; + writer.write_all(rendered.as_bytes())?; + + Ok(old_id_to_new) +} + +fn connected_components(graph: &GraphJson) -> Vec> { + let n = graph.nodes.len(); + let mut seen = vec![false; n]; + let mut components = Vec::new(); - Ok(node_map) + for start in 0..n { + if seen[start] { + continue; + } + let mut queue = VecDeque::from([start]); + let mut component = Vec::new(); + seen[start] = true; + + while let Some(node) = queue.pop_front() { + component.push(node); + for &neighbor in &graph.adjacency_indices[node] { + if !seen[neighbor] { + seen[neighbor] = true; + queue.push_back(neighbor); + } + } + } + + components.push(component); + } + + components.sort_by_key(|component| graph.node_ids[component[0]]); + components +} + +fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { + let mut order = Vec::with_capacity(graph.nodes.len()); + let degrees = graph + .adjacency_indices + .iter() + .map(Vec::len) + .collect::>(); + + for component in connected_components(graph) { + let component_set = component.iter().copied().collect::>(); + let start = component + .iter() + .copied() + .min_by_key(|&node| (degrees[node], graph.node_ids[node])) + .unwrap(); + + let mut visited = HashMap::new(); + let mut queue = VecDeque::from([start]); + visited.insert(start, true); + let mut component_order = Vec::with_capacity(component.len()); + + while let Some(node) = queue.pop_front() { + component_order.push(node); + let mut neighbors = graph.adjacency_indices[node] + .iter() + .copied() + .filter(|neighbor| component_set.contains(neighbor) && !visited.contains_key(neighbor)) + .collect::>(); + neighbors.sort_by_key(|&neighbor| (degrees[neighbor], graph.node_ids[neighbor])); + for neighbor in neighbors { + visited.insert(neighbor, true); + queue.push_back(neighbor); + } + } + + component_order.reverse(); + order.extend(component_order); + } + + order +} + +fn spectral_order(graph: &GraphJson) -> Vec { + let mut order = Vec::with_capacity(graph.nodes.len()); + let degrees = graph + .adjacency_indices + .iter() + .map(Vec::len) + .collect::>(); + + for component in connected_components(graph) { + if component.len() <= 2 { + let mut tiny = component.clone(); + tiny.sort_by_key(|&node| graph.node_ids[node]); + order.extend(tiny); + continue; + } + + let local_index = component + .iter() + .enumerate() + .map(|(idx, &node)| (node, idx)) + .collect::>(); + let max_degree = component + .iter() + .map(|&node| degrees[node]) + .max() + .unwrap_or(0) as f64; + + let mut x = component + .iter() + .map(|&node| pseudo_random_seed(graph.node_ids[node])) + .collect::>(); + orthogonalize_to_constant(&mut x); + normalize(&mut x); + + if x.iter().all(|value| value.abs() < 1e-12) { + for (idx, value) in x.iter_mut().enumerate() { + *value = idx as f64; + } + orthogonalize_to_constant(&mut x); + normalize(&mut x); + } + + let mut y = vec![0.0; component.len()]; + for _ in 0..128 { + for (local_idx, &node) in component.iter().enumerate() { + let degree = degrees[node] as f64; + let neighbor_sum = graph.adjacency_indices[node] + .iter() + .filter_map(|neighbor| local_index.get(neighbor).copied()) + .map(|neighbor_local| x[neighbor_local]) + .sum::(); + y[local_idx] = neighbor_sum + (max_degree - degree) * x[local_idx]; + } + + orthogonalize_to_constant(&mut y); + normalize(&mut y); + + let diff = x + .iter() + .zip(&y) + .map(|(a, b)| (a - b).abs()) + .fold(0.0_f64, f64::max); + x.copy_from_slice(&y); + if diff < 1e-10 { + break; + } + } + + let mut component_order = component + .iter() + .enumerate() + .map(|(local_idx, &node)| (node, x[local_idx])) + .collect::>(); + component_order.sort_by(|(a_node, a_val), (b_node, b_val)| { + a_val + .partial_cmp(b_val) + .unwrap_or(Ordering::Equal) + .then_with(|| graph.node_ids[*a_node].cmp(&graph.node_ids[*b_node])) + }); + order.extend(component_order.into_iter().map(|(node, _)| node)); + } + + order +} + +fn pseudo_random_seed(node_id: usize) -> f64 { + let raw = node_id.wrapping_mul(1_103_515_245).wrapping_add(12_345) % 1_000; + raw as f64 / 500.0 - 1.0 +} + +fn orthogonalize_to_constant(values: &mut [f64]) { + if values.is_empty() { + return; + } + let mean = values.iter().sum::() / values.len() as f64; + for value in values { + *value -= mean; + } +} + +fn normalize(values: &mut [f64]) { + let norm = values.iter().map(|value| value * value).sum::().sqrt(); + if norm > 0.0 { + for value in values { + *value /= norm; + } + } } #[cfg(test)] diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 4e9e45a..6d08eab 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -1,6 +1,23 @@ use super::*; use serde_json::Value; +fn path_graph_json() -> &'static [u8] { + br#"{ + "nodes": [ + {"id": 0}, + {"id": 1}, + {"id": 2}, + {"id": 3} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 0}, {"id": 2}], + [{"id": 1}, {"id": 3}], + [{"id": 2}] + ] + }"# +} + #[test] fn test_relabel_small_file() { let input = r#"{ @@ -267,3 +284,42 @@ fn test_sort_json_file_by_key_without_nodes_or_edges() { assert_eq!(output_json["graph"], serde_json::json!([])); assert_eq!(output_json["directed"], false); } + +#[test] +fn test_sort_json_file_by_reverse_cuthill_mckee() { + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + path_graph_json(), + &mut output, + GraphOrderingMethod::ReverseCuthillMckee, + ) + .unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(mapping.get(&3), Some(&0)); + assert_eq!(mapping.get(&2), Some(&1)); + assert_eq!(mapping.get(&1), Some(&2)); + assert_eq!(mapping.get(&0), Some(&3)); + assert_eq!(output_json["adjacency"][0][0]["id"], 1); +} + +#[test] +fn test_sort_json_file_by_spectral_ordering() { + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + path_graph_json(), + &mut output, + GraphOrderingMethod::Spectral, + ) + .unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + let endpoint_positions = [mapping[&0], mapping[&3]]; + let middle_positions = [mapping[&1], mapping[&2]]; + + assert!(endpoint_positions.contains(&0)); + assert!(endpoint_positions.contains(&3)); + assert!(middle_positions.contains(&1)); + assert!(middle_positions.contains(&2)); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); +} diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 64d7c46..6b90949 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -944,7 +944,8 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations temp.path(), ); assert_failure(&both); - assert!(String::from_utf8_lossy(&both.stderr).contains("Cannot provide both a map file and a key")); + assert!(String::from_utf8_lossy(&both.stderr) + .contains("Cannot provide both a map file and a sorting option")); let missing_shape = run( "reben", @@ -964,6 +965,60 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations assert_eq!(sorted_json["nodes"][0]["GEOID20"], "A"); } +#[test] +fn reben_cli_supports_spectral_and_rcm_orderings() { + let temp = TempDir::new("reben-orderings"); + let graph_path = temp.path().join("shape.json"); + let spectral_path = temp.path().join("spectral.json"); + let rcm_path = temp.path().join("rcm.json"); + + fs::write(&graph_path, sample_graph()).unwrap(); + + let spectral = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--ordering", + "spectral", + "--output-file", + spectral_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&spectral); + assert!(temp.path().join("shape_sorted_by_spectral_map.json").exists()); + + let rcm = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--ordering", + "reverse-cuthill-mckee", + "--output-file", + rcm_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&rcm); + assert!(temp + .path() + .join("shape_sorted_by_reverse-cuthill-mckee_map.json") + .exists()); + + let spectral_json: Value = + serde_json::from_str(&fs::read_to_string(&spectral_path).unwrap()).unwrap(); + let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); + assert_eq!( + spectral_json["nodes"].as_array().unwrap().len(), + rcm_json["nodes"].as_array().unwrap().len() + ); + assert!(!spectral_json["nodes"].as_array().unwrap().is_empty()); +} + #[test] fn pben_cli_converts_between_formats() { let temp = TempDir::new("pben"); From f737f333e0ed59b5db8a2e3e37cfb49256e19855 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:25:11 -0600 Subject: [PATCH 015/221] swap spectral for nested dissection --- ben/src/cli/reben.rs | 12 +- ben/src/json/graph/mod.rs | 231 ++++++++++++++++++++++-------------- ben/src/json/graph/tests.rs | 15 +-- ben/tests/test_cli.rs | 25 ++-- 4 files changed, 169 insertions(+), 114 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 9ccb7d6..2889280 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -22,8 +22,8 @@ enum Mode { #[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] /// Topology-based ordering methods for JSON graph relabeling. enum OrderingMethod { - /// Spectral ordering based on the graph Laplacian. - Spectral, + /// Nested dissection ordering based on recursive graph separators. + NestedDissection, /// Reverse Cuthill-McKee ordering. ReverseCuthillMckee, } @@ -245,14 +245,14 @@ pub fn run() { fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { match ordering { - OrderingMethod::Spectral => GraphOrderingMethod::Spectral, + OrderingMethod::NestedDissection => GraphOrderingMethod::NestedDissection, OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, } } fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { match ordering { - OrderingMethod::Spectral => "spectral", + OrderingMethod::NestedDissection => "nested-dissection", OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", } } @@ -312,12 +312,12 @@ mod tests { "--mode", "json", "--ordering", - "spectral", + "nested-dissection", ]) .unwrap(); assert_eq!(args.mode, Mode::Json); - assert_eq!(args.ordering, Some(OrderingMethod::Spectral)); + assert_eq!(args.ordering, Some(OrderingMethod::NestedDissection)); assert!(args.key.is_none()); } } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 1e8acce..a0c06b6 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -10,8 +10,8 @@ use std::result::Result as StdResult; /// Topology-based graph ordering methods supported by `reben`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum GraphOrderingMethod { - /// Order nodes using the Fiedler-vector style spectral ordering. - Spectral, + /// Order nodes using a recursive nested-dissection heuristic. + NestedDissection, /// Order nodes using Reverse Cuthill-McKee. ReverseCuthillMckee, } @@ -123,7 +123,7 @@ pub fn sort_json_file_by_ordering( tracing::trace!("Sorting JSON file by ordering method: {:?}", method); let order = match method { - GraphOrderingMethod::Spectral => spectral_order(&graph), + GraphOrderingMethod::NestedDissection => nested_dissection_order(&graph), GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), }; @@ -281,113 +281,168 @@ fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { order } -fn spectral_order(graph: &GraphJson) -> Vec { +fn nested_dissection_order(graph: &GraphJson) -> Vec { let mut order = Vec::with_capacity(graph.nodes.len()); - let degrees = graph - .adjacency_indices - .iter() - .map(Vec::len) - .collect::>(); for component in connected_components(graph) { - if component.len() <= 2 { - let mut tiny = component.clone(); - tiny.sort_by_key(|&node| graph.node_ids[node]); - order.extend(tiny); - continue; - } + order.extend(nested_dissection_component(graph, component)); + } + + order +} + +fn nested_dissection_component(graph: &GraphJson, component: Vec) -> Vec { + if component.len() <= 8 { + let mut base = component; + base.sort_by_key(|&node| graph.node_ids[node]); + return base; + } + + let component_mask = subset_mask(graph.nodes.len(), &component); + let start = component + .iter() + .copied() + .min_by_key(|&node| (graph.adjacency_indices[node].len(), graph.node_ids[node])) + .unwrap(); + let a = farthest_node_in_subset(graph, start, &component_mask); + let (b, dist_from_a) = farthest_node_with_distances(graph, a, &component_mask); + let dist_from_b = bfs_distances(graph, b, &component_mask); + + let Some(max_dist) = dist_from_a.iter().flatten().copied().max() else { + let mut base = component; + base.sort_by_key(|&node| graph.node_ids[node]); + return base; + }; + + let separator_target = max_dist / 2; + let mut separator = component + .iter() + .copied() + .filter(|&node| dist_from_a[node] == Some(separator_target)) + .collect::>(); - let local_index = component + if separator.is_empty() { + let best_delta = component .iter() - .enumerate() - .map(|(idx, &node)| (node, idx)) - .collect::>(); - let max_degree = component + .filter_map(|&node| Some((node, dist_from_a[node]?, dist_from_b[node]?))) + .map(|(_, da, db)| da.abs_diff(db)) + .min() + .unwrap_or(0); + separator = component .iter() - .map(|&node| degrees[node]) - .max() - .unwrap_or(0) as f64; + .copied() + .filter(|&node| { + matches!( + (dist_from_a[node], dist_from_b[node]), + (Some(da), Some(db)) if da.abs_diff(db) == best_delta + ) + }) + .collect(); + } - let mut x = component - .iter() - .map(|&node| pseudo_random_seed(graph.node_ids[node])) - .collect::>(); - orthogonalize_to_constant(&mut x); - normalize(&mut x); - - if x.iter().all(|value| value.abs() < 1e-12) { - for (idx, value) in x.iter_mut().enumerate() { - *value = idx as f64; - } - orthogonalize_to_constant(&mut x); - normalize(&mut x); - } + separator.sort_by_key(|&node| graph.node_ids[node]); + let separator_mask = subset_mask(graph.nodes.len(), &separator); + let remaining = component + .iter() + .copied() + .filter(|node| !separator_mask[*node]) + .collect::>(); + let mut subcomponents = connected_components_in_subset(graph, &remaining); + if subcomponents.len() <= 1 { + let mut fallback = component; + fallback.sort_by_key(|&node| graph.node_ids[node]); + return fallback; + } - let mut y = vec![0.0; component.len()]; - for _ in 0..128 { - for (local_idx, &node) in component.iter().enumerate() { - let degree = degrees[node] as f64; - let neighbor_sum = graph.adjacency_indices[node] - .iter() - .filter_map(|neighbor| local_index.get(neighbor).copied()) - .map(|neighbor_local| x[neighbor_local]) - .sum::(); - y[local_idx] = neighbor_sum + (max_degree - degree) * x[local_idx]; - } + subcomponents.sort_by_key(|part| { + part.iter() + .filter_map(|&node| dist_from_a[node]) + .min() + .unwrap_or(usize::MAX) + }); - orthogonalize_to_constant(&mut y); - normalize(&mut y); + let mut order = Vec::with_capacity(component.len()); + for subcomponent in subcomponents { + order.extend(nested_dissection_component(graph, subcomponent)); + } + order.extend(separator); + order +} - let diff = x - .iter() - .zip(&y) - .map(|(a, b)| (a - b).abs()) - .fold(0.0_f64, f64::max); - x.copy_from_slice(&y); - if diff < 1e-10 { - break; +fn subset_mask(size: usize, nodes: &[usize]) -> Vec { + let mut mask = vec![false; size]; + for &node in nodes { + mask[node] = true; + } + mask +} + +fn bfs_distances(graph: &GraphJson, start: usize, allowed: &[bool]) -> Vec> { + let mut distances = vec![None; graph.nodes.len()]; + let mut queue = VecDeque::from([start]); + distances[start] = Some(0); + + while let Some(node) = queue.pop_front() { + let distance = distances[node].unwrap(); + for &neighbor in &graph.adjacency_indices[node] { + if allowed[neighbor] && distances[neighbor].is_none() { + distances[neighbor] = Some(distance + 1); + queue.push_back(neighbor); } } - - let mut component_order = component - .iter() - .enumerate() - .map(|(local_idx, &node)| (node, x[local_idx])) - .collect::>(); - component_order.sort_by(|(a_node, a_val), (b_node, b_val)| { - a_val - .partial_cmp(b_val) - .unwrap_or(Ordering::Equal) - .then_with(|| graph.node_ids[*a_node].cmp(&graph.node_ids[*b_node])) - }); - order.extend(component_order.into_iter().map(|(node, _)| node)); } - order + distances } -fn pseudo_random_seed(node_id: usize) -> f64 { - let raw = node_id.wrapping_mul(1_103_515_245).wrapping_add(12_345) % 1_000; - raw as f64 / 500.0 - 1.0 +fn farthest_node_in_subset(graph: &GraphJson, start: usize, allowed: &[bool]) -> usize { + farthest_node_with_distances(graph, start, allowed).0 } -fn orthogonalize_to_constant(values: &mut [f64]) { - if values.is_empty() { - return; - } - let mean = values.iter().sum::() / values.len() as f64; - for value in values { - *value -= mean; - } +fn farthest_node_with_distances( + graph: &GraphJson, + start: usize, + allowed: &[bool], +) -> (usize, Vec>) { + let distances = bfs_distances(graph, start, allowed); + let farthest = distances + .iter() + .enumerate() + .filter(|(idx, distance)| allowed[*idx] && distance.is_some()) + .max_by_key(|(idx, distance)| (distance.unwrap(), graph.node_ids[*idx])) + .map(|(idx, _)| idx) + .unwrap_or(start); + (farthest, distances) } -fn normalize(values: &mut [f64]) { - let norm = values.iter().map(|value| value * value).sum::().sqrt(); - if norm > 0.0 { - for value in values { - *value /= norm; +fn connected_components_in_subset(graph: &GraphJson, nodes: &[usize]) -> Vec> { + let allowed = subset_mask(graph.nodes.len(), nodes); + let mut seen = vec![false; graph.nodes.len()]; + let mut components = Vec::new(); + + for &start in nodes { + if seen[start] { + continue; } + let mut queue = VecDeque::from([start]); + let mut component = Vec::new(); + seen[start] = true; + + while let Some(node) = queue.pop_front() { + component.push(node); + for &neighbor in &graph.adjacency_indices[node] { + if allowed[neighbor] && !seen[neighbor] { + seen[neighbor] = true; + queue.push_back(neighbor); + } + } + } + + component.sort_by_key(|&node| graph.node_ids[node]); + components.push(component); } + + components } #[cfg(test)] diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 6d08eab..942fbd1 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -304,22 +304,19 @@ fn test_sort_json_file_by_reverse_cuthill_mckee() { } #[test] -fn test_sort_json_file_by_spectral_ordering() { +fn test_sort_json_file_by_nested_dissection() { let mut output = Vec::new(); let mapping = sort_json_file_by_ordering( path_graph_json(), &mut output, - GraphOrderingMethod::Spectral, + GraphOrderingMethod::NestedDissection, ) .unwrap(); let output_json: Value = serde_json::from_slice(&output).unwrap(); - let endpoint_positions = [mapping[&0], mapping[&3]]; - let middle_positions = [mapping[&1], mapping[&2]]; - - assert!(endpoint_positions.contains(&0)); - assert!(endpoint_positions.contains(&3)); - assert!(middle_positions.contains(&1)); - assert!(middle_positions.contains(&2)); + let positions = [mapping[&0], mapping[&1], mapping[&2], mapping[&3]]; + let mut sorted = positions; + sorted.sort_unstable(); + assert_eq!(sorted, [0, 1, 2, 3]); assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 6b90949..ee7c8bd 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -966,29 +966,32 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations } #[test] -fn reben_cli_supports_spectral_and_rcm_orderings() { +fn reben_cli_supports_nested_dissection_and_rcm_orderings() { let temp = TempDir::new("reben-orderings"); let graph_path = temp.path().join("shape.json"); - let spectral_path = temp.path().join("spectral.json"); + let nested_path = temp.path().join("nested.json"); let rcm_path = temp.path().join("rcm.json"); fs::write(&graph_path, sample_graph()).unwrap(); - let spectral = run( + let nested = run( "reben", &[ graph_path.to_str().unwrap(), "--mode", "json", "--ordering", - "spectral", + "nested-dissection", "--output-file", - spectral_path.to_str().unwrap(), + nested_path.to_str().unwrap(), ], temp.path(), ); - assert_success(&spectral); - assert!(temp.path().join("shape_sorted_by_spectral_map.json").exists()); + assert_success(&nested); + assert!(temp + .path() + .join("shape_sorted_by_nested-dissection_map.json") + .exists()); let rcm = run( "reben", @@ -1009,14 +1012,14 @@ fn reben_cli_supports_spectral_and_rcm_orderings() { .join("shape_sorted_by_reverse-cuthill-mckee_map.json") .exists()); - let spectral_json: Value = - serde_json::from_str(&fs::read_to_string(&spectral_path).unwrap()).unwrap(); + let nested_json: Value = + serde_json::from_str(&fs::read_to_string(&nested_path).unwrap()).unwrap(); let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); assert_eq!( - spectral_json["nodes"].as_array().unwrap().len(), + nested_json["nodes"].as_array().unwrap().len(), rcm_json["nodes"].as_array().unwrap().len() ); - assert!(!spectral_json["nodes"].as_array().unwrap().is_empty()); + assert!(!nested_json["nodes"].as_array().unwrap().is_empty()); } #[test] From 07750f0a05f7b82378b51a724e17038805dd9f9f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:45:48 -0600 Subject: [PATCH 016/221] swap nested for mla --- ben/src/cli/reben.rs | 55 +++++-- ben/src/json/graph/mod.rs | 273 +++++++++++++++-------------------- ben/src/json/graph/tests.rs | 4 +- ben/src/ops/relabel/mod.rs | 199 +++++++++++++++++++++++-- ben/src/ops/relabel/tests.rs | 62 ++++++++ ben/tests/test_cli.rs | 119 +++++++++++++-- 6 files changed, 520 insertions(+), 192 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 2889280..917f9e8 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -1,7 +1,10 @@ use crate::cli::common::set_verbose; use crate::{ json::graph::{sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod}, - ops::relabel::{relabel_ben_file, relabel_ben_file_with_map}, + ops::relabel::{ + relabel_ben_file, relabel_ben_file_limit, relabel_ben_file_with_map, + relabel_ben_file_with_map_limit, + }, }; use clap::{Parser, ValueEnum}; use serde_json::{json, Value}; @@ -22,8 +25,9 @@ enum Mode { #[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] /// Topology-based ordering methods for JSON graph relabeling. enum OrderingMethod { - /// Nested dissection ordering based on recursive graph separators. - NestedDissection, + /// Minimum-linear-arrangement heuristic based on graph adjacency alone. + #[clap(alias = "mla")] + MinimumLinearArrangement, /// Reverse Cuthill-McKee ordering. ReverseCuthillMckee, } @@ -67,6 +71,9 @@ struct Args { /// the assignment vectors in the BEN file. #[arg(short, long)] mode: Mode, + /// Only relabel the first `n` expanded samples in BEN mode. + #[arg(long)] + n_items: Option, /// Verbosity level for the program. #[arg(short, long)] verbose: bool, @@ -79,6 +86,9 @@ pub fn run() { match &args.mode { Mode::Json => { + if args.n_items.is_some() { + panic!("--n-items is only supported in BEN mode."); + } let input_file = File::open(&args.input_file).expect("Could not open input file."); let reader = BufReader::new(input_file); let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref()) @@ -143,7 +153,11 @@ pub fn run() { File::create(&output_file_name).expect("Could not create output file."); let writer = BufWriter::new(output_file); - relabel_ben_file(reader, writer).unwrap(); + if let Some(limit) = args.n_items { + relabel_ben_file_limit(reader, writer, limit).unwrap(); + } else { + relabel_ben_file(reader, writer).unwrap(); + } return; } @@ -238,21 +252,28 @@ pub fn run() { tracing::trace!("Relabeling ben file according to map file {}", map_file_name,); - relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); + if let Some(limit) = args.n_items { + relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) + .unwrap(); + } else { + relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); + } } } } fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { match ordering { - OrderingMethod::NestedDissection => GraphOrderingMethod::NestedDissection, + OrderingMethod::MinimumLinearArrangement => { + GraphOrderingMethod::MinimumLinearArrangement + } OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, } } fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { match ordering { - OrderingMethod::NestedDissection => "nested-dissection", + OrderingMethod::MinimumLinearArrangement => "minimum-linear-arrangement", OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", } } @@ -312,12 +333,28 @@ mod tests { "--mode", "json", "--ordering", - "nested-dissection", + "minimum-linear-arrangement", ]) .unwrap(); assert_eq!(args.mode, Mode::Json); - assert_eq!(args.ordering, Some(OrderingMethod::NestedDissection)); + assert_eq!(args.ordering, Some(OrderingMethod::MinimumLinearArrangement)); assert!(args.key.is_none()); } + + #[test] + fn parse_ben_mode_n_items_args() { + let args = Args::try_parse_from([ + "reben", + "samples.jsonl.ben", + "--mode", + "ben", + "--n-items", + "25", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Ben); + assert_eq!(args.n_items, Some(25)); + } } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index a0c06b6..461a681 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -10,8 +10,8 @@ use std::result::Result as StdResult; /// Topology-based graph ordering methods supported by `reben`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum GraphOrderingMethod { - /// Order nodes using a recursive nested-dissection heuristic. - NestedDissection, + /// Order nodes using a minimum-linear-arrangement heuristic. + MinimumLinearArrangement, /// Order nodes using Reverse Cuthill-McKee. ReverseCuthillMckee, } @@ -123,7 +123,7 @@ pub fn sort_json_file_by_ordering( tracing::trace!("Sorting JSON file by ordering method: {:?}", method); let order = match method { - GraphOrderingMethod::NestedDissection => nested_dissection_order(&graph), + GraphOrderingMethod::MinimumLinearArrangement => minimum_linear_arrangement_order(&graph), GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), }; @@ -241,131 +241,81 @@ fn connected_components(graph: &GraphJson) -> Vec> { fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { let mut order = Vec::with_capacity(graph.nodes.len()); + + for component in connected_components(graph) { + order.extend(reverse_cuthill_mckee_component(graph, &component)); + } + + order +} + +fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Vec { let degrees = graph .adjacency_indices .iter() .map(Vec::len) .collect::>(); + let component_set = component.iter().copied().collect::>(); + let start = component + .iter() + .copied() + .min_by_key(|&node| (degrees[node], graph.node_ids[node])) + .unwrap(); - for component in connected_components(graph) { - let component_set = component.iter().copied().collect::>(); - let start = component + let mut visited = vec![false; graph.nodes.len()]; + let mut queue = VecDeque::from([start]); + visited[start] = true; + let mut component_order = Vec::with_capacity(component.len()); + + while let Some(node) = queue.pop_front() { + component_order.push(node); + let mut neighbors = graph.adjacency_indices[node] .iter() .copied() - .min_by_key(|&node| (degrees[node], graph.node_ids[node])) - .unwrap(); - - let mut visited = HashMap::new(); - let mut queue = VecDeque::from([start]); - visited.insert(start, true); - let mut component_order = Vec::with_capacity(component.len()); - - while let Some(node) = queue.pop_front() { - component_order.push(node); - let mut neighbors = graph.adjacency_indices[node] - .iter() - .copied() - .filter(|neighbor| component_set.contains(neighbor) && !visited.contains_key(neighbor)) - .collect::>(); - neighbors.sort_by_key(|&neighbor| (degrees[neighbor], graph.node_ids[neighbor])); - for neighbor in neighbors { - visited.insert(neighbor, true); - queue.push_back(neighbor); - } + .filter(|neighbor| component_set.contains(neighbor) && !visited[*neighbor]) + .collect::>(); + neighbors.sort_by_key(|&neighbor| (degrees[neighbor], graph.node_ids[neighbor])); + for neighbor in neighbors { + visited[neighbor] = true; + queue.push_back(neighbor); } - - component_order.reverse(); - order.extend(component_order); } - order + component_order.reverse(); + component_order } -fn nested_dissection_order(graph: &GraphJson) -> Vec { +fn minimum_linear_arrangement_order(graph: &GraphJson) -> Vec { let mut order = Vec::with_capacity(graph.nodes.len()); for component in connected_components(graph) { - order.extend(nested_dissection_component(graph, component)); + order.extend(minimum_linear_arrangement_component(graph, &component)); } order } -fn nested_dissection_component(graph: &GraphJson, component: Vec) -> Vec { - if component.len() <= 8 { - let mut base = component; - base.sort_by_key(|&node| graph.node_ids[node]); - return base; +fn minimum_linear_arrangement_component(graph: &GraphJson, component: &[usize]) -> Vec { + if component.len() <= 2 { + return component.to_vec(); } - let component_mask = subset_mask(graph.nodes.len(), &component); - let start = component - .iter() - .copied() - .min_by_key(|&node| (graph.adjacency_indices[node].len(), graph.node_ids[node])) - .unwrap(); - let a = farthest_node_in_subset(graph, start, &component_mask); - let (b, dist_from_a) = farthest_node_with_distances(graph, a, &component_mask); - let dist_from_b = bfs_distances(graph, b, &component_mask); - - let Some(max_dist) = dist_from_a.iter().flatten().copied().max() else { - let mut base = component; - base.sort_by_key(|&node| graph.node_ids[node]); - return base; - }; - - let separator_target = max_dist / 2; - let mut separator = component - .iter() - .copied() - .filter(|&node| dist_from_a[node] == Some(separator_target)) - .collect::>(); - - if separator.is_empty() { - let best_delta = component - .iter() - .filter_map(|&node| Some((node, dist_from_a[node]?, dist_from_b[node]?))) - .map(|(_, da, db)| da.abs_diff(db)) - .min() - .unwrap_or(0); - separator = component - .iter() - .copied() - .filter(|&node| { - matches!( - (dist_from_a[node], dist_from_b[node]), - (Some(da), Some(db)) if da.abs_diff(db) == best_delta - ) - }) - .collect(); - } - - separator.sort_by_key(|&node| graph.node_ids[node]); - let separator_mask = subset_mask(graph.nodes.len(), &separator); - let remaining = component - .iter() - .copied() - .filter(|node| !separator_mask[*node]) - .collect::>(); - let mut subcomponents = connected_components_in_subset(graph, &remaining); - if subcomponents.len() <= 1 { - let mut fallback = component; - fallback.sort_by_key(|&node| graph.node_ids[node]); - return fallback; + let component_mask = subset_mask(graph.nodes.len(), component); + let mut order = reverse_cuthill_mckee_component(graph, component); + + for _ in 0..8 { + let positions = positions_for_order(graph.nodes.len(), &order); + order.sort_by(|&a, &b| { + let a_score = barycenter_score(graph, a, &positions, &component_mask); + let b_score = barycenter_score(graph, b, &positions, &component_mask); + a_score + .partial_cmp(&b_score) + .unwrap_or(Ordering::Equal) + .then_with(|| graph.node_ids[a].cmp(&graph.node_ids[b])) + }); + local_adjacent_improvement(graph, &mut order, &component_mask); } - subcomponents.sort_by_key(|part| { - part.iter() - .filter_map(|&node| dist_from_a[node]) - .min() - .unwrap_or(usize::MAX) - }); - - let mut order = Vec::with_capacity(component.len()); - for subcomponent in subcomponents { - order.extend(nested_dissection_component(graph, subcomponent)); - } - order.extend(separator); order } @@ -377,72 +327,79 @@ fn subset_mask(size: usize, nodes: &[usize]) -> Vec { mask } -fn bfs_distances(graph: &GraphJson, start: usize, allowed: &[bool]) -> Vec> { - let mut distances = vec![None; graph.nodes.len()]; - let mut queue = VecDeque::from([start]); - distances[start] = Some(0); - - while let Some(node) = queue.pop_front() { - let distance = distances[node].unwrap(); - for &neighbor in &graph.adjacency_indices[node] { - if allowed[neighbor] && distances[neighbor].is_none() { - distances[neighbor] = Some(distance + 1); - queue.push_back(neighbor); - } - } +fn positions_for_order(size: usize, order: &[usize]) -> Vec { + let mut positions = vec![usize::MAX; size]; + for (idx, &node) in order.iter().enumerate() { + positions[node] = idx; } - - distances + positions } -fn farthest_node_in_subset(graph: &GraphJson, start: usize, allowed: &[bool]) -> usize { - farthest_node_with_distances(graph, start, allowed).0 -} - -fn farthest_node_with_distances( +fn barycenter_score( graph: &GraphJson, - start: usize, - allowed: &[bool], -) -> (usize, Vec>) { - let distances = bfs_distances(graph, start, allowed); - let farthest = distances - .iter() - .enumerate() - .filter(|(idx, distance)| allowed[*idx] && distance.is_some()) - .max_by_key(|(idx, distance)| (distance.unwrap(), graph.node_ids[*idx])) - .map(|(idx, _)| idx) - .unwrap_or(start); - (farthest, distances) -} + node: usize, + positions: &[usize], + component_mask: &[bool], +) -> f64 { + let mut sum = 0.0; + let mut count = 0.0; + for &neighbor in &graph.adjacency_indices[node] { + if component_mask[neighbor] { + sum += positions[neighbor] as f64; + count += 1.0; + } + } -fn connected_components_in_subset(graph: &GraphJson, nodes: &[usize]) -> Vec> { - let allowed = subset_mask(graph.nodes.len(), nodes); - let mut seen = vec![false; graph.nodes.len()]; - let mut components = Vec::new(); + if count == 0.0 { + positions[node] as f64 + } else { + sum / count + } +} - for &start in nodes { - if seen[start] { - continue; - } - let mut queue = VecDeque::from([start]); - let mut component = Vec::new(); - seen[start] = true; +fn local_adjacent_improvement(graph: &GraphJson, order: &mut [usize], component_mask: &[bool]) { + if order.len() < 2 { + return; + } - while let Some(node) = queue.pop_front() { - component.push(node); - for &neighbor in &graph.adjacency_indices[node] { - if allowed[neighbor] && !seen[neighbor] { - seen[neighbor] = true; - queue.push_back(neighbor); - } + let mut improved = true; + while improved { + improved = false; + let mut positions = positions_for_order(graph.nodes.len(), order); + for idx in 0..order.len() - 1 { + let current_cost = node_span_cost(graph, order[idx], &positions, component_mask) + + node_span_cost(graph, order[idx + 1], &positions, component_mask); + + order.swap(idx, idx + 1); + positions[order[idx]] = idx; + positions[order[idx + 1]] = idx + 1; + + let swapped_cost = node_span_cost(graph, order[idx], &positions, component_mask) + + node_span_cost(graph, order[idx + 1], &positions, component_mask); + + if swapped_cost <= current_cost { + improved = swapped_cost < current_cost; + } else { + order.swap(idx, idx + 1); + positions[order[idx]] = idx; + positions[order[idx + 1]] = idx + 1; } } - - component.sort_by_key(|&node| graph.node_ids[node]); - components.push(component); } +} - components +fn node_span_cost( + graph: &GraphJson, + node: usize, + positions: &[usize], + component_mask: &[bool], +) -> usize { + graph.adjacency_indices[node] + .iter() + .copied() + .filter(|&neighbor| component_mask[neighbor]) + .map(|neighbor| positions[node].abs_diff(positions[neighbor])) + .sum() } #[cfg(test)] diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 942fbd1..10c0efa 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -304,12 +304,12 @@ fn test_sort_json_file_by_reverse_cuthill_mckee() { } #[test] -fn test_sort_json_file_by_nested_dissection() { +fn test_sort_json_file_by_minimum_linear_arrangement() { let mut output = Vec::new(); let mapping = sort_json_file_by_ordering( path_graph_json(), &mut output, - GraphOrderingMethod::NestedDissection, + GraphOrderingMethod::MinimumLinearArrangement, ) .unwrap(); let output_json: Value = serde_json::from_slice(&output).unwrap(); diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 12ed6ed..1ed2982 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -56,10 +56,49 @@ pub fn relabel_ben_lines( mut reader: R, mut writer: W, variant: BenVariant, +) -> io::Result<()> { + relabel_ben_lines_impl(&mut reader, &mut writer, variant, None) +} + +/// Canonicalize up to a bounded number of samples from a BEN frame stream. +/// +/// Labels are reassigned in first-seen order within each assignment vector, +/// which can improve downstream compression ratios. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and +/// written. +pub fn relabel_ben_lines_limit( + mut reader: R, + mut writer: W, + variant: BenVariant, + max_samples: usize, +) -> io::Result<()> { + relabel_ben_lines_impl(&mut reader, &mut writer, variant, Some(max_samples)) +} + +/// Shared implementation for canonical BEN relabeling. +fn relabel_ben_lines_impl( + mut reader: R, + mut writer: W, + variant: BenVariant, + max_samples: Option, ) -> io::Result<()> { let mut sample_number = 0; let mut label_map = HashMap::new(); loop { + if max_samples.is_some_and(|limit| sample_number >= limit) { + break; + } let mut tmp_buffer = [0u8]; let max_val_bits = match reader.read_exact(&mut tmp_buffer) { Ok(_) => tmp_buffer[0], @@ -91,17 +130,22 @@ pub fn relabel_ben_lines( *val = new_val; } - let relabeled = encode_ben_vec_from_rle(ben_line); - writer.write_all(&relabeled)?; - let count_occurrences = if variant == BenVariant::MkvChain { let count = reader.read_u16::()?; - writer.write_all(&count.to_be_bytes())?; - count + let out_count = max_samples + .map(|limit| ((limit - sample_number).min(count as usize)) as u16) + .unwrap_or(count); + out_count } else { 1 }; + let relabeled = encode_ben_vec_from_rle(ben_line); + writer.write_all(&relabeled)?; + if variant == BenVariant::MkvChain { + writer.write_all(&count_occurrences.to_be_bytes())?; + } + sample_number += count_occurrences as usize; progress!("Relabeling line: {}\r", sample_number); @@ -123,6 +167,35 @@ pub fn relabel_ben_lines( /// /// Returns `Ok(())` after the full BEN file has been relabeled. pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io::Result<()> { + relabel_ben_file_impl(&mut reader, &mut writer, None) +} + +/// Relabel at most `max_samples` expanded samples from a BEN file, preserving +/// its leading BEN banner. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled. +pub fn relabel_ben_file_limit( + mut reader: R, + mut writer: W, + max_samples: usize, +) -> io::Result<()> { + relabel_ben_file_impl(&mut reader, &mut writer, Some(max_samples)) +} + +/// Shared implementation for BEN-file canonical relabeling. +fn relabel_ben_file_impl( + mut reader: R, + mut writer: W, + max_samples: Option, +) -> io::Result<()> { let mut check_buffer = [0u8; 17]; reader.read_exact(&mut check_buffer)?; @@ -139,7 +212,7 @@ pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io:: writer.write_all(&check_buffer)?; - relabel_ben_lines(&mut reader, &mut writer, variant)?; + relabel_ben_lines_impl(&mut reader, &mut writer, variant, max_samples)?; Ok(()) } @@ -166,6 +239,56 @@ pub fn relabel_ben_lines_with_map( mut writer: W, new_to_old_node_map: HashMap, variant: BenVariant, +) -> io::Result<()> { + relabel_ben_lines_with_map_impl( + &mut reader, + &mut writer, + new_to_old_node_map, + variant, + None, + ) +} + +/// Relabel BEN frames using an externally supplied node map, up to a bounded +/// number of expanded samples. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and +/// written. +pub fn relabel_ben_lines_with_map_limit( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, + variant: BenVariant, + max_samples: usize, +) -> io::Result<()> { + relabel_ben_lines_with_map_impl( + &mut reader, + &mut writer, + new_to_old_node_map, + variant, + Some(max_samples), + ) +} + +/// Shared implementation for mapped BEN relabeling. +fn relabel_ben_lines_with_map_impl( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, + variant: BenVariant, + max_samples: Option, ) -> io::Result<()> { let mut sample_number = 0; let permutation = dense_permutation(&new_to_old_node_map)?; @@ -173,6 +296,9 @@ pub fn relabel_ben_lines_with_map( let mut new_assignment_vec = vec![0u16; permutation.len()]; let mut new_rle = Vec::new(); loop { + if max_samples.is_some_and(|limit| sample_number >= limit) { + break; + } let mut tmp_buffer = [0u8]; let max_val_bits = match reader.read_exact(&mut tmp_buffer) { Ok(_) => tmp_buffer[0], @@ -207,17 +333,22 @@ pub fn relabel_ben_lines_with_map( assign_slice_to_rle(&new_assignment_vec, &mut new_rle); - let relabeled = encode_ben_vec_from_rle(new_rle.clone()); - writer.write_all(&relabeled)?; - let count_occurrences = if variant == BenVariant::MkvChain { let count = reader.read_u16::()?; - writer.write_all(&count.to_be_bytes())?; - count + let out_count = max_samples + .map(|limit| ((limit - sample_number).min(count as usize)) as u16) + .unwrap_or(count); + out_count } else { 1 }; + let relabeled = encode_ben_vec_from_rle(new_rle.clone()); + writer.write_all(&relabeled)?; + if variant == BenVariant::MkvChain { + writer.write_all(&count_occurrences.to_be_bytes())?; + } + sample_number += count_occurrences as usize; progress!("Relabeling line: {}\r", sample_number); } @@ -243,6 +374,44 @@ pub fn relabel_ben_file_with_map( mut reader: R, mut writer: W, new_to_old_node_map: HashMap, +) -> io::Result<()> { + relabel_ben_file_with_map_impl(&mut reader, &mut writer, new_to_old_node_map, None) +} + +/// Relabel at most `max_samples` expanded samples from a BEN file using an +/// externally supplied node map. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled. +pub fn relabel_ben_file_with_map_limit( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, + max_samples: usize, +) -> io::Result<()> { + relabel_ben_file_with_map_impl( + &mut reader, + &mut writer, + new_to_old_node_map, + Some(max_samples), + ) +} + +/// Shared implementation for BEN-file mapped relabeling. +fn relabel_ben_file_with_map_impl( + mut reader: R, + mut writer: W, + new_to_old_node_map: HashMap, + max_samples: Option, ) -> io::Result<()> { let mut check_buffer = [0u8; 17]; reader.read_exact(&mut check_buffer)?; @@ -260,7 +429,13 @@ pub fn relabel_ben_file_with_map( writer.write_all(&check_buffer)?; - relabel_ben_lines_with_map(&mut reader, &mut writer, new_to_old_node_map, variant)?; + relabel_ben_lines_with_map_impl( + &mut reader, + &mut writer, + new_to_old_node_map, + variant, + max_samples, + )?; Ok(()) } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 81faebb..7a3a9ce 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -134,6 +134,33 @@ fn test_relabel_simple_file_mkv() { assert_eq!(output_str, out_file); } +#[test] +fn test_relabel_simple_file_mkv_with_limit() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + "{\"assignment\":[2,3,1],\"sample\":4}\n" + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::MkvChain) + .unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_file_limit(encoded.as_slice(), io::BufWriter::new(&mut relabeled), 2).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n" + ); + assert_eq!(output_str, expected); +} + #[test] fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; @@ -329,6 +356,41 @@ fn test_relabel_simple_file_with_map_mkv() { assert_eq!(output_str, out_file); } +#[test] +fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + "{\"assignment\":[3,1,2],\"sample\":4}\n" + ); + + let new_to_old_map: HashMap = [(0, 1), (1, 2), (2, 0)].iter().cloned().collect(); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::MkvChain) + .unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_file_with_map_limit( + encoded.as_slice(), + io::BufWriter::new(&mut relabeled), + new_to_old_map, + 2, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[2,3,1],\"sample\":1}\n", + "{\"assignment\":[2,3,1],\"sample\":2}\n" + ); + assert_eq!(output_str, expected); +} + #[test] fn test_relabel_file_rejects_invalid_header() { let err = relabel_ben_file(b"not a valid banner".as_slice(), Vec::new()).unwrap_err(); diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index ee7c8bd..b52f6cc 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -891,6 +891,103 @@ fn reben_cli_json_and_ben_modes_work() { assert!(relabeled_text.contains(r#""assignment":[9,4,9]"#)); } +#[test] +fn reben_cli_can_limit_ben_relabeling_to_first_n_items() { + let temp = TempDir::new("reben-limit"); + let graph_path = temp.path().join("dual_graph.json"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.jsonl.ben"); + let canonical_path = temp.path().join("canonicalized_first_one.ben"); + let map_path = temp.path().join("dual_graph_sorted_by_GEOID20_map.json"); + let map_relabel_path = temp.path().join("map_relabel_first_one.ben"); + + fs::write(&graph_path, sample_graph()).unwrap(); + fs::write( + &jsonl_path, + r#"{"assignment":[9,9,4],"sample":1} +{"assignment":[4,7,7],"sample":2} +"#, + ) + .unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(fs::File::open(&jsonl_path).unwrap()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let sort_graph = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--key", + "GEOID20", + ], + temp.path(), + ); + assert_success(&sort_graph); + assert!(map_path.exists()); + + let canonicalize = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--n-items", + "1", + "--output-file", + canonical_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&canonicalize); + + let relabel = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--n-items", + "1", + "--output-file", + map_relabel_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&relabel); + + let mut canonical_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&canonical_path).unwrap()), + &mut canonical_jsonl, + ) + .unwrap(); + assert_eq!( + String::from_utf8(canonical_jsonl).unwrap(), + "{\"assignment\":[1,1,2],\"sample\":1}\n" + ); + + let mut relabeled_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&map_relabel_path).unwrap()), + &mut relabeled_jsonl, + ) + .unwrap(); + assert_eq!( + String::from_utf8(relabeled_jsonl).unwrap(), + "{\"assignment\":[9,4,9],\"sample\":1}\n" + ); +} + #[test] fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations() { let temp = TempDir::new("reben-more"); @@ -966,31 +1063,31 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations } #[test] -fn reben_cli_supports_nested_dissection_and_rcm_orderings() { +fn reben_cli_supports_mla_and_rcm_orderings() { let temp = TempDir::new("reben-orderings"); let graph_path = temp.path().join("shape.json"); - let nested_path = temp.path().join("nested.json"); + let mla_path = temp.path().join("mla.json"); let rcm_path = temp.path().join("rcm.json"); fs::write(&graph_path, sample_graph()).unwrap(); - let nested = run( + let mla = run( "reben", &[ graph_path.to_str().unwrap(), "--mode", "json", "--ordering", - "nested-dissection", + "minimum-linear-arrangement", "--output-file", - nested_path.to_str().unwrap(), + mla_path.to_str().unwrap(), ], temp.path(), ); - assert_success(&nested); + assert_success(&mla); assert!(temp .path() - .join("shape_sorted_by_nested-dissection_map.json") + .join("shape_sorted_by_minimum-linear-arrangement_map.json") .exists()); let rcm = run( @@ -1012,14 +1109,14 @@ fn reben_cli_supports_nested_dissection_and_rcm_orderings() { .join("shape_sorted_by_reverse-cuthill-mckee_map.json") .exists()); - let nested_json: Value = - serde_json::from_str(&fs::read_to_string(&nested_path).unwrap()).unwrap(); + let mla_json: Value = + serde_json::from_str(&fs::read_to_string(&mla_path).unwrap()).unwrap(); let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); assert_eq!( - nested_json["nodes"].as_array().unwrap().len(), + mla_json["nodes"].as_array().unwrap().len(), rcm_json["nodes"].as_array().unwrap().len() ); - assert!(!nested_json["nodes"].as_array().unwrap().is_empty()); + assert!(!mla_json["nodes"].as_array().unwrap().is_empty()); } #[test] From a1862b92c57cf7c489e22a1dcbb37609afc8926b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:59:47 -0600 Subject: [PATCH 017/221] swap mla for mlc --- ben/src/cli/reben.rs | 9 +- ben/src/json/graph/mod.rs | 238 +++++++++++++++++++++++++++++++++++- ben/src/json/graph/tests.rs | 18 +++ ben/tests/test_cli.rs | 31 +++++ 4 files changed, 292 insertions(+), 4 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 917f9e8..382e212 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -28,6 +28,9 @@ enum OrderingMethod { /// Minimum-linear-arrangement heuristic based on graph adjacency alone. #[clap(alias = "mla")] MinimumLinearArrangement, + /// Recursive multilevel clustering based on local neighborhoods. + #[clap(alias = "mlc")] + MultiLevelCluster, /// Reverse Cuthill-McKee ordering. ReverseCuthillMckee, } @@ -267,6 +270,7 @@ fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { OrderingMethod::MinimumLinearArrangement => { GraphOrderingMethod::MinimumLinearArrangement } + OrderingMethod::MultiLevelCluster => GraphOrderingMethod::MultiLevelCluster, OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, } } @@ -274,6 +278,7 @@ fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { match ordering { OrderingMethod::MinimumLinearArrangement => "minimum-linear-arrangement", + OrderingMethod::MultiLevelCluster => "multi-level-cluster", OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", } } @@ -333,12 +338,12 @@ mod tests { "--mode", "json", "--ordering", - "minimum-linear-arrangement", + "multi-level-cluster", ]) .unwrap(); assert_eq!(args.mode, Mode::Json); - assert_eq!(args.ordering, Some(OrderingMethod::MinimumLinearArrangement)); + assert_eq!(args.ordering, Some(OrderingMethod::MultiLevelCluster)); assert!(args.key.is_none()); } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 461a681..c7b4732 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -2,8 +2,8 @@ use crate::progress; use serde_json::{json, Value}; -use std::cmp::Ordering; -use std::collections::{HashMap, VecDeque}; +use std::cmp::{Ordering, Reverse}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::io::{self, Read, Result, Write}; use std::result::Result as StdResult; @@ -12,6 +12,8 @@ use std::result::Result as StdResult; pub enum GraphOrderingMethod { /// Order nodes using a minimum-linear-arrangement heuristic. MinimumLinearArrangement, + /// Order nodes using recursive multilevel clustering. + MultiLevelCluster, /// Order nodes using Reverse Cuthill-McKee. ReverseCuthillMckee, } @@ -124,6 +126,7 @@ pub fn sort_json_file_by_ordering( let order = match method { GraphOrderingMethod::MinimumLinearArrangement => minimum_linear_arrangement_order(&graph), + GraphOrderingMethod::MultiLevelCluster => multi_level_cluster_order(&graph), GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), }; @@ -295,6 +298,10 @@ fn minimum_linear_arrangement_order(graph: &GraphJson) -> Vec { order } +fn multi_level_cluster_order(graph: &GraphJson) -> Vec { + multilevel_cluster_order_generic(&graph.adjacency_indices, &graph.node_ids) +} + fn minimum_linear_arrangement_component(graph: &GraphJson, component: &[usize]) -> Vec { if component.len() <= 2 { return component.to_vec(); @@ -327,6 +334,233 @@ fn subset_mask(size: usize, nodes: &[usize]) -> Vec { mask } +fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> Vec> { + let mut seen = vec![false; adjacency.len()]; + let mut components = Vec::new(); + + for start in 0..adjacency.len() { + if seen[start] { + continue; + } + let mut queue = VecDeque::from([start]); + let mut component = Vec::new(); + seen[start] = true; + + while let Some(node) = queue.pop_front() { + component.push(node); + for &neighbor in &adjacency[node] { + if !seen[neighbor] { + seen[neighbor] = true; + queue.push_back(neighbor); + } + } + } + + components.push(component); + } + + components.sort_by_key(|component| { + component + .iter() + .map(|&node| labels[node]) + .min() + .unwrap_or(usize::MAX) + }); + components +} + +fn rcm_component_generic(adjacency: &[Vec], labels: &[usize], component: &[usize]) -> Vec { + let component_set = component.iter().copied().collect::>(); + let start = component + .iter() + .copied() + .min_by_key(|&node| { + ( + adjacency[node] + .iter() + .filter(|&&neighbor| component_set.contains(&neighbor)) + .count(), + labels[node], + ) + }) + .unwrap(); + + let mut visited = vec![false; adjacency.len()]; + let mut queue = VecDeque::from([start]); + let mut order = Vec::with_capacity(component.len()); + visited[start] = true; + + while let Some(node) = queue.pop_front() { + order.push(node); + let mut neighbors = adjacency[node] + .iter() + .copied() + .filter(|neighbor| component_set.contains(neighbor) && !visited[*neighbor]) + .collect::>(); + neighbors.sort_by_key(|&neighbor| { + ( + adjacency[neighbor] + .iter() + .filter(|&&next| component_set.contains(&next)) + .count(), + labels[neighbor], + ) + }); + for neighbor in neighbors { + visited[neighbor] = true; + queue.push_back(neighbor); + } + } + + order.reverse(); + order +} + +fn multilevel_cluster_order_generic(adjacency: &[Vec], labels: &[usize]) -> Vec { + let mut order = Vec::with_capacity(adjacency.len()); + for component in connected_components_generic(adjacency, labels) { + order.extend(multilevel_cluster_component_generic(adjacency, labels, &component)); + } + order +} + +fn multilevel_cluster_component_generic( + adjacency: &[Vec], + labels: &[usize], + component: &[usize], +) -> Vec { + if component.len() <= 8 { + return rcm_component_generic(adjacency, labels, component); + } + + let clusters = greedy_cluster_partition(adjacency, labels, component, 6); + if clusters.len() <= 1 || clusters.len() == component.len() { + return rcm_component_generic(adjacency, labels, component); + } + + let cluster_orders = clusters + .iter() + .map(|cluster| rcm_component_generic(adjacency, labels, cluster)) + .collect::>(); + let (coarse_adjacency, coarse_labels) = build_coarse_graph(adjacency, labels, &clusters); + let coarse_order = multilevel_cluster_order_generic(&coarse_adjacency, &coarse_labels); + + let mut order = Vec::with_capacity(component.len()); + for cluster_idx in coarse_order { + order.extend(cluster_orders[cluster_idx].iter().copied()); + } + order +} + +fn greedy_cluster_partition( + adjacency: &[Vec], + labels: &[usize], + component: &[usize], + max_cluster_size: usize, +) -> Vec> { + let component_mask = subset_mask(adjacency.len(), component); + let mut assigned = vec![false; adjacency.len()]; + let mut remaining = component.len(); + let mut clusters = Vec::new(); + + while remaining > 0 { + let seed = component + .iter() + .copied() + .filter(|&node| !assigned[node]) + .min_by_key(|&node| { + ( + adjacency[node] + .iter() + .filter(|&&neighbor| component_mask[neighbor] && !assigned[neighbor]) + .count(), + labels[node], + ) + }) + .unwrap(); + + let mut cluster = vec![seed]; + assigned[seed] = true; + remaining -= 1; + + let mut candidates = adjacency[seed] + .iter() + .copied() + .filter(|&neighbor| component_mask[neighbor] && !assigned[neighbor]) + .collect::>(); + candidates.sort_by_key(|&neighbor| { + let shared = adjacency[neighbor] + .iter() + .filter(|&&next| component_mask[next] && adjacency[seed].contains(&next)) + .count(); + ( + Reverse(shared), + adjacency[neighbor] + .iter() + .filter(|&&next| component_mask[next] && !assigned[next]) + .count(), + labels[neighbor], + ) + }); + + for neighbor in candidates.into_iter().take(max_cluster_size.saturating_sub(1)) { + assigned[neighbor] = true; + remaining -= 1; + cluster.push(neighbor); + } + + clusters.push(cluster); + } + + clusters +} + +fn build_coarse_graph( + adjacency: &[Vec], + labels: &[usize], + clusters: &[Vec], +) -> (Vec>, Vec) { + let mut cluster_of = vec![usize::MAX; adjacency.len()]; + for (cluster_idx, cluster) in clusters.iter().enumerate() { + for &node in cluster { + cluster_of[node] = cluster_idx; + } + } + + let mut coarse_sets = vec![HashSet::new(); clusters.len()]; + for (cluster_idx, cluster) in clusters.iter().enumerate() { + for &node in cluster { + for &neighbor in &adjacency[node] { + let neighbor_cluster = cluster_of[neighbor]; + if neighbor_cluster != cluster_idx && neighbor_cluster != usize::MAX { + coarse_sets[cluster_idx].insert(neighbor_cluster); + } + } + } + } + + let coarse_adjacency = coarse_sets + .into_iter() + .map(|neighbors| { + let mut neighbors = neighbors.into_iter().collect::>(); + neighbors.sort_unstable(); + neighbors + }) + .collect::>(); + let coarse_labels = clusters + .iter() + .map(|cluster| { + cluster + .iter() + .map(|&node| labels[node]) + .min() + .unwrap_or(usize::MAX) + }) + .collect::>(); + + (coarse_adjacency, coarse_labels) +} + fn positions_for_order(size: usize, order: &[usize]) -> Vec { let mut positions = vec![usize::MAX; size]; for (idx, &node) in order.iter().enumerate() { diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 10c0efa..6bcf6d0 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -320,3 +320,21 @@ fn test_sort_json_file_by_minimum_linear_arrangement() { assert_eq!(sorted, [0, 1, 2, 3]); assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); } + +#[test] +fn test_sort_json_file_by_multi_level_cluster() { + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + path_graph_json(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + let positions = [mapping[&0], mapping[&1], mapping[&2], mapping[&3]]; + let mut sorted = positions; + sorted.sort_unstable(); + assert_eq!(sorted, [0, 1, 2, 3]); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); +} diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index b52f6cc..d1bc099 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1119,6 +1119,37 @@ fn reben_cli_supports_mla_and_rcm_orderings() { assert!(!mla_json["nodes"].as_array().unwrap().is_empty()); } +#[test] +fn reben_cli_supports_multi_level_cluster_ordering() { + let temp = TempDir::new("reben-mlc"); + let graph_path = temp.path().join("shape.json"); + let mlc_path = temp.path().join("mlc.json"); + + fs::write(&graph_path, sample_graph()).unwrap(); + + let mlc = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--ordering", + "multi-level-cluster", + "--output-file", + mlc_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&mlc); + assert!(temp + .path() + .join("shape_sorted_by_multi-level-cluster_map.json") + .exists()); + + let mlc_json: Value = serde_json::from_str(&fs::read_to_string(&mlc_path).unwrap()).unwrap(); + assert!(!mlc_json["nodes"].as_array().unwrap().is_empty()); +} + #[test] fn pben_cli_converts_between_formats() { let temp = TempDir::new("pben"); From 82ca34b61febe7ce36e03080a6ed4eec75685f39 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 14:26:06 -0600 Subject: [PATCH 018/221] optimize mlc a bit --- ben/src/json/graph/mod.rs | 72 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index c7b4732..e5babc0 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -370,18 +370,13 @@ fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> V } fn rcm_component_generic(adjacency: &[Vec], labels: &[usize], component: &[usize]) -> Vec { - let component_set = component.iter().copied().collect::>(); + let component_mask = subset_mask(adjacency.len(), component); + let local_degree = local_degree_in_subset(adjacency, &component_mask, component); let start = component .iter() .copied() .min_by_key(|&node| { - ( - adjacency[node] - .iter() - .filter(|&&neighbor| component_set.contains(&neighbor)) - .count(), - labels[node], - ) + (local_degree[node], labels[node]) }) .unwrap(); @@ -395,17 +390,9 @@ fn rcm_component_generic(adjacency: &[Vec], labels: &[usize], component: let mut neighbors = adjacency[node] .iter() .copied() - .filter(|neighbor| component_set.contains(neighbor) && !visited[*neighbor]) + .filter(|&neighbor| component_mask[neighbor] && !visited[neighbor]) .collect::>(); - neighbors.sort_by_key(|&neighbor| { - ( - adjacency[neighbor] - .iter() - .filter(|&&next| component_set.contains(&next)) - .count(), - labels[neighbor], - ) - }); + neighbors.sort_by_key(|&neighbor| (local_degree[neighbor], labels[neighbor])); for neighbor in neighbors { visited[neighbor] = true; queue.push_back(neighbor); @@ -459,29 +446,30 @@ fn greedy_cluster_partition( max_cluster_size: usize, ) -> Vec> { let component_mask = subset_mask(adjacency.len(), component); + let local_degree = local_degree_in_subset(adjacency, &component_mask, component); let mut assigned = vec![false; adjacency.len()]; - let mut remaining = component.len(); + let mut unassigned = component.to_vec(); + unassigned.sort_by_key(|&node| (local_degree[node], labels[node])); + let mut remaining = unassigned.len(); let mut clusters = Vec::new(); + let mut seed_marks = vec![0usize; adjacency.len()]; + let mut mark_epoch = 1usize; while remaining > 0 { - let seed = component + let seed = unassigned .iter() .copied() - .filter(|&node| !assigned[node]) - .min_by_key(|&node| { - ( - adjacency[node] - .iter() - .filter(|&&neighbor| component_mask[neighbor] && !assigned[neighbor]) - .count(), - labels[node], - ) - }) + .find(|&node| !assigned[node]) .unwrap(); let mut cluster = vec![seed]; assigned[seed] = true; remaining -= 1; + for &neighbor in &adjacency[seed] { + if component_mask[neighbor] { + seed_marks[neighbor] = mark_epoch; + } + } let mut candidates = adjacency[seed] .iter() @@ -491,14 +479,11 @@ fn greedy_cluster_partition( candidates.sort_by_key(|&neighbor| { let shared = adjacency[neighbor] .iter() - .filter(|&&next| component_mask[next] && adjacency[seed].contains(&next)) + .filter(|&&next| component_mask[next] && seed_marks[next] == mark_epoch) .count(); ( Reverse(shared), - adjacency[neighbor] - .iter() - .filter(|&&next| component_mask[next] && !assigned[next]) - .count(), + local_degree[neighbor], labels[neighbor], ) }); @@ -509,12 +494,29 @@ fn greedy_cluster_partition( cluster.push(neighbor); } + mark_epoch = mark_epoch.wrapping_add(1); + if mark_epoch == 0 { + seed_marks.fill(0); + mark_epoch = 1; + } + clusters.push(cluster); } clusters } +fn local_degree_in_subset(adjacency: &[Vec], subset_mask: &[bool], subset: &[usize]) -> Vec { + let mut local_degree = vec![0usize; adjacency.len()]; + for &node in subset { + local_degree[node] = adjacency[node] + .iter() + .filter(|&&neighbor| subset_mask[neighbor]) + .count(); + } + local_degree +} + fn build_coarse_graph( adjacency: &[Vec], labels: &[usize], From bffa8df7209593e72cb48fda6041469dba851066 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 17:49:36 -0600 Subject: [PATCH 019/221] Start of twodelta Rework the types so that BenFrames are canonicalized First steps in TwoDelta --- ben/src/codec/decode/xz.rs | 6 + ben/src/codec/encode/ben.rs | 173 ++++++++++---- ben/src/codec/encode/jsonl.rs | 7 + ben/src/codec/encode/mod.rs | 4 +- ben/src/codec/encode/types.rs | 382 +++++++++++++++++++++++++++++++ ben/src/codec/encode/xz.rs | 6 + ben/src/codec/translate/mod.rs | 8 +- ben/src/io/reader.rs | 251 ++++++++++++++++++-- ben/src/io/writer.rs | 168 +++++++++----- ben/src/lib.rs | 2 + ben/src/ops/extract/mod.rs | 26 +-- ben/src/ops/relabel/mod.rs | 4 +- ben/src/util/rle/mod.rs | 4 +- ben/tests/test_impls_pipeline.rs | 80 ++++++- 14 files changed, 971 insertions(+), 150 deletions(-) create mode 100644 ben/src/codec/encode/types.rs diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 355cb56..8f5279f 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -77,6 +77,9 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: } } } + BenVariant::TwoDelta => { + panic!("not implemented"); + } } if last_valid_assignment == 0 { @@ -181,6 +184,9 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i } } } + BenVariant::TwoDelta => { + panic!("not implemented"); + } } if last_valid_assignment == 0 { diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 2c54c42..46130d8 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,4 +1,4 @@ -use crate::util::rle::assign_to_rle; +use super::types::{BenFrame, IdVec, TwoDeltaFrame}; use serde_json::Value; use std::io; @@ -13,7 +13,7 @@ use std::io; /// /// Returns the encoded ben32 frame bytes terminated by the four-byte `0` /// sentinel. -pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { +pub(crate) fn encode_ben32_line(data: Value) -> io::Result { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidData, @@ -64,7 +64,7 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { } ret.extend([0, 0, 0, 0]); - Ok(ret) + Ok(IdVec::U8(ret)) } /// Encode a full assignment vector into a single BEN frame. @@ -76,9 +76,8 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result> { /// # Returns /// /// Returns the encoded BEN frame bytes, including the per-frame header. -pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { - let rle_vec: Vec<(u16, u16)> = assign_to_rle(assign_vec); - encode_ben_vec_from_rle(rle_vec) +pub fn encode_ben_vec_from_assign(assign_vec: impl AsRef<[u16]>) -> BenFrame { + BenFrame::from_assignment(assign_vec) } /// Encode a run-length encoded assignment vector into a BEN frame. @@ -94,59 +93,141 @@ pub fn encode_ben_vec_from_assign(assign_vec: Vec) -> Vec { /// # Returns /// /// Returns the encoded BEN frame bytes, including the per-frame header. -pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> Vec { - let mut output_vec: Vec = Vec::new(); - - let max_val: u16 = rle_vec.iter().max_by_key(|x| x.0).unwrap().0; - let max_len: u16 = rle_vec.iter().max_by_key(|x| x.1).unwrap().1; - let max_val_bits: u8 = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bits: u8 = 16 - max_len.leading_zeros() as u8; - let assign_bits: u32 = (max_val_bits + max_len_bits) as u32; - let n_bytes: u32 = if (assign_bits * rle_vec.len() as u32).is_multiple_of(8) { - (assign_bits * rle_vec.len() as u32) / 8 - } else { - (assign_bits * rle_vec.len() as u32) / 8 + 1 - }; +pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> BenFrame { + BenFrame::from_rle(rle_vec) +} - output_vec.push(max_val_bits); - output_vec.push(max_len_bits); - output_vec.extend(n_bytes.to_be_bytes().as_slice()); +/// Encode a sample transition as a TwoDelta frame. +/// +/// The transition is valid only when all changed positions involve exactly two +/// assignment ids and positions outside that pair remain unchanged. +/// +/// # Arguments +/// +/// * `previous_assignment` - The previous full assignment vector. +/// * `new_assignment` - The next full assignment vector. +/// +/// # Returns +/// +/// Returns a serialized TwoDelta frame describing the transition. +pub fn encode_twodelta_vec( + previous_assignment: impl AsRef<[u16]>, + new_assignment: impl AsRef<[u16]>, +) -> io::Result { + let previous_assignment = previous_assignment.as_ref(); + let new_assignment = new_assignment.as_ref(); + + if previous_assignment.len() != new_assignment.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta requires assignment vectors of equal length", + )); + } - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; + let mut pair_ids = [0u16; 2]; + let mut pair_len = 0usize; + for (&previous, ¤t) in previous_assignment.iter().zip(new_assignment.iter()) { + if previous == current { + continue; + } + for value in [previous, current] { + if !pair_ids[..pair_len].contains(&value) { + if pair_len == 2 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta transitions may involve at most two assignment ids", + )); + } + pair_ids[pair_len] = value; + pair_len += 1; + } + } + } - for (val, len) in rle_vec { - let mut new_val: u32 = (remainder << max_val_bits) | (val as u32); + if pair_len == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta cannot encode identical assignments as a delta frame", + )); + } - let mut buff: u8; + let pair = if pair_len == 1 { + (pair_ids[0], pair_ids[0]) + } else { + (pair_ids[0], pair_ids[1]) + }; - let mut n_bits_left: u8 = remainder_bits + max_val_bits; + let mut pair_positions = Vec::new(); + pair_positions.reserve(previous_assignment.len()); + for (idx, (&previous, ¤t)) in previous_assignment + .iter() + .zip(new_assignment.iter()) + .enumerate() + { + let previous_in_pair = previous == pair.0 || previous == pair.1; + let current_in_pair = current == pair.0 || current == pair.1; + + if previous_in_pair != current_in_pair { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta requires the changed id pair to occupy the same positions", + )); + } - while n_bits_left >= 8 { - n_bits_left -= 8; - buff = (new_val >> n_bits_left) as u8; - output_vec.push(buff); - new_val &= !((0xFFFFFFFF as u32) << n_bits_left); + if !previous_in_pair && previous != current { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta found a change outside the selected id pair", + )); } - new_val = (new_val << max_len_bits) | (len as u32); - n_bits_left += max_len_bits; + if previous_in_pair { + pair_positions.push(idx); + } + } - while n_bits_left >= 8 { - n_bits_left -= 8; - buff = (new_val >> n_bits_left) as u8; - output_vec.push(buff); - new_val &= !((0xFFFFFFFF as u32) << n_bits_left); + if pair_positions.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta requires at least one occurrence of the selected id pair", + )); + } + + let first_value = new_assignment[pair_positions[0]]; + let second_value = if pair.0 == pair.1 { + pair.0 + } else if first_value == pair.0 { + pair.1 + } else { + pair.0 + }; + let ordered_pair = (first_value, second_value); + + let mut run_lengths = Vec::new(); + let mut current_value = first_value; + let mut current_run = 0u16; + + for &idx in &pair_positions { + let value = new_assignment[idx]; + if value != ordered_pair.0 && value != ordered_pair.1 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta payload encountered an assignment outside the selected id pair", + )); } - remainder_bits = n_bits_left; - remainder = new_val; + if value == current_value { + current_run += 1; + } else { + run_lengths.push(current_run); + current_value = value; + current_run = 1; + } } - if remainder_bits > 0 { - let buff = (remainder << (8 - remainder_bits)) as u8; - output_vec.push(buff); + if current_run > 0 { + run_lengths.push(current_run); } - output_vec + Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) } diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index cb4b743..588bc30 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -30,6 +30,13 @@ pub fn encode_jsonl_to_xben( n_threads: Option, compression_level: Option, ) -> Result<()> { + if variant == BenVariant::TwoDelta { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "TwoDelta is currently implemented only for uncompressed .ben streams", + )); + } + let mut n_cpus: u32 = n_threads.unwrap_or(1); n_cpus = n_cpus .min( diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 65f34d8..fa6b28a 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -2,11 +2,13 @@ mod ben; mod jsonl; +mod types; mod xz; pub(crate) use ben::encode_ben32_line; -pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle}; +pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; pub use xz::{encode_ben_to_xben, xz_compress}; #[cfg(test)] diff --git a/ben/src/codec/encode/types.rs b/ben/src/codec/encode/types.rs new file mode 100644 index 0000000..a7dd008 --- /dev/null +++ b/ben/src/codec/encode/types.rs @@ -0,0 +1,382 @@ +use std::io; + +/// Typed identifier storage used by experimental delta encoders. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum IdVec { + U8(Vec), + U16(Vec), +} + +/// A single typed identifier item. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)] +pub enum IdItem { + U8(u8), + U16(u16), +} + +impl IdVec { + /// Borrow the inner `u8` bytes. + pub fn as_slice(&self) -> &[u8] { + self.as_u8_slice().expect("expected U8-encoded payload") + } + + /// Borrow the inner `u8` bytes, returning an error on variant mismatch. + pub fn as_u8_slice(&self) -> io::Result<&[u8]> { + match self { + IdVec::U8(v) => Ok(v.as_slice()), + IdVec::U16(_) => Err(io::Error::new( + io::ErrorKind::InvalidData, + "expected U8-encoded payload", + )), + } + } + + /// Consume into raw `u8` bytes. + pub fn into_u8_vec(self) -> io::Result> { + match self { + IdVec::U8(v) => Ok(v), + IdVec::U16(_) => Err(io::Error::new( + io::ErrorKind::InvalidData, + "expected U8-encoded payload", + )), + } + } + + /// Return the logical element count. + pub fn len(&self) -> usize { + match self { + IdVec::U8(v) => v.len(), + IdVec::U16(v) => v.len(), + } + } + + /// Return whether the container is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Iterate over items while preserving the original scalar type. + pub fn iter(&self) -> impl Iterator + '_ { + match self { + IdVec::U8(v) => { + Box::new(v.iter().copied().map(IdItem::U8)) as Box> + } + IdVec::U16(v) => Box::new(v.iter().copied().map(IdItem::U16)), + } + } + + /// Return the item at index `i`, if any. + pub fn get(&self, i: usize) -> Option { + match self { + IdVec::U8(v) => v.get(i).copied().map(IdItem::U8), + IdVec::U16(v) => v.get(i).copied().map(IdItem::U16), + } + } +} + +impl<'a> IntoIterator for &'a IdVec { + type Item = IdItem; + type IntoIter = Box + 'a>; + + fn into_iter(self) -> Self::IntoIter { + Box::new(self.iter()) + } +} + +impl AsRef<[u8]> for IdVec { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for IdVec { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for IdVec { + fn eq(&self, other: &Vec) -> bool { + matches!(self, IdVec::U8(v) if v == other) + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &IdVec) -> bool { + matches!(other, IdVec::U8(v) if self == v) + } +} + +fn pack_fixed_width_items(items: &[u16], item_bits: u8) -> (u32, Vec) { + let payload_bits = item_bits as u32 * items.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let mut bytes = Vec::with_capacity(n_bytes as usize); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &item in items { + let mut packed = (remainder << item_bits) | item as u32; + let mut bits_left = remainder_bits + item_bits; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + (n_bytes, bytes) +} + +/// Canonical representation of a BEN frame. +/// +/// The frame stores the semantic RLE runs together with the derived header +/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN +/// frame, including the two one-byte bit-width fields and the four-byte payload +/// length. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BenFrame { + runs: Vec<(u16, u16)>, + max_val_bits: u8, + max_len_bits: u8, + n_bytes: u32, + bytes: Vec, +} + +impl BenFrame { + /// Build a frame from an RLE run vector. + pub fn from_rle(runs: Vec<(u16, u16)>) -> Self { + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bits = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bits = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bits + max_len_bits) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + + let mut bytes = Vec::with_capacity(6 + n_bytes as usize); + bytes.push(max_val_bits); + bytes.push(max_len_bits); + bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &(val, len) in &runs { + let mut packed = (remainder << max_val_bits) | (val as u32); + let mut bits_left = remainder_bits + max_val_bits; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + packed = (packed << max_len_bits) | (len as u32); + bits_left += max_len_bits; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + Self { + runs, + max_val_bits, + max_len_bits, + n_bytes, + bytes, + } + } + + /// Build a frame from a full assignment vector. + pub fn from_assignment(assignments: impl AsRef<[u16]>) -> Self { + Self::from_rle(crate::util::rle::assign_to_rle(assignments)) + } + + /// Borrow the canonical RLE runs. + pub fn runs(&self) -> &[(u16, u16)] { + &self.runs + } + + /// Return the number of bits used to store each value. + pub fn max_val_bits(&self) -> u8 { + self.max_val_bits + } + + /// Return the number of bits used to store each run length. + pub fn max_len_bits(&self) -> u8 { + self.max_len_bits + } + + /// Return the payload length in bytes. + pub fn n_bytes(&self) -> u32 { + self.n_bytes + } + + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.bytes + } +} + +impl AsRef<[u8]> for BenFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenFrame { + fn eq(&self, other: &Vec) -> bool { + self.bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenFrame) -> bool { + *self == other.bytes + } +} + +/// Canonical representation of a TwoDelta frame. +/// +/// A TwoDelta frame stores the two assignment ids that may change relative to +/// the previous sample and then encodes the lengths of alternating runs over +/// just those two ids. The first run always corresponds to `pair.0`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TwoDeltaFrame { + pair: (u16, u16), + max_len_bits: u8, + n_bytes: u32, + bytes: Vec, +} + +impl TwoDeltaFrame { + /// Build a TwoDelta frame from a pair ordering and run lengths. + pub fn from_run_lengths(pair: (u16, u16), run_lengths: Vec) -> Self { + let max_len = run_lengths.iter().copied().max().unwrap_or(0); + let max_len_bits = (16 - max_len.leading_zeros() as u8).max(1); + let (n_bytes, payload_bytes) = pack_fixed_width_items(&run_lengths, max_len_bits); + + let mut bytes = Vec::with_capacity(9 + payload_bytes.len()); + bytes.extend_from_slice(&pair.0.to_be_bytes()); + bytes.extend_from_slice(&pair.1.to_be_bytes()); + bytes.push(max_len_bits); + bytes.extend_from_slice(&n_bytes.to_be_bytes()); + bytes.extend_from_slice(&payload_bytes); + + Self { + pair, + max_len_bits, + n_bytes, + bytes, + } + } + + /// Rebuild a TwoDelta frame from already-parsed header fields and payload bytes. + pub fn from_parts(pair: (u16, u16), max_len_bits: u8, payload: Vec) -> Self { + let n_bytes = payload.len() as u32; + let mut bytes = Vec::with_capacity(9 + payload.len()); + bytes.extend_from_slice(&pair.0.to_be_bytes()); + bytes.extend_from_slice(&pair.1.to_be_bytes()); + bytes.push(max_len_bits); + bytes.extend_from_slice(&n_bytes.to_be_bytes()); + bytes.extend_from_slice(&payload); + + Self { + pair, + max_len_bits, + n_bytes, + bytes, + } + } + + /// Return the ordered pair of ids used by the delta frame. + pub fn pair(&self) -> (u16, u16) { + self.pair + } + + /// Return the bit width of each encoded run length. + pub fn max_len_bits(&self) -> u8 { + self.max_len_bits + } + + /// Return the packed payload length in bytes. + pub fn n_bytes(&self) -> u32 { + self.n_bytes + } + + /// Borrow just the packed payload bytes. + pub fn payload(&self) -> &[u8] { + &self.bytes[9..] + } + + /// Borrow the serialized TwoDelta frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.bytes + } + + /// Clone out the serialized TwoDelta frame bytes. + pub fn to_bytes(&self) -> Vec { + self.bytes.clone() + } + + /// Consume the frame and return the serialized bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.bytes + } +} + +impl AsRef<[u8]> for TwoDeltaFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for TwoDeltaFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 905f725..0d60fd0 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -104,6 +104,12 @@ pub fn encode_ben_to_xben( let mut ben_encoder = match &check_buffer { b"STANDARD BEN FILE" => XBenEncoder::new(encoder, BenVariant::Standard), b"MKVCHAIN BEN FILE" => XBenEncoder::new(encoder, BenVariant::MkvChain), + b"TWODELTA BEN FILE" => { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "TwoDelta BEN streams cannot yet be translated to XBEN", + )); + } _ => { return Err(io::Error::new( io::ErrorKind::InvalidData, diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 99c4a84..08346b5 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -47,7 +47,7 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { )); } - Ok(encode_ben_vec_from_rle(ben32_rle)) + Ok(encode_ben_vec_from_rle(ben32_rle).into_bytes()) } /// Translate a stream of ben32 frames into BEN frames. @@ -192,6 +192,12 @@ pub fn ben_to_ben32_lines( writer.write_all(&ben32_vec)?; writer.write_all(&n_reps.to_be_bytes())?; } + BenVariant::TwoDelta => { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "TwoDelta BEN streams cannot yet be translated to ben32/XBEN", + )); + } } } diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 5fc7234..5f08c5b 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -1,4 +1,5 @@ use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::codec::encode::{encode_ben_vec_from_assign, TwoDeltaFrame}; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -117,6 +118,8 @@ pub struct BenDecoder { reader: R, sample_count: usize, variant: BenVariant, + previous_assignment: Option>, + twodelta_consumed_first_frame: bool, } #[derive(Clone)] @@ -137,6 +140,20 @@ pub struct BenFrame { pub raw_data: Vec, } +enum StoredBenFrame { + Ben(BenFrame), + TwoDelta { frame: TwoDeltaFrame, count: u16 }, +} + +impl StoredBenFrame { + fn count(&self) -> u16 { + match self { + Self::Ben(frame) => frame.count, + Self::TwoDelta { count, .. } => *count, + } + } +} + impl BenDecoder { /// Create a decoder for an uncompressed BEN stream. /// @@ -162,11 +179,22 @@ impl BenDecoder { reader, sample_count: 0, variant: BenVariant::Standard, + previous_assignment: None, + twodelta_consumed_first_frame: false, }), b"MKVCHAIN BEN FILE" => Ok(BenDecoder { reader, sample_count: 0, variant: BenVariant::MkvChain, + previous_assignment: None, + twodelta_consumed_first_frame: false, + }), + b"TWODELTA BEN FILE" => Ok(BenDecoder { + reader, + sample_count: 0, + variant: BenVariant::TwoDelta, + previous_assignment: None, + twodelta_consumed_first_frame: false, }), _ => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), } @@ -205,13 +233,8 @@ impl BenDecoder { Ok(()) } - /// Read and return the next raw BEN frame from the underlying stream. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_frame_from_reader(&mut self) -> Option> { + /// Read and return the next raw BEN frame stored in standard BEN layout. + fn pop_standard_frame_from_reader(&mut self, with_count: bool) -> Option> { let mut b1 = [0u8; 1]; let max_val_bits = match self.reader.read_exact(&mut b1) { Ok(()) => b1[0], @@ -241,7 +264,7 @@ impl BenDecoder { return Some(Err(e)); } - let count = if self.variant == BenVariant::MkvChain { + let count = if with_count { match self.reader.read_u16::() { Ok(c) => c, Err(e) => return Some(Err(e)), @@ -259,6 +282,78 @@ impl BenDecoder { })) } + /// Read and return the next raw TwoDelta frame from the underlying stream. + fn pop_twodelta_frame_from_reader(&mut self) -> Option> { + let pair_a = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + tracing::trace!(""); + tracing::trace!("Done!"); + return None; + } + return Some(Err(e)); + } + }; + + let pair_b = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + let mut bits = [0u8; 1]; + if let Err(e) = self.reader.read_exact(&mut bits) { + return Some(Err(e)); + } + let max_len_bits = bits[0]; + + let n_bytes = match self.reader.read_u32::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + let mut payload = vec![0u8; n_bytes as usize]; + if let Err(e) = self.reader.read_exact(&mut payload) { + return Some(Err(e)); + } + + let count = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + Some(Ok(StoredBenFrame::TwoDelta { + frame: TwoDeltaFrame::from_parts((pair_a, pair_b), max_len_bits, payload), + count, + })) + } + + /// Read and return the next stored frame from the underlying BEN stream. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. + fn pop_frame_from_reader(&mut self) -> Option> { + match self.variant { + BenVariant::Standard => self + .pop_standard_frame_from_reader(false) + .map(|res| res.map(StoredBenFrame::Ben)), + BenVariant::MkvChain => self + .pop_standard_frame_from_reader(true) + .map(|res| res.map(StoredBenFrame::Ben)), + BenVariant::TwoDelta => { + if !self.twodelta_consumed_first_frame { + self.twodelta_consumed_first_frame = true; + self.pop_standard_frame_from_reader(true) + .map(|res| res.map(StoredBenFrame::Ben)) + } else { + self.pop_twodelta_frame_from_reader() + } + } + } + } + /// Consume this decoder and iterate over raw BEN frames instead of /// materialized assignments. /// @@ -278,10 +373,10 @@ impl BenDecoder { /// /// Returns the number of remaining samples in the stream. pub fn count_samples(self) -> io::Result { + let mut this = self; let mut total = 0usize; - for frame_res in self.into_frames() { - let f = frame_res?; - total += f.count as usize; + while let Some(frame_res) = this.pop_frame_from_reader() { + total += frame_res?.count() as usize; } Ok(total) } @@ -306,23 +401,119 @@ fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { .map(rle_to_vec) } +/// Decode the run-length payload of a TwoDelta frame. +fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { + let mut items = Vec::new(); + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + let mut current: Option = None; + + for &byte in frame.payload() { + buffer |= (byte as u32).to_be() >> n_bits_in_buff; + n_bits_in_buff += 8; + + if n_bits_in_buff >= frame.max_len_bits() as u16 && current.is_none() { + current = Some((buffer >> (32 - frame.max_len_bits())) as u16); + buffer <<= frame.max_len_bits(); + n_bits_in_buff -= frame.max_len_bits() as u16; + } + + if let Some(item) = current.take() { + if item > 0 { + items.push(item); + } + } + + while n_bits_in_buff >= frame.max_len_bits() as u16 { + let item = (buffer >> (32 - frame.max_len_bits())) as u16; + buffer <<= frame.max_len_bits(); + n_bits_in_buff -= frame.max_len_bits() as u16; + if item > 0 { + items.push(item); + } + } + } + + Ok(items) +} + +/// Decode a raw TwoDelta frame into a full assignment vector. +fn decode_twodelta_frame_to_assignment( + previous_assignment: &[u16], + frame: &TwoDeltaFrame, +) -> io::Result> { + let mut pair_positions = Vec::new(); + pair_positions.reserve(previous_assignment.len()); + let (first, second) = frame.pair(); + + for (idx, &assignment) in previous_assignment.iter().enumerate() { + if assignment == first || assignment == second { + pair_positions.push(idx); + } + } + + let run_lengths = decode_twodelta_run_lengths(frame)?; + let expected_total: usize = run_lengths.iter().map(|&len| len as usize).sum(); + if expected_total != pair_positions.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta payload does not match the previous assignment's pair positions", + )); + } + + let mut assignment = previous_assignment.to_vec(); + let mut write_idx = 0usize; + let mut current_value = first; + + for run_len in run_lengths { + for _ in 0..run_len { + assignment[pair_positions[write_idx]] = current_value; + write_idx += 1; + } + current_value = if current_value == first { second } else { first }; + } + + Ok(assignment) +} + +fn decode_stored_frame_to_assignment( + previous_assignment: Option<&[u16]>, + frame: &StoredBenFrame, +) -> io::Result> { + match frame { + StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), + StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( + previous_assignment.ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta frame encountered before an initial BEN frame", + ) + })?, + frame, + ), + } +} + impl Iterator for BenDecoder { type Item = io::Result; /// Decode and return the next assignment from the BEN stream. fn next(&mut self) -> Option> { - let ben_frame = match self.pop_frame_from_reader() { + let frame = match self.pop_frame_from_reader() { Some(Ok(frame)) => frame, Some(Err(e)) => return Some(Err(e)), None => return None, }; - let assignment = match decode_ben_frame_to_assignment(&ben_frame) { + let assignment = match decode_stored_frame_to_assignment(self.previous_assignment.as_deref(), &frame) + { Ok(assgn) => assgn, Err(e) => return Some(Err(e)), }; - self.sample_count += ben_frame.count as usize; + let count = frame.count(); + self.previous_assignment = Some(assignment.clone()); + self.sample_count += count as usize; progress!("Decoding sample: {}\r", self.sample_count); - Some(Ok((assignment, ben_frame.count))) + Some(Ok((assignment, count))) } } @@ -353,7 +544,32 @@ impl Iterator for BenFrameDecoeder { /// Return the next raw BEN frame from the input stream. fn next(&mut self) -> Option { - self.inner.pop_frame_from_reader() + match self.inner.variant { + BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() { + Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), + Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + "unexpected TwoDelta frame in non-TwoDelta BEN stream", + ))), + Some(Err(err)) => Some(Err(err)), + None => None, + }, + BenVariant::TwoDelta => match self.inner.next() { + Some(Ok((assignment, count))) => { + let encoded = encode_ben_vec_from_assign(&assignment); + let raw_data = encoded.as_slice()[6..].to_vec(); + Some(Ok(BenFrame { + max_val_bits: encoded.max_val_bits(), + max_len_bits: encoded.max_len_bits(), + count, + n_bytes: encoded.n_bytes(), + raw_data, + })) + } + Some(Err(err)) => Some(Err(err)), + None => None, + }, + } } } @@ -444,6 +660,9 @@ impl XBenDecoder { } None } + BenVariant::TwoDelta => { + panic!("not implemented"); + } } } diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index f1bc05b..0d5f5ab 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -1,17 +1,33 @@ -use crate::codec::encode::encode_ben32_line; -use crate::codec::encode::encode_ben_vec_from_rle; +use crate::codec::encode::{ + encode_ben32_line, encode_ben_vec_from_assign, encode_twodelta_vec, BenFrame, IdVec, + TwoDeltaFrame, +}; use crate::codec::translate::ben_to_ben32_lines; -use crate::util::rle::assign_to_rle; use crate::BenVariant; use serde_json::Value; use std::io::{self, BufRead, Result, Write}; use xz2::write::XzEncoder; +enum BufferedBenFrame { + Ben(BenFrame), + TwoDelta(TwoDeltaFrame), +} + +impl BufferedBenFrame { + fn as_slice(&self) -> &[u8] { + match self { + Self::Ben(frame) => frame.as_slice(), + Self::TwoDelta(frame) => frame.as_slice(), + } + } +} + /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { writer: W, - previous_sample: Vec, - count: u16, + previous_sample: Vec, + previous_encoded_sample: Option, + sample_count: u16, variant: BenVariant, complete: bool, } @@ -29,68 +45,95 @@ impl BenEncoder { /// Returns a new encoder ready to accept assignments or RLE frames. pub fn new(mut writer: W, variant: BenVariant) -> Self { match variant { - BenVariant::Standard => { - writer.write_all(b"STANDARD BEN FILE").unwrap(); - } - BenVariant::MkvChain => { - writer.write_all(b"MKVCHAIN BEN FILE").unwrap(); - } - } + BenVariant::Standard => writer.write_all(b"STANDARD BEN FILE").unwrap(), + BenVariant::MkvChain => writer.write_all(b"MKVCHAIN BEN FILE").unwrap(), + BenVariant::TwoDelta => writer.write_all(b"TWODELTA BEN FILE").unwrap(), + }; + BenEncoder { writer, previous_sample: Vec::new(), - count: 0, + previous_encoded_sample: None, + sample_count: 0, complete: false, variant, } } - /// Encode and write a run-length encoded assignment vector as one BEN frame. + fn flush_pending_frame(&mut self) -> Result<()> { + if self.sample_count == 0 { + return Ok(()); + } + + let encoded = self + .previous_encoded_sample + .as_ref() + .expect("missing previous BEN frame"); + self.writer.write_all(encoded.as_slice())?; + + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { + self.writer.write_all(&self.sample_count.to_be_bytes())?; + } + + Ok(()) + } + + /// Encode and write a full assignment vector. /// /// # Arguments /// - /// * `rle_vec` - The assignment vector in `(value, count)` form. + /// * `assign_vec` - The full assignment vector to encode. /// /// # Returns /// - /// Returns `Ok(())` after the frame has been queued or written. - pub fn write_rle(&mut self, rle_vec: Vec<(u16, u16)>) -> Result<()> { + /// Returns `Ok(())` after the assignment has been queued or written. + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { match self.variant { BenVariant::Standard => { - let encoded = encode_ben_vec_from_rle(rle_vec); - self.writer.write_all(&encoded)?; + let encoded = encode_ben_vec_from_assign(&assign_vec); + self.writer.write_all(encoded.as_slice())?; Ok(()) } BenVariant::MkvChain => { - let encoded = encode_ben_vec_from_rle(rle_vec); - if encoded == self.previous_sample { - self.count += 1; - } else { - if self.count > 0 { - self.writer.write_all(&self.previous_sample)?; - self.writer.write_all(&self.count.to_be_bytes())?; - } - self.previous_sample = encoded; - self.count = 1; + let repeated = assign_vec == self.previous_sample; + if repeated { + self.sample_count += 1; + return Ok(()); + } + + if self.sample_count > 0 { + self.flush_pending_frame()?; } + + let encoded = encode_ben_vec_from_assign(&assign_vec); + self.previous_encoded_sample = Some(BufferedBenFrame::Ben(encoded)); + self.previous_sample = assign_vec; + self.sample_count = 1; + Ok(()) } - } - } + BenVariant::TwoDelta => { + if self.previous_sample.is_empty() { + let encoded = encode_ben_vec_from_assign(&assign_vec); + self.previous_encoded_sample = Some(BufferedBenFrame::Ben(encoded)); + self.previous_sample = assign_vec; + self.sample_count = 1; + return Ok(()); + } - /// Encode and write a full assignment vector. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - let rle_vec = assign_to_rle(assign_vec); - self.write_rle(rle_vec)?; - Ok(()) + if assign_vec == self.previous_sample { + self.sample_count += 1; + return Ok(()); + } + + let encoded = encode_twodelta_vec(&self.previous_sample, &assign_vec)?; + self.flush_pending_frame()?; + self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); + self.previous_sample = assign_vec; + self.sample_count = 1; + Ok(()) + } + } } /// Encode and write a JSON assignment record. @@ -134,9 +177,7 @@ impl BenEncoder { }) .collect::>>()?; - let rle_vec = assign_to_rle(converted_vec); - self.write_rle(rle_vec)?; - Ok(()) + self.write_assignment(converted_vec) } /// Flush any buffered repetition state to the underlying writer. @@ -151,14 +192,8 @@ impl BenEncoder { if self.complete { return Ok(()); } - if self.variant == BenVariant::MkvChain && self.count > 0 { - self.writer - .write_all(&self.previous_sample) - .expect("Error while writing last line to file"); - self.writer - .write_all(&self.count.to_be_bytes()) - .expect("Error while writing last count to file"); - } + self.flush_pending_frame() + .expect("Error while flushing trailing BEN frame"); self.complete = true; Ok(()) } @@ -174,7 +209,7 @@ impl Drop for BenEncoder { /// A struct to make the writing of XBEN files easier and more ergonomic. pub struct XBenEncoder { encoder: XzEncoder, - previous_sample: Vec, + previous_sample: IdVec, count: u16, variant: BenVariant, } @@ -197,7 +232,7 @@ impl XBenEncoder { encoder.write_all(b"STANDARD BEN FILE").unwrap(); XBenEncoder { encoder, - previous_sample: Vec::new(), + previous_sample: IdVec::U8(Vec::new()), count: 0, variant: BenVariant::Standard, } @@ -206,11 +241,14 @@ impl XBenEncoder { encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); XBenEncoder { encoder, - previous_sample: Vec::new(), + previous_sample: IdVec::U8(Vec::new()), count: 0, variant: BenVariant::MkvChain, } } + BenVariant::TwoDelta => { + panic!("not implemented"); + } } } @@ -227,20 +265,24 @@ impl XBenEncoder { let encoded = encode_ben32_line(data)?; match self.variant { BenVariant::Standard => { - self.encoder.write_all(&encoded)?; + self.encoder.write_all(encoded.as_u8_slice()?)?; } BenVariant::MkvChain => { if encoded == self.previous_sample { self.count += 1; } else { if self.count > 0 { - self.encoder.write_all(&self.previous_sample)?; + self.encoder + .write_all(self.previous_sample.as_u8_slice()?)?; self.encoder.write_all(&self.count.to_be_bytes())?; } self.previous_sample = encoded; self.count = 1; } } + BenVariant::TwoDelta => { + panic!("not implemented"); + } } Ok(()) } @@ -275,7 +317,11 @@ impl Drop for XBenEncoder { fn drop(&mut self) { if self.variant == BenVariant::MkvChain && self.count > 0 { self.encoder - .write_all(&self.previous_sample) + .write_all( + self.previous_sample + .as_u8_slice() + .expect("Error writing last line to file"), + ) .expect("Error writing last line to file"); self.encoder .write_all(&self.count.to_be_bytes()) diff --git a/ben/src/lib.rs b/ben/src/lib.rs index b96cd51..cde7f7c 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -54,4 +54,6 @@ pub enum BenVariant { Standard, /// Store one frame plus a repetition count for repeated consecutive samples. MkvChain, + /// Reserved for a future delta-based variant. + TwoDelta, } diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index ce2864b..1e3c11a 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -1,8 +1,7 @@ //! Sample extraction helpers for BEN and XBEN streams. -use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::codec::decode::decode_ben32_line; use crate::io::reader::{BenDecoder, XBenDecoder}; -use crate::util::rle::rle_to_vec; use serde_json::Error as SerdeError; use std::fmt; use std::io::Cursor; @@ -113,25 +112,14 @@ pub fn extract_assignment_ben( }); } - let inner_decoder = BenDecoder::new(&mut reader).expect("Failed to create XBenDecoder"); - let frame_iterator = inner_decoder.into_frames(); - let mut current_sample = 1; - for frame in frame_iterator { - let frame = frame.map_err(SampleError::new_io_error)?; - if current_sample == sample_number || current_sample + frame.count as usize > sample_number - { - match decode_ben_line( - Cursor::new(&frame.raw_data), - frame.max_val_bits, - frame.max_len_bits, - frame.n_bytes, - ) { - Ok(assignment_rle) => return Ok(rle_to_vec(assignment_rle)), - Err(e) => return Err(SampleError::new_io_error(e)), - }; + let inner_decoder = BenDecoder::new(&mut reader).expect("Failed to create XBenDecoder"); + for record in inner_decoder { + let (assignment, count) = record.map_err(SampleError::new_io_error)?; + if current_sample == sample_number || current_sample + count as usize > sample_number { + return Ok(assignment); } - current_sample += frame.count as usize; + current_sample += count as usize; } Err(SampleError { diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 1ed2982..522ca1d 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -141,7 +141,7 @@ fn relabel_ben_lines_impl( }; let relabeled = encode_ben_vec_from_rle(ben_line); - writer.write_all(&relabeled)?; + writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; } @@ -344,7 +344,7 @@ fn relabel_ben_lines_with_map_impl( }; let relabeled = encode_ben_vec_from_rle(new_rle.clone()); - writer.write_all(&relabeled)?; + writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; } diff --git a/ben/src/util/rle/mod.rs b/ben/src/util/rle/mod.rs index 0f4d532..0739819 100644 --- a/ben/src/util/rle/mod.rs +++ b/ben/src/util/rle/mod.rs @@ -9,9 +9,9 @@ /// # Returns /// /// Returns the assignment vector as `(value, count)` pairs. -pub fn assign_to_rle(assign_vec: Vec) -> Vec<(u16, u16)> { +pub fn assign_to_rle(assign_vec: impl AsRef<[u16]>) -> Vec<(u16, u16)> { let mut rle_vec: Vec<(u16, u16)> = Vec::new(); - assign_slice_to_rle(&assign_vec, &mut rle_vec); + assign_slice_to_rle(assign_vec.as_ref(), &mut rle_vec); rle_vec } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 08d79db..0d8d5c5 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -4,8 +4,8 @@ use binary_ensemble::codec::decode::{ decode_ben_line, decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, }; use binary_ensemble::codec::encode::{ - encode_ben_to_xben, encode_ben_vec_from_rle, encode_jsonl_to_ben, encode_jsonl_to_xben, - xz_compress, + encode_ben_to_xben, encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_jsonl_to_ben, + encode_jsonl_to_xben, xz_compress, }; use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, DecoderInitError, Frame, @@ -1055,3 +1055,79 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { vec![2, 4] ); } + +#[test] +fn twodelta_roundtrips_and_counts_repeated_frames() { + let assignments = vec![ + vec![1u16, 1, 2, 2, 3, 3], + vec![1u16, 1, 2, 2, 3, 3], + vec![1u16, 2, 2, 1, 3, 3], + vec![1u16, 2, 2, 1, 3, 3], + vec![2u16, 2, 1, 1, 3, 3], + ]; + + let mut ben = Vec::new(); + { + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + for assignment in &assignments { + encoder.write_assignment(assignment.clone()).unwrap(); + } + encoder.finish().unwrap(); + } + + let records = collect_records(BenDecoder::new(ben.as_slice()).unwrap()).unwrap(); + assert_eq!( + records, + vec![ + (assignments[0].clone(), 2), + (assignments[2].clone(), 2), + (assignments[4].clone(), 1), + ] + ); + + let mut jsonl = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut jsonl).unwrap(); + assert_eq!(jsonl, jsonl_from_assignments(&assignments)); + + let frames = BenDecoder::new(ben.as_slice()).unwrap().into_frames(); + assert_eq!(collect_frames(frames.map(|res| res.map(|f| (Frame::Ben(f.clone()), f.count)))).unwrap().len(), 3); +} + +#[test] +fn twodelta_first_frame_carries_repeat_trailer() { + let first = vec![1u16, 1, 2, 2, 3, 3]; + let second = vec![1u16, 2, 2, 1, 3, 3]; + + let mut ben = Vec::new(); + { + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + encoder.write_assignment(first.clone()).unwrap(); + encoder.write_assignment(first.clone()).unwrap(); + encoder.write_assignment(second).unwrap(); + encoder.finish().unwrap(); + } + + let expected_first = encode_ben_vec_from_assign(&first); + assert_eq!(&ben[..17], b"TWODELTA BEN FILE"); + assert_eq!( + &ben[17..17 + expected_first.as_slice().len()], + expected_first.as_slice() + ); + let count_offset = 17 + expected_first.as_slice().len(); + assert_eq!( + u16::from_be_bytes([ben[count_offset], ben[count_offset + 1]]), + 2 + ); +} + +#[test] +fn twodelta_rejects_non_pair_transition() { + let mut ben = Vec::new(); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); + let err = encoder + .write_assignment(vec![1u16, 3, 2, 4]) + .err() + .unwrap(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} From 57e3cd9558eb65bcb0a287044770c6de11583f43 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 18:58:11 -0600 Subject: [PATCH 020/221] more tests --- ben/tests/test_impls_pipeline.rs | 165 +++++++++++++++++++++++++++++++ ben/tests/test_pipeline.rs | 82 +++++++++++++++ 2 files changed, 247 insertions(+) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 0d8d5c5..aeade60 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -12,6 +12,7 @@ use binary_ensemble::io::reader::{ SubsampleFrameDecoder, XBenDecoder, }; use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::ops::extract::extract_assignment_ben; use binary_ensemble::BenVariant; use proptest::prelude::*; @@ -115,6 +116,71 @@ fn strat_assignment_seq() -> impl Strategy>> { }) } +/// Strategy for sequences where every transition is valid for TwoDelta. +fn strat_twodelta_seq() -> impl Strategy>> { + ( + strat_assignment(32, 24, 300), + prop::collection::vec(any::(), 0..=40), + ) + .prop_map(|(base, ops)| { + let mut current = base; + let mut seq = vec![current.clone()]; + + for op in ops { + let mut next = current.clone(); + let mut distinct: Vec = current.clone(); + distinct.sort_unstable(); + distinct.dedup(); + + if distinct.len() < 2 || op % 5 == 0 { + seq.push(next.clone()); + current = next; + continue; + } + + let a = distinct[(op as usize) % distinct.len()]; + let mut b = distinct[((op >> 8) as usize) % distinct.len()]; + if a == b { + b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + } + + let positions: Vec = current + .iter() + .enumerate() + .filter_map(|(idx, &value)| ((value == a) || (value == b)).then_some(idx)) + .collect(); + + if positions.is_empty() { + seq.push(next.clone()); + current = next; + continue; + } + + let mut remaining = positions.len(); + let mut write_idx = 0usize; + let mut seed = op.rotate_left(13) ^ 0x9E37_79B9_7F4A_7C15; + let mut value = if op & 1 == 0 { a } else { b }; + + while remaining > 0 { + let run_len = 1 + (seed as usize % remaining); + for _ in 0..run_len { + next[positions[write_idx]] = value; + write_idx += 1; + } + remaining -= run_len; + value = if value == a { b } else { a }; + seed = seed.rotate_left(7) ^ 0xA076_1D64_78BD_642F; + } + + seq.push(next.clone()); + current = next; + } + + seq + }) + .prop_filter("TwoDelta sequences must be non-empty", |seq| !seq.is_empty()) +} + // Random (small) thread count and compression level for MT encoder. fn strat_threads_levels() -> impl Strategy { (1u32..=4, 0u32..=9) @@ -149,6 +215,19 @@ proptest! { prop_assert_eq!(out, jsonl); } + // JSONL -> BEN(TwoDelta) -> JSONL round-trip. + #[test] + fn fuzz_roundtrip_ben_twodelta(seq in strat_twodelta_seq()) { + let jsonl = jsonl_from_assignments(&seq); + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::TwoDelta).unwrap(); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + prop_assert_eq!(out, jsonl); + } + // JSONL -> XBEN(Standard) -> BEN -> JSONL // Also vary threads & compression level. #[test] @@ -271,6 +350,20 @@ proptest! { } + // Iterator surface: BenDecoder over TwoDelta BEN matches JSONL. + #[test] + fn fuzz_bendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq()) { + let jsonl = jsonl_from_assignments(&seq); + + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::TwoDelta).unwrap(); + + let mut dec = BenDecoder::new(ben.as_slice()).unwrap(); + let recs = collect_records(&mut dec).unwrap(); + let out = jsonl_from_records(&recs, 0); + prop_assert_eq!(out, jsonl); + } + // SubsampleDecoder: select indices (by_indices) #[test] fn fuzz_subsample_by_indices(seq in strat_assignment_seq(), params in strat_threads_levels()) { @@ -380,6 +473,39 @@ proptest! { prop_assert_eq!(picked, truth); } + #[test] + fn fuzz_subsample_by_indices_twodelta(seq in strat_twodelta_seq()) { + let jsonl = jsonl_from_assignments(&seq); + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::TwoDelta).unwrap(); + + let n = seq.len().max(1); + let mut want: Vec = (1..=n).step_by(3).collect(); + if want.is_empty() { + want.push(1); + } + + let mut sub = BenDecoder::new(ben.as_slice()) + .unwrap() + .into_subsample_by_indices(want.clone()); + let recs = collect_records(&mut sub).unwrap(); + + let truth: Vec> = (1..=n) + .zip(seq.iter()) + .filter(|(i, _)| want.contains(i)) + .map(|(_, v)| v.clone()) + .collect(); + + let mut picked: Vec> = Vec::new(); + for (assignment, count) in recs { + for _ in 0..count { + picked.push(assignment.clone()); + } + } + + prop_assert_eq!(picked, truth); + } + // xz_compress / xz_decompress round-trip on arbitrary bytes. #[test] fn fuzz_xz_roundtrip(bytes in proptest::collection::vec(any::(), 0..=200_000), params in strat_threads_levels()) { @@ -1131,3 +1257,42 @@ fn twodelta_rejects_non_pair_transition() { .unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } + +#[test] +fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { + let assignments = vec![ + vec![1u16, 1, 2, 2, 3, 3], + vec![1u16, 1, 2, 2, 3, 3], + vec![1u16, 2, 2, 1, 3, 3], + vec![2u16, 2, 1, 1, 3, 3], + ]; + + let mut ben = Vec::new(); + let jsonl = jsonl_from_assignments(&assignments); + encode_jsonl_to_ben( + BufReader::new(jsonl.as_slice()), + &mut ben, + BenVariant::TwoDelta, + ) + .unwrap(); + + assert_eq!(BenDecoder::new(ben.as_slice()).unwrap().count_samples().unwrap(), 4); + + let frames: Vec<_> = BenDecoder::new(ben.as_slice()) + .unwrap() + .into_frames() + .collect::>>() + .unwrap(); + assert_eq!(frames.len(), 3); + assert_eq!(frames[0].count, 2); + assert_eq!(frames[1].count, 1); + assert_eq!(frames[2].count, 1); + + let picked = extract_assignment_ben(ben.as_slice(), 3).unwrap(); + assert_eq!(picked, assignments[2]); + + let ben_path = unique_temp_path("twodelta_sample.ben"); + fs::write(&ben_path, &ben).unwrap(); + assert_eq!(count_samples_from_file(&ben_path, "ben").unwrap(), 4); + fs::remove_file(ben_path).unwrap(); +} diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 1b8be71..410ca09 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -127,6 +127,88 @@ fn test_mkvben_pipeline() { assert_eq!(original_data, output_writer); } +#[test] +fn test_twodeltaben_pipeline() { + let seed = 129530786u64; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + + let n_samples = 100; + let shape = 2.0; + let scale = 50.0; + let gamma = Gamma::new(shape, scale).unwrap(); + let mu = Uniform::new(1, 11).expect("Could not make uniform sampler"); + + let mut current: Vec = (0..400).map(|_| mu.sample(&mut rng) as u16).collect(); + let mut buffer = Cursor::new(Vec::new()); + + for i in 0..n_samples { + eprint!("Generating sample: {}\r", i + 1); + if i > 0 && i % 5 != 0 { + let mut distinct = current.clone(); + distinct.sort_unstable(); + distinct.dedup(); + + if distinct.len() >= 2 { + let a = distinct[(i * 7) % distinct.len()]; + let mut b = distinct[(i * 11) % distinct.len()]; + if a == b { + b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + } + + let positions: Vec = current + .iter() + .enumerate() + .filter_map(|(idx, &value)| ((value == a) || (value == b)).then_some(idx)) + .collect(); + + let mut next = current.clone(); + let mut remaining = positions.len(); + let mut cursor = 0usize; + let mut seed_word = i as u64 ^ 0x9E37_79B9_7F4A_7C15; + let mut value = if i % 2 == 0 { a } else { b }; + + while remaining > 0 { + let run_len = 1 + (seed_word as usize % remaining); + for _ in 0..run_len { + next[positions[cursor]] = value; + cursor += 1; + } + remaining -= run_len; + value = if value == a { b } else { a }; + seed_word = seed_word.rotate_left(9) ^ gamma.sample(&mut rng) as u64; + } + + current = next; + } + } + + writeln!( + &mut buffer, + "{}", + json!({ + "assignment": current.clone(), + "sample": i + 1, + }) + ) + .unwrap(); + } + + buffer.set_position(0); + + let mut input_writer = Vec::new(); + let mut output_writer = Vec::new(); + + encode_jsonl_to_ben(&mut buffer, &mut input_writer, BenVariant::TwoDelta).unwrap(); + buffer.set_position(0); + decode_ben_to_jsonl(&input_writer[..], &mut output_writer).unwrap(); + + buffer.set_position(0); + let mut original_data = Vec::new(); + buffer.read_to_end(&mut original_data).unwrap(); + + assert_eq!(original_data, output_writer); +} + #[test] fn test_xben_pipeline() { let seed = 129530786u64; From 80f5eb67f4228c4b117b6e0d9a253c4f9e72e676 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:36:26 -0600 Subject: [PATCH 021/221] Opitimize the checks for eqality to improve write speed --- ben/src/codec/encode/ben.rs | 146 +++++++++++------ ben/src/codec/encode/mod.rs | 2 +- ben/src/io/writer.rs | 260 +++++++++++++++++++++++++------ ben/tests/test_impls_pipeline.rs | 14 ++ 4 files changed, 328 insertions(+), 94 deletions(-) diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 46130d8..26fe8dd 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,5 +1,6 @@ use super::types::{BenFrame, IdVec, TwoDeltaFrame}; use serde_json::Value; +use std::collections::HashMap; use std::io; /// Encode a JSON assignment record into the ben32 frame representation used by @@ -113,6 +114,15 @@ pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> BenFrame { pub fn encode_twodelta_vec( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, +) -> io::Result { + encode_twodelta_vec_with_hint(previous_assignment, new_assignment, None, None) +} + +pub(crate) fn encode_twodelta_vec_with_hint( + previous_assignment: impl AsRef<[u16]>, + new_assignment: impl AsRef<[u16]>, + delta_pair: Option<(u16, u16)>, + masks: Option<&HashMap>>, ) -> io::Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); @@ -124,67 +134,98 @@ pub fn encode_twodelta_vec( )); } - let mut pair_ids = [0u16; 2]; - let mut pair_len = 0usize; - for (&previous, ¤t) in previous_assignment.iter().zip(new_assignment.iter()) { - if previous == current { - continue; - } - for value in [previous, current] { - if !pair_ids[..pair_len].contains(&value) { - if pair_len == 2 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta transitions may involve at most two assignment ids", - )); + let pair = if let Some(pair) = delta_pair { + pair + } else { + let mut pair_ids = [0u16; 2]; + let mut pair_len = 0usize; + for (&previous, ¤t) in previous_assignment.iter().zip(new_assignment.iter()) { + if previous == current { + continue; + } + for value in [previous, current] { + if !pair_ids[..pair_len].contains(&value) { + if pair_len == 2 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta transitions may involve at most two assignment ids", + )); + } + pair_ids[pair_len] = value; + pair_len += 1; } - pair_ids[pair_len] = value; - pair_len += 1; } } - } - if pair_len == 0 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta cannot encode identical assignments as a delta frame", - )); - } - - let pair = if pair_len == 1 { - (pair_ids[0], pair_ids[0]) - } else { - (pair_ids[0], pair_ids[1]) - }; - - let mut pair_positions = Vec::new(); - pair_positions.reserve(previous_assignment.len()); - for (idx, (&previous, ¤t)) in previous_assignment - .iter() - .zip(new_assignment.iter()) - .enumerate() - { - let previous_in_pair = previous == pair.0 || previous == pair.1; - let current_in_pair = current == pair.0 || current == pair.1; - - if previous_in_pair != current_in_pair { + if pair_len == 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "TwoDelta requires the changed id pair to occupy the same positions", + "TwoDelta cannot encode identical assignments as a delta frame", )); } - if !previous_in_pair && previous != current { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta found a change outside the selected id pair", - )); + if pair_len == 1 { + (pair_ids[0], pair_ids[0]) + } else { + (pair_ids[0], pair_ids[1]) } + }; - if previous_in_pair { - pair_positions.push(idx); + let pair_positions = if let Some(masks) = masks { + match (masks.get(&pair.0), masks.get(&pair.1)) { + (Some(mask_a), Some(mask_b)) if pair.0 != pair.1 => { + let mut merged = Vec::with_capacity(mask_a.len() + mask_b.len()); + let (mut i, mut j) = (0usize, 0usize); + while i < mask_a.len() || j < mask_b.len() { + if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { + merged.push(mask_a[i]); + i += 1; + } else { + merged.push(mask_b[j]); + j += 1; + } + } + merged + } + (Some(mask), _) => mask.clone(), + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta pair mask is missing for the previous assignment", + )); + } } - } + } else { + let mut pair_positions = Vec::new(); + pair_positions.reserve(previous_assignment.len()); + for (idx, (&previous, ¤t)) in previous_assignment + .iter() + .zip(new_assignment.iter()) + .enumerate() + { + let previous_in_pair = previous == pair.0 || previous == pair.1; + let current_in_pair = current == pair.0 || current == pair.1; + + if previous_in_pair != current_in_pair { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta requires the changed id pair to occupy the same positions", + )); + } + + if !previous_in_pair && previous != current { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta found a change outside the selected id pair", + )); + } + + if previous_in_pair { + pair_positions.push(idx); + } + } + pair_positions + }; if pair_positions.is_empty() { return Err(io::Error::new( @@ -208,7 +249,14 @@ pub fn encode_twodelta_vec( let mut current_run = 0u16; for &idx in &pair_positions { + let previous = previous_assignment[idx]; let value = new_assignment[idx]; + if previous != pair.0 && previous != pair.1 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta pair mask referenced an index outside the selected id pair", + )); + } if value != ordered_pair.0 && value != ordered_pair.1 { return Err(io::Error::new( io::ErrorKind::InvalidData, diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index fa6b28a..000e936 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -5,7 +5,7 @@ mod jsonl; mod types; mod xz; -pub(crate) use ben::encode_ben32_line; +pub(crate) use ben::{encode_ben32_line, encode_twodelta_vec_with_hint}; pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 0d5f5ab..87c23a8 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -1,10 +1,11 @@ use crate::codec::encode::{ - encode_ben32_line, encode_ben_vec_from_assign, encode_twodelta_vec, BenFrame, IdVec, + encode_ben32_line, encode_ben_vec_from_assign, encode_twodelta_vec_with_hint, BenFrame, IdVec, TwoDeltaFrame, }; use crate::codec::translate::ben_to_ben32_lines; use crate::BenVariant; use serde_json::Value; +use std::collections::HashMap; use std::io::{self, BufRead, Result, Write}; use xz2::write::XzEncoder; @@ -22,10 +23,17 @@ impl BufferedBenFrame { } } +#[derive(Clone, Copy, Debug, Default)] +struct AssignmentHints { + is_repeated: bool, + delta_pair: Option<(u16, u16)>, +} + /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { writer: W, previous_sample: Vec, + previous_masks: HashMap>, previous_encoded_sample: Option, sample_count: u16, variant: BenVariant, @@ -53,6 +61,7 @@ impl BenEncoder { BenEncoder { writer, previous_sample: Vec::new(), + previous_masks: HashMap::new(), previous_encoded_sample: None, sample_count: 0, complete: false, @@ -60,43 +69,125 @@ impl BenEncoder { } } - fn flush_pending_frame(&mut self) -> Result<()> { - if self.sample_count == 0 { - return Ok(()); + fn rebuild_previous_masks(&mut self) { + self.previous_masks.clear(); + for (idx, &assignment) in self.previous_sample.iter().enumerate() { + self.previous_masks.entry(assignment).or_default().push(idx); } + } - let encoded = self - .previous_encoded_sample - .as_ref() - .expect("missing previous BEN frame"); - self.writer.write_all(encoded.as_slice())?; + fn set_previous_sample( + &mut self, + sample: Vec, + encoded: BufferedBenFrame, + sample_count: u16, + ) { + self.previous_sample = sample; + self.rebuild_previous_masks(); + self.previous_encoded_sample = Some(encoded); + self.sample_count = sample_count; + } - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { - self.writer.write_all(&self.sample_count.to_be_bytes())?; + fn analyze_assignment_transition( + previous_sample: &[u16], + assign_vec: &[u16], + ) -> AssignmentHints { + Self::analyze_twodelta_transition(previous_sample, assign_vec) + } + + fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return false; } - Ok(()) + for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { + if previous != current { + return false; + } + } + + true } - /// Encode and write a full assignment vector. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { + fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> AssignmentHints { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return AssignmentHints::default(); + } + + let Some(first_mismatch) = previous_sample + .iter() + .zip(assign_vec.iter()) + .position(|(&previous, ¤t)| previous != current) + else { + return AssignmentHints { + is_repeated: true, + delta_pair: None, + }; + }; + + let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); + + for (&previous, ¤t) in previous_sample + .iter() + .zip(assign_vec.iter()) + .skip(first_mismatch + 1) + { + if previous == current { + continue; + } + + if previous != pair.0 && previous != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + + if current != pair.0 && current != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + } + + AssignmentHints { + is_repeated: false, + delta_pair: Some(pair), + } + } + + fn write_assignment_with_hints( + &mut self, + assign_vec: Vec, + hints: AssignmentHints, + ) -> Result<()> { match self.variant { BenVariant::Standard => { + let repeated = Self::is_repeated_assignment(&self.previous_sample, &assign_vec); + if hints.is_repeated { + if let Some(encoded) = self.previous_encoded_sample.as_ref() { + self.writer.write_all(encoded.as_slice())?; + self.previous_sample = assign_vec; + return Ok(()); + } + } + + if repeated { + if let Some(encoded) = self.previous_encoded_sample.as_ref() { + self.writer.write_all(encoded.as_slice())?; + self.previous_sample = assign_vec; + return Ok(()); + } + } + let encoded = encode_ben_vec_from_assign(&assign_vec); self.writer.write_all(encoded.as_slice())?; + self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 0); Ok(()) } BenVariant::MkvChain => { - let repeated = assign_vec == self.previous_sample; - if repeated { + if Self::is_repeated_assignment(&self.previous_sample, &assign_vec) { self.sample_count += 1; return Ok(()); } @@ -106,36 +197,70 @@ impl BenEncoder { } let encoded = encode_ben_vec_from_assign(&assign_vec); - self.previous_encoded_sample = Some(BufferedBenFrame::Ben(encoded)); - self.previous_sample = assign_vec; - self.sample_count = 1; - + self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); Ok(()) } BenVariant::TwoDelta => { if self.previous_sample.is_empty() { let encoded = encode_ben_vec_from_assign(&assign_vec); - self.previous_encoded_sample = Some(BufferedBenFrame::Ben(encoded)); - self.previous_sample = assign_vec; - self.sample_count = 1; + self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); return Ok(()); } - if assign_vec == self.previous_sample { + if hints.is_repeated { self.sample_count += 1; return Ok(()); } - let encoded = encode_twodelta_vec(&self.previous_sample, &assign_vec)?; + let encoded = encode_twodelta_vec_with_hint( + &self.previous_sample, + &assign_vec, + hints.delta_pair, + Some(&self.previous_masks), + )?; self.flush_pending_frame()?; - self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); - self.previous_sample = assign_vec; - self.sample_count = 1; + self.set_previous_sample(assign_vec, BufferedBenFrame::TwoDelta(encoded), 1); Ok(()) } } } + fn flush_pending_frame(&mut self) -> Result<()> { + if self.sample_count == 0 { + return Ok(()); + } + + let encoded = self + .previous_encoded_sample + .as_ref() + .expect("missing previous BEN frame"); + self.writer.write_all(encoded.as_slice())?; + + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { + self.writer.write_all(&self.sample_count.to_be_bytes())?; + } + + Ok(()) + } + + /// Encode and write a full assignment vector. + /// + /// # Arguments + /// + /// * `assign_vec` - The full assignment vector to encode. + /// + /// # Returns + /// + /// Returns `Ok(())` after the assignment has been queued or written. + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { + let hints = if self.variant == BenVariant::TwoDelta { + Self::analyze_assignment_transition(&self.previous_sample, &assign_vec) + } else { + AssignmentHints::default() + }; + self.write_assignment_with_hints(assign_vec, hints) + } + /// Encode and write a JSON assignment record. /// /// The input must contain an `assignment` array of integers. Other fields @@ -155,9 +280,22 @@ impl BenEncoder { "'assignment' field either missing or is not an array of integers", ) })?; + let previous_len = self.previous_sample.len(); + let can_compare = previous_len == assign_vec.len(); + let mut hints = AssignmentHints::default(); + let mut mismatch_pair: Option<(u16, u16)> = None; + let mut twodelta_valid = true; + let track_repeated = matches!(self.variant, BenVariant::Standard | BenVariant::MkvChain) + && can_compare + && !self.previous_sample.is_empty(); + let track_twodelta = self.variant == BenVariant::TwoDelta && can_compare; + let mut twodelta_is_repeated = track_twodelta && !self.previous_sample.is_empty(); + let mut is_repeated = track_repeated; + let converted_vec = assign_vec .iter() - .map(|x| { + .enumerate() + .map(|(idx, x)| { let u = x.as_u64().ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidData, @@ -168,16 +306,50 @@ impl BenEncoder { ) })?; - u16::try_from(u).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", u), - ) - }) + u16::try_from(u) + .map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", u), + ) + }) + .inspect(|value| { + if track_repeated && is_repeated && self.previous_sample[idx] != *value { + is_repeated = false; + } + + if track_twodelta { + let previous = self.previous_sample[idx]; + if previous != *value { + twodelta_is_repeated = false; + if let Some(pair) = mismatch_pair { + if previous != pair.0 && previous != pair.1 + || *value != pair.0 && *value != pair.1 + { + twodelta_valid = false; + } + } else { + mismatch_pair = Some((previous, *value)); + } + } + } + }) }) .collect::>>()?; - self.write_assignment(converted_vec) + if track_repeated { + hints.is_repeated = is_repeated; + } else if track_twodelta { + hints.is_repeated = twodelta_is_repeated; + } else if self.variant == BenVariant::Standard || self.variant == BenVariant::MkvChain { + hints.is_repeated = false; + } + + if track_twodelta && !hints.is_repeated && twodelta_valid { + hints.delta_pair = mismatch_pair; + } + + self.write_assignment_with_hints(converted_vec, hints) } /// Flush any buffered repetition state to the underlying writer. diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index aeade60..68355c9 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1258,6 +1258,20 @@ fn twodelta_rejects_non_pair_transition() { assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } +#[test] +fn twodelta_write_json_value_rejects_non_pair_transition() { + let mut ben = Vec::new(); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + encoder + .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) + .unwrap(); + let err = encoder + .write_json_value(json!({"assignment": [1u16, 3, 2, 4]})) + .err() + .unwrap(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + #[test] fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { let assignments = vec![ From a19fb264500e36870cc172aff5e5a28b12dd3ded Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:47:28 -0600 Subject: [PATCH 022/221] add twodelta to reben --- ben/src/ops/relabel/mod.rs | 131 +++++++++++++++++++++++++++++++---- ben/src/ops/relabel/tests.rs | 69 ++++++++++++++++++ ben/tests/test_cli.rs | 91 ++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 12 deletions(-) diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 522ca1d..cdb3694 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -2,6 +2,8 @@ use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; +use crate::io::reader::BenDecoder; +use crate::io::writer::BenEncoder; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -37,6 +39,84 @@ fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result< Ok(permutation) } +fn canonicalize_assignment(assignment: &[u16]) -> Vec { + let mut label_map = HashMap::new(); + let mut next_label = 0u16; + let mut out = Vec::with_capacity(assignment.len()); + + for &value in assignment { + let mapped = match label_map.get(&value) { + Some(mapped) => *mapped, + None => { + next_label += 1; + label_map.insert(value, next_label); + next_label + } + }; + out.push(mapped); + } + + out +} + +fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result> { + if assignment.len() != permutation.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "Relabel map length {} does not match assignment length {}", + permutation.len(), + assignment.len() + ), + )); + } + + let mut out = vec![0u16; permutation.len()]; + for (new_idx, &old_idx) in permutation.iter().enumerate() { + out[new_idx] = assignment[old_idx]; + } + Ok(out) +} + +fn relabel_ben_file_via_decoder( + reader: R, + writer: W, + variant: BenVariant, + max_samples: Option, + mut transform: F, +) -> io::Result<()> +where + F: FnMut(&[u16]) -> io::Result>, +{ + let decoder = BenDecoder::new(reader)?; + let mut encoder = BenEncoder::new(writer, variant); + let mut sample_number = 0usize; + + for record in decoder { + let (assignment, count) = record?; + if max_samples.is_some_and(|limit| sample_number >= limit) { + break; + } + + let relabeled = transform(&assignment)?; + let out_count = max_samples + .map(|limit| (limit - sample_number).min(count as usize)) + .unwrap_or(count as usize); + + for _ in 0..out_count { + encoder.write_assignment(relabeled.clone())?; + } + + sample_number += out_count; + progress!("Relabeling line: {}\r", sample_number); + } + + tracing::trace!(""); + tracing::trace!("Done!"); + encoder.finish()?; + Ok(()) +} + /// Canonicalize the labels used inside each BEN frame. /// /// Labels are reassigned in first-seen order within each assignment vector, @@ -202,6 +282,7 @@ fn relabel_ben_file_impl( let variant = match &check_buffer { b"STANDARD BEN FILE" => BenVariant::Standard, b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + b"TWODELTA BEN FILE" => BenVariant::TwoDelta, _ => { return Err(Error::new( io::ErrorKind::InvalidData, @@ -210,9 +291,19 @@ fn relabel_ben_file_impl( } }; - writer.write_all(&check_buffer)?; - - relabel_ben_lines_impl(&mut reader, &mut writer, variant, max_samples)?; + match variant { + BenVariant::Standard | BenVariant::MkvChain => { + writer.write_all(&check_buffer)?; + relabel_ben_lines_impl(&mut reader, &mut writer, variant, max_samples)? + } + BenVariant::TwoDelta => { + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder(full_stream.as_slice(), &mut writer, variant, max_samples, |assignment| { + Ok(canonicalize_assignment(assignment)) + })? + } + } Ok(()) } @@ -419,6 +510,7 @@ fn relabel_ben_file_with_map_impl( let variant = match &check_buffer { b"STANDARD BEN FILE" => BenVariant::Standard, b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + b"TWODELTA BEN FILE" => BenVariant::TwoDelta, _ => { return Err(Error::new( io::ErrorKind::InvalidData, @@ -427,15 +519,30 @@ fn relabel_ben_file_with_map_impl( } }; - writer.write_all(&check_buffer)?; - - relabel_ben_lines_with_map_impl( - &mut reader, - &mut writer, - new_to_old_node_map, - variant, - max_samples, - )?; + match variant { + BenVariant::Standard | BenVariant::MkvChain => { + writer.write_all(&check_buffer)?; + relabel_ben_lines_with_map_impl( + &mut reader, + &mut writer, + new_to_old_node_map, + variant, + max_samples, + )? + } + BenVariant::TwoDelta => { + let permutation = dense_permutation(&new_to_old_node_map)?; + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + &mut writer, + variant, + max_samples, + |assignment| permute_assignment(assignment, &permutation), + )? + } + } Ok(()) } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 7a3a9ce..80d8d33 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -161,6 +161,35 @@ fn test_relabel_simple_file_mkv_with_limit() { assert_eq!(output_str, expected); } +#[test] +fn test_relabel_simple_file_twodelta() { + let file = concat!( + "{\"assignment\":[1,1,2,2,3,3],\"sample\":1}\n", + "{\"assignment\":[1,1,2,2,3,3],\"sample\":2}\n", + "{\"assignment\":[1,2,2,1,3,3],\"sample\":3}\n", + "{\"assignment\":[2,2,1,1,3,3],\"sample\":4}\n" + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::TwoDelta) + .unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_file(encoded.as_slice(), io::BufWriter::new(&mut relabeled)).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,1,2,2,3,3],\"sample\":1}\n", + "{\"assignment\":[1,1,2,2,3,3],\"sample\":2}\n", + "{\"assignment\":[1,2,2,1,3,3],\"sample\":3}\n", + "{\"assignment\":[1,1,2,2,3,3],\"sample\":4}\n" + ); + assert_eq!(output_str, expected); +} + #[test] fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; @@ -356,6 +385,46 @@ fn test_relabel_simple_file_with_map_mkv() { assert_eq!(output_str, out_file); } +#[test] +fn test_relabel_simple_file_with_map_twodelta() { + let file = concat!( + "{\"assignment\":[1,1,2,2,3,3],\"sample\":1}\n", + "{\"assignment\":[1,1,2,2,3,3],\"sample\":2}\n", + "{\"assignment\":[1,2,2,1,3,3],\"sample\":3}\n", + "{\"assignment\":[2,2,1,1,3,3],\"sample\":4}\n" + ); + + let new_to_old_map: HashMap = + [(0, 2), (1, 3), (2, 0), (3, 1), (4, 4), (5, 5)] + .iter() + .cloned() + .collect(); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::TwoDelta) + .unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_file_with_map( + encoded.as_slice(), + io::BufWriter::new(&mut relabeled), + new_to_old_map, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[2,2,1,1,3,3],\"sample\":1}\n", + "{\"assignment\":[2,2,1,1,3,3],\"sample\":2}\n", + "{\"assignment\":[2,1,1,2,3,3],\"sample\":3}\n", + "{\"assignment\":[1,1,2,2,3,3],\"sample\":4}\n" + ); + assert_eq!(output_str, expected); +} + #[test] fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { let file = concat!( diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index d1bc099..71ada77 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -988,6 +988,97 @@ fn reben_cli_can_limit_ben_relabeling_to_first_n_items() { ); } +#[test] +fn reben_cli_supports_twodelta_ben_mode() { + let temp = TempDir::new("reben-twodelta"); + let graph_path = temp.path().join("dual_graph.json"); + let ben_path = temp.path().join("samples.twodelta.ben"); + let canonical_path = temp.path().join("canonicalized_twodelta.ben"); + let map_relabel_path = temp.path().join("map_relabel_twodelta.ben"); + + fs::write(&graph_path, sample_graph()).unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new( + r#"{"assignment":[1,1,2],"sample":1} +{"assignment":[1,1,2],"sample":2} +{"assignment":[1,2,1],"sample":3} +{"assignment":[2,2,1],"sample":4} +"# + .as_bytes(), + ), + &mut ben_bytes, + BenVariant::TwoDelta, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let sort_graph = run( + "reben", + &[ + graph_path.to_str().unwrap(), + "--mode", + "json", + "--key", + "GEOID20", + ], + temp.path(), + ); + assert_success(&sort_graph); + + let map_path = temp.path().join("dual_graph_sorted_by_GEOID20_map.json"); + assert!(map_path.exists()); + + let canonicalize = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--output-file", + canonical_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&canonicalize); + + let relabel = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-file", + map_relabel_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&relabel); + + let mut canonical_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&canonical_path).unwrap()), + &mut canonical_jsonl, + ) + .unwrap(); + assert!(String::from_utf8(canonical_jsonl) + .unwrap() + .contains(r#""assignment":[1,1,2]"#)); + + let mut relabeled_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&map_relabel_path).unwrap()), + &mut relabeled_jsonl, + ) + .unwrap(); + assert!(String::from_utf8(relabeled_jsonl) + .unwrap() + .contains(r#""assignment":[1,2,1]"#)); +} + #[test] fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations() { let temp = TempDir::new("reben-more"); From e0054a1f978c0d85b927eaf9df33178d5b8e21d5 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 21:15:44 -0600 Subject: [PATCH 023/221] add twodelta to xben --- ben/src/codec/decode/xz.rs | 46 +++- ben/src/codec/encode/ben.rs | 65 +++-- ben/src/codec/encode/jsonl.rs | 7 - ben/src/codec/encode/mod.rs | 6 +- ben/src/codec/encode/xz.rs | 7 +- ben/src/io/reader.rs | 208 ++++++++++++-- ben/src/io/writer.rs | 446 ++++++++++++++++++------------- ben/tests/test_impls_pipeline.rs | 70 +++++ ben/tests/test_pipeline.rs | 93 +++++++ 9 files changed, 702 insertions(+), 246 deletions(-) diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 8f5279f..0b11fca 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,7 +1,10 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::codec::translate::ben32_to_ben_lines; +use crate::io::reader::XBenDecoder; +use crate::io::writer::BenEncoder; use crate::{progress, BenVariant}; -use std::io::{self, BufRead, Error, Read, Write}; +use serde_json::json; +use std::io::{self, BufRead, BufReader, Error, Read, Write}; use xz2::read::XzDecoder; /// Decode an XBEN stream into an equivalent BEN stream. @@ -35,6 +38,18 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: writer.write_all(b"MKVCHAIN BEN FILE")?; BenVariant::MkvChain } + b"TWODELTA BEN FILE" => { + let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); + let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta); + for record in &mut xben { + let (assignment, count) = record?; + ben.write_assignment(assignment.clone())?; + for _ in 1..count { + ben.write_assignment(assignment.clone())?; + } + } + return Ok(()); + } _ => { return Err(Error::new( io::ErrorKind::InvalidData, @@ -77,9 +92,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: } } } - BenVariant::TwoDelta => { - panic!("not implemented"); - } + BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), } if last_valid_assignment == 0 { @@ -141,6 +154,27 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let variant = match &first_buffer { b"STANDARD BEN FILE" => BenVariant::Standard, b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + b"TWODELTA BEN FILE" => { + let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); + let mut sample_number = 1usize; + for record in &mut xben { + let (assignment, count) = record?; + for _ in 0..count { + progress!("Decoding sample: {}\r", sample_number); + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + sample_number += 1; + } + } + tracing::trace!(""); + tracing::trace!("Done!"); + return Ok(()); + } _ => { return Err(Error::new( io::ErrorKind::InvalidData, @@ -184,9 +218,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i } } } - BenVariant::TwoDelta => { - panic!("not implemented"); - } + BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), } if last_valid_assignment == 0 { diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 26fe8dd..6d4b586 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -3,6 +3,8 @@ use serde_json::Value; use std::collections::HashMap; use std::io; +pub(crate) type TwoDeltaRuns = ((u16, u16), Vec); + /// Encode a JSON assignment record into the ben32 frame representation used by /// XBEN streams. /// @@ -14,6 +16,7 @@ use std::io; /// /// Returns the encoded ben32 frame bytes terminated by the four-byte `0` /// sentinel. +#[cfg_attr(not(test), allow(dead_code))] pub(crate) fn encode_ben32_line(data: Value) -> io::Result { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( @@ -21,28 +24,39 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result { "'assignment' field either missing or is not an array of integers", ) })?; + encode_ben32_assignments( + assign_vec + .iter() + .map(|assignment| { + let assign_u64 = assignment.as_u64().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + assignment + ), + ) + })?; + u16::try_from(assign_u64).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", assign_u64), + ) + }) + }) + .collect::>>()?, + ) +} + +pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> io::Result { + let assign_vec = assign_vec.as_ref(); let mut prev_assign: u16 = 0; let mut count: u16 = 0; let mut first = true; let mut ret = Vec::new(); - for assignment in assign_vec { - let assign_u64 = assignment.as_u64().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!( - "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", - assignment - ), - ) - })?; - let assign = u16::try_from(assign_u64).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", assign_u64), - ) - })?; + for &assign in assign_vec { if first { prev_assign = assign; count = 1; @@ -115,15 +129,17 @@ pub fn encode_twodelta_vec( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, ) -> io::Result { - encode_twodelta_vec_with_hint(previous_assignment, new_assignment, None, None) + let (ordered_pair, run_lengths) = + build_twodelta_runs_with_hint(previous_assignment, new_assignment, None, None)?; + Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) } -pub(crate) fn encode_twodelta_vec_with_hint( +pub(crate) fn build_twodelta_runs_with_hint( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, delta_pair: Option<(u16, u16)>, masks: Option<&HashMap>>, -) -> io::Result { +) -> io::Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); @@ -277,5 +293,16 @@ pub(crate) fn encode_twodelta_vec_with_hint( run_lengths.push(current_run); } + Ok((ordered_pair, run_lengths)) +} + +pub(crate) fn encode_twodelta_vec_with_hint( + previous_assignment: impl AsRef<[u16]>, + new_assignment: impl AsRef<[u16]>, + delta_pair: Option<(u16, u16)>, + masks: Option<&HashMap>>, +) -> io::Result { + let (ordered_pair, run_lengths) = + build_twodelta_runs_with_hint(previous_assignment, new_assignment, delta_pair, masks)?; Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) } diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 588bc30..cb4b743 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -30,13 +30,6 @@ pub fn encode_jsonl_to_xben( n_threads: Option, compression_level: Option, ) -> Result<()> { - if variant == BenVariant::TwoDelta { - return Err(io::Error::new( - io::ErrorKind::Unsupported, - "TwoDelta is currently implemented only for uncompressed .ben streams", - )); - } - let mut n_cpus: u32 = n_threads.unwrap_or(1); n_cpus = n_cpus .min( diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 000e936..124bd11 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -5,7 +5,11 @@ mod jsonl; mod types; mod xz; -pub(crate) use ben::{encode_ben32_line, encode_twodelta_vec_with_hint}; +pub(crate) use ben::{ + build_twodelta_runs_with_hint, encode_ben32_assignments, encode_twodelta_vec_with_hint, +}; +#[cfg(test)] +pub(crate) use ben::encode_ben32_line; pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 0d60fd0..a7c8c8b 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -104,12 +104,7 @@ pub fn encode_ben_to_xben( let mut ben_encoder = match &check_buffer { b"STANDARD BEN FILE" => XBenEncoder::new(encoder, BenVariant::Standard), b"MKVCHAIN BEN FILE" => XBenEncoder::new(encoder, BenVariant::MkvChain), - b"TWODELTA BEN FILE" => { - return Err(io::Error::new( - io::ErrorKind::Unsupported, - "TwoDelta BEN streams cannot yet be translated to XBEN", - )); - } + b"TWODELTA BEN FILE" => XBenEncoder::new(encoder, BenVariant::TwoDelta), _ => { return Err(io::Error::new( io::ErrorKind::InvalidData, diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 5f08c5b..149089c 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -1,5 +1,5 @@ use crate::codec::decode::{decode_ben32_line, decode_ben_line}; -use crate::codec::encode::{encode_ben_vec_from_assign, TwoDeltaFrame}; +use crate::codec::encode::{encode_ben32_assignments, encode_ben_vec_from_assign, TwoDeltaFrame}; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -10,6 +10,9 @@ use std::iter::Peekable; use std::path::{Path, PathBuf}; use xz2::read::XzDecoder; +const XBEN_TWODELTA_FULL_TAG: u8 = 0; +const XBEN_TWODELTA_DELTA_TAG: u8 = 1; + /// A decoded assignment together with the number of times it repeats. pub type MkvRecord = (Vec, u16); /// A raw ben32 frame together with the number of times it repeats. @@ -145,6 +148,16 @@ enum StoredBenFrame { TwoDelta { frame: TwoDeltaFrame, count: u16 }, } +enum XBenTwoDeltaFrame { + Full { + runs: Vec<(u16, u16)>, + }, + Delta { + pair: (u16, u16), + run_lengths: Vec, + }, +} + impl StoredBenFrame { fn count(&self) -> u16 { match self { @@ -438,13 +451,14 @@ fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { } /// Decode a raw TwoDelta frame into a full assignment vector. -fn decode_twodelta_frame_to_assignment( +fn apply_twodelta_runs_to_assignment( previous_assignment: &[u16], - frame: &TwoDeltaFrame, + pair: (u16, u16), + run_lengths: &[u16], ) -> io::Result> { let mut pair_positions = Vec::new(); pair_positions.reserve(previous_assignment.len()); - let (first, second) = frame.pair(); + let (first, second) = pair; for (idx, &assignment) in previous_assignment.iter().enumerate() { if assignment == first || assignment == second { @@ -452,7 +466,6 @@ fn decode_twodelta_frame_to_assignment( } } - let run_lengths = decode_twodelta_run_lengths(frame)?; let expected_total: usize = run_lengths.iter().map(|&len| len as usize).sum(); if expected_total != pair_positions.len() { return Err(io::Error::new( @@ -465,7 +478,7 @@ fn decode_twodelta_frame_to_assignment( let mut write_idx = 0usize; let mut current_value = first; - for run_len in run_lengths { + for &run_len in run_lengths { for _ in 0..run_len { assignment[pair_positions[write_idx]] = current_value; write_idx += 1; @@ -476,6 +489,15 @@ fn decode_twodelta_frame_to_assignment( Ok(assignment) } +/// Decode a raw TwoDelta frame into a full assignment vector. +fn decode_twodelta_frame_to_assignment( + previous_assignment: &[u16], + frame: &TwoDeltaFrame, +) -> io::Result> { + let run_lengths = decode_twodelta_run_lengths(frame)?; + apply_twodelta_runs_to_assignment(previous_assignment, frame.pair(), &run_lengths) +} + fn decode_stored_frame_to_assignment( previous_assignment: Option<&[u16]>, frame: &StoredBenFrame, @@ -580,9 +602,23 @@ pub struct XBenDecoder { pub variant: BenVariant, overflow: Vec, buf: Box<[u8]>, + previous_assignment: Option>, } impl XBenDecoder { + pub(crate) fn from_decompressed_stream( + xz: BufReader>, + variant: BenVariant, + ) -> Self { + Self { + xz, + variant, + overflow: Vec::with_capacity(1 << 20), + buf: vec![0u8; 1 << 20].into_boxed_slice(), + previous_assignment: None, + } + } + /// Create a decoder for an XBEN stream. /// /// # Arguments @@ -602,20 +638,16 @@ impl XBenDecoder { let variant = match &first { b"STANDARD BEN FILE" => BenVariant::Standard, b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, + b"TWODELTA BEN FILE" => BenVariant::TwoDelta, _ => { return Err(io::Error::new( io::ErrorKind::InvalidData, - "Invalid .xben header (expecting STANDARD/MKVCHAIN BEN FILE)", + "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", )); } }; - Ok(Self { - xz, - variant, - overflow: Vec::with_capacity(1 << 20), - buf: vec![0u8; 1 << 20].into_boxed_slice(), - }) + Ok(Self::from_decompressed_stream(xz, variant)) } /// Try to extract one complete ben32 frame from the buffered overflow. @@ -661,8 +693,81 @@ impl XBenDecoder { None } BenVariant::TwoDelta => { - panic!("not implemented"); + None + } + } + } + + fn pop_twodelta_frame_from_overflow( + &self, + overflow: &[u8], + ) -> Option> { + let tag = *overflow.first()?; + match tag { + XBEN_TWODELTA_FULL_TAG => { + if overflow.len() < 7 { + return None; + } + let run_count = u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) + as usize; + let payload_len = run_count.checked_mul(4)?; + let total_len = 1usize + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut runs = Vec::with_capacity(run_count); + let mut cursor = 5usize; + for _ in 0..run_count { + let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); + runs.push((value, len)); + cursor += 4; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) + } + XBEN_TWODELTA_DELTA_TAG => { + if overflow.len() < 11 { + return None; + } + let pair = ( + u16::from_be_bytes([overflow[1], overflow[2]]), + u16::from_be_bytes([overflow[3], overflow[4]]), + ); + let run_count = + u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) as usize; + let payload_len = run_count.checked_mul(2)?; + let total_len = 1usize + .checked_add(2)? + .checked_add(2)? + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut run_lengths = Vec::with_capacity(run_count); + let mut cursor = 9usize; + for _ in 0..run_count { + run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); + cursor += 2; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok(( + XBenTwoDeltaFrame::Delta { pair, run_lengths }, + total_len, + count, + ))) } + _ => Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid TwoDelta XBEN frame tag", + ))), } } @@ -717,15 +822,64 @@ impl Iterator for XBenDecoder { /// Decode and return the next assignment from the XBEN stream. fn next(&mut self) -> Option { loop { - if let Some((frame_bytes, consumed, count)) = - self.pop_frame_from_overflow(&self.overflow) - { - let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { - Ok(assignment) => Ok((assignment, count)), - Err(e) => Err(e), - }; - self.overflow.drain(..consumed); - return Some(res); + match self.variant { + BenVariant::Standard | BenVariant::MkvChain => { + if let Some((frame_bytes, consumed, count)) = + self.pop_frame_from_overflow(&self.overflow) + { + let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + Ok((assignment, count)) + } + Err(e) => Err(e), + }; + self.overflow.drain(..consumed); + return Some(res); + } + } + BenVariant::TwoDelta => { + if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { + let res = match parsed { + Ok((frame, consumed, count)) => { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.as_deref() { + Some(previous_assignment) => { + apply_twodelta_runs_to_assignment( + previous_assignment, + pair, + &run_lengths, + ) + } + None => Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN frame encountered before an initial BEN frame", + )), + } + } + }; + match assignment { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + self.overflow.drain(..consumed); + Ok((assignment, count)) + } + Err(err) => { + self.overflow.drain(..consumed); + Err(err) + } + } + } + Err(err) => { + self.overflow.clear(); + Err(err) + } + }; + return Some(res); + } + } } let read = match self.xz.read(&mut self.buf) { @@ -774,6 +928,14 @@ impl Iterator for XBenFrameDecoder { /// Return the next raw ben32 frame from the input stream. fn next(&mut self) -> Option { + if self.inner.variant == BenVariant::TwoDelta { + return self.inner.next().map(|result| { + result.and_then(|(assignment, count)| { + Ok((encode_ben32_assignments(&assignment)?.into_u8_vec()?, count)) + }) + }); + } + loop { if let Some((frame, consumed, count)) = self.inner.pop_frame_from_overflow(&self.inner.overflow) diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 87c23a8..9a9bcf4 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -1,14 +1,19 @@ use crate::codec::encode::{ - encode_ben32_line, encode_ben_vec_from_assign, encode_twodelta_vec_with_hint, BenFrame, IdVec, - TwoDeltaFrame, + build_twodelta_runs_with_hint, encode_ben32_assignments, encode_ben_vec_from_assign, + encode_twodelta_vec_with_hint, BenFrame, TwoDeltaFrame, }; use crate::codec::translate::ben_to_ben32_lines; +use crate::io::reader::BenDecoder; +use crate::util::rle::assign_to_rle; use crate::BenVariant; use serde_json::Value; use std::collections::HashMap; use std::io::{self, BufRead, Result, Write}; use xz2::write::XzEncoder; +const XBEN_TWODELTA_FULL_TAG: u8 = 0; +const XBEN_TWODELTA_DELTA_TAG: u8 = 1; + enum BufferedBenFrame { Ben(BenFrame), TwoDelta(TwoDeltaFrame), @@ -29,6 +34,130 @@ struct AssignmentHints { delta_pair: Option<(u16, u16)>, } +fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return false; + } + + for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { + if previous != current { + return false; + } + } + + true +} + +fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> AssignmentHints { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return AssignmentHints::default(); + } + + let Some(first_mismatch) = previous_sample + .iter() + .zip(assign_vec.iter()) + .position(|(&previous, ¤t)| previous != current) + else { + return AssignmentHints { + is_repeated: true, + delta_pair: None, + }; + }; + + let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); + + for (&previous, ¤t) in previous_sample + .iter() + .zip(assign_vec.iter()) + .skip(first_mismatch + 1) + { + if previous == current { + continue; + } + + if previous != pair.0 && previous != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + + if current != pair.0 && current != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + } + + AssignmentHints { + is_repeated: false, + delta_pair: Some(pair), + } +} + +fn parse_json_assignment(data: Value) -> Result> { + let assign_vec = data["assignment"].as_array().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "'assignment' field either missing or is not an array of integers", + ) + })?; + + assign_vec + .iter() + .map(|x| { + let u = x.as_u64().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + x + ), + ) + })?; + + u16::try_from(u).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", u), + ) + }) + }) + .collect() +} + +fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { + let runs = assign_to_rle(assignments); + let mut bytes = Vec::with_capacity(1 + 4 + runs.len() * 4); + bytes.push(XBEN_TWODELTA_FULL_TAG); + bytes.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for (value, len) in runs { + bytes.extend_from_slice(&value.to_be_bytes()); + bytes.extend_from_slice(&len.to_be_bytes()); + } + bytes +} + +fn encode_xben_twodelta_delta_frame( + previous_assignment: &[u16], + new_assignment: &[u16], + delta_pair: Option<(u16, u16)>, + masks: Option<&HashMap>>, +) -> io::Result> { + let (ordered_pair, run_lengths) = + build_twodelta_runs_with_hint(previous_assignment, new_assignment, delta_pair, masks)?; + let mut bytes = Vec::with_capacity(1 + 2 + 2 + 4 + run_lengths.len() * 2); + bytes.push(XBEN_TWODELTA_DELTA_TAG); + bytes.extend_from_slice(&ordered_pair.0.to_be_bytes()); + bytes.extend_from_slice(&ordered_pair.1.to_be_bytes()); + bytes.extend_from_slice(&(run_lengths.len() as u32).to_be_bytes()); + for run_length in run_lengths { + bytes.extend_from_slice(&run_length.to_be_bytes()); + } + Ok(bytes) +} + /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { writer: W, @@ -88,75 +217,6 @@ impl BenEncoder { self.sample_count = sample_count; } - fn analyze_assignment_transition( - previous_sample: &[u16], - assign_vec: &[u16], - ) -> AssignmentHints { - Self::analyze_twodelta_transition(previous_sample, assign_vec) - } - - fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return false; - } - - for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { - if previous != current { - return false; - } - } - - true - } - - fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> AssignmentHints { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return AssignmentHints::default(); - } - - let Some(first_mismatch) = previous_sample - .iter() - .zip(assign_vec.iter()) - .position(|(&previous, ¤t)| previous != current) - else { - return AssignmentHints { - is_repeated: true, - delta_pair: None, - }; - }; - - let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); - - for (&previous, ¤t) in previous_sample - .iter() - .zip(assign_vec.iter()) - .skip(first_mismatch + 1) - { - if previous == current { - continue; - } - - if previous != pair.0 && previous != pair.1 { - return AssignmentHints { - is_repeated: false, - delta_pair: None, - }; - } - - if current != pair.0 && current != pair.1 { - return AssignmentHints { - is_repeated: false, - delta_pair: None, - }; - } - } - - AssignmentHints { - is_repeated: false, - delta_pair: Some(pair), - } - } - fn write_assignment_with_hints( &mut self, assign_vec: Vec, @@ -164,7 +224,7 @@ impl BenEncoder { ) -> Result<()> { match self.variant { BenVariant::Standard => { - let repeated = Self::is_repeated_assignment(&self.previous_sample, &assign_vec); + let repeated = is_repeated_assignment(&self.previous_sample, &assign_vec); if hints.is_repeated { if let Some(encoded) = self.previous_encoded_sample.as_ref() { self.writer.write_all(encoded.as_slice())?; @@ -187,7 +247,7 @@ impl BenEncoder { Ok(()) } BenVariant::MkvChain => { - if Self::is_repeated_assignment(&self.previous_sample, &assign_vec) { + if is_repeated_assignment(&self.previous_sample, &assign_vec) { self.sample_count += 1; return Ok(()); } @@ -254,7 +314,7 @@ impl BenEncoder { /// Returns `Ok(())` after the assignment has been queued or written. pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { let hints = if self.variant == BenVariant::TwoDelta { - Self::analyze_assignment_transition(&self.previous_sample, &assign_vec) + analyze_twodelta_transition(&self.previous_sample, &assign_vec) } else { AssignmentHints::default() }; @@ -274,82 +334,7 @@ impl BenEncoder { /// /// Returns `Ok(())` after the record has been validated and encoded. pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let assign_vec = data["assignment"].as_array().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "'assignment' field either missing or is not an array of integers", - ) - })?; - let previous_len = self.previous_sample.len(); - let can_compare = previous_len == assign_vec.len(); - let mut hints = AssignmentHints::default(); - let mut mismatch_pair: Option<(u16, u16)> = None; - let mut twodelta_valid = true; - let track_repeated = matches!(self.variant, BenVariant::Standard | BenVariant::MkvChain) - && can_compare - && !self.previous_sample.is_empty(); - let track_twodelta = self.variant == BenVariant::TwoDelta && can_compare; - let mut twodelta_is_repeated = track_twodelta && !self.previous_sample.is_empty(); - let mut is_repeated = track_repeated; - - let converted_vec = assign_vec - .iter() - .enumerate() - .map(|(idx, x)| { - let u = x.as_u64().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!( - "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", - x - ), - ) - })?; - - u16::try_from(u) - .map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", u), - ) - }) - .inspect(|value| { - if track_repeated && is_repeated && self.previous_sample[idx] != *value { - is_repeated = false; - } - - if track_twodelta { - let previous = self.previous_sample[idx]; - if previous != *value { - twodelta_is_repeated = false; - if let Some(pair) = mismatch_pair { - if previous != pair.0 && previous != pair.1 - || *value != pair.0 && *value != pair.1 - { - twodelta_valid = false; - } - } else { - mismatch_pair = Some((previous, *value)); - } - } - } - }) - }) - .collect::>>()?; - - if track_repeated { - hints.is_repeated = is_repeated; - } else if track_twodelta { - hints.is_repeated = twodelta_is_repeated; - } else if self.variant == BenVariant::Standard || self.variant == BenVariant::MkvChain { - hints.is_repeated = false; - } - - if track_twodelta && !hints.is_repeated && twodelta_valid { - hints.delta_pair = mismatch_pair; - } - - self.write_assignment_with_hints(converted_vec, hints) + self.write_assignment(parse_json_assignment(data)?) } /// Flush any buffered repetition state to the underlying writer. @@ -381,12 +366,40 @@ impl Drop for BenEncoder { /// A struct to make the writing of XBEN files easier and more ergonomic. pub struct XBenEncoder { encoder: XzEncoder, - previous_sample: IdVec, + previous_assignment: Vec, + previous_masks: HashMap>, + previous_frame: Vec, count: u16, variant: BenVariant, } impl XBenEncoder { + fn rebuild_previous_masks(&mut self) { + self.previous_masks.clear(); + for (idx, &assignment) in self.previous_assignment.iter().enumerate() { + self.previous_masks.entry(assignment).or_default().push(idx); + } + } + + fn set_previous_assignment(&mut self, assignment: Vec, frame: Vec, count: u16) { + self.previous_assignment = assignment; + self.rebuild_previous_masks(); + self.previous_frame = frame; + self.count = count; + } + + fn flush_pending_frame(&mut self) -> Result<()> { + if self.count == 0 { + return Ok(()); + } + + self.encoder.write_all(&self.previous_frame)?; + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { + self.encoder.write_all(&self.count.to_be_bytes())?; + } + Ok(()) + } + /// Create a new XBEN writer around an already-configured XZ encoder. /// /// # Arguments @@ -404,7 +417,9 @@ impl XBenEncoder { encoder.write_all(b"STANDARD BEN FILE").unwrap(); XBenEncoder { encoder, - previous_sample: IdVec::U8(Vec::new()), + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), count: 0, variant: BenVariant::Standard, } @@ -413,50 +428,93 @@ impl XBenEncoder { encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); XBenEncoder { encoder, - previous_sample: IdVec::U8(Vec::new()), + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), count: 0, variant: BenVariant::MkvChain, } } BenVariant::TwoDelta => { - panic!("not implemented"); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant: BenVariant::TwoDelta, + } } } } - /// Encode and write a JSON assignment record into the compressed XBEN stream. + /// Encode and write a full assignment vector into the compressed XBEN stream. /// /// # Arguments /// - /// * `data` - A JSON object containing an `assignment` array. + /// * `assign_vec` - The full assignment vector to encode. /// /// # Returns /// - /// Returns `Ok(())` after the record has been validated and encoded. - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let encoded = encode_ben32_line(data)?; + /// Returns `Ok(())` after the assignment has been queued or written. + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { match self.variant { BenVariant::Standard => { - self.encoder.write_all(encoded.as_u8_slice()?)?; + let encoded = encode_ben32_assignments(&assign_vec)?.into_u8_vec()?; + self.encoder.write_all(&encoded)?; + self.previous_assignment = assign_vec; + self.previous_frame = encoded; + Ok(()) } BenVariant::MkvChain => { - if encoded == self.previous_sample { + if is_repeated_assignment(&self.previous_assignment, &assign_vec) { self.count += 1; - } else { - if self.count > 0 { - self.encoder - .write_all(self.previous_sample.as_u8_slice()?)?; - self.encoder.write_all(&self.count.to_be_bytes())?; - } - self.previous_sample = encoded; - self.count = 1; + return Ok(()); } + + self.flush_pending_frame()?; + let encoded = encode_ben32_assignments(&assign_vec)?.into_u8_vec()?; + self.set_previous_assignment(assign_vec, encoded, 1); + Ok(()) } BenVariant::TwoDelta => { - panic!("not implemented"); + if self.previous_assignment.is_empty() { + let encoded = encode_xben_twodelta_full_frame(&assign_vec); + self.set_previous_assignment(assign_vec, encoded, 1); + return Ok(()); + } + + let hints = analyze_twodelta_transition(&self.previous_assignment, &assign_vec); + if hints.is_repeated { + self.count += 1; + return Ok(()); + } + + let encoded = encode_xben_twodelta_delta_frame( + &self.previous_assignment, + &assign_vec, + hints.delta_pair, + Some(&self.previous_masks), + )?; + self.flush_pending_frame()?; + self.set_previous_assignment(assign_vec, encoded, 1); + Ok(()) } } - Ok(()) + } + + /// Encode and write a JSON assignment record into the compressed XBEN stream. + /// + /// # Arguments + /// + /// * `data` - A JSON object containing an `assignment` array. + /// + /// # Returns + /// + /// Returns `Ok(())` after the record has been validated and encoded. + pub fn write_json_value(&mut self, data: Value) -> Result<()> { + self.write_assignment(parse_json_assignment(data)?) } /// Read BEN frames from `reader` and write them into this XBEN stream. @@ -474,12 +532,42 @@ impl XBenEncoder { pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { let peek = reader.fill_buf()?; let has_banner = peek.len() >= 17 - && (peek.starts_with(b"STANDARD BEN FILE") || peek.starts_with(b"MKVCHAIN BEN FILE")); + && (peek.starts_with(b"STANDARD BEN FILE") + || peek.starts_with(b"MKVCHAIN BEN FILE") + || peek.starts_with(b"TWODELTA BEN FILE")); if has_banner { + if self.variant == BenVariant::TwoDelta { + let mut banner_prefixed = Vec::new(); + banner_prefixed.extend_from_slice(&peek[..17]); + reader.consume(17); + reader.read_to_end(&mut banner_prefixed)?; + + let decoder = BenDecoder::new(io::Cursor::new(banner_prefixed))?; + for record in decoder { + let (assignment, count) = record?; + self.write_assignment(assignment.clone())?; + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && count > 1 + { + self.count += count - 1; + } else if self.variant == BenVariant::Standard { + for _ in 1..count { + self.write_assignment(assignment.clone())?; + } + } + } + return Ok(()); + } reader.consume(17); } + if self.variant == BenVariant::TwoDelta { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN translation requires a BEN stream with its banner", + )); + } + ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) } } @@ -487,17 +575,9 @@ impl XBenEncoder { impl Drop for XBenEncoder { /// Flush any buffered XBEN repetition state during drop. fn drop(&mut self) { - if self.variant == BenVariant::MkvChain && self.count > 0 { - self.encoder - .write_all( - self.previous_sample - .as_u8_slice() - .expect("Error writing last line to file"), - ) - .expect("Error writing last line to file"); - self.encoder - .write_all(&self.count.to_be_bytes()) - .expect("Error writing last line count to file"); + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { + self.flush_pending_frame() + .expect("Error writing last XBEN frame to file"); } } } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 68355c9..a6af7c6 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -278,6 +278,30 @@ proptest! { prop_assert_eq!(out, jsonl); } + // JSONL -> XBEN(TwoDelta) -> BEN -> JSONL + #[test] + fn fuzz_roundtrip_xben_twodelta(seq in strat_twodelta_seq(), params in strat_threads_levels()) { + let (threads, level) = params; + let jsonl = jsonl_from_assignments(&seq); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::TwoDelta, + Some(threads), + Some(level), + ).unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + prop_assert_eq!(out, jsonl); + } + // Direct XBEN -> JSONL via jsonl_decode_xben matches the long path. #[test] fn fuzz_decode_xben_direct_equals_via_ben(seq in strat_assignment_seq(), params in strat_threads_levels()) { @@ -333,6 +357,31 @@ proptest! { prop_assert_eq!(iter_jsonl, direct); } + // Iterator surface: XBenDecoder over TwoDelta XBEN matches direct JSONL. + #[test] + fn fuzz_xbendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq(), params in strat_threads_levels()) { + let (threads, level) = params; + let jsonl = jsonl_from_assignments(&seq); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::TwoDelta, + Some(threads), + Some(level), + ).unwrap(); + + let mut dec = XBenDecoder::new(xben.as_slice()).unwrap(); + let recs = collect_records(&mut dec).unwrap(); + let iter_jsonl = jsonl_from_records(&recs, 0); + + let mut direct = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut direct).unwrap(); + + prop_assert_eq!(iter_jsonl, direct); + } + // Iterator surface: BenDecoder over BEN produced by BenEncoder. #[test] fn fuzz_bendecoder_iterator_matches_jsonl(seq in strat_assignment_seq()) { @@ -1101,6 +1150,27 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!(XBenDecoder::new(xben.as_slice()).unwrap().count_samples().unwrap(), 3); + + let twodelta_jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,2,2,1],"sample":2} +{"assignment":[1,2,2,1],"sample":3} +"#; + let mut twodelta_xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(twodelta_jsonl.as_bytes()), + &mut twodelta_xben, + BenVariant::TwoDelta, + Some(1), + Some(0), + ) + .unwrap(); + assert_eq!( + XBenDecoder::new(twodelta_xben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 3 + ); } #[test] diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 410ca09..9dd73b9 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -342,3 +342,96 @@ fn test_xmkvben_pipeline() { assert_eq!(original_data, xoutput_writer); } + +#[test] +fn test_xtwodeltaben_pipeline() { + let seed = 129530786u64; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + + let n_samples = 50; + let shape = 2.0; + let scale = 50.0; + let gamma = Gamma::new(shape, scale).unwrap(); + let mu = Uniform::new(1, 11).expect("Could not make uniform sampler"); + + let mut current: Vec = (0..400).map(|_| mu.sample(&mut rng) as u16).collect(); + let mut buffer = Vec::new(); + let mut sample_writer = Cursor::new(&mut buffer); + + for i in 0..n_samples { + eprint!("Generating sample: {}\r", i + 1); + if i > 0 && i % 5 != 0 { + let mut distinct = current.clone(); + distinct.sort_unstable(); + distinct.dedup(); + + if distinct.len() >= 2 { + let a = distinct[(i * 7) % distinct.len()]; + let mut b = distinct[(i * 11) % distinct.len()]; + if a == b { + b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + } + + let positions: Vec = current + .iter() + .enumerate() + .filter_map(|(idx, &value)| ((value == a) || (value == b)).then_some(idx)) + .collect(); + + let mut next = current.clone(); + let mut remaining = positions.len(); + let mut cursor = 0usize; + let mut seed_word = i as u64 ^ 0x9E37_79B9_7F4A_7C15; + let mut value = if i % 2 == 0 { a } else { b }; + + while remaining > 0 { + let run_len = 1 + (seed_word as usize % remaining); + for _ in 0..run_len { + next[positions[cursor]] = value; + cursor += 1; + } + remaining -= run_len; + value = if value == a { b } else { a }; + seed_word = seed_word.rotate_left(9) ^ gamma.sample(&mut rng) as u64; + } + + current = next; + } + } + + writeln!( + &mut sample_writer, + "{}", + json!({ + "assignment": current.clone(), + "sample": i + 1, + }) + ) + .unwrap(); + } + eprintln!(); + + sample_writer.set_position(0); + let mut original_data = Vec::new(); + sample_writer.read_to_end(&mut original_data).unwrap(); + + sample_writer.set_position(0); + + let mut input_writer = Vec::new(); + let mut output_writer = Vec::new(); + + encode_jsonl_to_xben( + sample_writer, + &mut input_writer, + BenVariant::TwoDelta, + Some(1), + Some(1), + ) + .unwrap(); + decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); + + let mut xoutput_writer = Vec::new(); + decode_ben_to_jsonl(&output_writer[..], &mut xoutput_writer).unwrap(); + + assert_eq!(original_data, xoutput_writer); +} From b023278aa4a7b04894b0c81afd34e942d2c7e57c Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 21:19:46 -0600 Subject: [PATCH 024/221] move banners to their own section --- ben/src/codec/decode/xz.rs | 29 +++++++++++++++-------------- ben/src/codec/encode/xz.rs | 18 +++++------------- ben/src/format/banners.rs | 38 ++++++++++++++++++++++++++++++++++++++ ben/src/format/mod.rs | 3 +++ ben/src/io/reader.rs | 32 ++++++++++++++------------------ ben/src/io/writer.rs | 22 +++++++--------------- ben/src/lib.rs | 2 ++ ben/src/ops/relabel/mod.rs | 31 +++++++------------------------ 8 files changed, 91 insertions(+), 84 deletions(-) create mode 100644 ben/src/format/banners.rs create mode 100644 ben/src/format/mod.rs diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 0b11fca..f722231 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,5 +1,6 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::codec::translate::ben32_to_ben_lines; +use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; use crate::io::reader::XBenDecoder; use crate::io::writer::BenEncoder; use crate::{progress, BenVariant}; @@ -23,22 +24,22 @@ use xz2::read::XzDecoder; pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io::Result<()> { let mut decoder = XzDecoder::new(reader); - let mut first_buffer = [0u8; 17]; + let mut first_buffer = [0u8; BANNER_LEN]; if let Err(e) = decoder.read_exact(&mut first_buffer) { return Err(e); } - let variant = match &first_buffer { - b"STANDARD BEN FILE" => { - writer.write_all(b"STANDARD BEN FILE")?; + let variant = match variant_from_banner(&first_buffer) { + Some(BenVariant::Standard) => { + writer.write_all(banner_for_variant(BenVariant::Standard))?; BenVariant::Standard } - b"MKVCHAIN BEN FILE" => { - writer.write_all(b"MKVCHAIN BEN FILE")?; + Some(BenVariant::MkvChain) => { + writer.write_all(banner_for_variant(BenVariant::MkvChain))?; BenVariant::MkvChain } - b"TWODELTA BEN FILE" => { + Some(BenVariant::TwoDelta) => { let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta); for record in &mut xben { @@ -50,7 +51,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: } return Ok(()); } - _ => { + None => { return Err(Error::new( io::ErrorKind::InvalidData, "Invalid file format", @@ -145,16 +146,16 @@ pub fn xz_decompress(reader: R, mut writer: W) -> io::Resu pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { let mut decoder = XzDecoder::new(reader); - let mut first_buffer = [0u8; 17]; + let mut first_buffer = [0u8; BANNER_LEN]; if let Err(e) = decoder.read_exact(&mut first_buffer) { return Err(e); } - let variant = match &first_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - b"TWODELTA BEN FILE" => { + let variant = match variant_from_banner(&first_buffer) { + Some(BenVariant::Standard) => BenVariant::Standard, + Some(BenVariant::MkvChain) => BenVariant::MkvChain, + Some(BenVariant::TwoDelta) => { let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); let mut sample_number = 1usize; for record in &mut xben { @@ -175,7 +176,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i tracing::trace!("Done!"); return Ok(()); } - _ => { + None => { return Err(Error::new( io::ErrorKind::InvalidData, "Invalid file format", diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index a7c8c8b..52a1a72 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -1,5 +1,5 @@ +use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::io::writer::XBenEncoder; -use crate::BenVariant; use std::io::{self, BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; @@ -79,7 +79,7 @@ pub fn encode_ben_to_xben( n_threads: Option, compression_level: Option, ) -> Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let mut n_cpus: u32 = n_threads.unwrap_or(1); @@ -101,17 +101,9 @@ pub fn encode_ben_to_xben( .expect("init MT encoder"); let encoder = XzEncoder::new_stream(writer, mt); - let mut ben_encoder = match &check_buffer { - b"STANDARD BEN FILE" => XBenEncoder::new(encoder, BenVariant::Standard), - b"MKVCHAIN BEN FILE" => XBenEncoder::new(encoder, BenVariant::MkvChain), - b"TWODELTA BEN FILE" => XBenEncoder::new(encoder, BenVariant::TwoDelta), - _ => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; + let variant = variant_from_banner(&check_buffer) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; + let mut ben_encoder = XBenEncoder::new(encoder, variant); ben_encoder.write_ben_file(reader)?; diff --git a/ben/src/format/banners.rs b/ben/src/format/banners.rs new file mode 100644 index 0000000..5ecfb0a --- /dev/null +++ b/ben/src/format/banners.rs @@ -0,0 +1,38 @@ +//! Banner constants and helpers for BEN and XBEN streams. + +use crate::BenVariant; + +/// Fixed byte length of every BEN/XBEN banner. +pub const BANNER_LEN: usize = 17; +/// Banner for standard BEN/XBEN streams. +pub const STANDARD_BEN_BANNER: &[u8; BANNER_LEN] = b"STANDARD BEN FILE"; +/// Banner for MKVChain BEN/XBEN streams. +pub const MKVCHAIN_BEN_BANNER: &[u8; BANNER_LEN] = b"MKVCHAIN BEN FILE"; +/// Banner for TwoDelta BEN/XBEN streams. +pub const TWODELTA_BEN_BANNER: &[u8; BANNER_LEN] = b"TWODELTA BEN FILE"; + +/// Return the banner used by a BEN variant. +pub fn banner_for_variant(variant: BenVariant) -> &'static [u8; BANNER_LEN] { + match variant { + BenVariant::Standard => STANDARD_BEN_BANNER, + BenVariant::MkvChain => MKVCHAIN_BEN_BANNER, + BenVariant::TwoDelta => TWODELTA_BEN_BANNER, + } +} + +/// Parse a BEN/XBEN banner into its variant. +pub fn variant_from_banner(banner: &[u8; BANNER_LEN]) -> Option { + match banner { + STANDARD_BEN_BANNER => Some(BenVariant::Standard), + MKVCHAIN_BEN_BANNER => Some(BenVariant::MkvChain), + TWODELTA_BEN_BANNER => Some(BenVariant::TwoDelta), + _ => None, + } +} + +/// Return whether the given bytes begin with a known BEN/XBEN banner. +pub fn has_known_banner_prefix(bytes: &[u8]) -> bool { + bytes.starts_with(STANDARD_BEN_BANNER) + || bytes.starts_with(MKVCHAIN_BEN_BANNER) + || bytes.starts_with(TWODELTA_BEN_BANNER) +} diff --git a/ben/src/format/mod.rs b/ben/src/format/mod.rs new file mode 100644 index 0000000..8811025 --- /dev/null +++ b/ben/src/format/mod.rs @@ -0,0 +1,3 @@ +//! Shared on-disk format metadata for BEN and XBEN streams. + +pub mod banners; diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 149089c..28c040e 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -1,5 +1,6 @@ use crate::codec::decode::{decode_ben32_line, decode_ben_line}; use crate::codec::encode::{encode_ben32_assignments, encode_ben_vec_from_assign, TwoDeltaFrame}; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -181,35 +182,35 @@ impl BenDecoder { /// /// Returns a new decoder positioned at the first BEN frame. pub fn new(mut reader: R) -> Result { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; if let Err(e) = reader.read_exact(&mut check_buffer) { return Err(DecoderInitError::Io(e)); } - match &check_buffer { - b"STANDARD BEN FILE" => Ok(BenDecoder { + match variant_from_banner(&check_buffer) { + Some(BenVariant::Standard) => Ok(BenDecoder { reader, sample_count: 0, variant: BenVariant::Standard, previous_assignment: None, twodelta_consumed_first_frame: false, }), - b"MKVCHAIN BEN FILE" => Ok(BenDecoder { + Some(BenVariant::MkvChain) => Ok(BenDecoder { reader, sample_count: 0, variant: BenVariant::MkvChain, previous_assignment: None, twodelta_consumed_first_frame: false, }), - b"TWODELTA BEN FILE" => Ok(BenDecoder { + Some(BenVariant::TwoDelta) => Ok(BenDecoder { reader, sample_count: 0, variant: BenVariant::TwoDelta, previous_assignment: None, twodelta_consumed_first_frame: false, }), - _ => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), + None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), } } @@ -633,19 +634,14 @@ impl XBenDecoder { let xz = XzDecoder::new(reader); let mut xz = BufReader::with_capacity(1 << 20, xz); - let mut first = [0u8; 17]; + let mut first = [0u8; BANNER_LEN]; xz.read_exact(&mut first)?; - let variant = match &first { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - b"TWODELTA BEN FILE" => BenVariant::TwoDelta, - _ => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", - )); - } - }; + let variant = variant_from_banner(&first).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", + ) + })?; Ok(Self::from_decompressed_stream(xz, variant)) } diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 9a9bcf4..66e02aa 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -3,6 +3,7 @@ use crate::codec::encode::{ encode_twodelta_vec_with_hint, BenFrame, TwoDeltaFrame, }; use crate::codec::translate::ben_to_ben32_lines; +use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; use crate::io::reader::BenDecoder; use crate::util::rle::assign_to_rle; use crate::BenVariant; @@ -181,11 +182,7 @@ impl BenEncoder { /// /// Returns a new encoder ready to accept assignments or RLE frames. pub fn new(mut writer: W, variant: BenVariant) -> Self { - match variant { - BenVariant::Standard => writer.write_all(b"STANDARD BEN FILE").unwrap(), - BenVariant::MkvChain => writer.write_all(b"MKVCHAIN BEN FILE").unwrap(), - BenVariant::TwoDelta => writer.write_all(b"TWODELTA BEN FILE").unwrap(), - }; + writer.write_all(banner_for_variant(variant)).unwrap(); BenEncoder { writer, @@ -412,9 +409,9 @@ impl XBenEncoder { /// /// Returns a new XBEN encoder ready to accept assignments or BEN frames. pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { + encoder.write_all(banner_for_variant(variant)).unwrap(); match variant { BenVariant::Standard => { - encoder.write_all(b"STANDARD BEN FILE").unwrap(); XBenEncoder { encoder, previous_assignment: Vec::new(), @@ -425,7 +422,6 @@ impl XBenEncoder { } } BenVariant::MkvChain => { - encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); XBenEncoder { encoder, previous_assignment: Vec::new(), @@ -436,7 +432,6 @@ impl XBenEncoder { } } BenVariant::TwoDelta => { - encoder.write_all(b"TWODELTA BEN FILE").unwrap(); XBenEncoder { encoder, previous_assignment: Vec::new(), @@ -531,16 +526,13 @@ impl XBenEncoder { /// Returns `Ok(())` after the BEN stream has been translated into XBEN. pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { let peek = reader.fill_buf()?; - let has_banner = peek.len() >= 17 - && (peek.starts_with(b"STANDARD BEN FILE") - || peek.starts_with(b"MKVCHAIN BEN FILE") - || peek.starts_with(b"TWODELTA BEN FILE")); + let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); if has_banner { if self.variant == BenVariant::TwoDelta { let mut banner_prefixed = Vec::new(); - banner_prefixed.extend_from_slice(&peek[..17]); - reader.consume(17); + banner_prefixed.extend_from_slice(&peek[..BANNER_LEN]); + reader.consume(BANNER_LEN); reader.read_to_end(&mut banner_prefixed)?; let decoder = BenDecoder::new(io::Cursor::new(banner_prefixed))?; @@ -558,7 +550,7 @@ impl XBenEncoder { } return Ok(()); } - reader.consume(17); + reader.consume(BANNER_LEN); } if self.variant == BenVariant::TwoDelta { diff --git a/ben/src/lib.rs b/ben/src/lib.rs index cde7f7c..687c332 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -24,6 +24,8 @@ pub mod cli; /// Encoding, decoding, and format-to-format translation helpers. pub mod codec; +/// Shared on-disk format metadata such as stream banners. +pub mod format; /// Streaming readers and writers for BEN and XBEN files. pub mod io; /// JSON graph utilities used by relabeling workflows. diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index cdb3694..864df67 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -2,6 +2,7 @@ use crate::codec::decode::decode_ben_line; use crate::codec::encode::encode_ben_vec_from_rle; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::io::reader::BenDecoder; use crate::io::writer::BenEncoder; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; @@ -276,20 +277,11 @@ fn relabel_ben_file_impl( mut writer: W, max_samples: Option, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; - let variant = match &check_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - b"TWODELTA BEN FILE" => BenVariant::TwoDelta, - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; + let variant = variant_from_banner(&check_buffer) + .ok_or_else(|| Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; match variant { BenVariant::Standard | BenVariant::MkvChain => { @@ -504,20 +496,11 @@ fn relabel_ben_file_with_map_impl( new_to_old_node_map: HashMap, max_samples: Option, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; - let variant = match &check_buffer { - b"STANDARD BEN FILE" => BenVariant::Standard, - b"MKVCHAIN BEN FILE" => BenVariant::MkvChain, - b"TWODELTA BEN FILE" => BenVariant::TwoDelta, - _ => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); - } - }; + let variant = variant_from_banner(&check_buffer) + .ok_or_else(|| Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; match variant { BenVariant::Standard | BenVariant::MkvChain => { From 98b8541157c78440ce3745bc6aa853a99a2495fa Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 21:21:28 -0600 Subject: [PATCH 025/221] add way to convert between ben versions --- ben/src/cli/ben.rs | 8 +- ben/src/cli/pben.rs | 22 +++- ben/src/cli/reben.rs | 124 ++++++++++++++++++-- ben/src/codec/decode/tests.rs | 18 ++- ben/src/io/reader.rs | 19 +-- ben/src/json/graph/mod.rs | 36 +++--- ben/src/json/graph/tests.rs | 5 +- ben/src/logging.rs | 2 +- ben/src/ops/relabel/mod.rs | 160 +++++++++++++++++++++++-- ben/src/ops/relabel/tests.rs | 88 +++++++++----- ben/tests/test_cli.rs | 193 +++++++++++++++++++++++++++---- ben/tests/test_impls_pipeline.rs | 94 +++++++++++---- ben/tests/test_pipeline.rs | 3 +- pyben/src/common.rs | 5 +- pyben/src/decode/mod.rs | 9 +- pyben/src/encode/mod.rs | 4 +- 16 files changed, 656 insertions(+), 134 deletions(-) diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 1dc9470..8121cf8 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -376,8 +376,12 @@ pub fn run() { let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { Some(file) if !args.print => { - match decode_setup(file.clone(), args.output_file.clone(), false, args.overwrite) - { + match decode_setup( + file.clone(), + args.output_file.clone(), + false, + args.overwrite, + ) { Ok(path) => open_derived_writer(path), Err(err) => { eprintln!("Error: {:?}", err); diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 45c208c..9a2b819 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -3,8 +3,8 @@ use crate::io::reader::BenDecoder; use crate::io::writer::{BenEncoder, XBenEncoder}; use crate::BenVariant; use clap::{Parser, ValueEnum}; -use serde_json::json; use pipe::pipe; +use serde_json::json; use std::{ fs::File, io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}, @@ -88,7 +88,10 @@ pub fn run() -> Result<()> { Mode::PcToBen => { tracing::trace!("Converting PCOMPRESS to BEN"); - let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + let mut pcompress_reader: BufReader> = match args + .input_file + .as_ref() + { Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), None => BufReader::new(Box::new(io::stdin())), }; @@ -116,7 +119,10 @@ pub fn run() -> Result<()> { Mode::PcToXben => { tracing::trace!("Converting PCOMPRESS to XBEN"); - let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + let mut pcompress_reader: BufReader> = match args + .input_file + .as_ref() + { Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), None => BufReader::new(Box::new(io::stdin())), }; @@ -287,8 +293,14 @@ mod tests { #[test] fn derive_output_path_replaces_expected_suffixes() { - assert_eq!(derive_output_path(Mode::BenToPc, "plans.ben"), "plans.pcompress"); - assert_eq!(derive_output_path(Mode::PcToBen, "plans.pcompress"), "plans.ben"); + assert_eq!( + derive_output_path(Mode::BenToPc, "plans.ben"), + "plans.pcompress" + ); + assert_eq!( + derive_output_path(Mode::PcToBen, "plans.pcompress"), + "plans.ben" + ); assert_eq!(derive_output_path(Mode::PcToXben, "plans.pc"), "plans.xben"); } diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 382e212..5130a33 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -2,9 +2,12 @@ use crate::cli::common::set_verbose; use crate::{ json::graph::{sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod}, ops::relabel::{ - relabel_ben_file, relabel_ben_file_limit, relabel_ben_file_with_map, + convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, + relabel_ben_file_as_variant_limit, relabel_ben_file_limit, relabel_ben_file_with_map, + relabel_ben_file_with_map_as_variant, relabel_ben_file_with_map_as_variant_limit, relabel_ben_file_with_map_limit, }, + BenVariant, }; use clap::{Parser, ValueEnum}; use serde_json::{json, Value}; @@ -35,6 +38,15 @@ enum OrderingMethod { ReverseCuthillMckee, } +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// BEN variants supported for BEN-mode output. +enum BenCliVariant { + Standard, + MkvChain, + #[clap(alias = "twodelta")] + TwoDelta, +} + #[derive(Parser, Debug)] #[command( name = "Relabeling Binary Ensemble CLI Tool", @@ -77,6 +89,12 @@ struct Args { /// Only relabel the first `n` expanded samples in BEN mode. #[arg(long)] n_items: Option, + /// BEN variant to use for the BEN-mode output file. + #[arg(long, value_enum)] + output_variant: Option, + /// Rewrite the BEN stream without canonicalizing or map relabeling. + #[arg(long)] + convert_only: bool, /// Verbosity level for the program. #[arg(short, long)] verbose: bool, @@ -138,17 +156,36 @@ pub fn run() { .expect("Could not write map file."); } Mode::Ben => { + if args.convert_only && args.output_variant.is_none() { + panic!("--convert-only requires --output-variant."); + } + if args.convert_only + && (args.map_file.is_some() || args.key.is_some() || args.ordering.is_some()) + { + panic!("--convert-only cannot be combined with relabeling options."); + } + let input_file = File::open(&args.input_file).expect("Could not open input file."); let reader = BufReader::new(input_file); + let output_variant = args.output_variant.as_ref().map(to_ben_variant); if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { - tracing::trace!("Canonicalizing assignment vectors in ben file."); + if args.convert_only { + tracing::trace!("Converting BEN file to requested variant."); + } else { + tracing::trace!("Canonicalizing assignment vectors in ben file."); + } let output_file_name = match args.output_file { Some(name) => name, None => { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + "_canonicalized_assignments.jsonl.ben" + if let Some(variant) = output_variant { + args.input_file.trim_end_matches(".ben").to_owned() + + format!("_{}.ben", ben_variant_name(variant)).as_str() + } else { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + "_canonicalized_assignments.jsonl.ben" + } } }; @@ -156,7 +193,20 @@ pub fn run() { File::create(&output_file_name).expect("Could not create output file."); let writer = BufWriter::new(output_file); - if let Some(limit) = args.n_items { + if args.convert_only { + let variant = output_variant.unwrap(); + if let Some(limit) = args.n_items { + convert_ben_file_limit(reader, writer, variant, limit).unwrap(); + } else { + convert_ben_file(reader, writer, variant).unwrap(); + } + } else if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_as_variant_limit(reader, writer, variant, limit).unwrap(); + } else { + relabel_ben_file_as_variant(reader, writer, variant).unwrap(); + } + } else if let Some(limit) = args.n_items { relabel_ben_file_limit(reader, writer, limit).unwrap(); } else { relabel_ben_file(reader, writer).unwrap(); @@ -253,9 +303,31 @@ pub fn run() { File::create(&output_file_name).expect("Could not create output file."); let writer = BufWriter::new(output_file); - tracing::trace!("Relabeling ben file according to map file {}", map_file_name,); + tracing::trace!( + "Relabeling ben file according to map file {}", + map_file_name, + ); - if let Some(limit) = args.n_items { + if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_with_map_as_variant_limit( + reader, + writer, + new_to_old_node_map, + variant, + limit, + ) + .unwrap(); + } else { + relabel_ben_file_with_map_as_variant( + reader, + writer, + new_to_old_node_map, + variant, + ) + .unwrap(); + } + } else if let Some(limit) = args.n_items { relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) .unwrap(); } else { @@ -267,9 +339,7 @@ pub fn run() { fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { match ordering { - OrderingMethod::MinimumLinearArrangement => { - GraphOrderingMethod::MinimumLinearArrangement - } + OrderingMethod::MinimumLinearArrangement => GraphOrderingMethod::MinimumLinearArrangement, OrderingMethod::MultiLevelCluster => GraphOrderingMethod::MultiLevelCluster, OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, } @@ -283,6 +353,22 @@ fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { } } +fn ben_variant_name(variant: BenVariant) -> &'static str { + match variant { + BenVariant::Standard => "standard", + BenVariant::MkvChain => "mkvchain", + BenVariant::TwoDelta => "twodelta", + } +} + +fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { + match variant { + BenCliVariant::Standard => BenVariant::Standard, + BenCliVariant::MkvChain => BenVariant::MkvChain, + BenCliVariant::TwoDelta => BenVariant::TwoDelta, + } +} + fn relabeling_label(key: Option<&str>, ordering: Option<&OrderingMethod>) -> Option { match (key, ordering) { (Some(_), Some(_)) => panic!("Provide either --key or --ordering, not both."), @@ -362,4 +448,22 @@ mod tests { assert_eq!(args.mode, Mode::Ben); assert_eq!(args.n_items, Some(25)); } + + #[test] + fn parse_ben_mode_output_variant_args() { + let args = Args::try_parse_from([ + "reben", + "samples.jsonl.ben", + "--mode", + "ben", + "--output-variant", + "twodelta", + "--convert-only", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Ben); + assert_eq!(args.output_variant, Some(BenCliVariant::TwoDelta)); + assert!(args.convert_only); + } } diff --git a/ben/src/codec/decode/tests.rs b/ben/src/codec/decode/tests.rs index fcee0dd..f6f5453 100644 --- a/ben/src/codec/decode/tests.rs +++ b/ben/src/codec/decode/tests.rs @@ -293,8 +293,13 @@ fn test_jsonl_decode_ben32_propagates_non_eof_error() { #[test] fn test_decode_xben_to_ben_rejects_invalid_inner_header() { let mut xz = Vec::new(); - xz_compress(BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), &mut xz, Some(1), Some(0)) - .unwrap(); + xz_compress( + BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), + &mut xz, + Some(1), + Some(0), + ) + .unwrap(); let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), Vec::new()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -303,8 +308,13 @@ fn test_decode_xben_to_ben_rejects_invalid_inner_header() { #[test] fn test_decode_xben_to_jsonl_rejects_invalid_inner_header() { let mut xz = Vec::new(); - xz_compress(BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), &mut xz, Some(1), Some(0)) - .unwrap(); + xz_compress( + BufReader::new(b"BAD BAD BAD BAD!!".as_slice()), + &mut xz, + Some(1), + Some(0), + ) + .unwrap(); let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), Vec::new()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 28c040e..aa43224 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -484,7 +484,11 @@ fn apply_twodelta_runs_to_assignment( assignment[pair_positions[write_idx]] = current_value; write_idx += 1; } - current_value = if current_value == first { second } else { first }; + current_value = if current_value == first { + second + } else { + first + }; } Ok(assignment) @@ -527,11 +531,11 @@ impl Iterator for BenDecoder { Some(Err(e)) => return Some(Err(e)), None => return None, }; - let assignment = match decode_stored_frame_to_assignment(self.previous_assignment.as_deref(), &frame) - { - Ok(assgn) => assgn, - Err(e) => return Some(Err(e)), - }; + let assignment = + match decode_stored_frame_to_assignment(self.previous_assignment.as_deref(), &frame) { + Ok(assgn) => assgn, + Err(e) => return Some(Err(e)), + }; let count = frame.count(); self.previous_assignment = Some(assignment.clone()); self.sample_count += count as usize; @@ -568,7 +572,8 @@ impl Iterator for BenFrameDecoeder { /// Return the next raw BEN frame from the input stream. fn next(&mut self) -> Option { match self.inner.variant { - BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() { + BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() + { Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::new( io::ErrorKind::InvalidData, diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index e5babc0..4c80c98 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -258,7 +258,10 @@ fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Ve .iter() .map(Vec::len) .collect::>(); - let component_set = component.iter().copied().collect::>(); + let component_set = component + .iter() + .copied() + .collect::>(); let start = component .iter() .copied() @@ -369,15 +372,17 @@ fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> V components } -fn rcm_component_generic(adjacency: &[Vec], labels: &[usize], component: &[usize]) -> Vec { +fn rcm_component_generic( + adjacency: &[Vec], + labels: &[usize], + component: &[usize], +) -> Vec { let component_mask = subset_mask(adjacency.len(), component); let local_degree = local_degree_in_subset(adjacency, &component_mask, component); let start = component .iter() .copied() - .min_by_key(|&node| { - (local_degree[node], labels[node]) - }) + .min_by_key(|&node| (local_degree[node], labels[node])) .unwrap(); let mut visited = vec![false; adjacency.len()]; @@ -406,7 +411,9 @@ fn rcm_component_generic(adjacency: &[Vec], labels: &[usize], component: fn multilevel_cluster_order_generic(adjacency: &[Vec], labels: &[usize]) -> Vec { let mut order = Vec::with_capacity(adjacency.len()); for component in connected_components_generic(adjacency, labels) { - order.extend(multilevel_cluster_component_generic(adjacency, labels, &component)); + order.extend(multilevel_cluster_component_generic( + adjacency, labels, &component, + )); } order } @@ -481,14 +488,13 @@ fn greedy_cluster_partition( .iter() .filter(|&&next| component_mask[next] && seed_marks[next] == mark_epoch) .count(); - ( - Reverse(shared), - local_degree[neighbor], - labels[neighbor], - ) + (Reverse(shared), local_degree[neighbor], labels[neighbor]) }); - for neighbor in candidates.into_iter().take(max_cluster_size.saturating_sub(1)) { + for neighbor in candidates + .into_iter() + .take(max_cluster_size.saturating_sub(1)) + { assigned[neighbor] = true; remaining -= 1; cluster.push(neighbor); @@ -506,7 +512,11 @@ fn greedy_cluster_partition( clusters } -fn local_degree_in_subset(adjacency: &[Vec], subset_mask: &[bool], subset: &[usize]) -> Vec { +fn local_degree_in_subset( + adjacency: &[Vec], + subset_mask: &[bool], + subset: &[usize], +) -> Vec { let mut local_degree = vec![0usize; adjacency.len()]; for &node in subset { local_degree[node] = adjacency[node] diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 6bcf6d0..9c9dfb8 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -248,7 +248,10 @@ fn test_sort_json_file_by_key_with_non_numeric_values() { assert_eq!(output_json["nodes"][0]["key"], 7); assert_eq!(output_json["nodes"][1]["key"], "abc"); - assert_eq!(output_json["nodes"][2]["key"], serde_json::json!({"nested": true})); + assert_eq!( + output_json["nodes"][2]["key"], + serde_json::json!({"nested": true}) + ); } #[test] diff --git a/ben/src/logging.rs b/ben/src/logging.rs index 31fafae..cd3b868 100644 --- a/ben/src/logging.rs +++ b/ben/src/logging.rs @@ -1,6 +1,6 @@ +use std::sync::Once; use tracing::Level; use tracing_subscriber::EnvFilter; -use std::sync::Once; static INIT_LOGGER: Once = Once::new(); diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 864df67..c4508e3 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -118,6 +118,58 @@ where Ok(()) } +fn detect_ben_variant(header: &[u8; 17]) -> io::Result { + match header { + b"STANDARD BEN FILE" => Ok(BenVariant::Standard), + b"MKVCHAIN BEN FILE" => Ok(BenVariant::MkvChain), + b"TWODELTA BEN FILE" => Ok(BenVariant::TwoDelta), + _ => Err(Error::new( + io::ErrorKind::InvalidData, + "Invalid file format", + )), + } +} + +fn convert_ben_file_impl( + mut reader: R, + writer: W, + target_variant: BenVariant, + max_samples: Option, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + let _input_variant = detect_ben_variant(&check_buffer)?; + + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + writer, + target_variant, + max_samples, + |assignment| Ok(assignment.to_vec()), + ) +} + +/// Rewrite a BEN file into the requested BEN variant. +pub fn convert_ben_file( + reader: R, + writer: W, + target_variant: BenVariant, +) -> io::Result<()> { + convert_ben_file_impl(reader, writer, target_variant, None) +} + +/// Rewrite at most `max_samples` expanded samples into the requested BEN variant. +pub fn convert_ben_file_limit( + reader: R, + writer: W, + target_variant: BenVariant, + max_samples: usize, +) -> io::Result<()> { + convert_ben_file_impl(reader, writer, target_variant, Some(max_samples)) +} + /// Canonicalize the labels used inside each BEN frame. /// /// Labels are reassigned in first-seen order within each assignment vector, @@ -291,9 +343,13 @@ fn relabel_ben_file_impl( BenVariant::TwoDelta => { let mut full_stream = check_buffer.to_vec(); reader.read_to_end(&mut full_stream)?; - relabel_ben_file_via_decoder(full_stream.as_slice(), &mut writer, variant, max_samples, |assignment| { - Ok(canonicalize_assignment(assignment)) - })? + relabel_ben_file_via_decoder( + full_stream.as_slice(), + &mut writer, + variant, + max_samples, + |assignment| Ok(canonicalize_assignment(assignment)), + )? } } @@ -323,13 +379,7 @@ pub fn relabel_ben_lines_with_map( new_to_old_node_map: HashMap, variant: BenVariant, ) -> io::Result<()> { - relabel_ben_lines_with_map_impl( - &mut reader, - &mut writer, - new_to_old_node_map, - variant, - None, - ) + relabel_ben_lines_with_map_impl(&mut reader, &mut writer, new_to_old_node_map, variant, None) } /// Relabel BEN frames using an externally supplied node map, up to a bounded @@ -530,5 +580,95 @@ fn relabel_ben_file_with_map_impl( Ok(()) } +/// Canonicalize BEN assignments and write them using the requested BEN variant. +pub fn relabel_ben_file_as_variant( + mut reader: R, + writer: W, + target_variant: BenVariant, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + let _input_variant = detect_ben_variant(&check_buffer)?; + + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + writer, + target_variant, + None, + |assignment| Ok(canonicalize_assignment(assignment)), + ) +} + +/// Canonicalize up to `max_samples` expanded samples and write the requested BEN variant. +pub fn relabel_ben_file_as_variant_limit( + mut reader: R, + writer: W, + target_variant: BenVariant, + max_samples: usize, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + let _input_variant = detect_ben_variant(&check_buffer)?; + + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + writer, + target_variant, + Some(max_samples), + |assignment| Ok(canonicalize_assignment(assignment)), + ) +} + +/// Relabel a BEN file with a supplied node map and write the requested BEN variant. +pub fn relabel_ben_file_with_map_as_variant( + mut reader: R, + writer: W, + new_to_old_node_map: HashMap, + target_variant: BenVariant, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + let _input_variant = detect_ben_variant(&check_buffer)?; + + let permutation = dense_permutation(&new_to_old_node_map)?; + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + writer, + target_variant, + None, + |assignment| permute_assignment(assignment, &permutation), + ) +} + +/// Relabel up to `max_samples` expanded samples with a supplied node map and write the requested BEN variant. +pub fn relabel_ben_file_with_map_as_variant_limit( + mut reader: R, + writer: W, + new_to_old_node_map: HashMap, + target_variant: BenVariant, + max_samples: usize, +) -> io::Result<()> { + let mut check_buffer = [0u8; 17]; + reader.read_exact(&mut check_buffer)?; + let _input_variant = detect_ben_variant(&check_buffer)?; + + let permutation = dense_permutation(&new_to_old_node_map)?; + let mut full_stream = check_buffer.to_vec(); + reader.read_to_end(&mut full_stream)?; + relabel_ben_file_via_decoder( + full_stream.as_slice(), + writer, + target_variant, + Some(max_samples), + |assignment| permute_assignment(assignment, &permutation), + ) +} + #[cfg(test)] mod tests; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 80d8d33..98ccefc 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -144,8 +144,12 @@ fn test_relabel_simple_file_mkv_with_limit() { ); let mut encoded = Vec::new(); - encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::MkvChain) - .unwrap(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::MkvChain, + ) + .unwrap(); let mut relabeled = Vec::new(); relabel_ben_file_limit(encoded.as_slice(), io::BufWriter::new(&mut relabeled), 2).unwrap(); @@ -171,8 +175,12 @@ fn test_relabel_simple_file_twodelta() { ); let mut encoded = Vec::new(); - encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::TwoDelta) - .unwrap(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::TwoDelta, + ) + .unwrap(); let mut relabeled = Vec::new(); relabel_ben_file(encoded.as_slice(), io::BufWriter::new(&mut relabeled)).unwrap(); @@ -292,11 +300,20 @@ fn test_relabel_simple_file_with_map() { "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}" ); - let new_to_old_map: HashMap = - [(0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 0), (8, 1)] - .iter() - .cloned() - .collect(); + let new_to_old_map: HashMap = [ + (0, 2), + (1, 3), + (2, 4), + (3, 5), + (4, 6), + (5, 7), + (6, 8), + (7, 0), + (8, 1), + ] + .iter() + .cloned() + .collect(); let input = file.as_bytes(); @@ -345,11 +362,20 @@ fn test_relabel_simple_file_with_map_mkv() { "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}", ); - let new_to_old_map: HashMap = - [(0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 0), (8, 1)] - .iter() - .cloned() - .collect(); + let new_to_old_map: HashMap = [ + (0, 2), + (1, 3), + (2, 4), + (3, 5), + (4, 6), + (5, 7), + (6, 8), + (7, 0), + (8, 1), + ] + .iter() + .cloned() + .collect(); let input = file.as_bytes(); @@ -394,15 +420,18 @@ fn test_relabel_simple_file_with_map_twodelta() { "{\"assignment\":[2,2,1,1,3,3],\"sample\":4}\n" ); - let new_to_old_map: HashMap = - [(0, 2), (1, 3), (2, 0), (3, 1), (4, 4), (5, 5)] - .iter() - .cloned() - .collect(); + let new_to_old_map: HashMap = [(0, 2), (1, 3), (2, 0), (3, 1), (4, 4), (5, 5)] + .iter() + .cloned() + .collect(); let mut encoded = Vec::new(); - encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::TwoDelta) - .unwrap(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::TwoDelta, + ) + .unwrap(); let mut relabeled = Vec::new(); relabel_ben_file_with_map( @@ -437,8 +466,12 @@ fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { let new_to_old_map: HashMap = [(0, 1), (1, 2), (2, 0)].iter().cloned().collect(); let mut encoded = Vec::new(); - encode_jsonl_to_ben(file.as_bytes(), io::BufWriter::new(&mut encoded), BenVariant::MkvChain) - .unwrap(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::MkvChain, + ) + .unwrap(); let mut relabeled = Vec::new(); relabel_ben_file_with_map_limit( @@ -469,12 +502,9 @@ fn test_relabel_file_rejects_invalid_header() { #[test] fn test_relabel_file_with_map_rejects_invalid_header() { - let err = relabel_ben_file_with_map( - b"not a valid banner".as_slice(), - Vec::new(), - HashMap::new(), - ) - .unwrap_err(); + let err = + relabel_ben_file_with_map(b"not a valid banner".as_slice(), Vec::new(), HashMap::new()) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!(err.to_string(), "Invalid file format"); } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 71ada77..56a386f 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -264,7 +264,10 @@ fn ben_cli_supports_stdin_stdout_workflows() { &xencode_jsonl.stdout, ); assert_success(&xdecode_jsonl); - assert_eq!(String::from_utf8(xdecode_jsonl.stdout).unwrap(), sample_jsonl()); + assert_eq!( + String::from_utf8(xdecode_jsonl.stdout).unwrap(), + sample_jsonl() + ); let mut ben_bytes = Vec::new(); encode_jsonl_to_ben( @@ -422,14 +425,23 @@ fn ben_cli_uses_default_output_names() { let encode = run( "ben", - &["--mode", "encode", jsonl_path.to_str().unwrap(), "--save-all"], + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--save-all", + ], temp.path(), ); assert_success(&encode); assert!(ben_path.exists()); fs::remove_file(&jsonl_path).unwrap(); - let decode = run("ben", &["--mode", "decode", ben_path.to_str().unwrap()], temp.path()); + let decode = run( + "ben", + &["--mode", "decode", ben_path.to_str().unwrap()], + temp.path(), + ); assert_success(&decode); assert_eq!(fs::read_to_string(&jsonl_path).unwrap(), sample_jsonl()); @@ -442,7 +454,11 @@ fn ben_cli_uses_default_output_names() { assert!(xz_path.exists()); fs::remove_file(&jsonl_path).unwrap(); - let decompress = run("ben", &["--mode", "xz-decompress", xz_path.to_str().unwrap()], temp.path()); + let decompress = run( + "ben", + &["--mode", "xz-decompress", xz_path.to_str().unwrap()], + temp.path(), + ); assert_success(&decompress); assert_eq!(fs::read_to_string(&jsonl_path).unwrap(), sample_jsonl()); } @@ -463,9 +479,8 @@ fn ben_cli_reports_expected_error_paths() { temp.path(), ); assert_success(&xencode); - assert!( - String::from_utf8_lossy(&xencode.stderr).contains("Unsupported file type(s) for xencode mode") - ); + assert!(String::from_utf8_lossy(&xencode.stderr) + .contains("Unsupported file type(s) for xencode mode")); let decode = run( "ben", @@ -477,7 +492,11 @@ fn ben_cli_reports_expected_error_paths() { String::from_utf8_lossy(&decode.stderr).contains("Unsupported file type for decode mode") ); - let read = run("ben", &["--mode", "read", bogus_jsonl.to_str().unwrap()], temp.path()); + let read = run( + "ben", + &["--mode", "read", bogus_jsonl.to_str().unwrap()], + temp.path(), + ); assert_success(&read); assert!( String::from_utf8_lossy(&read.stderr).contains("Sample number is required in read mode") @@ -489,9 +508,8 @@ fn ben_cli_reports_expected_error_paths() { temp.path(), ); assert_success(&xz); - assert!( - String::from_utf8_lossy(&xz.stderr).contains("Unsupported file type for xz decompress mode") - ); + assert!(String::from_utf8_lossy(&xz.stderr) + .contains("Unsupported file type for xz decompress mode")); let bad_xben = run_stdin_stdout("ben", &["--mode", "x-decode"], temp.path(), b"not-an-xben"); assert_success(&bad_xben); @@ -714,9 +732,8 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { let unsupported_decode = run_stdin_stdout("ben", &["--mode", "decode"], temp.path(), b""); assert_success(&unsupported_decode); - assert!( - String::from_utf8_lossy(&unsupported_decode.stderr).contains("Unsupported file type(s) for decode mode") - ); + assert!(String::from_utf8_lossy(&unsupported_decode.stderr) + .contains("Unsupported file type(s) for decode mode")); let read_too_large = run( "ben", @@ -1079,6 +1096,132 @@ fn reben_cli_supports_twodelta_ben_mode() { .contains(r#""assignment":[1,2,1]"#)); } +#[test] +fn reben_cli_can_convert_between_ben_variants() { + let temp = TempDir::new("reben-convert"); + let ben_path = temp.path().join("samples.standard.ben"); + let twodelta_path = temp.path().join("samples.twodelta.ben"); + let mkv_path = temp.path().join("samples.mkv.ben"); + + let source_jsonl = r#"{"assignment":[4,4,9],"sample":1} +{"assignment":[4,4,9],"sample":2} +{"assignment":[4,9,4],"sample":3} +{"assignment":[9,9,4],"sample":4} +"#; + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(source_jsonl.as_bytes()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let to_twodelta = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "twodelta", + "--convert-only", + "--output-file", + twodelta_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&to_twodelta); + + let twodelta_bytes = fs::read(&twodelta_path).unwrap(); + assert_eq!(&twodelta_bytes[..17], b"TWODELTA BEN FILE"); + + let to_mkv = run( + "reben", + &[ + twodelta_path.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "mkv-chain", + "--convert-only", + "--output-file", + mkv_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&to_mkv); + + let mkv_bytes = fs::read(&mkv_path).unwrap(); + assert_eq!(&mkv_bytes[..17], b"MKVCHAIN BEN FILE"); + + let mut original_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&ben_path).unwrap()), + &mut original_jsonl, + ) + .unwrap(); + + let mut converted_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&mkv_path).unwrap()), + &mut converted_jsonl, + ) + .unwrap(); + + assert_eq!(original_jsonl, converted_jsonl); +} + +#[test] +fn reben_cli_can_canonicalize_into_a_different_ben_variant() { + let temp = TempDir::new("reben-canonicalize-convert"); + let ben_path = temp.path().join("samples.standard.ben"); + let output_path = temp.path().join("canonicalized.twodelta.ben"); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new( + r#"{"assignment":[9,9,4],"sample":1} +{"assignment":[4,7,7],"sample":2} +"# + .as_bytes(), + ), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let canonicalize = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "twodelta", + "--output-file", + output_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&canonicalize); + + let bytes = fs::read(&output_path).unwrap(); + assert_eq!(&bytes[..17], b"TWODELTA BEN FILE"); + + let mut canonical_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&output_path).unwrap()), + &mut canonical_jsonl, + ) + .unwrap(); + let canonical_text = String::from_utf8(canonical_jsonl).unwrap(); + assert!(canonical_text.contains(r#""assignment":[1,1,2]"#)); + assert!(canonical_text.contains(r#""assignment":[1,2,2]"#)); +} + #[test] fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations() { let temp = TempDir::new("reben-more"); @@ -1112,7 +1255,10 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations temp.path(), ); assert_success(&relabel); - assert!(temp.path().join("shape_sorted_by_GEOID20_map.json").exists()); + assert!(temp + .path() + .join("shape_sorted_by_GEOID20_map.json") + .exists()); let generated_graph = temp.path().join("shape_sorted_by_GEOID20.json"); let generated_map = temp.path().join("shape_sorted_by_GEOID20_map.json"); @@ -1149,7 +1295,8 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations assert_failure(&missing_shape); assert!(String::from_utf8_lossy(&missing_shape.stderr).contains("No shape file provided")); - let sorted_json: Value = serde_json::from_str(&fs::read_to_string(generated_graph).unwrap()).unwrap(); + let sorted_json: Value = + serde_json::from_str(&fs::read_to_string(generated_graph).unwrap()).unwrap(); assert_eq!(sorted_json["nodes"][0]["GEOID20"], "A"); } @@ -1200,8 +1347,7 @@ fn reben_cli_supports_mla_and_rcm_orderings() { .join("shape_sorted_by_reverse-cuthill-mckee_map.json") .exists()); - let mla_json: Value = - serde_json::from_str(&fs::read_to_string(&mla_path).unwrap()).unwrap(); + let mla_json: Value = serde_json::from_str(&fs::read_to_string(&mla_path).unwrap()).unwrap(); let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); assert_eq!( mla_json["nodes"].as_array().unwrap().len(), @@ -1309,16 +1455,13 @@ fn pben_cli_converts_between_formats() { &mut roundtrip_jsonl, ) .unwrap(); - assert!(String::from_utf8(roundtrip_jsonl).unwrap().contains(r#""assignment":[2,2,3]"#)); + assert!(String::from_utf8(roundtrip_jsonl) + .unwrap() + .contains(r#""assignment":[2,2,3]"#)); let xdecode = run( "ben", - &[ - "--mode", - "x-decode", - xben_path.to_str().unwrap(), - "--print", - ], + &["--mode", "x-decode", xben_path.to_str().unwrap(), "--print"], temp.path(), ); assert_success(&xdecode); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index a6af7c6..e52f9dc 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -141,7 +141,8 @@ fn strat_twodelta_seq() -> impl Strategy>> { let a = distinct[(op as usize) % distinct.len()]; let mut b = distinct[((op >> 8) as usize) % distinct.len()]; if a == b { - b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + b = distinct + [(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; } let positions: Vec = current @@ -178,7 +179,9 @@ fn strat_twodelta_seq() -> impl Strategy>> { seq }) - .prop_filter("TwoDelta sequences must be non-empty", |seq| !seq.is_empty()) + .prop_filter("TwoDelta sequences must be non-empty", |seq| { + !seq.is_empty() + }) } // Random (small) thread count and compression level for MT encoder. @@ -903,8 +906,11 @@ fn ben_encoder_write_assignment_path_roundtrips() { let mut out = Vec::new(); decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); - assert_eq!(out, br#"{"assignment":[9,9,2,2,2],"sample":1} -"#); + assert_eq!( + out, + br#"{"assignment":[9,9,2,2,2],"sample":1} +"# + ); } #[test] @@ -990,7 +996,8 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { .unwrap(); let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); let mut xben = binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard); - xben.write_ben_file(BufReader::new(payload_only.as_slice())).unwrap(); + xben.write_ben_file(BufReader::new(payload_only.as_slice())) + .unwrap(); } let mut ben = Vec::new(); @@ -998,8 +1005,11 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { let mut out = Vec::new(); decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); - assert_eq!(out, br#"{"assignment":[5,5,7],"sample":1} -"#); + assert_eq!( + out, + br#"{"assignment":[5,5,7],"sample":1} +"# + ); } struct FailAfterN { @@ -1016,7 +1026,10 @@ impl std::io::Read for FailAfterN { if self.pos >= self.data.len() { return Ok(0); } - let n = buf.len().min(self.data.len() - self.pos).min(self.fail_at - self.pos); + let n = buf + .len() + .min(self.data.len() - self.pos) + .min(self.fail_at - self.pos); buf[..n].copy_from_slice(&self.data[self.pos..self.pos + n]); self.pos += n; Ok(n) @@ -1112,7 +1125,13 @@ fn decoder_init_error_display_source_and_conversion_paths() { let xz_bytes = { let mut buf = Vec::new(); - xz_compress(BufReader::new(b"hello".as_slice()), &mut buf, Some(1), Some(0)).unwrap(); + xz_compress( + BufReader::new(b"hello".as_slice()), + &mut buf, + Some(1), + Some(0), + ) + .unwrap(); buf }; let xz_header = xz_bytes[..17].to_vec(); @@ -1137,8 +1156,19 @@ fn ben_decoder_and_xben_decoder_count_samples() { "#; let mut ben = Vec::new(); - encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); - assert_eq!(BenDecoder::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + encode_jsonl_to_ben( + BufReader::new(jsonl.as_bytes()), + &mut ben, + BenVariant::MkvChain, + ) + .unwrap(); + assert_eq!( + BenDecoder::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 3 + ); let mut xben = Vec::new(); encode_jsonl_to_xben( @@ -1149,7 +1179,13 @@ fn ben_decoder_and_xben_decoder_count_samples() { Some(0), ) .unwrap(); - assert_eq!(XBenDecoder::new(xben.as_slice()).unwrap().count_samples().unwrap(), 3); + assert_eq!( + XBenDecoder::new(xben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 3 + ); let twodelta_jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[1,2,2,1],"sample":2} @@ -1181,7 +1217,12 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { "#; let mut ben = Vec::new(); - encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); + encode_jsonl_to_ben( + BufReader::new(jsonl.as_bytes()), + &mut ben, + BenVariant::MkvChain, + ) + .unwrap(); let ben_path = unique_temp_path("sample.ben"); fs::write(&ben_path, &ben).unwrap(); @@ -1222,7 +1263,12 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { "#; let mut ben = Vec::new(); - encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, BenVariant::MkvChain).unwrap(); + encode_jsonl_to_ben( + BufReader::new(jsonl.as_bytes()), + &mut ben, + BenVariant::MkvChain, + ) + .unwrap(); let mut by_indices = BenDecoder::new(ben.as_slice()) .unwrap() @@ -1286,7 +1332,12 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { assert_eq!(jsonl, jsonl_from_assignments(&assignments)); let frames = BenDecoder::new(ben.as_slice()).unwrap().into_frames(); - assert_eq!(collect_frames(frames.map(|res| res.map(|f| (Frame::Ben(f.clone()), f.count)))).unwrap().len(), 3); + assert_eq!( + collect_frames(frames.map(|res| res.map(|f| (Frame::Ben(f.clone()), f.count)))) + .unwrap() + .len(), + 3 + ); } #[test] @@ -1321,10 +1372,7 @@ fn twodelta_rejects_non_pair_transition() { let mut ben = Vec::new(); let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); - let err = encoder - .write_assignment(vec![1u16, 3, 2, 4]) - .err() - .unwrap(); + let err = encoder.write_assignment(vec![1u16, 3, 2, 4]).err().unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } @@ -1360,7 +1408,13 @@ fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { ) .unwrap(); - assert_eq!(BenDecoder::new(ben.as_slice()).unwrap().count_samples().unwrap(), 4); + assert_eq!( + BenDecoder::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 4 + ); let frames: Vec<_> = BenDecoder::new(ben.as_slice()) .unwrap() diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 9dd73b9..802fff6 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -152,7 +152,8 @@ fn test_twodeltaben_pipeline() { let a = distinct[(i * 7) % distinct.len()]; let mut b = distinct[(i * 11) % distinct.len()]; if a == b { - b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + b = distinct + [(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; } let positions: Vec = current diff --git a/pyben/src/common.rs b/pyben/src/common.rs index a3fd7fa..bdd1673 100644 --- a/pyben/src/common.rs +++ b/pyben/src/common.rs @@ -51,8 +51,7 @@ pub fn open_output(out_file: &PathBuf, overwrite: bool) -> PyResult) -> PyResult Date: Mon, 16 Mar 2026 21:31:51 -0600 Subject: [PATCH 026/221] format --- ben/src/codec/decode/xz.rs | 10 +++++-- ben/src/codec/encode/mod.rs | 4 +-- ben/src/io/reader.rs | 12 ++++---- ben/src/io/writer.rs | 57 +++++++++++++++++-------------------- ben/tests/test_pipeline.rs | 3 +- 5 files changed, 44 insertions(+), 42 deletions(-) diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index f722231..d6a52cb 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -40,7 +40,10 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BenVariant::MkvChain } Some(BenVariant::TwoDelta) => { - let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); + let mut xben = XBenDecoder::from_decompressed_stream( + BufReader::new(decoder), + BenVariant::TwoDelta, + ); let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta); for record in &mut xben { let (assignment, count) = record?; @@ -156,7 +159,10 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i Some(BenVariant::Standard) => BenVariant::Standard, Some(BenVariant::MkvChain) => BenVariant::MkvChain, Some(BenVariant::TwoDelta) => { - let mut xben = XBenDecoder::from_decompressed_stream(BufReader::new(decoder), BenVariant::TwoDelta); + let mut xben = XBenDecoder::from_decompressed_stream( + BufReader::new(decoder), + BenVariant::TwoDelta, + ); let mut sample_number = 1usize; for record in &mut xben { let (assignment, count) = record?; diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 124bd11..c6fba90 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -5,11 +5,11 @@ mod jsonl; mod types; mod xz; +#[cfg(test)] +pub(crate) use ben::encode_ben32_line; pub(crate) use ben::{ build_twodelta_runs_with_hint, encode_ben32_assignments, encode_twodelta_vec_with_hint, }; -#[cfg(test)] -pub(crate) use ben::encode_ben32_line; pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index aa43224..a811a64 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -693,9 +693,7 @@ impl XBenDecoder { } None } - BenVariant::TwoDelta => { - None - } + BenVariant::TwoDelta => None, } } @@ -709,8 +707,9 @@ impl XBenDecoder { if overflow.len() < 7 { return None; } - let run_count = u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) - as usize; + let run_count = + u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) + as usize; let payload_len = run_count.checked_mul(4)?; let total_len = 1usize .checked_add(4)? @@ -740,7 +739,8 @@ impl XBenDecoder { u16::from_be_bytes([overflow[3], overflow[4]]), ); let run_count = - u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) as usize; + u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) + as usize; let payload_len = run_count.checked_mul(2)?; let total_len = 1usize .checked_add(2)? diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 66e02aa..bd117d4 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -411,36 +411,30 @@ impl XBenEncoder { pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { encoder.write_all(banner_for_variant(variant)).unwrap(); match variant { - BenVariant::Standard => { - XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::Standard, - } - } - BenVariant::MkvChain => { - XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::MkvChain, - } - } - BenVariant::TwoDelta => { - XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::TwoDelta, - } - } + BenVariant::Standard => XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant: BenVariant::Standard, + }, + BenVariant::MkvChain => XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant: BenVariant::MkvChain, + }, + BenVariant::TwoDelta => XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant: BenVariant::TwoDelta, + }, } } @@ -539,7 +533,8 @@ impl XBenEncoder { for record in decoder { let (assignment, count) = record?; self.write_assignment(assignment.clone())?; - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && count > 1 + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) + && count > 1 { self.count += count - 1; } else if self.variant == BenVariant::Standard { diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 802fff6..c204661 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -370,7 +370,8 @@ fn test_xtwodeltaben_pipeline() { let a = distinct[(i * 7) % distinct.len()]; let mut b = distinct[(i * 11) % distinct.len()]; if a == b { - b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + b = distinct + [(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; } let positions: Vec = current From befb6413f6a8d6548fbeae17ddfbd116aa4d9759 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 21:59:54 -0600 Subject: [PATCH 027/221] fix read-all bug --- Taskfile.yml | 2 +- ben/src/io/writer.rs | 9 +++--- ben/src/ops/relabel/mod.rs | 47 +++++++++++++------------------ ben/tests/test_cli.rs | 57 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 33 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 518ddc5..8c78cac 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -19,7 +19,7 @@ tasks: desc: List available tasks silent: true cmds: - - task --list-all + - 'command -v go-task >/dev/null 2>&1 && go-task --list-all || task --list-all' ensure-rust-linux: &ensure-rust-unix desc: Install Rust if it is not already available diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index bd117d4..9e9df09 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -9,7 +9,7 @@ use crate::util::rle::assign_to_rle; use crate::BenVariant; use serde_json::Value; use std::collections::HashMap; -use std::io::{self, BufRead, Result, Write}; +use std::io::{self, BufRead, Read, Result, Write}; use xz2::write::XzEncoder; const XBEN_TWODELTA_FULL_TAG: u8 = 0; @@ -524,12 +524,11 @@ impl XBenEncoder { if has_banner { if self.variant == BenVariant::TwoDelta { - let mut banner_prefixed = Vec::new(); - banner_prefixed.extend_from_slice(&peek[..BANNER_LEN]); + let mut banner = [0u8; BANNER_LEN]; + banner.copy_from_slice(&peek[..BANNER_LEN]); reader.consume(BANNER_LEN); - reader.read_to_end(&mut banner_prefixed)?; - let decoder = BenDecoder::new(io::Cursor::new(banner_prefixed))?; + let decoder = BenDecoder::new(io::Cursor::new(banner).chain(reader))?; for record in decoder { let (assignment, count) = record?; self.write_assignment(assignment.clone())?; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index c4508e3..2beda97 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -9,7 +9,7 @@ use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; -use std::io::{self, Error, Read, Write}; +use std::io::{self, Cursor, Error, Read, Write}; /// Convert a sparse permutation map into a dense index vector. /// @@ -136,14 +136,13 @@ fn convert_ben_file_impl( target_variant: BenVariant, max_samples: Option, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let _input_variant = detect_ben_variant(&check_buffer)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, writer, target_variant, max_samples, @@ -341,10 +340,9 @@ fn relabel_ben_file_impl( relabel_ben_lines_impl(&mut reader, &mut writer, variant, max_samples)? } BenVariant::TwoDelta => { - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, &mut writer, variant, max_samples, @@ -565,10 +563,9 @@ fn relabel_ben_file_with_map_impl( } BenVariant::TwoDelta => { let permutation = dense_permutation(&new_to_old_node_map)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, &mut writer, variant, max_samples, @@ -586,14 +583,13 @@ pub fn relabel_ben_file_as_variant( writer: W, target_variant: BenVariant, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let _input_variant = detect_ben_variant(&check_buffer)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, writer, target_variant, None, @@ -608,14 +604,13 @@ pub fn relabel_ben_file_as_variant_limit( target_variant: BenVariant, max_samples: usize, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let _input_variant = detect_ben_variant(&check_buffer)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, writer, target_variant, Some(max_samples), @@ -630,15 +625,14 @@ pub fn relabel_ben_file_with_map_as_variant( new_to_old_node_map: HashMap, target_variant: BenVariant, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let _input_variant = detect_ben_variant(&check_buffer)?; let permutation = dense_permutation(&new_to_old_node_map)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, writer, target_variant, None, @@ -654,15 +648,14 @@ pub fn relabel_ben_file_with_map_as_variant_limit( target_variant: BenVariant, max_samples: usize, ) -> io::Result<()> { - let mut check_buffer = [0u8; 17]; + let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; let _input_variant = detect_ben_variant(&check_buffer)?; let permutation = dense_permutation(&new_to_old_node_map)?; - let mut full_stream = check_buffer.to_vec(); - reader.read_to_end(&mut full_stream)?; + let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder( - full_stream.as_slice(), + chained, writer, target_variant, Some(max_samples), diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 56a386f..0f894df 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1173,6 +1173,63 @@ fn reben_cli_can_convert_between_ben_variants() { assert_eq!(original_jsonl, converted_jsonl); } +#[test] +fn reben_cli_can_limit_variant_conversion_to_first_n_items() { + let temp = TempDir::new("reben-convert-limit"); + let ben_path = temp.path().join("samples.standard.ben"); + let twodelta_path = temp.path().join("samples.twodelta.ben"); + + let source_jsonl = r#"{"assignment":[4,4,9],"sample":1} +{"assignment":[4,4,9],"sample":2} +{"assignment":[4,9,4],"sample":3} +{"assignment":[9,9,4],"sample":4} +"#; + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(source_jsonl.as_bytes()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + let limited_convert = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "twodelta", + "--convert-only", + "--n-items", + "2", + "--output-file", + twodelta_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&limited_convert); + + let twodelta_bytes = fs::read(&twodelta_path).unwrap(); + assert_eq!(&twodelta_bytes[..17], b"TWODELTA BEN FILE"); + + let mut converted_jsonl = Vec::new(); + decode_ben_to_jsonl( + BufReader::new(fs::File::open(&twodelta_path).unwrap()), + &mut converted_jsonl, + ) + .unwrap(); + + assert_eq!( + String::from_utf8(converted_jsonl).unwrap(), + r#"{"assignment":[4,4,9],"sample":1} +{"assignment":[4,4,9],"sample":2} +"# + ); +} + #[test] fn reben_cli_can_canonicalize_into_a_different_ben_variant() { let temp = TempDir::new("reben-canonicalize-convert"); From ed719805be48252d799e94ce5eef66b61d1522d7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:10:52 -0600 Subject: [PATCH 028/221] update docs --- ben/src/cli/reben.rs | 46 +++++++ ben/src/codec/encode/types.rs | 10 ++ ben/src/io/reader.rs | 89 +++++++++++- ben/src/io/writer.rs | 107 +++++++++++++++ ben/src/json/graph/mod.rs | 246 ++++++++++++++++++++++++++++++++++ ben/src/ops/relabel/mod.rs | 200 ++++++++++++++++++++++++--- 6 files changed, 676 insertions(+), 22 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 5130a33..91818e7 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -337,6 +337,15 @@ pub fn run() { } } +/// Convert a CLI ordering method variant to the library's graph ordering type. +/// +/// # Arguments +/// +/// * `ordering` - The CLI ordering method selected by the user. +/// +/// # Returns +/// +/// Returns the corresponding `GraphOrderingMethod`. fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { match ordering { OrderingMethod::MinimumLinearArrangement => GraphOrderingMethod::MinimumLinearArrangement, @@ -345,6 +354,15 @@ fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { } } +/// Return the kebab-case display name for an ordering method. +/// +/// # Arguments +/// +/// * `ordering` - The CLI ordering method variant. +/// +/// # Returns +/// +/// Returns a static string identifying the ordering method. fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { match ordering { OrderingMethod::MinimumLinearArrangement => "minimum-linear-arrangement", @@ -353,6 +371,15 @@ fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { } } +/// Return the lowercase display name for a BEN variant. +/// +/// # Arguments +/// +/// * `variant` - The BEN variant to name. +/// +/// # Returns +/// +/// Returns a static string identifying the variant. fn ben_variant_name(variant: BenVariant) -> &'static str { match variant { BenVariant::Standard => "standard", @@ -361,6 +388,15 @@ fn ben_variant_name(variant: BenVariant) -> &'static str { } } +/// Convert a CLI BEN variant to the library's `BenVariant` type. +/// +/// # Arguments +/// +/// * `variant` - The CLI BEN variant selected by the user. +/// +/// # Returns +/// +/// Returns the corresponding `BenVariant`. fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { match variant { BenCliVariant::Standard => BenVariant::Standard, @@ -369,6 +405,16 @@ fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { } } +/// Derive a human-readable label from the key or ordering method for file naming. +/// +/// # Arguments +/// +/// * `key` - An optional JSON key used for sorting. +/// * `ordering` - An optional topology-based ordering method. +/// +/// # Returns +/// +/// Returns the label string, or `None` if neither option is provided. fn relabeling_label(key: Option<&str>, ordering: Option<&OrderingMethod>) -> Option { match (key, ordering) { (Some(_), Some(_)) => panic!("Provide either --key or --ordering, not both."), diff --git a/ben/src/codec/encode/types.rs b/ben/src/codec/encode/types.rs index a7dd008..49f4639 100644 --- a/ben/src/codec/encode/types.rs +++ b/ben/src/codec/encode/types.rs @@ -109,6 +109,16 @@ impl PartialEq for Vec { } } +/// Pack a slice of items into a byte vector using a fixed bit width per item. +/// +/// # Arguments +/// +/// * `items` - The values to pack. +/// * `item_bits` - The number of bits used to encode each item. +/// +/// # Returns +/// +/// Returns the payload length in bytes and the packed byte vector. fn pack_fixed_width_items(items: &[u16], item_bits: u8) -> (u32, Vec) { let payload_bits = item_bits as u32 * items.len() as u32; let n_bytes = payload_bits.div_ceil(8); diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index a811a64..86e2931 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -248,6 +248,16 @@ impl BenDecoder { } /// Read and return the next raw BEN frame stored in standard BEN layout. + /// + /// # Arguments + /// + /// * `with_count` - When `true`, read a trailing `u16` repetition count; + /// otherwise the count defaults to `1`. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. fn pop_standard_frame_from_reader(&mut self, with_count: bool) -> Option> { let mut b1 = [0u8; 1]; let max_val_bits = match self.reader.read_exact(&mut b1) { @@ -297,6 +307,11 @@ impl BenDecoder { } /// Read and return the next raw TwoDelta frame from the underlying stream. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next TwoDelta frame, `Some(Err(...))` + /// for a read failure, or `None` at a clean end of stream. fn pop_twodelta_frame_from_reader(&mut self) -> Option> { let pair_a = match self.reader.read_u16::() { Ok(value) => value, @@ -344,6 +359,10 @@ impl BenDecoder { /// Read and return the next stored frame from the underlying BEN stream. /// + /// # Arguments + /// + /// * `&mut self` - The decoder whose internal reader is advanced. + /// /// # Returns /// /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read @@ -416,6 +435,14 @@ fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { } /// Decode the run-length payload of a TwoDelta frame. +/// +/// # Arguments +/// +/// * `frame` - The TwoDelta frame whose packed payload is decoded. +/// +/// # Returns +/// +/// Returns the sequence of non-zero run lengths extracted from the payload. fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { let mut items = Vec::new(); let mut buffer: u32 = 0; @@ -451,7 +478,20 @@ fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { Ok(items) } -/// Decode a raw TwoDelta frame into a full assignment vector. +/// Apply decoded TwoDelta run lengths to produce a new assignment vector. +/// +/// Positions in `previous_assignment` that hold either value of `pair` are +/// overwritten according to the alternating run-length encoding. +/// +/// # Arguments +/// +/// * `previous_assignment` - The assignment from the preceding frame. +/// * `pair` - The two label values that participate in the delta. +/// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. +/// +/// # Returns +/// +/// Returns the updated assignment vector. fn apply_twodelta_runs_to_assignment( previous_assignment: &[u16], pair: (u16, u16), @@ -495,6 +535,15 @@ fn apply_twodelta_runs_to_assignment( } /// Decode a raw TwoDelta frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `previous_assignment` - The assignment from the preceding frame. +/// * `frame` - The TwoDelta frame to decode. +/// +/// # Returns +/// +/// Returns the reconstructed assignment vector. fn decode_twodelta_frame_to_assignment( previous_assignment: &[u16], frame: &TwoDeltaFrame, @@ -503,6 +552,17 @@ fn decode_twodelta_frame_to_assignment( apply_twodelta_runs_to_assignment(previous_assignment, frame.pair(), &run_lengths) } +/// Decode a stored BEN frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `previous_assignment` - The assignment from the preceding frame, required +/// for TwoDelta frames. +/// * `frame` - The stored frame to decode. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. fn decode_stored_frame_to_assignment( previous_assignment: Option<&[u16]>, frame: &StoredBenFrame, @@ -612,6 +672,16 @@ pub struct XBenDecoder { } impl XBenDecoder { + /// Create an XBEN decoder from an already-opened decompressed stream. + /// + /// # Arguments + /// + /// * `xz` - A buffered XZ decompression reader positioned past the banner. + /// * `variant` - The BEN variant indicated by the banner. + /// + /// # Returns + /// + /// Returns a new decoder ready to yield frames from the stream. pub(crate) fn from_decompressed_stream( xz: BufReader>, variant: BenVariant, @@ -653,6 +723,9 @@ impl XBenDecoder { /// Try to extract one complete ben32 frame from the buffered overflow. /// + /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 + /// frame and, for MkvChain streams, reads the trailing repetition count. + /// /// # Arguments /// /// * `overflow` - Buffered decompressed bytes that may contain one or more @@ -697,6 +770,20 @@ impl XBenDecoder { } } + /// Try to extract one complete TwoDelta frame from the buffered overflow. + /// + /// Inspects the leading tag byte to determine whether the frame is a full + /// RLE frame or a delta frame, then reads the corresponding payload. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain a complete + /// TwoDelta frame. + /// + /// # Returns + /// + /// Returns the parsed frame, the number of consumed bytes, and the + /// repetition count when a complete frame is available. fn pop_twodelta_frame_from_overflow( &self, overflow: &[u8], diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 9e9df09..dbb0174 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -35,6 +35,16 @@ struct AssignmentHints { delta_pair: Option<(u16, u16)>, } +/// Check whether two assignment vectors are identical element-by-element. +/// +/// # Arguments +/// +/// * `previous_sample` - The previous assignment vector. +/// * `assign_vec` - The current assignment vector. +/// +/// # Returns +/// +/// Returns `true` if both vectors have the same length and every element matches. fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { return false; @@ -49,6 +59,20 @@ fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { true } +/// Analyze the transition between two assignment vectors for two-delta encoding. +/// +/// Determines whether the assignments are identical (repeated) or differ by +/// exactly one swapped pair of values, which qualifies for delta encoding. +/// +/// # Arguments +/// +/// * `previous_sample` - The previous assignment vector. +/// * `assign_vec` - The current assignment vector. +/// +/// # Returns +/// +/// Returns an `AssignmentHints` with `is_repeated` set if the vectors match, +/// or `delta_pair` set if all differences involve exactly two values. fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> AssignmentHints { if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { return AssignmentHints::default(); @@ -97,6 +121,16 @@ fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> A } } +/// Extract and validate the `assignment` array from a JSON object. +/// +/// # Arguments +/// +/// * `data` - A JSON value expected to contain an `assignment` array of integers. +/// +/// # Returns +/// +/// Returns a `Vec` of assignment values, or an error if the field is +/// missing, not an array, or contains values that do not fit in a `u16`. fn parse_json_assignment(data: Value) -> Result> { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( @@ -128,6 +162,18 @@ fn parse_json_assignment(data: Value) -> Result> { .collect() } +/// Encode an assignment vector as a full XBEN two-delta frame. +/// +/// The frame begins with a full-frame tag byte followed by RLE-encoded +/// assignment runs in big-endian format. +/// +/// # Arguments +/// +/// * `assignments` - The full assignment vector to encode. +/// +/// # Returns +/// +/// Returns the encoded frame as a byte vector. fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { let runs = assign_to_rle(assignments); let mut bytes = Vec::with_capacity(1 + 4 + runs.len() * 4); @@ -140,6 +186,21 @@ fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { bytes } +/// Encode the difference between two assignments as an XBEN two-delta delta frame. +/// +/// The frame begins with a delta tag byte, the swapped value pair, and then +/// run-length encoded flip positions in big-endian format. +/// +/// # Arguments +/// +/// * `previous_assignment` - The previous assignment vector. +/// * `new_assignment` - The current assignment vector. +/// * `delta_pair` - An optional pre-computed pair of swapped values. +/// * `masks` - An optional index map from value to positions in the previous assignment. +/// +/// # Returns +/// +/// Returns the encoded delta frame as a byte vector, or an error if encoding fails. fn encode_xben_twodelta_delta_frame( previous_assignment: &[u16], new_assignment: &[u16], @@ -195,6 +256,7 @@ impl BenEncoder { } } + /// Rebuild the value-to-position index map from the current previous sample. fn rebuild_previous_masks(&mut self) { self.previous_masks.clear(); for (idx, &assignment) in self.previous_sample.iter().enumerate() { @@ -202,6 +264,13 @@ impl BenEncoder { } } + /// Store a new previous sample along with its encoded frame and repetition count. + /// + /// # Arguments + /// + /// * `sample` - The assignment vector to cache. + /// * `encoded` - The already-encoded frame for this assignment. + /// * `sample_count` - The initial repetition count for this sample. fn set_previous_sample( &mut self, sample: Vec, @@ -214,6 +283,20 @@ impl BenEncoder { self.sample_count = sample_count; } + /// Encode and write an assignment vector using pre-computed transition hints. + /// + /// The encoding strategy depends on the configured `BenVariant`. Repeated + /// assignments may be deduplicated or counted, and two-delta hints enable + /// compact delta frames when applicable. + /// + /// # Arguments + /// + /// * `assign_vec` - The assignment vector to encode. + /// * `hints` - Pre-computed hints about repetition and delta-pair eligibility. + /// + /// # Returns + /// + /// Returns `Ok(())` after the assignment has been queued or written. fn write_assignment_with_hints( &mut self, assign_vec: Vec, @@ -282,6 +365,14 @@ impl BenEncoder { } } + /// Flush the buffered frame and its repetition count to the underlying writer. + /// + /// For MkvChain and TwoDelta variants, the repetition count is appended + /// after the encoded frame. This is a no-op when no samples are pending. + /// + /// # Returns + /// + /// Returns `Ok(())` once the pending frame has been written. fn flush_pending_frame(&mut self) -> Result<()> { if self.sample_count == 0 { return Ok(()); @@ -371,6 +462,7 @@ pub struct XBenEncoder { } impl XBenEncoder { + /// Rebuild the value-to-position index map from the current previous assignment. fn rebuild_previous_masks(&mut self) { self.previous_masks.clear(); for (idx, &assignment) in self.previous_assignment.iter().enumerate() { @@ -378,6 +470,13 @@ impl XBenEncoder { } } + /// Store a new previous assignment along with its encoded frame and repetition count. + /// + /// # Arguments + /// + /// * `assignment` - The assignment vector to cache. + /// * `frame` - The already-encoded frame bytes for this assignment. + /// * `count` - The initial repetition count for this assignment. fn set_previous_assignment(&mut self, assignment: Vec, frame: Vec, count: u16) { self.previous_assignment = assignment; self.rebuild_previous_masks(); @@ -385,6 +484,14 @@ impl XBenEncoder { self.count = count; } + /// Flush the buffered frame and its repetition count to the XZ encoder. + /// + /// For MkvChain and TwoDelta variants, the repetition count is appended + /// after the encoded frame. This is a no-op when no samples are pending. + /// + /// # Returns + /// + /// Returns `Ok(())` once the pending frame has been written. fn flush_pending_frame(&mut self) -> Result<()> { if self.count == 0 { return Ok(()); diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 4c80c98..3ea9857 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -28,6 +28,15 @@ struct GraphJson { } impl GraphJson { + /// Deserialize a NetworkX node-link JSON graph from a reader. + /// + /// # Arguments + /// + /// * `reader` - A source implementing `Read` that provides the JSON data. + /// + /// # Returns + /// + /// Returns a parsed `GraphJson` with precomputed node ids and adjacency indices. fn from_reader(reader: R) -> io::Result { let data: Value = serde_json::from_reader(reader)?; let nodes = data["nodes"].as_array().cloned().unwrap_or_default(); @@ -133,6 +142,15 @@ pub fn sort_json_file_by_ordering( reorder_graph(graph, order, writer) } +/// Extract the `id` field from a node JSON value as a `usize`. +/// +/// # Arguments +/// +/// * `node` - A JSON value representing a graph node. +/// +/// # Returns +/// +/// Returns the node id, or an error if the field is missing or not an unsigned integer. fn parse_node_id(node: &Value) -> io::Result { node["id"].as_u64().map(|v| v as usize).ok_or_else(|| { io::Error::new( @@ -142,6 +160,15 @@ fn parse_node_id(node: &Value) -> io::Result { }) } +/// Extract the `id` field from an adjacency link JSON value as a `usize`. +/// +/// # Arguments +/// +/// * `link` - A JSON value representing an adjacency link (edge target). +/// +/// # Returns +/// +/// Returns the target node id, or an error if the field is missing or not an unsigned integer. fn parse_link_id(link: &Value) -> io::Result { link["id"].as_u64().map(|v| v as usize).ok_or_else(|| { io::Error::new( @@ -151,6 +178,17 @@ fn parse_link_id(link: &Value) -> io::Result { }) } +/// Compare two nodes by a named attribute, using numeric ordering when possible. +/// +/// # Arguments +/// +/// * `a` - The first node JSON value. +/// * `b` - The second node JSON value. +/// * `key` - The attribute name to compare. +/// +/// # Returns +/// +/// Returns the ordering between the two nodes based on the attribute value. fn compare_node_key(a: &Value, b: &Value, key: &str) -> Ordering { let extract_value = |val: &Value| -> StdResult { match &val[key] { @@ -168,6 +206,17 @@ fn compare_node_key(a: &Value, b: &Value, key: &str) -> Ordering { } } +/// Apply a permutation to a graph and write the relabeled JSON to a writer. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph to reorder. +/// * `order` - A permutation where `order[new_index]` gives the old index. +/// * `writer` - The destination for the reordered JSON output. +/// +/// # Returns +/// +/// Returns a map from original node id to new node id. fn reorder_graph( mut graph: GraphJson, order: Vec, @@ -212,6 +261,16 @@ fn reorder_graph( Ok(old_id_to_new) } +/// Find connected components of a graph using breadth-first search. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph to decompose. +/// +/// # Returns +/// +/// Returns a list of components, each a vector of node indices, sorted by +/// smallest original node id. fn connected_components(graph: &GraphJson) -> Vec> { let n = graph.nodes.len(); let mut seen = vec![false; n]; @@ -242,6 +301,15 @@ fn connected_components(graph: &GraphJson) -> Vec> { components } +/// Compute a Reverse Cuthill-McKee ordering for the entire graph. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph to order. +/// +/// # Returns +/// +/// Returns a permutation of node indices that reduces bandwidth. fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { let mut order = Vec::with_capacity(graph.nodes.len()); @@ -252,6 +320,17 @@ fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { order } +/// Compute a Reverse Cuthill-McKee ordering for a single connected component. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph. +/// * `component` - The node indices belonging to the component. +/// +/// # Returns +/// +/// Returns a reversed BFS ordering of the component starting from the +/// minimum-degree node. fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Vec { let degrees = graph .adjacency_indices @@ -291,6 +370,16 @@ fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Ve component_order } +/// Compute a minimum-linear-arrangement heuristic ordering for the entire graph. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph to order. +/// +/// # Returns +/// +/// Returns a permutation of node indices that heuristically minimizes total +/// edge span. fn minimum_linear_arrangement_order(graph: &GraphJson) -> Vec { let mut order = Vec::with_capacity(graph.nodes.len()); @@ -301,10 +390,31 @@ fn minimum_linear_arrangement_order(graph: &GraphJson) -> Vec { order } +/// Compute a multilevel cluster ordering for the entire graph. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph to order. +/// +/// # Returns +/// +/// Returns a permutation of node indices produced by recursive multilevel +/// clustering. fn multi_level_cluster_order(graph: &GraphJson) -> Vec { multilevel_cluster_order_generic(&graph.adjacency_indices, &graph.node_ids) } +/// Compute a minimum-linear-arrangement heuristic ordering for a single component. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph. +/// * `component` - The node indices belonging to the component. +/// +/// # Returns +/// +/// Returns an ordering that heuristically minimizes total edge span within the +/// component, refined by iterated barycenter sorting and adjacent swaps. fn minimum_linear_arrangement_component(graph: &GraphJson, component: &[usize]) -> Vec { if component.len() <= 2 { return component.to_vec(); @@ -329,6 +439,16 @@ fn minimum_linear_arrangement_component(graph: &GraphJson, component: &[usize]) order } +/// Build a boolean mask indicating membership in a subset of nodes. +/// +/// # Arguments +/// +/// * `size` - The total number of nodes (length of the returned vector). +/// * `nodes` - The node indices that belong to the subset. +/// +/// # Returns +/// +/// Returns a boolean vector where `true` marks nodes present in the subset. fn subset_mask(size: usize, nodes: &[usize]) -> Vec { let mut mask = vec![false; size]; for &node in nodes { @@ -337,6 +457,17 @@ fn subset_mask(size: usize, nodes: &[usize]) -> Vec { mask } +/// Find connected components of a generic adjacency list using breadth-first search. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `labels` - Node labels used to sort components by smallest label. +/// +/// # Returns +/// +/// Returns a list of components, each a vector of node indices, sorted by +/// minimum label. fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> Vec> { let mut seen = vec![false; adjacency.len()]; let mut components = Vec::new(); @@ -372,6 +503,18 @@ fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> V components } +/// Compute a Reverse Cuthill-McKee ordering for a component of a generic graph. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `labels` - Node labels used for tie-breaking. +/// * `component` - The node indices belonging to the component. +/// +/// # Returns +/// +/// Returns a reversed BFS ordering of the component starting from the +/// minimum-degree node. fn rcm_component_generic( adjacency: &[Vec], labels: &[usize], @@ -408,6 +551,17 @@ fn rcm_component_generic( order } +/// Compute a multilevel cluster ordering for a generic graph. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `labels` - Node labels used for tie-breaking and component sorting. +/// +/// # Returns +/// +/// Returns a permutation of node indices produced by recursive multilevel +/// clustering across all connected components. fn multilevel_cluster_order_generic(adjacency: &[Vec], labels: &[usize]) -> Vec { let mut order = Vec::with_capacity(adjacency.len()); for component in connected_components_generic(adjacency, labels) { @@ -418,6 +572,18 @@ fn multilevel_cluster_order_generic(adjacency: &[Vec], labels: &[usize]) order } +/// Compute a multilevel cluster ordering for a single component of a generic graph. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `labels` - Node labels used for tie-breaking. +/// * `component` - The node indices belonging to the component. +/// +/// # Returns +/// +/// Returns an ordering that recursively partitions the component into clusters, +/// orders each cluster with RCM, builds a coarse graph of clusters, and recurses. fn multilevel_cluster_component_generic( adjacency: &[Vec], labels: &[usize], @@ -446,6 +612,18 @@ fn multilevel_cluster_component_generic( order } +/// Partition a component into small clusters using a greedy seed-expansion strategy. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `labels` - Node labels used for tie-breaking. +/// * `component` - The node indices to partition. +/// * `max_cluster_size` - The maximum number of nodes per cluster. +/// +/// # Returns +/// +/// Returns a list of clusters, each a vector of node indices. fn greedy_cluster_partition( adjacency: &[Vec], labels: &[usize], @@ -512,6 +690,18 @@ fn greedy_cluster_partition( clusters } +/// Compute the degree of each node restricted to a subset of the graph. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list for each node. +/// * `subset_mask` - A boolean mask indicating which nodes belong to the subset. +/// * `subset` - The node indices in the subset. +/// +/// # Returns +/// +/// Returns a vector indexed by node where each entry is the number of neighbors +/// within the subset. fn local_degree_in_subset( adjacency: &[Vec], subset_mask: &[bool], @@ -527,6 +717,18 @@ fn local_degree_in_subset( local_degree } +/// Build a coarse graph where each cluster is contracted into a single node. +/// +/// # Arguments +/// +/// * `adjacency` - The adjacency list of the original graph. +/// * `labels` - Node labels of the original graph. +/// * `clusters` - The cluster partition of the original graph. +/// +/// # Returns +/// +/// Returns a tuple of the coarse adjacency list and coarse labels, where each +/// coarse label is the minimum original label in the cluster. fn build_coarse_graph( adjacency: &[Vec], labels: &[usize], @@ -573,6 +775,17 @@ fn build_coarse_graph( (coarse_adjacency, coarse_labels) } +/// Invert a permutation to get the position of each node in the ordering. +/// +/// # Arguments +/// +/// * `size` - The total number of nodes (length of the returned vector). +/// * `order` - A permutation where `order[position]` gives the node index. +/// +/// # Returns +/// +/// Returns a vector indexed by node where each entry is the node's position in +/// the ordering. fn positions_for_order(size: usize, order: &[usize]) -> Vec { let mut positions = vec![usize::MAX; size]; for (idx, &node) in order.iter().enumerate() { @@ -581,6 +794,19 @@ fn positions_for_order(size: usize, order: &[usize]) -> Vec { positions } +/// Compute the barycenter score of a node as the mean position of its neighbors. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph. +/// * `node` - The node index to score. +/// * `positions` - The current position of each node in the ordering. +/// * `component_mask` - A boolean mask restricting which neighbors to consider. +/// +/// # Returns +/// +/// Returns the average position of the node's neighbors within the component, +/// or the node's own position if it has no neighbors in the component. fn barycenter_score( graph: &GraphJson, node: usize, @@ -603,6 +829,13 @@ fn barycenter_score( } } +/// Improve an ordering by repeatedly swapping adjacent pairs that reduce total edge span. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph. +/// * `order` - The current ordering, modified in place. +/// * `component_mask` - A boolean mask restricting which neighbors to consider. fn local_adjacent_improvement(graph: &GraphJson, order: &mut [usize], component_mask: &[bool]) { if order.len() < 2 { return; @@ -634,6 +867,19 @@ fn local_adjacent_improvement(graph: &GraphJson, order: &mut [usize], component_ } } +/// Compute the total edge span cost for a single node in the current ordering. +/// +/// # Arguments +/// +/// * `graph` - The parsed graph. +/// * `node` - The node index to evaluate. +/// * `positions` - The current position of each node in the ordering. +/// * `component_mask` - A boolean mask restricting which neighbors to consider. +/// +/// # Returns +/// +/// Returns the sum of absolute position differences between the node and each of +/// its neighbors within the component. fn node_span_cost( graph: &GraphJson, node: usize, diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 2beda97..26b28be 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -40,6 +40,16 @@ fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result< Ok(permutation) } +/// Canonicalize an assignment vector by remapping labels in first-seen order. +/// +/// # Arguments +/// +/// * `assignment` - The original assignment slice whose labels should be remapped. +/// +/// # Returns +/// +/// Returns a new vector with labels replaced by sequential integers starting at 1, +/// assigned in the order they first appear. fn canonicalize_assignment(assignment: &[u16]) -> Vec { let mut label_map = HashMap::new(); let mut next_label = 0u16; @@ -60,6 +70,17 @@ fn canonicalize_assignment(assignment: &[u16]) -> Vec { out } +/// Reorder an assignment vector according to a dense permutation. +/// +/// # Arguments +/// +/// * `assignment` - The original assignment slice to permute. +/// * `permutation` - A dense permutation vector where `permutation[new_idx] == old_idx`. +/// +/// # Returns +/// +/// Returns a new vector with elements rearranged so that `out[new_idx] == assignment[old_idx]`, +/// or an error if the lengths do not match. fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result> { if assignment.len() != permutation.len() { return Err(io::Error::new( @@ -79,6 +100,19 @@ fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result( reader: R, writer: W, @@ -118,6 +152,15 @@ where Ok(()) } +/// Determine the BEN variant from a 17-byte file banner. +/// +/// # Arguments +/// +/// * `header` - The 17-byte banner read from the start of a BEN file. +/// +/// # Returns +/// +/// Returns the detected `BenVariant`, or an error if the banner is not recognized. fn detect_ben_variant(header: &[u8; 17]) -> io::Result { match header { b"STANDARD BEN FILE" => Ok(BenVariant::Standard), @@ -130,6 +173,18 @@ fn detect_ben_variant(header: &[u8; 17]) -> io::Result { } } +/// Shared implementation for converting a BEN file into a different variant without relabeling. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the converted BEN output. +/// * `target_variant` - The BEN variant to encode into. +/// * `max_samples` - Optional upper bound on the number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after all (or up to `max_samples`) samples have been converted. fn convert_ben_file_impl( mut reader: R, writer: W, @@ -141,16 +196,22 @@ fn convert_ben_file_impl( let _input_variant = detect_ben_variant(&check_buffer)?; let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - writer, - target_variant, - max_samples, - |assignment| Ok(assignment.to_vec()), - ) + relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, |assignment| { + Ok(assignment.to_vec()) + }) } /// Rewrite a BEN file into the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the converted BEN output. +/// * `target_variant` - The BEN variant to encode into. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full BEN file has been converted. pub fn convert_ben_file( reader: R, writer: W, @@ -160,6 +221,17 @@ pub fn convert_ben_file( } /// Rewrite at most `max_samples` expanded samples into the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the converted BEN output. +/// * `target_variant` - The BEN variant to encode into. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been converted. pub fn convert_ben_file_limit( reader: R, writer: W, @@ -219,6 +291,18 @@ pub fn relabel_ben_lines_limit( } /// Shared implementation for canonical BEN relabeling. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// * `max_samples` - Optional upper bound on the number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. fn relabel_ben_lines_impl( mut reader: R, mut writer: W, @@ -323,6 +407,16 @@ pub fn relabel_ben_file_limit( } /// Shared implementation for BEN-file canonical relabeling. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// * `max_samples` - Optional upper bound on the number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. fn relabel_ben_file_impl( mut reader: R, mut writer: W, @@ -414,6 +508,20 @@ pub fn relabel_ben_lines_with_map_limit( } /// Shared implementation for mapped BEN relabeling. +/// +/// # Arguments +/// +/// * `reader` - The BEN input stream without its 17-byte file banner. +/// * `writer` - The destination for the relabeled BEN frames. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `variant` - The BEN variant, used to determine whether repetition counts +/// follow each frame. +/// * `max_samples` - Optional upper bound on the number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. fn relabel_ben_lines_with_map_impl( mut reader: R, mut writer: W, @@ -538,6 +646,18 @@ pub fn relabel_ben_file_with_map_limit( } /// Shared implementation for BEN-file mapped relabeling. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN file. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `max_samples` - Optional upper bound on the number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. fn relabel_ben_file_with_map_impl( mut reader: R, mut writer: W, @@ -578,6 +698,16 @@ fn relabel_ben_file_with_map_impl( } /// Canonicalize BEN assignments and write them using the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN output. +/// * `target_variant` - The BEN variant to encode into. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full BEN file has been relabeled and converted. pub fn relabel_ben_file_as_variant( mut reader: R, writer: W, @@ -588,16 +718,23 @@ pub fn relabel_ben_file_as_variant( let _input_variant = detect_ben_variant(&check_buffer)?; let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - writer, - target_variant, - None, - |assignment| Ok(canonicalize_assignment(assignment)), - ) + relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { + Ok(canonicalize_assignment(assignment)) + }) } /// Canonicalize up to `max_samples` expanded samples and write the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN output. +/// * `target_variant` - The BEN variant to encode into. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and converted. pub fn relabel_ben_file_as_variant_limit( mut reader: R, writer: W, @@ -619,6 +756,18 @@ pub fn relabel_ben_file_as_variant_limit( } /// Relabel a BEN file with a supplied node map and write the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN output. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `target_variant` - The BEN variant to encode into. +/// +/// # Returns +/// +/// Returns `Ok(())` after the full BEN file has been relabeled and converted. pub fn relabel_ben_file_with_map_as_variant( mut reader: R, writer: W, @@ -631,16 +780,25 @@ pub fn relabel_ben_file_with_map_as_variant( let permutation = dense_permutation(&new_to_old_node_map)?; let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - writer, - target_variant, - None, - |assignment| permute_assignment(assignment, &permutation), - ) + relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { + permute_assignment(assignment, &permutation) + }) } /// Relabel up to `max_samples` expanded samples with a supplied node map and write the requested BEN variant. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including its banner. +/// * `writer` - The destination for the relabeled BEN output. +/// * `new_to_old_node_map` - The permutation describing how node positions +/// should be reordered. +/// * `target_variant` - The BEN variant to encode into. +/// * `max_samples` - The maximum number of expanded samples to write. +/// +/// # Returns +/// +/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and converted. pub fn relabel_ben_file_with_map_as_variant_limit( mut reader: R, writer: W, From 8dc1ce92496137cff0e76d2aa4bbf86bfd6e3eca Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:15:59 -0600 Subject: [PATCH 029/221] fix labelling issue --- ben/src/io/reader.rs | 38 +++++++++++++++++++++----------------- ben/src/ops/relabel/mod.rs | 4 ++-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 86e2931..b4c9ca9 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -124,6 +124,7 @@ pub struct BenDecoder { variant: BenVariant, previous_assignment: Option>, twodelta_consumed_first_frame: bool, + silent: bool, } #[derive(Clone)] @@ -189,31 +190,32 @@ impl BenDecoder { } match variant_from_banner(&check_buffer) { - Some(BenVariant::Standard) => Ok(BenDecoder { + Some(variant) => Ok(BenDecoder { reader, sample_count: 0, - variant: BenVariant::Standard, - previous_assignment: None, - twodelta_consumed_first_frame: false, - }), - Some(BenVariant::MkvChain) => Ok(BenDecoder { - reader, - sample_count: 0, - variant: BenVariant::MkvChain, - previous_assignment: None, - twodelta_consumed_first_frame: false, - }), - Some(BenVariant::TwoDelta) => Ok(BenDecoder { - reader, - sample_count: 0, - variant: BenVariant::TwoDelta, + variant, previous_assignment: None, twodelta_consumed_first_frame: false, + silent: false, }), None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), } } + /// Suppress progress output from this decoder's iterator. + /// + /// # Arguments + /// + /// * `silent` - When `true`, the decoder will not emit progress messages. + /// + /// # Returns + /// + /// Returns `self` for method chaining. + pub fn silent(mut self, silent: bool) -> Self { + self.silent = silent; + self + } + /// Decode the remaining BEN stream and write it as JSONL. /// /// # Arguments @@ -599,7 +601,9 @@ impl Iterator for BenDecoder { let count = frame.count(); self.previous_assignment = Some(assignment.clone()); self.sample_count += count as usize; - progress!("Decoding sample: {}\r", self.sample_count); + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } Some(Ok((assignment, count))) } } diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 26b28be..45dfa90 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -123,7 +123,7 @@ fn relabel_ben_file_via_decoder( where F: FnMut(&[u16]) -> io::Result>, { - let decoder = BenDecoder::new(reader)?; + let decoder = BenDecoder::new(reader)?.silent(true); let mut encoder = BenEncoder::new(writer, variant); let mut sample_number = 0usize; @@ -143,7 +143,7 @@ where } sample_number += out_count; - progress!("Relabeling line: {}\r", sample_number); + progress!("Relabelling line: {}\r", sample_number); } tracing::trace!(""); From 3967b9a1b1e88a277a83cfdc68b925f8f304704b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:24:41 -0600 Subject: [PATCH 030/221] Speed up conversion --- ben/src/io/writer.rs | 88 +++++++++++++++++++++++++++++++++++++- ben/src/ops/relabel/mod.rs | 28 ++++++------ 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index dbb0174..c131f8c 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -359,7 +359,16 @@ impl BenEncoder { Some(&self.previous_masks), )?; self.flush_pending_frame()?; - self.set_previous_sample(assign_vec, BufferedBenFrame::TwoDelta(encoded), 1); + + if let Some(pair) = hints.delta_pair { + self.update_masks_for_delta(&assign_vec, pair); + self.previous_sample = assign_vec; + } else { + self.previous_sample = assign_vec; + self.rebuild_previous_masks(); + } + self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); + self.sample_count = 1; Ok(()) } } @@ -391,6 +400,83 @@ impl BenEncoder { Ok(()) } + /// Record additional repetitions of the most recently written assignment. + /// + /// For MkvChain and TwoDelta variants the repetition count is incremented + /// directly. For Standard, the cached encoded frame is re-emitted once per + /// additional repeat. + /// + /// # Arguments + /// + /// * `additional` - The number of extra copies beyond the one already written. + /// + /// # Returns + /// + /// Returns `Ok(())` after all additional repeats have been recorded. + pub fn repeat_previous(&mut self, additional: u16) -> Result<()> { + match self.variant { + BenVariant::Standard => { + if let Some(encoded) = self.previous_encoded_sample.as_ref() { + for _ in 0..additional { + self.writer.write_all(encoded.as_slice())?; + } + } + } + BenVariant::MkvChain | BenVariant::TwoDelta => { + self.sample_count += additional; + } + } + Ok(()) + } + + /// Update the value-to-position masks incrementally for a two-delta transition. + /// + /// Instead of rebuilding the entire mask HashMap, only the positions belonging + /// to the two swapped values are repartitioned. This is O(pair_positions) + /// rather than O(assignment_length). + /// + /// # Arguments + /// + /// * `new_sample` - The new assignment vector after the transition. + /// * `pair` - The two values involved in the delta swap. + fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { + if pair.0 == pair.1 { + return; + } + + let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); + let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); + + let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); + let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); + + let (mut i, mut j) = (0, 0); + while i < pos_a.len() || j < pos_b.len() { + let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { + let p = pos_a[i]; + i += 1; + p + } else { + let p = pos_b[j]; + j += 1; + p + }; + + if new_sample[pos] == pair.0 { + new_a.push(pos); + } else { + new_b.push(pos); + } + } + + if !new_a.is_empty() { + self.previous_masks.insert(pair.0, new_a); + } + if !new_b.is_empty() { + self.previous_masks.insert(pair.1, new_b); + } + } + /// Encode and write a full assignment vector. /// /// # Arguments diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 45dfa90..1d566b9 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -108,7 +108,8 @@ fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result( mut transform: F, ) -> io::Result<()> where - F: FnMut(&[u16]) -> io::Result>, + F: FnMut(Vec) -> io::Result>, { let decoder = BenDecoder::new(reader)?.silent(true); let mut encoder = BenEncoder::new(writer, variant); @@ -133,13 +134,14 @@ where break; } - let relabeled = transform(&assignment)?; + let relabeled = transform(assignment)?; let out_count = max_samples .map(|limit| (limit - sample_number).min(count as usize)) .unwrap_or(count as usize); - for _ in 0..out_count { - encoder.write_assignment(relabeled.clone())?; + encoder.write_assignment(relabeled)?; + if out_count > 1 { + encoder.repeat_previous((out_count - 1) as u16)?; } sample_number += out_count; @@ -196,9 +198,7 @@ fn convert_ben_file_impl( let _input_variant = detect_ben_variant(&check_buffer)?; let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, |assignment| { - Ok(assignment.to_vec()) - }) + relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, Ok) } /// Rewrite a BEN file into the requested BEN variant. @@ -440,7 +440,7 @@ fn relabel_ben_file_impl( &mut writer, variant, max_samples, - |assignment| Ok(canonicalize_assignment(assignment)), + |assignment| Ok(canonicalize_assignment(&assignment)), )? } } @@ -689,7 +689,7 @@ fn relabel_ben_file_with_map_impl( &mut writer, variant, max_samples, - |assignment| permute_assignment(assignment, &permutation), + |assignment| permute_assignment(&assignment, &permutation), )? } } @@ -719,7 +719,7 @@ pub fn relabel_ben_file_as_variant( let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - Ok(canonicalize_assignment(assignment)) + Ok(canonicalize_assignment(&assignment)) }) } @@ -751,7 +751,7 @@ pub fn relabel_ben_file_as_variant_limit( writer, target_variant, Some(max_samples), - |assignment| Ok(canonicalize_assignment(assignment)), + |assignment| Ok(canonicalize_assignment(&assignment)), ) } @@ -781,7 +781,7 @@ pub fn relabel_ben_file_with_map_as_variant( let permutation = dense_permutation(&new_to_old_node_map)?; let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - permute_assignment(assignment, &permutation) + permute_assignment(&assignment, &permutation) }) } @@ -817,7 +817,7 @@ pub fn relabel_ben_file_with_map_as_variant_limit( writer, target_variant, Some(max_samples), - |assignment| permute_assignment(assignment, &permutation), + |assignment| permute_assignment(&assignment, &permutation), ) } From d1a85d06b93fea6cb8cf51356f238fd23b64262f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:38:00 -0600 Subject: [PATCH 031/221] optimize the twodelta case --- ben/src/io/reader.rs | 18 +++--- ben/src/io/writer.rs | 128 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 132 insertions(+), 14 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index b4c9ca9..3ea5da1 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -495,16 +495,15 @@ fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { /// /// Returns the updated assignment vector. fn apply_twodelta_runs_to_assignment( - previous_assignment: &[u16], + mut assignment: Vec, pair: (u16, u16), run_lengths: &[u16], ) -> io::Result> { let mut pair_positions = Vec::new(); - pair_positions.reserve(previous_assignment.len()); let (first, second) = pair; - for (idx, &assignment) in previous_assignment.iter().enumerate() { - if assignment == first || assignment == second { + for (idx, &val) in assignment.iter().enumerate() { + if val == first || val == second { pair_positions.push(idx); } } @@ -517,7 +516,6 @@ fn apply_twodelta_runs_to_assignment( )); } - let mut assignment = previous_assignment.to_vec(); let mut write_idx = 0usize; let mut current_value = first; @@ -547,7 +545,7 @@ fn apply_twodelta_runs_to_assignment( /// /// Returns the reconstructed assignment vector. fn decode_twodelta_frame_to_assignment( - previous_assignment: &[u16], + previous_assignment: Vec, frame: &TwoDeltaFrame, ) -> io::Result> { let run_lengths = decode_twodelta_run_lengths(frame)?; @@ -566,13 +564,13 @@ fn decode_twodelta_frame_to_assignment( /// /// Returns the expanded assignment vector. fn decode_stored_frame_to_assignment( - previous_assignment: Option<&[u16]>, + previous_assignment: &mut Option>, frame: &StoredBenFrame, ) -> io::Result> { match frame { StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( - previous_assignment.ok_or_else(|| { + previous_assignment.take().ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidData, "TwoDelta frame encountered before an initial BEN frame", @@ -594,7 +592,7 @@ impl Iterator for BenDecoder { None => return None, }; let assignment = - match decode_stored_frame_to_assignment(self.previous_assignment.as_deref(), &frame) { + match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { Ok(assgn) => assgn, Err(e) => return Some(Err(e)), }; @@ -937,7 +935,7 @@ impl Iterator for XBenDecoder { let assignment = match frame { XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.as_deref() { + match self.previous_assignment.take() { Some(previous_assignment) => { apply_twodelta_runs_to_assignment( previous_assignment, diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index c131f8c..47c27d5 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -64,20 +64,73 @@ fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { /// Determines whether the assignments are identical (repeated) or differ by /// exactly one swapped pair of values, which qualifies for delta encoding. /// +/// When `masks` are available the pair is detected in O(K) where K is the +/// number of unique label values, by checking each label's mask positions for +/// changes rather than scanning the full assignment array. +/// /// # Arguments /// /// * `previous_sample` - The previous assignment vector. /// * `assign_vec` - The current assignment vector. +/// * `masks` - An optional index map from each label value to its sorted +/// positions in the previous assignment. /// /// # Returns /// /// Returns an `AssignmentHints` with `is_repeated` set if the vectors match, /// or `delta_pair` set if all differences involve exactly two values. -fn analyze_twodelta_transition(previous_sample: &[u16], assign_vec: &[u16]) -> AssignmentHints { +fn analyze_twodelta_transition( + previous_sample: &[u16], + assign_vec: &[u16], + masks: Option<&HashMap>>, +) -> AssignmentHints { if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { return AssignmentHints::default(); } + // Fast path: use masks to find the pair in O(K) instead of O(N). + if let Some(masks) = masks { + if previous_sample == assign_vec { + return AssignmentHints { + is_repeated: true, + delta_pair: None, + }; + } + + // Check each label's mask positions. Only labels involved in the swap + // will have any changed positions; all others short-circuit immediately. + let mut pair: Option<(u16, u16)> = None; + for (&label, positions) in masks { + for &pos in positions { + if assign_vec[pos] != label { + let other = assign_vec[pos]; + match pair { + None => { + pair = Some((label, other)); + break; + } + Some((a, b)) => { + if (label == a || label == b) && (other == a || other == b) { + break; + } + // More than two values involved. + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + } + } + } + } + + return AssignmentHints { + is_repeated: false, + delta_pair: pair, + }; + } + + // Slow path: full O(N) scan when masks are not available. let Some(first_mismatch) = previous_sample .iter() .zip(assign_vec.iter()) @@ -488,7 +541,12 @@ impl BenEncoder { /// Returns `Ok(())` after the assignment has been queued or written. pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { let hints = if self.variant == BenVariant::TwoDelta { - analyze_twodelta_transition(&self.previous_sample, &assign_vec) + let masks = if self.previous_masks.is_empty() { + None + } else { + Some(&self.previous_masks) + }; + analyze_twodelta_transition(&self.previous_sample, &assign_vec, masks) } else { AssignmentHints::default() }; @@ -570,6 +628,53 @@ impl XBenEncoder { self.count = count; } + /// Update the value-to-position masks incrementally for a two-delta transition. + /// + /// Instead of rebuilding the entire mask HashMap, only the positions belonging + /// to the two swapped values are repartitioned. This is O(pair_positions) + /// rather than O(assignment_length). + /// + /// # Arguments + /// + /// * `new_sample` - The new assignment vector after the transition. + /// * `pair` - The two values involved in the delta swap. + fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { + if pair.0 == pair.1 { + return; + } + + let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); + let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); + + let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); + let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); + + let (mut i, mut j) = (0, 0); + while i < pos_a.len() || j < pos_b.len() { + let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { + let p = pos_a[i]; + i += 1; + p + } else { + let p = pos_b[j]; + j += 1; + p + }; + if new_sample[pos] == pair.0 { + new_a.push(pos); + } else { + new_b.push(pos); + } + } + + if !new_a.is_empty() { + self.previous_masks.insert(pair.0, new_a); + } + if !new_b.is_empty() { + self.previous_masks.insert(pair.1, new_b); + } + } + /// Flush the buffered frame and its repetition count to the XZ encoder. /// /// For MkvChain and TwoDelta variants, the repetition count is appended @@ -667,7 +772,13 @@ impl XBenEncoder { return Ok(()); } - let hints = analyze_twodelta_transition(&self.previous_assignment, &assign_vec); + let masks = if self.previous_masks.is_empty() { + None + } else { + Some(&self.previous_masks) + }; + let hints = + analyze_twodelta_transition(&self.previous_assignment, &assign_vec, masks); if hints.is_repeated { self.count += 1; return Ok(()); @@ -680,7 +791,16 @@ impl XBenEncoder { Some(&self.previous_masks), )?; self.flush_pending_frame()?; - self.set_previous_assignment(assign_vec, encoded, 1); + + if let Some(pair) = hints.delta_pair { + self.update_masks_for_delta(&assign_vec, pair); + self.previous_assignment = assign_vec; + } else { + self.previous_assignment = assign_vec; + self.rebuild_previous_masks(); + } + self.previous_frame = encoded; + self.count = 1; Ok(()) } } From f2dd39b9e3a90045fdd3a1f128c59705cbfdef27 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:40:58 -0600 Subject: [PATCH 032/221] fix header double-check bug --- ben/src/codec/encode/xz.rs | 4 +-- ben/tests/test_impls_pipeline.rs | 48 ++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 52a1a72..ca5cd7e 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -1,6 +1,6 @@ use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::io::writer::XBenEncoder; -use std::io::{self, BufRead, Result, Write}; +use std::io::{self, BufRead, Cursor, Read, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; @@ -105,7 +105,7 @@ pub fn encode_ben_to_xben( .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; let mut ben_encoder = XBenEncoder::new(encoder, variant); - ben_encoder.write_ben_file(reader)?; + ben_encoder.write_ben_file(Cursor::new(check_buffer).chain(reader))?; Ok(()) } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index e52f9dc..1e902d1 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -848,31 +848,37 @@ fn subsample_by_indices_sorts_and_dedups() { #[test] fn ben_encode_xben_respects_existing_ben_header() { - // Build a BEN(Standard) - let jsonl = r#"{"assignment":[1,1],"sample":1} + let cases = [ + ( + BenVariant::Standard, + r#"{"assignment":[1,1],"sample":1} {"assignment":[2,2],"sample":2} -"#; - let mut ben = Vec::new(); - encode_jsonl_to_ben( - BufReader::new(jsonl.as_bytes()), - &mut ben, - BenVariant::Standard, - ) - .unwrap(); +"#, + ), + ( + BenVariant::TwoDelta, + r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,2,1,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#, + ), + ]; - // Now convert BEN -> XBEN - let mut xz = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xz, Some(1), Some(0)) - .expect("ben->xben failed"); + for (variant, jsonl) in cases { + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, variant).unwrap(); - // Decode back - let mut ben_back = Vec::new(); - decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut ben_back).unwrap(); + let mut xz = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xz, Some(1), Some(0)) + .expect("ben->xben failed"); - // Then to JSONL - let mut jsonl_back = Vec::new(); - decode_ben_to_jsonl(ben_back.as_slice(), &mut jsonl_back).unwrap(); - assert_eq!(jsonl_back, jsonl.as_bytes()); + let mut ben_back = Vec::new(); + decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut ben_back).unwrap(); + + let mut jsonl_back = Vec::new(); + decode_ben_to_jsonl(ben_back.as_slice(), &mut jsonl_back).unwrap(); + assert_eq!(jsonl_back, jsonl.as_bytes()); + } } #[test] From 9cc8efc733f38056b01053742b5c933686c7c052 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:50:08 -0600 Subject: [PATCH 033/221] small opt for xben encoder --- ben/src/io/writer.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 47c27d5..baab0bd 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -841,18 +841,13 @@ impl XBenEncoder { banner.copy_from_slice(&peek[..BANNER_LEN]); reader.consume(BANNER_LEN); - let decoder = BenDecoder::new(io::Cursor::new(banner).chain(reader))?; + let decoder = + BenDecoder::new(io::Cursor::new(banner).chain(reader))?.silent(true); for record in decoder { let (assignment, count) = record?; - self.write_assignment(assignment.clone())?; - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) - && count > 1 - { + self.write_assignment(assignment)?; + if count > 1 { self.count += count - 1; - } else if self.variant == BenVariant::Standard { - for _ in 1..count { - self.write_assignment(assignment.clone())?; - } } } return Ok(()); From 392cf9c7a77aeb0dd15b762ea832b98dd8db1860 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:54:14 -0600 Subject: [PATCH 034/221] opt decode --- ben/src/io/reader.rs | 61 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 3ea5da1..b81c44c 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -487,7 +487,7 @@ fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { /// /// # Arguments /// -/// * `previous_assignment` - The assignment from the preceding frame. +/// * `assignment` - The assignment from the preceding frame (mutated in place). /// * `pair` - The two label values that participate in the delta. /// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. /// @@ -499,36 +499,32 @@ fn apply_twodelta_runs_to_assignment( pair: (u16, u16), run_lengths: &[u16], ) -> io::Result> { - let mut pair_positions = Vec::new(); let (first, second) = pair; - for (idx, &val) in assignment.iter().enumerate() { - if val == first || val == second { - pair_positions.push(idx); - } - } - - let expected_total: usize = run_lengths.iter().map(|&len| len as usize).sum(); - if expected_total != pair_positions.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta payload does not match the previous assignment's pair positions", - )); - } - - let mut write_idx = 0usize; + let mut run_idx = 0usize; + let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); let mut current_value = first; - for &run_len in run_lengths { - for _ in 0..run_len { - assignment[pair_positions[write_idx]] = current_value; - write_idx += 1; + for val in assignment.iter_mut() { + if *val == first || *val == second { + if remaining_in_run == 0 { + run_idx += 1; + if run_idx >= run_lengths.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta payload exhausted before all pair positions were covered", + )); + } + remaining_in_run = run_lengths[run_idx]; + current_value = if current_value == first { + second + } else { + first + }; + } + *val = current_value; + remaining_in_run -= 1; } - current_value = if current_value == first { - second - } else { - first - }; } Ok(assignment) @@ -536,20 +532,23 @@ fn apply_twodelta_runs_to_assignment( /// Decode a raw TwoDelta frame into a full assignment vector. /// +/// Unpacks the bitpacked run lengths from the frame payload, then applies +/// them in a single pass over the assignment. +/// /// # Arguments /// -/// * `previous_assignment` - The assignment from the preceding frame. -/// * `frame` - The TwoDelta frame to decode. +/// * `assignment` - The assignment from the preceding frame (mutated in place). +/// * `frame` - The TwoDelta frame whose packed payload is decoded and applied. /// /// # Returns /// -/// Returns the reconstructed assignment vector. +/// Returns the updated assignment vector. fn decode_twodelta_frame_to_assignment( - previous_assignment: Vec, + assignment: Vec, frame: &TwoDeltaFrame, ) -> io::Result> { let run_lengths = decode_twodelta_run_lengths(frame)?; - apply_twodelta_runs_to_assignment(previous_assignment, frame.pair(), &run_lengths) + apply_twodelta_runs_to_assignment(assignment, frame.pair(), &run_lengths) } /// Decode a stored BEN frame into a full assignment vector. From 4a06e297e7f9a4207c57d6faee592d1f4f8e498d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 16 Mar 2026 23:02:29 -0600 Subject: [PATCH 035/221] get rid of extreneous copy --- ben/src/io/reader.rs | 98 ++++++++++++++++++++++++++++++-------- ben/src/io/writer.rs | 98 ++++++++++++++++++++++++++++++++------ ben/src/ops/relabel/mod.rs | 26 +++++----- 3 files changed, 175 insertions(+), 47 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index b81c44c..c64c5fd 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -227,26 +227,20 @@ impl BenDecoder { /// /// Returns `Ok(())` after the remaining stream has been fully decoded. pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - while let Some(result_tuple) = self.next() { - match result_tuple { - Ok((assignment, count)) => { - let starting_sample = self.sample_count + 1 - count as usize; - for offset in 0..count as usize { - let line = json!({ - "assignment": assignment, - "sample": starting_sample + offset, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes()).unwrap(); - } - } - Err(e) => { - return Err(e); - } + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; } - } - Ok(()) + Ok(true) + }) } /// Read and return the next raw BEN frame stored in standard BEN layout. @@ -415,6 +409,70 @@ impl BenDecoder { } Ok(total) } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// Unlike the `Iterator` implementation, this avoids cloning the assignment + /// buffer on every frame. The decoder owns a single buffer, mutates it in + /// place for TwoDelta frames, and lends `&[u16]` to the callback. This + /// eliminates one full-length memcpy per frame. + /// + /// The callback receives a borrowed assignment slice and its repetition + /// count. Return `true` to continue decoding or `false` to stop early. + /// + /// # Arguments + /// + /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. + /// + /// # Returns + /// + /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + loop { + let frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + + let count = frame.count(); + + match frame { + StoredBenFrame::Ben(ben_frame) => { + let assignment = decode_ben_frame_to_assignment(&ben_frame)?; + let keep_going = f(&assignment, count)?; + self.previous_assignment = Some(assignment); + if !keep_going { + return Ok(()); + } + } + StoredBenFrame::TwoDelta { frame, count } => { + let assignment = self.previous_assignment.take().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta frame encountered before an initial BEN frame", + ) + })?; + let run_lengths = decode_twodelta_run_lengths(&frame)?; + let assignment = + apply_twodelta_runs_to_assignment(assignment, frame.pair(), &run_lengths)?; + let keep_going = f(&assignment, count)?; + self.previous_assignment = Some(assignment); + if !keep_going { + return Ok(()); + } + } + } + + self.sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } + } + } } /// Decode a raw BEN frame into a full assignment vector. @@ -445,7 +503,7 @@ fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { /// # Returns /// /// Returns the sequence of non-zero run lengths extracted from the payload. -fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { +pub(crate) fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { let mut items = Vec::new(); let mut buffer: u32 = 0; let mut n_bits_in_buff: u16 = 0; diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index baab0bd..da78beb 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -1,12 +1,14 @@ +use crate::codec::decode::decode_ben_line; use crate::codec::encode::{ build_twodelta_runs_with_hint, encode_ben32_assignments, encode_ben_vec_from_assign, encode_twodelta_vec_with_hint, BenFrame, TwoDeltaFrame, }; use crate::codec::translate::ben_to_ben32_lines; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::io::reader::BenDecoder; +use crate::io::reader::decode_twodelta_run_lengths; use crate::util::rle::assign_to_rle; -use crate::BenVariant; +use crate::{progress, BenVariant}; +use byteorder::{BigEndian, ReadBytesExt}; use serde_json::Value; use std::collections::HashMap; use std::io::{self, BufRead, Read, Result, Write}; @@ -831,26 +833,92 @@ impl XBenEncoder { /// # Returns /// /// Returns `Ok(())` after the BEN stream has been translated into XBEN. + /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without + /// materializing full assignment vectors. + /// + /// The first frame (standard BEN RLE) is decoded to RLE runs and written as + /// an XBEN full frame. Subsequent delta frames have their bitpacked run + /// lengths unpacked and written as XBEN delta frames with raw u16 runs. + /// This avoids O(N) assignment reconstruction per frame entirely. + /// + /// # Arguments + /// + /// * `reader` - The BEN TwoDelta stream positioned after the banner. + /// + /// # Returns + /// + /// Returns `Ok(())` after the stream has been fully translated. + fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> Result<()> { + // First frame: standard BEN RLE → XBEN full frame. + let max_val_bits = reader.read_u8()?; + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + let first_count = reader.read_u16::()?; + + let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); + encoded.push(XBEN_TWODELTA_FULL_TAG); + encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for &(value, len) in &runs { + encoded.extend_from_slice(&value.to_be_bytes()); + encoded.extend_from_slice(&len.to_be_bytes()); + } + self.previous_frame = encoded; + self.count = first_count; + + let mut sample_count = first_count as usize; + progress!("Encoding line: {}\r", sample_count); + + // Delta frames: unpack bitpacked run lengths → XBEN delta frame. + loop { + let pair_a = match reader.read_u16::() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + }; + let pair_b = reader.read_u16::()?; + let delta_max_len_bits = reader.read_u8()?; + let delta_n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; delta_n_bytes as usize]; + reader.read_exact(&mut payload)?; + let count = reader.read_u16::()?; + + // Unpack bitpacked run lengths. + let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); + let run_lengths = decode_twodelta_run_lengths(&frame)?; + + // Write as XBEN delta frame. + let mut delta_encoded = Vec::with_capacity(1 + 2 + 2 + 4 + run_lengths.len() * 2); + delta_encoded.push(XBEN_TWODELTA_DELTA_TAG); + delta_encoded.extend_from_slice(&frame.pair().0.to_be_bytes()); + delta_encoded.extend_from_slice(&frame.pair().1.to_be_bytes()); + delta_encoded.extend_from_slice(&(run_lengths.len() as u32).to_be_bytes()); + for run_len in &run_lengths { + delta_encoded.extend_from_slice(&run_len.to_be_bytes()); + } + + self.flush_pending_frame()?; + self.previous_frame = delta_encoded; + self.count = count; + + sample_count += count as usize; + progress!("Encoding line: {}\r", sample_count); + } + + tracing::trace!(""); + tracing::trace!("Done!"); + Ok(()) + } + pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { let peek = reader.fill_buf()?; let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); if has_banner { if self.variant == BenVariant::TwoDelta { - let mut banner = [0u8; BANNER_LEN]; - banner.copy_from_slice(&peek[..BANNER_LEN]); reader.consume(BANNER_LEN); - - let decoder = - BenDecoder::new(io::Cursor::new(banner).chain(reader))?.silent(true); - for record in decoder { - let (assignment, count) = record?; - self.write_assignment(assignment)?; - if count > 1 { - self.count += count - 1; - } - } - return Ok(()); + return self.translate_ben_twodelta_to_xben(reader); } reader.consume(BANNER_LEN); } diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 1d566b9..37a763b 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -122,16 +122,15 @@ fn relabel_ben_file_via_decoder( mut transform: F, ) -> io::Result<()> where - F: FnMut(Vec) -> io::Result>, + F: FnMut(&[u16]) -> io::Result>, { - let decoder = BenDecoder::new(reader)?.silent(true); + let mut decoder = BenDecoder::new(reader)?.silent(true); let mut encoder = BenEncoder::new(writer, variant); let mut sample_number = 0usize; - for record in decoder { - let (assignment, count) = record?; + decoder.for_each_assignment(|assignment, count| { if max_samples.is_some_and(|limit| sample_number >= limit) { - break; + return Ok(false); } let relabeled = transform(assignment)?; @@ -146,7 +145,8 @@ where sample_number += out_count; progress!("Relabelling line: {}\r", sample_number); - } + Ok(true) + })?; tracing::trace!(""); tracing::trace!("Done!"); @@ -198,7 +198,9 @@ fn convert_ben_file_impl( let _input_variant = detect_ben_variant(&check_buffer)?; let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, Ok) + relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, |a| { + Ok(a.to_vec()) + }) } /// Rewrite a BEN file into the requested BEN variant. @@ -440,7 +442,7 @@ fn relabel_ben_file_impl( &mut writer, variant, max_samples, - |assignment| Ok(canonicalize_assignment(&assignment)), + |assignment| Ok(canonicalize_assignment(assignment)), )? } } @@ -689,7 +691,7 @@ fn relabel_ben_file_with_map_impl( &mut writer, variant, max_samples, - |assignment| permute_assignment(&assignment, &permutation), + |assignment| permute_assignment(assignment, &permutation), )? } } @@ -751,7 +753,7 @@ pub fn relabel_ben_file_as_variant_limit( writer, target_variant, Some(max_samples), - |assignment| Ok(canonicalize_assignment(&assignment)), + |assignment| Ok(canonicalize_assignment(assignment)), ) } @@ -781,7 +783,7 @@ pub fn relabel_ben_file_with_map_as_variant( let permutation = dense_permutation(&new_to_old_node_map)?; let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - permute_assignment(&assignment, &permutation) + permute_assignment(assignment, &permutation) }) } @@ -817,7 +819,7 @@ pub fn relabel_ben_file_with_map_as_variant_limit( writer, target_variant, Some(max_samples), - |assignment| permute_assignment(&assignment, &permutation), + |assignment| permute_assignment(assignment, &permutation), ) } From 69fabd9b372868fd796c494c17e7bd5367a41d47 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:42:11 -0600 Subject: [PATCH 036/221] possible xz compression improvement for twodelta --- ben/src/io/reader.rs | 128 ++++++++++++++++++++++++++++++++ ben/src/io/writer.rs | 171 ++++++++++++++++++++++++++++++++----------- 2 files changed, 257 insertions(+), 42 deletions(-) diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index c64c5fd..e032856 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -13,6 +13,7 @@ use xz2::read::XzDecoder; const XBEN_TWODELTA_FULL_TAG: u8 = 0; const XBEN_TWODELTA_DELTA_TAG: u8 = 1; +const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// A decoded assignment together with the number of times it repeats. pub type MkvRecord = (Vec, u16); @@ -728,6 +729,7 @@ pub struct XBenDecoder { overflow: Vec, buf: Box<[u8]>, previous_assignment: Option>, + chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, } impl XBenDecoder { @@ -751,6 +753,7 @@ impl XBenDecoder { overflow: Vec::with_capacity(1 << 20), buf: vec![0u8; 1 << 20].into_boxed_slice(), previous_assignment: None, + chunk_queue: std::collections::VecDeque::new(), } } @@ -911,6 +914,7 @@ impl XBenDecoder { count, ))) } + XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. _ => Some(Err(io::Error::new( io::ErrorKind::InvalidData, "invalid TwoDelta XBEN frame tag", @@ -918,6 +922,96 @@ impl XBenDecoder { } } + /// Try to parse a columnar TwoDelta chunk from the overflow buffer. + /// + /// If the overflow starts with the chunk tag and contains enough bytes for + /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. + /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, + /// or `None` when the overflow is incomplete. + fn try_parse_twodelta_chunk(&mut self) -> Option> { + if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { + return None; + } + if self.overflow.len() < 5 { + return None; + } + + let n_frames = u32::from_be_bytes([ + self.overflow[1], + self.overflow[2], + self.overflow[3], + self.overflow[4], + ]) as usize; + + // Calculate total chunk size: tag(1) + n_frames(4) + // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) + let header_len = 5; + let pairs_len = n_frames * 4; + let counts_len = n_frames * 2; + let run_counts_len = n_frames * 4; + let fixed_len = header_len + pairs_len + counts_len + run_counts_len; + + if self.overflow.len() < fixed_len { + return None; + } + + // Read run-length counts to determine total run data size. + let run_counts_start = header_len + pairs_len + counts_len; + let mut total_runs = 0usize; + let mut run_counts = Vec::with_capacity(n_frames); + for i in 0..n_frames { + let offset = run_counts_start + i * 4; + let rc = u32::from_be_bytes([ + self.overflow[offset], + self.overflow[offset + 1], + self.overflow[offset + 2], + self.overflow[offset + 3], + ]) as usize; + run_counts.push(rc); + total_runs += rc; + } + + let run_data_len = total_runs * 2; + let total_len = fixed_len + run_data_len; + if self.overflow.len() < total_len { + return None; + } + + // Parse pairs channel. + let pairs_start = header_len; + // Parse counts channel. + let counts_start = pairs_start + pairs_len; + // Run data starts after run counts. + let run_data_start = run_counts_start + run_counts_len; + + let mut run_cursor = run_data_start; + for i in 0..n_frames { + let po = pairs_start + i * 4; + let pair = ( + u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), + u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), + ); + let co = counts_start + i * 2; + let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); + + let rc = run_counts[i]; + let mut run_lengths = Vec::with_capacity(rc); + for _ in 0..rc { + run_lengths.push(u16::from_be_bytes([ + self.overflow[run_cursor], + self.overflow[run_cursor + 1], + ])); + run_cursor += 2; + } + + self.chunk_queue + .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); + } + + self.overflow.drain(..total_len); + Some(Ok(())) + } + /// Consume this decoder and iterate over raw ben32 frames instead of /// materialized assignments. /// @@ -986,6 +1080,40 @@ impl Iterator for XBenDecoder { } } BenVariant::TwoDelta => { + // Drain frames from a previously parsed chunk first. + if let Some((frame, count)) = self.chunk_queue.pop_front() { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.take() { + Some(prev) => { + apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) + } + None => Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN frame encountered before an initial BEN frame", + )), + } + } + }; + return Some(match assignment { + Ok(a) => { + self.previous_assignment = Some(a.clone()); + Ok((a, count)) + } + Err(e) => Err(e), + }); + } + + // Try to parse a columnar chunk. + if let Some(result) = self.try_parse_twodelta_chunk() { + match result { + Ok(()) => continue, // Loop to drain chunk_queue. + Err(e) => return Some(Err(e)), + } + } + + // Try a single legacy frame (tag 0 or 1). if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { let res = match parsed { Ok((frame, consumed, count)) => { diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index da78beb..7b7cd46 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -16,6 +16,17 @@ use xz2::write::XzEncoder; const XBEN_TWODELTA_FULL_TAG: u8 = 0; const XBEN_TWODELTA_DELTA_TAG: u8 = 1; +const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; + +/// Default number of delta frames per columnar chunk in XBEN TwoDelta. +pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; + +/// A buffered delta frame awaiting chunk serialization. +struct BufferedDeltaFrame { + pair: (u16, u16), + run_lengths: Vec, + count: u16, +} enum BufferedBenFrame { Ben(BenFrame), @@ -605,6 +616,8 @@ pub struct XBenEncoder { previous_frame: Vec, count: u16, variant: BenVariant, + chunk_size: usize, + chunk_buffer: Vec, } impl XBenEncoder { @@ -697,6 +710,55 @@ impl XBenEncoder { Ok(()) } + /// Write all buffered delta frames as a single columnar chunk. + /// + /// The chunk layout groups same-type fields together so XZ's dictionary + /// compression can exploit the resulting byte-level regularity: + /// + /// ```text + /// [chunk_tag=2] [n_frames: u32] + /// [pairs channel: (pair_a u16, pair_b u16) × n_frames] + /// [counts channel: count u16 × n_frames] + /// [run-length counts: n_runs u32 × n_frames] + /// [run-length data: u16 × total_runs] + /// ``` + fn flush_chunk(&mut self) -> Result<()> { + if self.chunk_buffer.is_empty() { + return Ok(()); + } + + let n = self.chunk_buffer.len() as u32; + self.encoder.write_all(&[XBEN_TWODELTA_CHUNK_TAG])?; + self.encoder.write_all(&n.to_be_bytes())?; + + // Pairs channel. + for frame in &self.chunk_buffer { + self.encoder.write_all(&frame.pair.0.to_be_bytes())?; + self.encoder.write_all(&frame.pair.1.to_be_bytes())?; + } + + // Counts channel. + for frame in &self.chunk_buffer { + self.encoder.write_all(&frame.count.to_be_bytes())?; + } + + // Run-length counts channel. + for frame in &self.chunk_buffer { + self.encoder + .write_all(&(frame.run_lengths.len() as u32).to_be_bytes())?; + } + + // Run-length data channel. + for frame in &self.chunk_buffer { + for &rl in &frame.run_lengths { + self.encoder.write_all(&rl.to_be_bytes())?; + } + } + + self.chunk_buffer.clear(); + Ok(()) + } + /// Create a new XBEN writer around an already-configured XZ encoder. /// /// # Arguments @@ -710,34 +772,35 @@ impl XBenEncoder { /// Returns a new XBEN encoder ready to accept assignments or BEN frames. pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { encoder.write_all(banner_for_variant(variant)).unwrap(); - match variant { - BenVariant::Standard => XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::Standard, - }, - BenVariant::MkvChain => XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::MkvChain, - }, - BenVariant::TwoDelta => XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant: BenVariant::TwoDelta, - }, + XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant, + chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, + chunk_buffer: Vec::new(), } } + /// Set the number of delta frames per columnar chunk. + /// + /// Only affects TwoDelta variant encoding. Larger chunks give XZ more + /// same-type data to compress together; smaller chunks reduce peak memory. + /// + /// # Arguments + /// + /// * `size` - Number of delta frames per chunk. + /// + /// # Returns + /// + /// Returns `self` for method chaining. + pub fn with_chunk_size(mut self, size: usize) -> Self { + self.chunk_size = size.max(1); + self + } + /// Encode and write a full assignment vector into the compressed XBEN stream. /// /// # Arguments @@ -782,17 +845,31 @@ impl XBenEncoder { let hints = analyze_twodelta_transition(&self.previous_assignment, &assign_vec, masks); if hints.is_repeated { - self.count += 1; + if self.chunk_buffer.is_empty() { + self.count += 1; + } else { + self.chunk_buffer.last_mut().unwrap().count += 1; + } return Ok(()); } - let encoded = encode_xben_twodelta_delta_frame( + // Flush the initial full frame before the first delta. + if self.chunk_buffer.is_empty() { + self.flush_pending_frame()?; + } + + let (ordered_pair, run_lengths) = build_twodelta_runs_with_hint( &self.previous_assignment, &assign_vec, hints.delta_pair, Some(&self.previous_masks), )?; - self.flush_pending_frame()?; + + self.chunk_buffer.push(BufferedDeltaFrame { + pair: ordered_pair, + run_lengths, + count: 1, + }); if let Some(pair) = hints.delta_pair { self.update_masks_for_delta(&assign_vec, pair); @@ -801,8 +878,10 @@ impl XBenEncoder { self.previous_assignment = assign_vec; self.rebuild_previous_masks(); } - self.previous_frame = encoded; - self.count = 1; + + if self.chunk_buffer.len() >= self.chunk_size { + self.flush_chunk()?; + } Ok(()) } } @@ -869,7 +948,7 @@ impl XBenEncoder { let mut sample_count = first_count as usize; progress!("Encoding line: {}\r", sample_count); - // Delta frames: unpack bitpacked run lengths → XBEN delta frame. + // Delta frames: unpack bitpacked run lengths and buffer into chunks. loop { let pair_a = match reader.read_u16::() { Ok(v) => v, @@ -888,24 +967,28 @@ impl XBenEncoder { let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); let run_lengths = decode_twodelta_run_lengths(&frame)?; - // Write as XBEN delta frame. - let mut delta_encoded = Vec::with_capacity(1 + 2 + 2 + 4 + run_lengths.len() * 2); - delta_encoded.push(XBEN_TWODELTA_DELTA_TAG); - delta_encoded.extend_from_slice(&frame.pair().0.to_be_bytes()); - delta_encoded.extend_from_slice(&frame.pair().1.to_be_bytes()); - delta_encoded.extend_from_slice(&(run_lengths.len() as u32).to_be_bytes()); - for run_len in &run_lengths { - delta_encoded.extend_from_slice(&run_len.to_be_bytes()); + // Flush the initial full frame before the first delta chunk. + if self.chunk_buffer.is_empty() && self.count > 0 { + self.flush_pending_frame()?; } - self.flush_pending_frame()?; - self.previous_frame = delta_encoded; - self.count = count; + self.chunk_buffer.push(BufferedDeltaFrame { + pair: frame.pair(), + run_lengths, + count, + }); + + if self.chunk_buffer.len() >= self.chunk_size { + self.flush_chunk()?; + } sample_count += count as usize; progress!("Encoding line: {}\r", sample_count); } + // Flush remaining partial chunk (Drop will also catch this, but be explicit). + self.flush_chunk()?; + tracing::trace!(""); tracing::trace!("Done!"); Ok(()) @@ -941,5 +1024,9 @@ impl Drop for XBenEncoder { self.flush_pending_frame() .expect("Error writing last XBEN frame to file"); } + if !self.chunk_buffer.is_empty() { + self.flush_chunk() + .expect("Error writing last XBEN TwoDelta chunk"); + } } } From c6bcfafc69a7c30cb5bb06488329e00f26ff0825 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 17 Mar 2026 16:30:29 -0600 Subject: [PATCH 037/221] remove mla --- ben/src/cli/ben.rs | 9 +- ben/src/cli/reben.rs | 5 - ben/src/codec/encode/ben.rs | 9 ++ ben/src/codec/encode/jsonl.rs | 4 + ben/src/codec/encode/xz.rs | 4 + ben/src/io/writer.rs | 36 +----- ben/src/json/graph/mod.rs | 187 ------------------------------- ben/src/json/graph/tests.rs | 18 --- ben/src/ops/extract/tests.rs | 1 + ben/tests/test_cli.rs | 29 +---- ben/tests/test_impls_pipeline.rs | 20 +++- ben/tests/test_pipeline.rs | 3 + pyben/src/encode/mod.rs | 4 +- 13 files changed, 53 insertions(+), 276 deletions(-) diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 8121cf8..8b75486 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -104,6 +104,11 @@ struct Args { /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. #[arg(short = 'l', long)] compression_level: Option, + /// Number of TwoDelta delta frames per columnar chunk in XBEN encoding. + /// Only affects TwoDelta variant. Larger chunks improve XZ compression. + /// Default is 10,000. + #[arg(long)] + chunk_size: Option, } /// Derive the output path for encode-style CLI modes. @@ -330,7 +335,7 @@ pub fn run() { if ben_and_xben { if let Err(err) = - encode_ben_to_xben(reader, writer, args.n_cpus, args.compression_level) + encode_ben_to_xben(reader, writer, args.n_cpus, args.compression_level, args.chunk_size) { eprintln!("Error: {:?}", err); } @@ -342,6 +347,7 @@ pub fn run() { BenVariant::Standard, args.n_cpus, args.compression_level, + args.chunk_size, ) } else { encode_jsonl_to_xben( @@ -350,6 +356,7 @@ pub fn run() { BenVariant::MkvChain, args.n_cpus, args.compression_level, + args.chunk_size, ) }; if let Err(e) = possible_error { diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 91818e7..28d6b54 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -28,9 +28,6 @@ enum Mode { #[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] /// Topology-based ordering methods for JSON graph relabeling. enum OrderingMethod { - /// Minimum-linear-arrangement heuristic based on graph adjacency alone. - #[clap(alias = "mla")] - MinimumLinearArrangement, /// Recursive multilevel clustering based on local neighborhoods. #[clap(alias = "mlc")] MultiLevelCluster, @@ -348,7 +345,6 @@ pub fn run() { /// Returns the corresponding `GraphOrderingMethod`. fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { match ordering { - OrderingMethod::MinimumLinearArrangement => GraphOrderingMethod::MinimumLinearArrangement, OrderingMethod::MultiLevelCluster => GraphOrderingMethod::MultiLevelCluster, OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, } @@ -365,7 +361,6 @@ fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { /// Returns a static string identifying the ordering method. fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { match ordering { - OrderingMethod::MinimumLinearArrangement => "minimum-linear-arrangement", OrderingMethod::MultiLevelCluster => "multi-level-cluster", OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", } diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 6d4b586..963ecf9 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -48,6 +48,15 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result { ) } +/// Encode an assignment vector into a Ben32 vector +/// +/// # Arguments +/// +/// * `assign_vec` - The full assignment vector to encode +/// +/// # Returns +/// +/// Returns the encoded BEN32 frame byte vector. pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> io::Result { let assign_vec = assign_vec.as_ref(); let mut prev_assign: u16 = 0; diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index cb4b743..22696aa 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -29,6 +29,7 @@ pub fn encode_jsonl_to_xben( variant: BenVariant, n_threads: Option, compression_level: Option, + chunk_size: Option, ) -> Result<()> { let mut n_cpus: u32 = n_threads.unwrap_or(1); n_cpus = n_cpus @@ -49,6 +50,9 @@ pub fn encode_jsonl_to_xben( .expect("init MT encoder"); let encoder = XzEncoder::new_stream(writer, mt); let mut ben_encoder = XBenEncoder::new(encoder, variant); + if let Some(cs) = chunk_size { + ben_encoder = ben_encoder.with_chunk_size(cs); + } let mut line_num = 1; diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index ca5cd7e..58761da 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -78,6 +78,7 @@ pub fn encode_ben_to_xben( writer: W, n_threads: Option, compression_level: Option, + chunk_size: Option, ) -> Result<()> { let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; @@ -104,6 +105,9 @@ pub fn encode_ben_to_xben( let variant = variant_from_banner(&check_buffer) .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; let mut ben_encoder = XBenEncoder::new(encoder, variant); + if let Some(cs) = chunk_size { + ben_encoder = ben_encoder.with_chunk_size(cs); + } ben_encoder.write_ben_file(Cursor::new(check_buffer).chain(reader))?; diff --git a/ben/src/io/writer.rs b/ben/src/io/writer.rs index 7b7cd46..2f89cf5 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer.rs @@ -15,7 +15,6 @@ use std::io::{self, BufRead, Read, Result, Write}; use xz2::write::XzEncoder; const XBEN_TWODELTA_FULL_TAG: u8 = 0; -const XBEN_TWODELTA_DELTA_TAG: u8 = 1; const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// Default number of delta frames per columnar chunk in XBEN TwoDelta. @@ -252,40 +251,6 @@ fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { bytes } -/// Encode the difference between two assignments as an XBEN two-delta delta frame. -/// -/// The frame begins with a delta tag byte, the swapped value pair, and then -/// run-length encoded flip positions in big-endian format. -/// -/// # Arguments -/// -/// * `previous_assignment` - The previous assignment vector. -/// * `new_assignment` - The current assignment vector. -/// * `delta_pair` - An optional pre-computed pair of swapped values. -/// * `masks` - An optional index map from value to positions in the previous assignment. -/// -/// # Returns -/// -/// Returns the encoded delta frame as a byte vector, or an error if encoding fails. -fn encode_xben_twodelta_delta_frame( - previous_assignment: &[u16], - new_assignment: &[u16], - delta_pair: Option<(u16, u16)>, - masks: Option<&HashMap>>, -) -> io::Result> { - let (ordered_pair, run_lengths) = - build_twodelta_runs_with_hint(previous_assignment, new_assignment, delta_pair, masks)?; - let mut bytes = Vec::with_capacity(1 + 2 + 2 + 4 + run_lengths.len() * 2); - bytes.push(XBEN_TWODELTA_DELTA_TAG); - bytes.extend_from_slice(&ordered_pair.0.to_be_bytes()); - bytes.extend_from_slice(&ordered_pair.1.to_be_bytes()); - bytes.extend_from_slice(&(run_lengths.len() as u32).to_be_bytes()); - for run_length in run_lengths { - bytes.extend_from_slice(&run_length.to_be_bytes()); - } - Ok(bytes) -} - /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { writer: W, @@ -707,6 +672,7 @@ impl XBenEncoder { if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { self.encoder.write_all(&self.count.to_be_bytes())?; } + self.count = 0; Ok(()) } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 3ea9857..013e571 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -10,8 +10,6 @@ use std::result::Result as StdResult; /// Topology-based graph ordering methods supported by `reben`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum GraphOrderingMethod { - /// Order nodes using a minimum-linear-arrangement heuristic. - MinimumLinearArrangement, /// Order nodes using recursive multilevel clustering. MultiLevelCluster, /// Order nodes using Reverse Cuthill-McKee. @@ -134,7 +132,6 @@ pub fn sort_json_file_by_ordering( tracing::trace!("Sorting JSON file by ordering method: {:?}", method); let order = match method { - GraphOrderingMethod::MinimumLinearArrangement => minimum_linear_arrangement_order(&graph), GraphOrderingMethod::MultiLevelCluster => multi_level_cluster_order(&graph), GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), }; @@ -370,26 +367,6 @@ fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Ve component_order } -/// Compute a minimum-linear-arrangement heuristic ordering for the entire graph. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph to order. -/// -/// # Returns -/// -/// Returns a permutation of node indices that heuristically minimizes total -/// edge span. -fn minimum_linear_arrangement_order(graph: &GraphJson) -> Vec { - let mut order = Vec::with_capacity(graph.nodes.len()); - - for component in connected_components(graph) { - order.extend(minimum_linear_arrangement_component(graph, &component)); - } - - order -} - /// Compute a multilevel cluster ordering for the entire graph. /// /// # Arguments @@ -404,51 +381,6 @@ fn multi_level_cluster_order(graph: &GraphJson) -> Vec { multilevel_cluster_order_generic(&graph.adjacency_indices, &graph.node_ids) } -/// Compute a minimum-linear-arrangement heuristic ordering for a single component. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph. -/// * `component` - The node indices belonging to the component. -/// -/// # Returns -/// -/// Returns an ordering that heuristically minimizes total edge span within the -/// component, refined by iterated barycenter sorting and adjacent swaps. -fn minimum_linear_arrangement_component(graph: &GraphJson, component: &[usize]) -> Vec { - if component.len() <= 2 { - return component.to_vec(); - } - - let component_mask = subset_mask(graph.nodes.len(), component); - let mut order = reverse_cuthill_mckee_component(graph, component); - - for _ in 0..8 { - let positions = positions_for_order(graph.nodes.len(), &order); - order.sort_by(|&a, &b| { - let a_score = barycenter_score(graph, a, &positions, &component_mask); - let b_score = barycenter_score(graph, b, &positions, &component_mask); - a_score - .partial_cmp(&b_score) - .unwrap_or(Ordering::Equal) - .then_with(|| graph.node_ids[a].cmp(&graph.node_ids[b])) - }); - local_adjacent_improvement(graph, &mut order, &component_mask); - } - - order -} - -/// Build a boolean mask indicating membership in a subset of nodes. -/// -/// # Arguments -/// -/// * `size` - The total number of nodes (length of the returned vector). -/// * `nodes` - The node indices that belong to the subset. -/// -/// # Returns -/// -/// Returns a boolean vector where `true` marks nodes present in the subset. fn subset_mask(size: usize, nodes: &[usize]) -> Vec { let mut mask = vec![false; size]; for &node in nodes { @@ -775,124 +707,5 @@ fn build_coarse_graph( (coarse_adjacency, coarse_labels) } -/// Invert a permutation to get the position of each node in the ordering. -/// -/// # Arguments -/// -/// * `size` - The total number of nodes (length of the returned vector). -/// * `order` - A permutation where `order[position]` gives the node index. -/// -/// # Returns -/// -/// Returns a vector indexed by node where each entry is the node's position in -/// the ordering. -fn positions_for_order(size: usize, order: &[usize]) -> Vec { - let mut positions = vec![usize::MAX; size]; - for (idx, &node) in order.iter().enumerate() { - positions[node] = idx; - } - positions -} - -/// Compute the barycenter score of a node as the mean position of its neighbors. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph. -/// * `node` - The node index to score. -/// * `positions` - The current position of each node in the ordering. -/// * `component_mask` - A boolean mask restricting which neighbors to consider. -/// -/// # Returns -/// -/// Returns the average position of the node's neighbors within the component, -/// or the node's own position if it has no neighbors in the component. -fn barycenter_score( - graph: &GraphJson, - node: usize, - positions: &[usize], - component_mask: &[bool], -) -> f64 { - let mut sum = 0.0; - let mut count = 0.0; - for &neighbor in &graph.adjacency_indices[node] { - if component_mask[neighbor] { - sum += positions[neighbor] as f64; - count += 1.0; - } - } - - if count == 0.0 { - positions[node] as f64 - } else { - sum / count - } -} - -/// Improve an ordering by repeatedly swapping adjacent pairs that reduce total edge span. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph. -/// * `order` - The current ordering, modified in place. -/// * `component_mask` - A boolean mask restricting which neighbors to consider. -fn local_adjacent_improvement(graph: &GraphJson, order: &mut [usize], component_mask: &[bool]) { - if order.len() < 2 { - return; - } - - let mut improved = true; - while improved { - improved = false; - let mut positions = positions_for_order(graph.nodes.len(), order); - for idx in 0..order.len() - 1 { - let current_cost = node_span_cost(graph, order[idx], &positions, component_mask) - + node_span_cost(graph, order[idx + 1], &positions, component_mask); - - order.swap(idx, idx + 1); - positions[order[idx]] = idx; - positions[order[idx + 1]] = idx + 1; - - let swapped_cost = node_span_cost(graph, order[idx], &positions, component_mask) - + node_span_cost(graph, order[idx + 1], &positions, component_mask); - - if swapped_cost <= current_cost { - improved = swapped_cost < current_cost; - } else { - order.swap(idx, idx + 1); - positions[order[idx]] = idx; - positions[order[idx + 1]] = idx + 1; - } - } - } -} - -/// Compute the total edge span cost for a single node in the current ordering. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph. -/// * `node` - The node index to evaluate. -/// * `positions` - The current position of each node in the ordering. -/// * `component_mask` - A boolean mask restricting which neighbors to consider. -/// -/// # Returns -/// -/// Returns the sum of absolute position differences between the node and each of -/// its neighbors within the component. -fn node_span_cost( - graph: &GraphJson, - node: usize, - positions: &[usize], - component_mask: &[bool], -) -> usize { - graph.adjacency_indices[node] - .iter() - .copied() - .filter(|&neighbor| component_mask[neighbor]) - .map(|neighbor| positions[node].abs_diff(positions[neighbor])) - .sum() -} - #[cfg(test)] mod tests; diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests.rs index 9c9dfb8..ec6f640 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests.rs @@ -306,24 +306,6 @@ fn test_sort_json_file_by_reverse_cuthill_mckee() { assert_eq!(output_json["adjacency"][0][0]["id"], 1); } -#[test] -fn test_sort_json_file_by_minimum_linear_arrangement() { - let mut output = Vec::new(); - let mapping = sort_json_file_by_ordering( - path_graph_json(), - &mut output, - GraphOrderingMethod::MinimumLinearArrangement, - ) - .unwrap(); - let output_json: Value = serde_json::from_slice(&output).unwrap(); - - let positions = [mapping[&0], mapping[&1], mapping[&2], mapping[&3]]; - let mut sorted = positions; - sorted.sort_unstable(); - assert_eq!(sorted, [0, 1, 2, 3]); - assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); -} - #[test] fn test_sort_json_file_by_multi_level_cluster() { let mut output = Vec::new(); diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index e03f487..3f769ad 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -145,6 +145,7 @@ fn test_extract_assignment_xben_roundtrip_and_errors() { BenVariant::MkvChain, Some(1), Some(0), + None, ) .unwrap(); diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 0f894df..0d566ac 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1358,33 +1358,13 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations } #[test] -fn reben_cli_supports_mla_and_rcm_orderings() { +fn reben_cli_supports_rcm_ordering() { let temp = TempDir::new("reben-orderings"); let graph_path = temp.path().join("shape.json"); - let mla_path = temp.path().join("mla.json"); let rcm_path = temp.path().join("rcm.json"); fs::write(&graph_path, sample_graph()).unwrap(); - let mla = run( - "reben", - &[ - graph_path.to_str().unwrap(), - "--mode", - "json", - "--ordering", - "minimum-linear-arrangement", - "--output-file", - mla_path.to_str().unwrap(), - ], - temp.path(), - ); - assert_success(&mla); - assert!(temp - .path() - .join("shape_sorted_by_minimum-linear-arrangement_map.json") - .exists()); - let rcm = run( "reben", &[ @@ -1404,13 +1384,8 @@ fn reben_cli_supports_mla_and_rcm_orderings() { .join("shape_sorted_by_reverse-cuthill-mckee_map.json") .exists()); - let mla_json: Value = serde_json::from_str(&fs::read_to_string(&mla_path).unwrap()).unwrap(); let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); - assert_eq!( - mla_json["nodes"].as_array().unwrap().len(), - rcm_json["nodes"].as_array().unwrap().len() - ); - assert!(!mla_json["nodes"].as_array().unwrap().is_empty()); + assert!(!rcm_json["nodes"].as_array().unwrap().is_empty()); } #[test] diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 1e902d1..e2a1ec2 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -245,6 +245,7 @@ proptest! { BenVariant::Standard, Some(threads), Some(level), + None, ).unwrap(); // Decode XBEN -> BEN -> JSONL @@ -270,6 +271,7 @@ proptest! { BenVariant::MkvChain, Some(threads), Some(level), + None, ).unwrap(); let mut ben = Vec::new(); @@ -294,6 +296,7 @@ proptest! { BenVariant::TwoDelta, Some(threads), Some(level), + None, ).unwrap(); let mut ben = Vec::new(); @@ -318,6 +321,7 @@ proptest! { BenVariant::MkvChain, Some(threads), Some(level), + None, ).unwrap(); // Path A: direct to JSONL @@ -346,6 +350,7 @@ proptest! { BenVariant::Standard, Some(threads), Some(level), + None, ).unwrap(); let mut dec = XBenDecoder::new(xben.as_slice()).unwrap(); @@ -373,6 +378,7 @@ proptest! { BenVariant::TwoDelta, Some(threads), Some(level), + None, ).unwrap(); let mut dec = XBenDecoder::new(xben.as_slice()).unwrap(); @@ -430,6 +436,7 @@ proptest! { BenVariant::MkvChain, Some(threads), Some(level), + None, ).unwrap(); // Choose some indices to keep (1-based). We derive from seq length. @@ -470,6 +477,7 @@ proptest! { BenVariant::MkvChain, Some(threads), Some(level), + None, ).unwrap(); let n = seq.len(); @@ -505,6 +513,7 @@ proptest! { BenVariant::MkvChain, Some(threads), Some(level), + None, ).unwrap(); let n = seq.len(); @@ -617,6 +626,7 @@ fn subsample_every_respects_offset() { BenVariant::MkvChain, Some(1), Some(0), + None, ) .unwrap(); @@ -675,6 +685,7 @@ fn xbenencoder_drop_flushes_tail_group() { BenVariant::MkvChain, Some(1), Some(0), + None, ) .unwrap(); out @@ -745,6 +756,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { BenVariant::Standard, Some(1), Some(0), + None, ) .unwrap(); @@ -830,6 +842,7 @@ fn subsample_by_indices_sorts_and_dedups() { BenVariant::Standard, Some(1), Some(0), + None, ) .unwrap(); let xb = XBenDecoder::new(xz.as_slice()).unwrap(); @@ -869,7 +882,7 @@ fn ben_encode_xben_respects_existing_ben_header() { encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, variant).unwrap(); let mut xz = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xz, Some(1), Some(0)) + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xz, Some(1), Some(0), None) .expect("ben->xben failed"); let mut ben_back = Vec::new(); @@ -892,6 +905,7 @@ fn xz_mt_params_are_capped_and_safe() { BenVariant::Standard, Some(10_000), Some(42), + None, ) .unwrap(); let mut ben = Vec::new(); @@ -962,6 +976,7 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { BenVariant::Standard, Some(1), Some(0), + None, ) .unwrap(); @@ -1183,6 +1198,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { BenVariant::MkvChain, Some(1), Some(0), + None, ) .unwrap(); assert_eq!( @@ -1204,6 +1220,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { BenVariant::TwoDelta, Some(1), Some(0), + None, ) .unwrap(); assert_eq!( @@ -1239,6 +1256,7 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { BenVariant::MkvChain, Some(1), Some(0), + None, ) .unwrap(); let xben_path = unique_temp_path("sample.xben"); diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index c204661..e4d283a 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -265,6 +265,7 @@ fn test_xben_pipeline() { BenVariant::Standard, Some(1), Some(1), + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -334,6 +335,7 @@ fn test_xmkvben_pipeline() { BenVariant::MkvChain, Some(1), Some(1), + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -428,6 +430,7 @@ fn test_xtwodeltaben_pipeline() { BenVariant::TwoDelta, Some(1), Some(1), + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index e514c22..df82964 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -82,7 +82,7 @@ pub fn compress_ben_to_xben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_ben_to_xben(reader, writer, n_threads, compression_level).map_err(|e| { + encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { PyIOError::new_err(format!( "Failed to convert BEN to XBEN from {} to {}: {e}", in_file.display(), @@ -135,7 +135,7 @@ pub fn compress_jsonl_to_xben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level).map_err(|e| { + encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err(|e| { PyIOError::new_err(format!( "Failed to convert JSONL to XBEN from {} to {}: {e}", in_file.display(), From 762c8f62a5eae39b761b8c781a3ab34cdd5a5bdf Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 18 Mar 2026 00:35:04 -0600 Subject: [PATCH 038/221] Cleanup of twodelta method --- Cargo.lock | 21 ++ ben/Cargo.toml | 1 + ben/src/codec/encode/ben.rs | 399 +++++++++++++++++++++------------ ben/src/codec/encode/errors.rs | 14 ++ ben/src/codec/encode/mod.rs | 5 +- ben/src/codec/encode/types.rs | 49 +--- ben/src/io/reader.rs | 26 +-- 7 files changed, 313 insertions(+), 202 deletions(-) create mode 100644 ben/src/codec/encode/errors.rs diff --git a/Cargo.lock b/Cargo.lock index eaad981..b3ceb07 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -101,6 +101,7 @@ dependencies = [ "rand_chacha 0.9.0", "rand_distr", "serde_json", + "thiserror", "tracing", "tracing-subscriber", "xz2", @@ -889,6 +890,26 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "thread_local" version = "1.1.9" diff --git a/ben/Cargo.toml b/ben/Cargo.toml index d04cb53..14fea11 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -20,6 +20,7 @@ clap = { version = "^4.5.2", features = ["derive"] } pcompress = "1.0.7" pipe = "0.4.0" serde_json = "^1.0.107" +thiserror = "2.0.18" tracing = "0.1.41" tracing-subscriber = { version = "0.3.20", features = ["env-filter", "fmt"] } xz2 = "0.1.7" diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 963ecf9..141969d 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,13 +1,16 @@ +use super::errors::BenEncodeError; use super::types::{BenFrame, IdVec, TwoDeltaFrame}; use serde_json::Value; use std::collections::HashMap; -use std::io; +use std::io::{Error, ErrorKind, Result}; pub(crate) type TwoDeltaRuns = ((u16, u16), Vec); /// Encode a JSON assignment record into the ben32 frame representation used by /// XBEN streams. /// +/// Note: This is a helper function that is only used in the testing suite. +/// /// # Arguments /// /// * `data` - A JSON object containing an `assignment` array. @@ -17,10 +20,10 @@ pub(crate) type TwoDeltaRuns = ((u16, u16), Vec); /// Returns the encoded ben32 frame bytes terminated by the four-byte `0` /// sentinel. #[cfg_attr(not(test), allow(dead_code))] -pub(crate) fn encode_ben32_line(data: Value) -> io::Result { +pub(crate) fn encode_ben32_line(data: Value) -> Result { let assign_vec = data["assignment"].as_array().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, + Error::new( + ErrorKind::InvalidData, "'assignment' field either missing or is not an array of integers", ) })?; @@ -29,8 +32,8 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result { .iter() .map(|assignment| { let assign_u64 = assignment.as_u64().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, + Error::new( + ErrorKind::InvalidData, format!( "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", assignment @@ -38,13 +41,13 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result { ) })?; u16::try_from(assign_u64).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, + Error::new( + ErrorKind::InvalidData, format!("The value '{}' is too large to fit in a u16.", assign_u64), ) }) }) - .collect::>>()?, + .collect::>>()?, ) } @@ -57,7 +60,7 @@ pub(crate) fn encode_ben32_line(data: Value) -> io::Result { /// # Returns /// /// Returns the encoded BEN32 frame byte vector. -pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> io::Result { +pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result { let assign_vec = assign_vec.as_ref(); let mut prev_assign: u16 = 0; let mut count: u16 = 0; @@ -137,181 +140,289 @@ pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> BenFrame { pub fn encode_twodelta_vec( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, -) -> io::Result { - let (ordered_pair, run_lengths) = - build_twodelta_runs_with_hint(previous_assignment, new_assignment, None, None)?; - Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) +) -> Result { + encode_twodelta_vec_with_hint(previous_assignment, new_assignment, None, None) } -pub(crate) fn build_twodelta_runs_with_hint( +/// Encode a sample transition as a TwoDelta frame, using hints to help speed up the +/// encoding process. +/// +/// In the case that the delta pair exists, we will take it as gospel that the pairs that were +/// swapped were the ones in the delta pair. This is a hyper optimization included to improve +/// encoding speed of recombination algorithms, in particular. +/// +/// # Arguments +/// +/// * `previous_assignment` - The previous full assignment vector. +/// * `new_assignment` - The next full assignment vector. +/// * `delta_pair` - An optional pair of assignment ids that are expected to be involved in +/// the transition. If provided, the function will check that only these two ids are involved in +/// the changes between the previous and new assignments, and that they occupy the same positions. +/// * `masks` - An optional mapping from assignment ids to their positions in the previous +/// assignment vector. If provided, the function will use these masks to efficiently compute the +/// positions of the changed ids, and will validate that they are consistent with the actual +/// changes between the previous and new assignments. +/// +/// # Returns +/// A serialized TwoDelta frame describing the transition, or an error if the hints are +/// invalid +pub(crate) fn encode_twodelta_vec_with_hint( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, delta_pair: Option<(u16, u16)>, - masks: Option<&HashMap>>, -) -> io::Result { + masks: Option<&mut HashMap>>, +) -> Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); if previous_assignment.len() != new_assignment.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta requires assignment vectors of equal length", + return Err(Error::new( + ErrorKind::InvalidData, + format!( + "TwoDelta requires previous and new assignment vectors to be of \ + equal length, but got lengths {} and {}", + previous_assignment.len(), + new_assignment.len() + ), )); } - let pair = if let Some(pair) = delta_pair { - pair - } else { - let mut pair_ids = [0u16; 2]; - let mut pair_len = 0usize; - for (&previous, ¤t) in previous_assignment.iter().zip(new_assignment.iter()) { - if previous == current { - continue; - } - for value in [previous, current] { - if !pair_ids[..pair_len].contains(&value) { - if pair_len == 2 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta transitions may involve at most two assignment ids", - )); - } - pair_ids[pair_len] = value; - pair_len += 1; - } - } + if delta_pair.is_some() { + if masks.is_none() { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair hint provided without corresponding masks", + )); } - - if pair_len == 0 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta cannot encode identical assignments as a delta frame", + let pair = delta_pair.unwrap(); + if pair.0 == pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair hint cannot have identical values for the two ids", )); } + } - if pair_len == 1 { - (pair_ids[0], pair_ids[0]) - } else { - (pair_ids[0], pair_ids[1]) + match (delta_pair, masks) { + (Some(pair), Some(masks)) => determine_twodelta_frame_from_pair_and_mask_hints( + previous_assignment, + new_assignment, + pair, + masks, + ), + (None, Some(masks)) => { + determine_twodelta_frame_from_mask_hint(previous_assignment, new_assignment, masks) } - }; + _ => determine_twodelta_frame_from_scratch(previous_assignment, new_assignment), + } - let pair_positions = if let Some(masks) = masks { - match (masks.get(&pair.0), masks.get(&pair.1)) { - (Some(mask_a), Some(mask_b)) if pair.0 != pair.1 => { - let mut merged = Vec::with_capacity(mask_a.len() + mask_b.len()); - let (mut i, mut j) = (0usize, 0usize); - while i < mask_a.len() || j < mask_b.len() { - if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { - merged.push(mask_a[i]); - i += 1; - } else { - merged.push(mask_b[j]); - j += 1; - } - } - merged - } - (Some(mask), _) => mask.clone(), - _ => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta pair mask is missing for the previous assignment", - )); - } - } - } else { - let mut pair_positions = Vec::new(); - pair_positions.reserve(previous_assignment.len()); - for (idx, (&previous, ¤t)) in previous_assignment - .iter() - .zip(new_assignment.iter()) - .enumerate() - { - let previous_in_pair = previous == pair.0 || previous == pair.1; - let current_in_pair = current == pair.0 || current == pair.1; - - if previous_in_pair != current_in_pair { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta requires the changed id pair to occupy the same positions", - )); - } + // Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) +} - if !previous_in_pair && previous != current { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta found a change outside the selected id pair", - )); - } +fn validate_masks_and_order_pairs( + pair: (u16, u16), + masks: &HashMap>, +) -> Result<(u16, u16)> { + let mask_a = match masks.get(&pair.0) { + Some(m) => m, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair mask is missing for the previous assignment", + )) + } + }; - if previous_in_pair { - pair_positions.push(idx); - } + let mask_b = match masks.get(&pair.1) { + Some(m) => m, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair mask is missing for the current assignment", + )) } - pair_positions }; - if pair_positions.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta requires at least one occurrence of the selected id pair", + if mask_a.len() == 0 { + return Err(Error::new( + ErrorKind::InvalidData, + format!("TwoDelta pair mask for the id {} is empty", pair.0), )); - } + }; + + if mask_b.len() == 0 { + return Err(Error::new( + ErrorKind::InvalidData, + format!("TwoDelta pair mask for the id {} is empty", pair.1), + )); + }; - let first_value = new_assignment[pair_positions[0]]; - let second_value = if pair.0 == pair.1 { - pair.0 - } else if first_value == pair.0 { - pair.1 + if mask_a[0] < mask_b[0] { + Ok((pair.0, pair.1)) } else { - pair.0 + Ok((pair.1, pair.0)) + } +} + +fn determine_twodelta_frame_from_pair_and_mask_hints( + previous: &[u16], + current: &[u16], + delta_pair: (u16, u16), + masks: &mut HashMap>, +) -> Result { + let pair = match validate_masks_and_order_pairs(delta_pair, masks) { + Ok(pair) => pair, + Err(e) => { + return Err(Error::new( + ErrorKind::InvalidData, + format!( + "Encountered when validating masks and ordering pairs in \ + `determine_twodelta_run_from_pair_and_mask_hints`:\n{}", + e + ), + )); + } }; - let ordered_pair = (first_value, second_value); - let mut run_lengths = Vec::new(); - let mut current_value = first_value; - let mut current_run = 0u16; - - for &idx in &pair_positions { - let previous = previous_assignment[idx]; - let value = new_assignment[idx]; - if previous != pair.0 && previous != pair.1 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, + let mask_a = masks + .get(&pair.0) + .expect("Failed to get mask for pair.0 after validation"); + let mask_b = masks + .get(&pair.1) + .expect("Failed to get mask for pair.1 after validation"); + + let new_capacity = mask_a.len() + mask_b.len(); + let mut run_lengths = Vec::with_capacity(new_capacity); + let mut new_mask_a = Vec::with_capacity(new_capacity); + let mut new_mask_b = Vec::with_capacity(new_capacity); + + let (mut i, mut j) = (0usize, 0usize); + let mut current_mask_count = 0u16; + let mut current_value = pair.0; + + let mut found_assignment_change = false; + + while i < mask_a.len() || j < mask_b.len() { + let idx = if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { + if current_value != pair.0 { + run_lengths.push(current_mask_count); + current_mask_count = 1; + current_value = pair.0; + } else { + current_mask_count += 1; + } + i += 1; + mask_a[i - 1] + } else { + if current_value != pair.1 { + run_lengths.push(current_mask_count); + current_mask_count = 1; + current_value = pair.1; + } else { + current_mask_count += 1; + } + j += 1; + mask_b[j - 1] + }; + + let previous_value = previous[idx]; + let current_value = current[idx]; + + if previous_value != pair.0 && previous_value != pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, "TwoDelta pair mask referenced an index outside the selected id pair", )); } - if value != ordered_pair.0 && value != ordered_pair.1 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, + if current_value != pair.0 && current_value != pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, "TwoDelta payload encountered an assignment outside the selected id pair", )); } + if current_value != previous_value { + found_assignment_change = true; + } - if value == current_value { - current_run += 1; + if current_value == pair.0 { + new_mask_a.push(idx); } else { - run_lengths.push(current_run); - current_value = value; - current_run = 1; + new_mask_b.push(idx); } } + run_lengths.push(current_mask_count); - if current_run > 0 { - run_lengths.push(current_run); + // Special error that signals that we can reuse the last TwoDelta frame + if !found_assignment_change { + return Err(BenEncodeError::RepeatedSample.into()); } - Ok((ordered_pair, run_lengths)) + masks.insert(pair.0, new_mask_a); + masks.insert(pair.1, new_mask_b); + Ok(TwoDeltaFrame::from_run_lengths(pair, run_lengths)) } -pub(crate) fn encode_twodelta_vec_with_hint( - previous_assignment: impl AsRef<[u16]>, - new_assignment: impl AsRef<[u16]>, - delta_pair: Option<(u16, u16)>, - masks: Option<&HashMap>>, -) -> io::Result { - let (ordered_pair, run_lengths) = - build_twodelta_runs_with_hint(previous_assignment, new_assignment, delta_pair, masks)?; - Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) +fn determine_twodelta_frame_from_mask_hint( + previous: &[u16], + current: &[u16], + masks: &mut HashMap>, +) -> Result { + for (&assign0, &assign1) in previous.iter().zip(current.iter()) { + if assign0 != assign1 { + return determine_twodelta_frame_from_pair_and_mask_hints( + previous, + current, + (assign0, assign1), + masks, + ); + } + } + + return Err(BenEncodeError::RepeatedSample.into()); +} + +fn determine_twodelta_frame_from_scratch( + previous: &[u16], + current: &[u16], +) -> Result { + let mut delta_pair = [0u16; 2]; + let mut pair_len = 0usize; + + let mut run_lengths = Vec::new(); + let mut current_value = 0u16; + let mut current_run_length = 0u16; + + for (&assign0, &assign1) in previous.iter().zip(current.iter()) { + if assign0 != assign1 { + // We are encoding the current, so the first value we encounter in the current should + // be added to the front of the pair + for value in [assign1, assign0] { + if !delta_pair[..pair_len].contains(&value) { + // We have found both values for the pair and yet encountered a third value + // so this is not a valid TwoDelta transition. + if pair_len == 2 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta transitions may involve at most two assignment ids", + )); + } + delta_pair[pair_len] = value; + pair_len += 1; + } + } + if current_run_length > 0 && current_value != assign1 { + run_lengths.push(current_run_length); + current_run_length = 1; + current_value = assign1; + } else { + current_run_length += 1; + } + } + } + run_lengths.push(current_run_length); + + Ok(TwoDeltaFrame::from_run_lengths( + (delta_pair[0], delta_pair[1]), + run_lengths, + )) } diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs new file mode 100644 index 0000000..5c562a1 --- /dev/null +++ b/ben/src/codec/encode/errors.rs @@ -0,0 +1,14 @@ +use std::io; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum BenEncodeError { + #[error("Encountered a repeated sample when encoding.")] + RepeatedSample, +} + +impl From for io::Error { + fn from(error: BenEncodeError) -> Self { + io::Error::new(io::ErrorKind::Other, error) + } +} diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index c6fba90..0b2c249 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -1,15 +1,14 @@ //! Encoding routines for BEN and XBEN formats. mod ben; +mod errors; mod jsonl; mod types; mod xz; #[cfg(test)] pub(crate) use ben::encode_ben32_line; -pub(crate) use ben::{ - build_twodelta_runs_with_hint, encode_ben32_assignments, encode_twodelta_vec_with_hint, -}; +pub(crate) use ben::{encode_ben32_assignments, encode_twodelta_vec_with_hint}; pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; diff --git a/ben/src/codec/encode/types.rs b/ben/src/codec/encode/types.rs index 49f4639..2613522 100644 --- a/ben/src/codec/encode/types.rs +++ b/ben/src/codec/encode/types.rs @@ -156,10 +156,10 @@ fn pack_fixed_width_items(items: &[u16], item_bits: u8) -> (u32, Vec) { /// length. #[derive(Debug, Clone, PartialEq, Eq)] pub struct BenFrame { - runs: Vec<(u16, u16)>, - max_val_bits: u8, - max_len_bits: u8, - n_bytes: u32, + pub runs: Vec<(u16, u16)>, + pub max_val_bits: u8, + pub max_len_bits: u8, + pub n_bytes: u32, bytes: Vec, } @@ -226,26 +226,6 @@ impl BenFrame { Self::from_rle(crate::util::rle::assign_to_rle(assignments)) } - /// Borrow the canonical RLE runs. - pub fn runs(&self) -> &[(u16, u16)] { - &self.runs - } - - /// Return the number of bits used to store each value. - pub fn max_val_bits(&self) -> u8 { - self.max_val_bits - } - - /// Return the number of bits used to store each run length. - pub fn max_len_bits(&self) -> u8 { - self.max_len_bits - } - - /// Return the payload length in bytes. - pub fn n_bytes(&self) -> u32 { - self.n_bytes - } - /// Borrow the serialized BEN frame bytes. pub fn as_slice(&self) -> &[u8] { &self.bytes @@ -295,9 +275,9 @@ impl PartialEq for Vec { /// just those two ids. The first run always corresponds to `pair.0`. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TwoDeltaFrame { - pair: (u16, u16), - max_len_bits: u8, - n_bytes: u32, + pub pair: (u16, u16), + pub max_len_bits: u8, + pub n_bytes: u32, bytes: Vec, } @@ -341,21 +321,6 @@ impl TwoDeltaFrame { } } - /// Return the ordered pair of ids used by the delta frame. - pub fn pair(&self) -> (u16, u16) { - self.pair - } - - /// Return the bit width of each encoded run length. - pub fn max_len_bits(&self) -> u8 { - self.max_len_bits - } - - /// Return the packed payload length in bytes. - pub fn n_bytes(&self) -> u32 { - self.n_bytes - } - /// Borrow just the packed payload bytes. pub fn payload(&self) -> &[u8] { &self.bytes[9..] diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index e032856..4c5b478 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -459,7 +459,7 @@ impl BenDecoder { })?; let run_lengths = decode_twodelta_run_lengths(&frame)?; let assignment = - apply_twodelta_runs_to_assignment(assignment, frame.pair(), &run_lengths)?; + apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths)?; let keep_going = f(&assignment, count)?; self.previous_assignment = Some(assignment); if !keep_going { @@ -514,10 +514,10 @@ pub(crate) fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> n_bits_in_buff; n_bits_in_buff += 8; - if n_bits_in_buff >= frame.max_len_bits() as u16 && current.is_none() { - current = Some((buffer >> (32 - frame.max_len_bits())) as u16); - buffer <<= frame.max_len_bits(); - n_bits_in_buff -= frame.max_len_bits() as u16; + if n_bits_in_buff >= frame.max_len_bits as u16 && current.is_none() { + current = Some((buffer >> (32 - frame.max_len_bits)) as u16); + buffer <<= frame.max_len_bits; + n_bits_in_buff -= frame.max_len_bits as u16; } if let Some(item) = current.take() { @@ -526,10 +526,10 @@ pub(crate) fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result= frame.max_len_bits() as u16 { - let item = (buffer >> (32 - frame.max_len_bits())) as u16; - buffer <<= frame.max_len_bits(); - n_bits_in_buff -= frame.max_len_bits() as u16; + while n_bits_in_buff >= frame.max_len_bits as u16 { + let item = (buffer >> (32 - frame.max_len_bits)) as u16; + buffer <<= frame.max_len_bits; + n_bits_in_buff -= frame.max_len_bits as u16; if item > 0 { items.push(item); } @@ -607,7 +607,7 @@ fn decode_twodelta_frame_to_assignment( frame: &TwoDeltaFrame, ) -> io::Result> { let run_lengths = decode_twodelta_run_lengths(frame)?; - apply_twodelta_runs_to_assignment(assignment, frame.pair(), &run_lengths) + apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths) } /// Decode a stored BEN frame into a full assignment vector. @@ -707,10 +707,10 @@ impl Iterator for BenFrameDecoeder { let encoded = encode_ben_vec_from_assign(&assignment); let raw_data = encoded.as_slice()[6..].to_vec(); Some(Ok(BenFrame { - max_val_bits: encoded.max_val_bits(), - max_len_bits: encoded.max_len_bits(), + max_val_bits: encoded.max_val_bits, + max_len_bits: encoded.max_len_bits, count, - n_bytes: encoded.n_bytes(), + n_bytes: encoded.n_bytes, raw_data, })) } From 713d3036e803eba5ad8a7a5b4cbfae5908682afc Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:59:58 -0600 Subject: [PATCH 039/221] Start stubbing out bigger reorg --- ben/src/codec/encode/ben.rs | 480 ++----- ben/src/codec/encode/errors.rs | 2 +- ben/src/codec/encode/mod.rs | 10 +- ben/src/codec/encode/tests.rs | 5 +- ben/src/codec/encode/traits.rs | 7 + ben/src/codec/encode/twodelta.rs | 569 ++++++++ ben/src/codec/encode/types.rs | 357 ----- ben/src/codec/frames.rs | 241 ++++ ben/src/codec/mod.rs | 4 + ben/src/codec/translate/mod.rs | 5 +- ben/src/io/reader.rs | 152 +-- ben/src/io/reader/ben.rs | 1613 +++++++++++++++++++++++ ben/src/io/reader/errors.rs | 0 ben/src/io/reader/mod.rs | 1 + ben/src/io/reader/tests.rs | 0 ben/src/io/reader/twodelta.rs | 0 ben/src/io/{writer.rs => writer/ben.rs} | 50 +- ben/src/io/writer/frames.rs | 0 ben/src/io/writer/mod.rs | 3 + ben/src/io/writer/tests.rs | 0 ben/src/io/writer/twodelta.rs | 5 + ben/src/io/writer/utils.rs | 0 ben/src/ops/relabel/mod.rs | 6 +- ben/src/ops/relabel/tests.rs | 16 +- 24 files changed, 2670 insertions(+), 856 deletions(-) create mode 100644 ben/src/codec/encode/traits.rs create mode 100644 ben/src/codec/encode/twodelta.rs delete mode 100644 ben/src/codec/encode/types.rs create mode 100644 ben/src/codec/frames.rs create mode 100644 ben/src/io/reader/ben.rs create mode 100644 ben/src/io/reader/errors.rs create mode 100644 ben/src/io/reader/mod.rs create mode 100644 ben/src/io/reader/tests.rs create mode 100644 ben/src/io/reader/twodelta.rs rename ben/src/io/{writer.rs => writer/ben.rs} (96%) create mode 100644 ben/src/io/writer/frames.rs create mode 100644 ben/src/io/writer/mod.rs create mode 100644 ben/src/io/writer/tests.rs create mode 100644 ben/src/io/writer/twodelta.rs create mode 100644 ben/src/io/writer/utils.rs diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 141969d..4289efb 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,11 +1,9 @@ -use super::errors::BenEncodeError; -use super::types::{BenFrame, IdVec, TwoDeltaFrame}; +use crate::codec::encode::traits::{FromAssign, FromRLE}; +use crate::codec::frames::{BenEncodeFrame, MkvBenEncodeFrame}; +use crate::util::rle::assign_to_rle; use serde_json::Value; -use std::collections::HashMap; use std::io::{Error, ErrorKind, Result}; -pub(crate) type TwoDeltaRuns = ((u16, u16), Vec); - /// Encode a JSON assignment record into the ben32 frame representation used by /// XBEN streams. /// @@ -20,35 +18,42 @@ pub(crate) type TwoDeltaRuns = ((u16, u16), Vec); /// Returns the encoded ben32 frame bytes terminated by the four-byte `0` /// sentinel. #[cfg_attr(not(test), allow(dead_code))] -pub(crate) fn encode_ben32_line(data: Value) -> Result { - let assign_vec = data["assignment"].as_array().ok_or_else(|| { - Error::new( - ErrorKind::InvalidData, - "'assignment' field either missing or is not an array of integers", - ) - })?; - encode_ben32_assignments( - assign_vec - .iter() - .map(|assignment| { - let assign_u64 = assignment.as_u64().ok_or_else(|| { - Error::new( - ErrorKind::InvalidData, - format!( - "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", - assignment - ), - ) - })?; - u16::try_from(assign_u64).map_err(|_| { - Error::new( - ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", assign_u64), - ) - }) +pub(crate) fn encode_ben32_line(data: Value) -> Result> { + let json_value_assign_vec = match data["assignment"].as_array() { + Some(vec) => vec, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "'assignment' field either missing or is not an array of integers", + )) + } + }; + + let possible_assign_vec = json_value_assign_vec + .iter() + .map(|assignment| { + let assign_u64 = assignment.as_u64().ok_or_else(|| { + Error::new( + ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + assignment + ), + ) + })?; + u16::try_from(assign_u64).map_err(|_| { + Error::new( + ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", assign_u64), + ) }) - .collect::>>()?, - ) + }) + .collect::>>(); + + match possible_assign_vec { + Ok(vec) => encode_ben32_assignments(vec), + Err(e) => Err(e), + } } /// Encode an assignment vector into a Ben32 vector @@ -60,7 +65,7 @@ pub(crate) fn encode_ben32_line(data: Value) -> Result { /// # Returns /// /// Returns the encoded BEN32 frame byte vector. -pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result { +pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result> { let assign_vec = assign_vec.as_ref(); let mut prev_assign: u16 = 0; let mut count: u16 = 0; @@ -91,338 +96,123 @@ pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result< } ret.extend([0, 0, 0, 0]); - Ok(IdVec::U8(ret)) -} - -/// Encode a full assignment vector into a single BEN frame. -/// -/// # Arguments -/// -/// * `assign_vec` - The full assignment vector to encode. -/// -/// # Returns -/// -/// Returns the encoded BEN frame bytes, including the per-frame header. -pub fn encode_ben_vec_from_assign(assign_vec: impl AsRef<[u16]>) -> BenFrame { - BenFrame::from_assignment(assign_vec) -} - -/// Encode a run-length encoded assignment vector into a BEN frame. -/// -/// The returned byte vector contains the per-frame BEN header followed by the -/// packed `(value, run_length)` payload. -/// -/// # Arguments -/// -/// * `rle_vec` - The run-length encoded assignment vector as `(value, count)` -/// pairs. -/// -/// # Returns -/// -/// Returns the encoded BEN frame bytes, including the per-frame header. -pub fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> BenFrame { - BenFrame::from_rle(rle_vec) + Ok(ret) } -/// Encode a sample transition as a TwoDelta frame. -/// -/// The transition is valid only when all changed positions involve exactly two -/// assignment ids and positions outside that pair remain unchanged. -/// -/// # Arguments -/// -/// * `previous_assignment` - The previous full assignment vector. -/// * `new_assignment` - The next full assignment vector. -/// -/// # Returns -/// -/// Returns a serialized TwoDelta frame describing the transition. -pub fn encode_twodelta_vec( - previous_assignment: impl AsRef<[u16]>, - new_assignment: impl AsRef<[u16]>, -) -> Result { - encode_twodelta_vec_with_hint(previous_assignment, new_assignment, None, None) -} - -/// Encode a sample transition as a TwoDelta frame, using hints to help speed up the -/// encoding process. -/// -/// In the case that the delta pair exists, we will take it as gospel that the pairs that were -/// swapped were the ones in the delta pair. This is a hyper optimization included to improve -/// encoding speed of recombination algorithms, in particular. -/// -/// # Arguments -/// -/// * `previous_assignment` - The previous full assignment vector. -/// * `new_assignment` - The next full assignment vector. -/// * `delta_pair` - An optional pair of assignment ids that are expected to be involved in -/// the transition. If provided, the function will check that only these two ids are involved in -/// the changes between the previous and new assignments, and that they occupy the same positions. -/// * `masks` - An optional mapping from assignment ids to their positions in the previous -/// assignment vector. If provided, the function will use these masks to efficiently compute the -/// positions of the changed ids, and will validate that they are consistent with the actual -/// changes between the previous and new assignments. -/// -/// # Returns -/// A serialized TwoDelta frame describing the transition, or an error if the hints are -/// invalid -pub(crate) fn encode_twodelta_vec_with_hint( - previous_assignment: impl AsRef<[u16]>, - new_assignment: impl AsRef<[u16]>, - delta_pair: Option<(u16, u16)>, - masks: Option<&mut HashMap>>, -) -> Result { - let previous_assignment = previous_assignment.as_ref(); - let new_assignment = new_assignment.as_ref(); +/// Compresses a Run-length encoded vector into a BEN-bytes vector. +fn compress_rle_to_bytes( + max_val_bit_count: u8, + max_len_bit_count: u8, + n_bytes: u32, + runs: &Vec<(u16, u16)>, +) -> Vec { + let mut bytes = Vec::with_capacity(6 + n_bytes as usize); + bytes.push(max_val_bit_count); + bytes.push(max_len_bit_count); + bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &(val, len) in runs { + let mut packed = (remainder << max_val_bit_count) | (val as u32); + let mut bits_left = remainder_bits + max_val_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } - if previous_assignment.len() != new_assignment.len() { - return Err(Error::new( - ErrorKind::InvalidData, - format!( - "TwoDelta requires previous and new assignment vectors to be of \ - equal length, but got lengths {} and {}", - previous_assignment.len(), - new_assignment.len() - ), - )); - } + packed = (packed << max_len_bit_count) | (len as u32); + bits_left += max_len_bit_count; - if delta_pair.is_some() { - if masks.is_none() { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair hint provided without corresponding masks", - )); - } - let pair = delta_pair.unwrap(); - if pair.0 == pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair hint cannot have identical values for the two ids", - )); + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); } + + remainder = packed; + remainder_bits = bits_left; } - match (delta_pair, masks) { - (Some(pair), Some(masks)) => determine_twodelta_frame_from_pair_and_mask_hints( - previous_assignment, - new_assignment, - pair, - masks, - ), - (None, Some(masks)) => { - determine_twodelta_frame_from_mask_hint(previous_assignment, new_assignment, masks) - } - _ => determine_twodelta_frame_from_scratch(previous_assignment, new_assignment), + if remainder_bits > 0 { + bytes.push((remainder << (8 - remainder_bits)) as u8); } - // Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) + bytes } -fn validate_masks_and_order_pairs( - pair: (u16, u16), - masks: &HashMap>, -) -> Result<(u16, u16)> { - let mask_a = match masks.get(&pair.0) { - Some(m) => m, - None => { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask is missing for the previous assignment", - )) - } - }; - - let mask_b = match masks.get(&pair.1) { - Some(m) => m, - None => { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask is missing for the current assignment", - )) +impl FromRLE for BenEncodeFrame { + /// Build a frame from an RLE run vector. + fn from_rle(runs: Vec<(u16, u16)>, _count: Option) -> Self { + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let raw_bytes = compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + + Self { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, } - }; - - if mask_a.len() == 0 { - return Err(Error::new( - ErrorKind::InvalidData, - format!("TwoDelta pair mask for the id {} is empty", pair.0), - )); - }; - - if mask_b.len() == 0 { - return Err(Error::new( - ErrorKind::InvalidData, - format!("TwoDelta pair mask for the id {} is empty", pair.1), - )); - }; - - if mask_a[0] < mask_b[0] { - Ok((pair.0, pair.1)) - } else { - Ok((pair.1, pair.0)) } } -fn determine_twodelta_frame_from_pair_and_mask_hints( - previous: &[u16], - current: &[u16], - delta_pair: (u16, u16), - masks: &mut HashMap>, -) -> Result { - let pair = match validate_masks_and_order_pairs(delta_pair, masks) { - Ok(pair) => pair, - Err(e) => { - return Err(Error::new( - ErrorKind::InvalidData, - format!( - "Encountered when validating masks and ordering pairs in \ - `determine_twodelta_run_from_pair_and_mask_hints`:\n{}", - e - ), - )); - } - }; - - let mask_a = masks - .get(&pair.0) - .expect("Failed to get mask for pair.0 after validation"); - let mask_b = masks - .get(&pair.1) - .expect("Failed to get mask for pair.1 after validation"); - - let new_capacity = mask_a.len() + mask_b.len(); - let mut run_lengths = Vec::with_capacity(new_capacity); - let mut new_mask_a = Vec::with_capacity(new_capacity); - let mut new_mask_b = Vec::with_capacity(new_capacity); - - let (mut i, mut j) = (0usize, 0usize); - let mut current_mask_count = 0u16; - let mut current_value = pair.0; - - let mut found_assignment_change = false; - - while i < mask_a.len() || j < mask_b.len() { - let idx = if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { - if current_value != pair.0 { - run_lengths.push(current_mask_count); - current_mask_count = 1; - current_value = pair.0; - } else { - current_mask_count += 1; - } - i += 1; - mask_a[i - 1] - } else { - if current_value != pair.1 { - run_lengths.push(current_mask_count); - current_mask_count = 1; - current_value = pair.1; - } else { - current_mask_count += 1; - } - j += 1; - mask_b[j - 1] - }; - - let previous_value = previous[idx]; - let current_value = current[idx]; - - if previous_value != pair.0 && previous_value != pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask referenced an index outside the selected id pair", - )); - } - if current_value != pair.0 && current_value != pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta payload encountered an assignment outside the selected id pair", - )); - } - if current_value != previous_value { - found_assignment_change = true; - } - - if current_value == pair.0 { - new_mask_a.push(idx); - } else { - new_mask_b.push(idx); - } - } - run_lengths.push(current_mask_count); - - // Special error that signals that we can reuse the last TwoDelta frame - if !found_assignment_change { - return Err(BenEncodeError::RepeatedSample.into()); +impl FromAssign for BenEncodeFrame { + /// Build a frame from a full assignment vector. + fn from_assignment(assignments: impl AsRef<[u16]>, _count: Option) -> Self { + Self::from_rle(assign_to_rle(assignments), _count) } - - masks.insert(pair.0, new_mask_a); - masks.insert(pair.1, new_mask_b); - Ok(TwoDeltaFrame::from_run_lengths(pair, run_lengths)) } -fn determine_twodelta_frame_from_mask_hint( - previous: &[u16], - current: &[u16], - masks: &mut HashMap>, -) -> Result { - for (&assign0, &assign1) in previous.iter().zip(current.iter()) { - if assign0 != assign1 { - return determine_twodelta_frame_from_pair_and_mask_hints( - previous, - current, - (assign0, assign1), - masks, - ); +impl FromRLE for MkvBenEncodeFrame { + /// Build a frame from an RLE run vector. + fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self { + let count = match count { + Some(v) => v, + None => 1, + }; + + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let mut raw_bytes = + compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + + raw_bytes.extend(count.to_be_bytes()); + + Self { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, } } - - return Err(BenEncodeError::RepeatedSample.into()); } -fn determine_twodelta_frame_from_scratch( - previous: &[u16], - current: &[u16], -) -> Result { - let mut delta_pair = [0u16; 2]; - let mut pair_len = 0usize; - - let mut run_lengths = Vec::new(); - let mut current_value = 0u16; - let mut current_run_length = 0u16; - - for (&assign0, &assign1) in previous.iter().zip(current.iter()) { - if assign0 != assign1 { - // We are encoding the current, so the first value we encounter in the current should - // be added to the front of the pair - for value in [assign1, assign0] { - if !delta_pair[..pair_len].contains(&value) { - // We have found both values for the pair and yet encountered a third value - // so this is not a valid TwoDelta transition. - if pair_len == 2 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta transitions may involve at most two assignment ids", - )); - } - delta_pair[pair_len] = value; - pair_len += 1; - } - } - if current_run_length > 0 && current_value != assign1 { - run_lengths.push(current_run_length); - current_run_length = 1; - current_value = assign1; - } else { - current_run_length += 1; - } - } +impl FromAssign for MkvBenEncodeFrame { + /// Build a frame from a full assignment vector. + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { + Self::from_rle(assign_to_rle(assignments), count) } - run_lengths.push(current_run_length); - - Ok(TwoDeltaFrame::from_run_lengths( - (delta_pair[0], delta_pair[1]), - run_lengths, - )) } diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index 5c562a1..c1027ca 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -2,7 +2,7 @@ use std::io; use thiserror::Error; #[derive(Debug, Error)] -pub enum BenEncodeError { +pub(crate) enum BenEncodeError { #[error("Encountered a repeated sample when encoding.")] RepeatedSample, } diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 0b2c249..c32bc7e 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -3,15 +3,17 @@ mod ben; mod errors; mod jsonl; -mod types; +mod traits; +mod twodelta; mod xz; +pub(crate) use ben::encode_ben32_assignments; +pub use traits::{FromAssign, FromRLE}; +pub(crate) use twodelta::encode_twodelta_frame_with_hint; + #[cfg(test)] pub(crate) use ben::encode_ben32_line; -pub(crate) use ben::{encode_ben32_assignments, encode_twodelta_vec_with_hint}; -pub use ben::{encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_twodelta_vec}; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; -pub use types::{BenFrame, IdItem, IdVec, TwoDeltaFrame}; pub use xz::{encode_ben_to_xben, xz_compress}; #[cfg(test)] diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index cc07a3d..6e6d5fb 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,4 +1,5 @@ use super::*; +use crate::codec::frames::BenEncodeFrame; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; @@ -261,8 +262,8 @@ fn test_encode_jsonl_to_ben_len_65535() { #[test] fn test_encode_ben_vec_from_assign_matches_rle_entrypoint() { let assign_vec = vec![4u16, 4, 4, 1, 1, 3, 3, 3, 2]; - let direct = encode_ben_vec_from_assign(assign_vec.clone()); - let via_rle = encode_ben_vec_from_rle(crate::util::rle::assign_to_rle(assign_vec)); + let direct = BenEncodeFrame::from_assignment(assign_vec.clone(), None); + let via_rle = BenEncodeFrame::from_rle(crate::util::rle::assign_to_rle(assign_vec), None); assert_eq!(direct, via_rle); } diff --git a/ben/src/codec/encode/traits.rs b/ben/src/codec/encode/traits.rs new file mode 100644 index 0000000..0380056 --- /dev/null +++ b/ben/src/codec/encode/traits.rs @@ -0,0 +1,7 @@ +pub trait FromRLE { + fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; +} + +pub trait FromAssign { + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self; +} diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs new file mode 100644 index 0000000..b8aa774 --- /dev/null +++ b/ben/src/codec/encode/twodelta.rs @@ -0,0 +1,569 @@ +use super::errors::BenEncodeError; +use crate::codec::frames::TwoDeltaFrame; +use std::collections::HashMap; +use std::io::{Error, ErrorKind, Result}; + +impl TwoDeltaFrame { + /// Build a TwoDelta frame by packing a run-length vector into the binary format. + /// + /// Run lengths are packed at `max_len_bit_count` bits per value (the minimum + /// bit width needed to represent the largest run length), MSB-first with no + /// padding between values. If the total bit count is not a multiple of 8, the + /// final byte is zero-padded on the right. + /// + /// The serialized layout is: + /// ```text + /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...] + /// ``` + /// where the payload is the bit-packed run lengths. + /// + /// # Arguments + /// + /// * `pair` - The ordered pair of assignment ids. `pair.0` corresponds to the first run. + /// * `run_length_vector` - The lengths of alternating runs of `pair.0` and `pair.1` + /// over the positions occupied by the pair, in position order. + /// + /// # Returns + /// + /// A fully serialized `TwoDeltaFrame` with both the packed `raw_bytes` and the + /// original `run_length_vector` stored on the struct. + pub fn from_run_lengths(pair: (u16, u16), run_length_vector: Vec) -> Self { + let max_len = run_length_vector.iter().copied().max().unwrap_or(0); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + + let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + + // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) + let mut raw_bytes = Vec::with_capacity((n_bytes + 9) as usize); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &item in &run_length_vector { + let mut packed = (remainder << max_len_bit_count) | item as u32; + let mut bits_left = remainder_bits + max_len_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + raw_bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + raw_bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + Self { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + } + } + + /// Reconstruct a TwoDelta frame from already-parsed header fields and a raw payload. + /// + /// This is the inverse of `from_run_lengths`: it re-assembles the serialized bytes + /// and decodes the bit-packed payload back into the run-length vector so that both + /// representations are available on the resulting frame. + /// + /// The decoding reads `max_len_bit_count` bits at a time from the payload, MSB-first, + /// and discards any trailing zero-valued items produced by right-padding in the final byte. + /// + /// # Arguments + /// + /// * `pair` - The ordered pair of assignment ids as read from the frame header. + /// * `max_len_bit_count` - The bit width of each packed run length, as read from the + /// frame header. + /// * `payload` - The raw packed payload bytes, not including the 9-byte header. + /// + /// # Returns + /// + /// A `TwoDeltaFrame` with both `raw_bytes` (header + payload) and the decoded + /// `run_length_vector` populated. + pub fn from_parts(pair: (u16, u16), max_len_bit_count: u8, payload: Vec) -> Self { + let n_bytes = payload.len() as u32; + let mut raw_bytes = Vec::with_capacity(9 + payload.len()); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + raw_bytes.extend_from_slice(&payload); + + let mut run_length_vector = Vec::new(); + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + let mut current: Option = None; + + for byte in payload { + buffer |= (byte as u32).to_be() >> n_bits_in_buff; + n_bits_in_buff += 8; + + if n_bits_in_buff >= max_len_bit_count as u16 && current.is_none() { + current = Some((buffer >> (32 - max_len_bit_count)) as u16); + buffer <<= max_len_bit_count; + n_bits_in_buff -= max_len_bit_count as u16; + } + + if let Some(item) = current.take() { + if item > 0 { + run_length_vector.push(item); + } + } + + while n_bits_in_buff >= max_len_bit_count as u16 { + let item = (buffer >> (32 - max_len_bit_count)) as u16; + buffer <<= max_len_bit_count; + n_bits_in_buff -= max_len_bit_count as u16; + if item > 0 { + run_length_vector.push(item); + } + } + } + + Self { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + } + } +} + +/// Encode a transition between two assignment vectors as a TwoDelta frame, optionally +/// using caller-supplied hints to accelerate encoding. +/// +/// # Arguments +/// +/// * `previous_assignment` - The full assignment vector from the preceding sample. +/// * `new_assignment` - The full assignment vector for the sample being encoded. +/// * `delta_pair` - An optional hint asserting which pair of ids is involved in the +/// transition. Must be provided together with `masks`, and the two ids must be distinct. +/// * `masks` - An optional mutable map from assignment id to the sorted list of positions +/// it occupies in `previous_assignment`. When provided, the map is updated in-place to +/// reflect `new_assignment` before returning. +/// +/// # Returns +/// +/// A `TwoDeltaFrame` describing the transition from `previous_assignment` to +/// `new_assignment`. +/// +/// # TwoDelta encoding +/// +/// A TwoDelta frame is valid only when every position that changes between +/// `previous_assignment` and `new_assignment` involves exactly two assignment ids +/// (call them A and B), and no position outside that pair changes. The frame stores +/// the pair and the lengths of alternating runs of A and B over the positions +/// occupied by the pair, ordered by position. The first run always corresponds to +/// whichever id occupies the lowest-indexed position. +/// +/// # Hints +/// +/// Two optional hints can be provided to avoid scanning the full assignment vector: +/// +/// - `delta_pair`: The caller asserts that exactly this pair of ids is involved in +/// the transition. Must be provided together with `masks`. The pair must have two +/// distinct ids — passing `(x, x)` is an error. +/// +/// - `masks`: A mutable map from assignment id to the sorted list of positions it +/// occupies in `previous_assignment`. When provided, the function reads positions +/// directly from the map instead of scanning the assignment vector, and updates +/// the map in-place to reflect `new_assignment` before returning. The masks must +/// cover every id that appears in the pair; a missing or empty entry is an error. +/// +/// The hints are not independent: `delta_pair` requires `masks`. Providing `masks` +/// without `delta_pair` is allowed — the function will infer the pair from the first +/// differing position and then use the masks from there. +/// +/// When no hints are provided the function falls back to a full scan of both +/// assignment vectors. +/// +/// # Errors +/// +/// Returns an error if: +/// - The assignment vectors have different lengths. +/// - `delta_pair` is provided without `masks`. +/// - `delta_pair` contains two identical ids. +/// - A mask entry required by the pair is absent or empty. +/// - A position referenced by a mask holds a value outside the pair. +/// - The transition involves more than two distinct ids. +/// - The two assignments are identical (returns `BenEncodeError::RepeatedSample`). +pub(crate) fn encode_twodelta_frame_with_hint( + previous_assignment: impl AsRef<[u16]>, + new_assignment: impl AsRef<[u16]>, + delta_pair: Option<(u16, u16)>, + masks: Option<&mut HashMap>>, +) -> Result { + let previous_assignment = previous_assignment.as_ref(); + let new_assignment = new_assignment.as_ref(); + + if previous_assignment.len() != new_assignment.len() { + return Err(Error::new( + ErrorKind::InvalidData, + format!( + "TwoDelta requires previous and new assignment vectors to be of \ + equal length, but got lengths {} and {}", + previous_assignment.len(), + new_assignment.len() + ), + )); + } + + if delta_pair.is_some() { + if masks.is_none() { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair hint provided without corresponding masks", + )); + } + let pair = delta_pair.unwrap(); + if pair.0 == pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair hint cannot have identical values for the two ids", + )); + } + } + + match (delta_pair, masks) { + (Some(pair), Some(masks)) => construct_twodelta_frame_from_pair_and_mask_hints( + previous_assignment, + new_assignment, + pair, + masks, + ), + (None, Some(masks)) => { + construct_twodelta_frame_from_mask_hint(previous_assignment, new_assignment, masks) + } + _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment), + } + + // Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) +} + +/// Validate that `masks` contains non-empty entries for both ids in `pair` and return +/// the pair ordered so that `pair.0` occupies a lower index than `pair.1`. +/// +/// Ordering by first position ensures that the run-length sequence produced during +/// encoding always begins with the id whose positions come first in the assignment +/// vector, which is required for deterministic round-trip decoding. +/// +/// # Arguments +/// +/// * `pair` - The two assignment ids to validate and order. +/// * `masks` - The position mask map to look up entries in. +/// +/// # Returns +/// +/// The pair reordered so that `pair.0` has a smaller first position than `pair.1`, +/// or an error if either id is absent from `masks` or has an empty position list. +fn validate_masks_and_order_pairs_for_twodelta( + pair: (u16, u16), + masks: &HashMap>, +) -> Result<(u16, u16)> { + let mask_a = match masks.get(&pair.0) { + Some(m) => m, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair mask is missing for the previous assignment", + )) + } + }; + + let mask_b = match masks.get(&pair.1) { + Some(m) => m, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair mask is missing for the current assignment", + )) + } + }; + + if mask_a.len() == 0 { + return Err(Error::new( + ErrorKind::InvalidData, + format!("TwoDelta pair mask for the id {} is empty", pair.0), + )); + }; + + if mask_b.len() == 0 { + return Err(Error::new( + ErrorKind::InvalidData, + format!("TwoDelta pair mask for the id {} is empty", pair.1), + )); + }; + + if mask_a[0] < mask_b[0] { + Ok((pair.0, pair.1)) + } else { + Ok((pair.1, pair.0)) + } +} + +/// Build a TwoDelta frame using both a known pair and pre-computed position masks. +/// +/// This is the fast path used during recombination-aware encoding, where the caller +/// already knows which two ids are swapping and has maintained a mask for each id. +/// +/// The function merges the two sorted position lists from `masks` to produce the +/// interleaved sequence of positions, validates that every referenced position in +/// `previous` and `current` belongs to the pair, computes the run lengths over +/// `current`, and then updates `masks` in-place to reflect the new positions of +/// each id in `current`. +/// +/// # Arguments +/// +/// * `previous` - The full assignment vector from the preceding sample. +/// * `current` - The full assignment vector for the sample being encoded. +/// * `delta_pair` - The pair of ids asserted to be involved in the transition. +/// * `masks` - Mutable position mask map for both ids in the pair. Updated in-place +/// to reflect `current` before returning. +/// +/// # Returns +/// +/// A `TwoDeltaFrame` for the transition, or `BenEncodeError::RepeatedSample` if no +/// position actually changed value (signalling the frame can be deduplicated), or +/// another error if a mask entry is inconsistent with the assignment data. +fn construct_twodelta_frame_from_pair_and_mask_hints( + previous: &[u16], + current: &[u16], + delta_pair: (u16, u16), + masks: &mut HashMap>, +) -> Result { + let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, masks) { + Ok(pair) => pair, + Err(e) => { + return Err(Error::new( + ErrorKind::InvalidData, + format!( + "Encountered when validating masks and ordering pairs in \ + `determine_twodelta_run_from_pair_and_mask_hints`:\n{}", + e + ), + )); + } + }; + + let mask_a = masks + .get(&pair.0) + .expect("Failed to get mask for pair.0 after validation"); + let mask_b = masks + .get(&pair.1) + .expect("Failed to get mask for pair.1 after validation"); + + let new_capacity = mask_a.len() + mask_b.len(); + let mut run_lengths = Vec::with_capacity(new_capacity); + // Accumulate updated masks reflecting positions in `current`. + let mut new_mask_a = Vec::with_capacity(new_capacity); + let mut new_mask_b = Vec::with_capacity(new_capacity); + + // Two-pointer merge over the sorted position lists. `current_value` tracks + // which id owns the active run; `current_mask_count` is the length of that run. + let (mut i, mut j) = (0usize, 0usize); + let mut current_mask_count = 0u16; + let mut current_value = pair.0; + + let mut found_assignment_change = false; + + while i < mask_a.len() || j < mask_b.len() { + // Pick the next position from whichever mask is lower, mirroring the + // merge step used when building pair_positions from two masks. + let idx = if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { + if current_value != pair.0 { + run_lengths.push(current_mask_count); + current_mask_count = 1; + current_value = pair.0; + } else { + current_mask_count += 1; + } + i += 1; + mask_a[i - 1] + } else { + if current_value != pair.1 { + run_lengths.push(current_mask_count); + current_mask_count = 1; + current_value = pair.1; + } else { + current_mask_count += 1; + } + j += 1; + mask_b[j - 1] + }; + + let previous_value = previous[idx]; + let current_value = current[idx]; + + if previous_value != pair.0 && previous_value != pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta pair mask referenced an index outside the selected id pair", + )); + } + if current_value != pair.0 && current_value != pair.1 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta payload encountered an assignment outside the selected id pair", + )); + } + if current_value != previous_value { + found_assignment_change = true; + } + + if current_value == pair.0 { + new_mask_a.push(idx); + } else { + new_mask_b.push(idx); + } + } + run_lengths.push(current_mask_count); + + // Special error that signals that we can reuse the last TwoDelta frame + if !found_assignment_change { + return Err(BenEncodeError::RepeatedSample.into()); + } + + masks.insert(pair.0, new_mask_a); + masks.insert(pair.1, new_mask_b); + Ok(TwoDeltaFrame::from_run_lengths(pair, run_lengths)) +} + +/// Build a TwoDelta frame using only pre-computed position masks, inferring the pair +/// from the first differing position between `previous` and `current`. +/// +/// Scans until it finds a position where the two assignments differ, then delegates +/// to `construct_twodelta_frame_from_pair_and_mask_hints` with that pair. If no +/// difference is found the assignments are identical and +/// `BenEncodeError::RepeatedSample` is returned. +/// +/// # Arguments +/// +/// * `previous` - The full assignment vector from the preceding sample. +/// * `current` - The full assignment vector for the sample being encoded. +/// * `masks` - Mutable position mask map covering all ids that may appear in the pair. +/// Updated in-place to reflect `current` before returning. +/// +/// # Returns +/// +/// A `TwoDeltaFrame` for the transition, or `BenEncodeError::RepeatedSample` if the +/// two assignments are identical. +fn construct_twodelta_frame_from_mask_hint( + previous: &[u16], + current: &[u16], + masks: &mut HashMap>, +) -> Result { + for (&assign0, &assign1) in previous.iter().zip(current.iter()) { + if assign0 != assign1 { + return construct_twodelta_frame_from_pair_and_mask_hints( + previous, + current, + (assign0, assign1), + masks, + ); + } + } + + return Err(BenEncodeError::RepeatedSample.into()); +} + +/// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no +/// hints from the caller. +/// +/// Simultaneously discovers the pair and computes run lengths in a single pass over +/// the zipped assignments. Only positions where the two assignments differ are +/// considered; unchanged positions are skipped entirely. The pair is ordered so that +/// the first id encountered in `current` at a changed position becomes `pair.0`, +/// which ensures the run-length sequence begins with the id that appears first. +/// +/// # Arguments +/// +/// * `previous` - The full assignment vector from the preceding sample. +/// * `current` - The full assignment vector for the sample being encoded. +/// +/// # Returns +/// +/// A `TwoDeltaFrame` for the transition, or an error if more than two distinct ids +/// appear across all changed positions. +fn construct_twodelta_frame_from_scratch( + previous: &[u16], + current: &[u16], +) -> Result { + let mut delta_pair = [0u16; 2]; + let mut pair_len = 0usize; + + let mut run_lengths = Vec::new(); + let mut current_value = 0u16; + let mut current_run_length = 0u16; + + for (&assign0, &assign1) in previous.iter().zip(current.iter()) { + if assign0 != assign1 { + // We are encoding the current, so the first value we encounter in the current should + // be added to the front of the pair + for value in [assign1, assign0] { + if !delta_pair[..pair_len].contains(&value) { + // We have found both values for the pair and yet encountered a third value + // so this is not a valid TwoDelta transition. + if pair_len == 2 { + return Err(Error::new( + ErrorKind::InvalidData, + "TwoDelta transitions may involve at most two assignment ids", + )); + } + delta_pair[pair_len] = value; + pair_len += 1; + } + } + if current_run_length > 0 && current_value != assign1 { + run_lengths.push(current_run_length); + current_run_length = 1; + current_value = assign1; + } else { + current_run_length += 1; + } + } + } + run_lengths.push(current_run_length); + + Ok(TwoDeltaFrame::from_run_lengths( + (delta_pair[0], delta_pair[1]), + run_lengths, + )) +} + +/// Encode a transition between two assignment vectors as a TwoDelta frame. +/// +/// This is the unhinted entry point. It falls back to a full scan of both +/// assignment vectors to discover the pair and compute run lengths. Prefer +/// `encode_twodelta_frame_with_hint` when masks are available, as it avoids +/// the scan entirely. +/// +/// The transition is valid only when all changed positions involve exactly two +/// assignment ids and positions outside that pair remain unchanged. +/// +/// # Arguments +/// +/// * `previous_assignment` - The previous full assignment vector. +/// * `new_assignment` - The next full assignment vector. +/// +/// # Returns +/// +/// Returns a TwoDelta frame describing the transition, or an error if the +/// transition involves more than two ids or the assignments are identical. +pub fn encode_twodelta_frame( + previous_assignment: impl AsRef<[u16]>, + new_assignment: impl AsRef<[u16]>, +) -> Result { + encode_twodelta_frame_with_hint(previous_assignment, new_assignment, None, None) +} diff --git a/ben/src/codec/encode/types.rs b/ben/src/codec/encode/types.rs deleted file mode 100644 index 2613522..0000000 --- a/ben/src/codec/encode/types.rs +++ /dev/null @@ -1,357 +0,0 @@ -use std::io; - -/// Typed identifier storage used by experimental delta encoders. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum IdVec { - U8(Vec), - U16(Vec), -} - -/// A single typed identifier item. -#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)] -pub enum IdItem { - U8(u8), - U16(u16), -} - -impl IdVec { - /// Borrow the inner `u8` bytes. - pub fn as_slice(&self) -> &[u8] { - self.as_u8_slice().expect("expected U8-encoded payload") - } - - /// Borrow the inner `u8` bytes, returning an error on variant mismatch. - pub fn as_u8_slice(&self) -> io::Result<&[u8]> { - match self { - IdVec::U8(v) => Ok(v.as_slice()), - IdVec::U16(_) => Err(io::Error::new( - io::ErrorKind::InvalidData, - "expected U8-encoded payload", - )), - } - } - - /// Consume into raw `u8` bytes. - pub fn into_u8_vec(self) -> io::Result> { - match self { - IdVec::U8(v) => Ok(v), - IdVec::U16(_) => Err(io::Error::new( - io::ErrorKind::InvalidData, - "expected U8-encoded payload", - )), - } - } - - /// Return the logical element count. - pub fn len(&self) -> usize { - match self { - IdVec::U8(v) => v.len(), - IdVec::U16(v) => v.len(), - } - } - - /// Return whether the container is empty. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Iterate over items while preserving the original scalar type. - pub fn iter(&self) -> impl Iterator + '_ { - match self { - IdVec::U8(v) => { - Box::new(v.iter().copied().map(IdItem::U8)) as Box> - } - IdVec::U16(v) => Box::new(v.iter().copied().map(IdItem::U16)), - } - } - - /// Return the item at index `i`, if any. - pub fn get(&self, i: usize) -> Option { - match self { - IdVec::U8(v) => v.get(i).copied().map(IdItem::U8), - IdVec::U16(v) => v.get(i).copied().map(IdItem::U16), - } - } -} - -impl<'a> IntoIterator for &'a IdVec { - type Item = IdItem; - type IntoIter = Box + 'a>; - - fn into_iter(self) -> Self::IntoIter { - Box::new(self.iter()) - } -} - -impl AsRef<[u8]> for IdVec { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for IdVec { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for IdVec { - fn eq(&self, other: &Vec) -> bool { - matches!(self, IdVec::U8(v) if v == other) - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &IdVec) -> bool { - matches!(other, IdVec::U8(v) if self == v) - } -} - -/// Pack a slice of items into a byte vector using a fixed bit width per item. -/// -/// # Arguments -/// -/// * `items` - The values to pack. -/// * `item_bits` - The number of bits used to encode each item. -/// -/// # Returns -/// -/// Returns the payload length in bytes and the packed byte vector. -fn pack_fixed_width_items(items: &[u16], item_bits: u8) -> (u32, Vec) { - let payload_bits = item_bits as u32 * items.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - let mut bytes = Vec::with_capacity(n_bytes as usize); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for &item in items { - let mut packed = (remainder << item_bits) | item as u32; - let mut bits_left = remainder_bits + item_bits; - - while bits_left >= 8 { - bits_left -= 8; - bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - remainder = packed; - remainder_bits = bits_left; - } - - if remainder_bits > 0 { - bytes.push((remainder << (8 - remainder_bits)) as u8); - } - - (n_bytes, bytes) -} - -/// Canonical representation of a BEN frame. -/// -/// The frame stores the semantic RLE runs together with the derived header -/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN -/// frame, including the two one-byte bit-width fields and the four-byte payload -/// length. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BenFrame { - pub runs: Vec<(u16, u16)>, - pub max_val_bits: u8, - pub max_len_bits: u8, - pub n_bytes: u32, - bytes: Vec, -} - -impl BenFrame { - /// Build a frame from an RLE run vector. - pub fn from_rle(runs: Vec<(u16, u16)>) -> Self { - let (max_val, max_len) = runs - .iter() - .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { - (max_val.max(val), max_len.max(len)) - }); - let max_val_bits = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bits = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bits + max_len_bits) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - - let mut bytes = Vec::with_capacity(6 + n_bytes as usize); - bytes.push(max_val_bits); - bytes.push(max_len_bits); - bytes.extend_from_slice(&n_bytes.to_be_bytes()); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for &(val, len) in &runs { - let mut packed = (remainder << max_val_bits) | (val as u32); - let mut bits_left = remainder_bits + max_val_bits; - - while bits_left >= 8 { - bits_left -= 8; - bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - packed = (packed << max_len_bits) | (len as u32); - bits_left += max_len_bits; - - while bits_left >= 8 { - bits_left -= 8; - bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - remainder = packed; - remainder_bits = bits_left; - } - - if remainder_bits > 0 { - bytes.push((remainder << (8 - remainder_bits)) as u8); - } - - Self { - runs, - max_val_bits, - max_len_bits, - n_bytes, - bytes, - } - } - - /// Build a frame from a full assignment vector. - pub fn from_assignment(assignments: impl AsRef<[u16]>) -> Self { - Self::from_rle(crate::util::rle::assign_to_rle(assignments)) - } - - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.bytes - } -} - -impl AsRef<[u8]> for BenFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for BenFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for BenFrame { - fn eq(&self, other: &Vec) -> bool { - self.bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &BenFrame) -> bool { - *self == other.bytes - } -} - -/// Canonical representation of a TwoDelta frame. -/// -/// A TwoDelta frame stores the two assignment ids that may change relative to -/// the previous sample and then encodes the lengths of alternating runs over -/// just those two ids. The first run always corresponds to `pair.0`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TwoDeltaFrame { - pub pair: (u16, u16), - pub max_len_bits: u8, - pub n_bytes: u32, - bytes: Vec, -} - -impl TwoDeltaFrame { - /// Build a TwoDelta frame from a pair ordering and run lengths. - pub fn from_run_lengths(pair: (u16, u16), run_lengths: Vec) -> Self { - let max_len = run_lengths.iter().copied().max().unwrap_or(0); - let max_len_bits = (16 - max_len.leading_zeros() as u8).max(1); - let (n_bytes, payload_bytes) = pack_fixed_width_items(&run_lengths, max_len_bits); - - let mut bytes = Vec::with_capacity(9 + payload_bytes.len()); - bytes.extend_from_slice(&pair.0.to_be_bytes()); - bytes.extend_from_slice(&pair.1.to_be_bytes()); - bytes.push(max_len_bits); - bytes.extend_from_slice(&n_bytes.to_be_bytes()); - bytes.extend_from_slice(&payload_bytes); - - Self { - pair, - max_len_bits, - n_bytes, - bytes, - } - } - - /// Rebuild a TwoDelta frame from already-parsed header fields and payload bytes. - pub fn from_parts(pair: (u16, u16), max_len_bits: u8, payload: Vec) -> Self { - let n_bytes = payload.len() as u32; - let mut bytes = Vec::with_capacity(9 + payload.len()); - bytes.extend_from_slice(&pair.0.to_be_bytes()); - bytes.extend_from_slice(&pair.1.to_be_bytes()); - bytes.push(max_len_bits); - bytes.extend_from_slice(&n_bytes.to_be_bytes()); - bytes.extend_from_slice(&payload); - - Self { - pair, - max_len_bits, - n_bytes, - bytes, - } - } - - /// Borrow just the packed payload bytes. - pub fn payload(&self) -> &[u8] { - &self.bytes[9..] - } - - /// Borrow the serialized TwoDelta frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.bytes - } - - /// Clone out the serialized TwoDelta frame bytes. - pub fn to_bytes(&self) -> Vec { - self.bytes.clone() - } - - /// Consume the frame and return the serialized bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.bytes - } -} - -impl AsRef<[u8]> for TwoDeltaFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for TwoDeltaFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} diff --git a/ben/src/codec/frames.rs b/ben/src/codec/frames.rs new file mode 100644 index 0000000..bceaba8 --- /dev/null +++ b/ben/src/codec/frames.rs @@ -0,0 +1,241 @@ +/// Canonical representation of a BEN frame. +/// +/// The frame stores the semantic RLE runs together with the derived header +/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN +/// frame, including the two one-byte bit-width fields and the four-byte payload +/// length. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BenEncodeFrame { + // The RLE runs that were encoded into this frame, stored here for reference + pub runs: Vec<(u16, u16)>, + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized BEN frame bytes, including the header and payload. + pub raw_bytes: Vec, +} + +impl BenEncodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for BenEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenEncodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenEncodeFrame) -> bool { + *self == other.raw_bytes + } +} + +/// Canonical representation of a BEN frame. +/// +/// The frame stores the semantic RLE runs together with the derived header +/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN +/// frame, including the two one-byte bit-width fields and the four-byte payload +/// length. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MkvBenEncodeFrame { + // The RLE runs that were encoded into this frame, stored here for reference + pub runs: Vec<(u16, u16)>, + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized MKVBEN frame bytes, including the header and payload. + pub raw_bytes: Vec, + // The number of times that this frame was repeated + pub count: u16, +} + +impl MkvBenEncodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for MkvBenEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for MkvBenEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for MkvBenEncodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &MkvBenEncodeFrame) -> bool { + *self == other.raw_bytes + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BenDecodeFrame { + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized BEN frame bytes, including the header and payload. + pub raw_bytes: Vec, + // The number of times this frame was repeated + pub count: usize, +} + +impl BenDecodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for BenDecodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenDecodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenDecodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenDecodeFrame) -> bool { + *self == other.raw_bytes + } +} + +/// Canonical representation of a TwoDelta frame. +/// +/// A TwoDelta frame stores the two assignment ids that may change relative to +/// the previous sample and then encodes the lengths of alternating runs over +/// just those two ids. The first run always corresponds to `pair.0`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TwoDeltaFrame { + // The pair of assignment ids that are encoded in this frame, stored here for reference. + // Canonically, `pair.0` is the id for the first run in the run-length vector and `pair.1` + // is the id for the second run. + pub pair: (u16, u16), + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The run-length vector that was encoded into this frame, stored here for reference. + pub run_length_vector: Vec, + // The full serialized TwoDelta frame bytes, including the header and payload. + pub raw_bytes: Vec, +} + +impl TwoDeltaFrame { + /// Borrow just the packed payload bytes. + pub fn payload(&self) -> &[u8] { + &self.raw_bytes[9..] + } + + /// Borrow the serialized TwoDelta frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized TwoDelta frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for TwoDeltaFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for TwoDeltaFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index 9903d3b..b37ecb2 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -7,4 +7,8 @@ pub mod decode; pub mod encode; +pub mod frames; pub mod translate; + +pub use encode::{FromAssign, FromRLE}; +pub use frames::{BenDecodeFrame, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaFrame}; diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 08346b5..a40a849 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -1,10 +1,11 @@ //! Translation helpers between BEN and ben32 representations. +use crate::codec::{FromAssign, FromRLE}; use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Error, Read, Write}; use crate::codec::decode::decode_ben_line; -use crate::codec::encode::encode_ben_vec_from_rle; +use crate::codec::BenEncodeFrame; use crate::{progress, BenVariant}; /// Convert a single ben32 frame into a BEN frame payload. @@ -47,7 +48,7 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { )); } - Ok(encode_ben_vec_from_rle(ben32_rle).into_bytes()) + Ok(BenEncodeFrame::from_rle(ben32_rle, None).into_bytes()) } /// Translate a stream of ben32 frames into BEN frames. diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs index 4c5b478..dc3acc6 100644 --- a/ben/src/io/reader.rs +++ b/ben/src/io/reader.rs @@ -1,5 +1,5 @@ use crate::codec::decode::{decode_ben32_line, decode_ben_line}; -use crate::codec::encode::{encode_ben32_assignments, encode_ben_vec_from_assign, TwoDeltaFrame}; +use crate::codec::{BenDecodeFrame, TwoDeltaFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; @@ -20,7 +20,7 @@ pub type MkvRecord = (Vec, u16); /// A raw ben32 frame together with the number of times it repeats. pub type Ben32Frame = (Vec, u16); /// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. -pub type FrameIter = Box> + Send>; +pub type FrameIter = Box> + Send>; #[derive(Debug)] /// Errors produced while validating the header of a decoder input stream. @@ -128,26 +128,8 @@ pub struct BenDecoder { silent: bool, } -#[derive(Clone)] -/// A single raw BEN frame. -/// -/// `raw_data` contains only the packed `(value, run_length)` payload and does -/// not include the outer frame header fields. -pub struct BenFrame { - /// Number of bits used to encode each label value in `raw_data`. - pub max_val_bits: u8, - /// Number of bits used to encode each run length in `raw_data`. - pub max_len_bits: u8, - /// Number of repeated samples represented by this frame. - pub count: u16, - /// Length in bytes of the packed payload stored in `raw_data`. - pub n_bytes: u32, - /// Packed BEN payload for this frame. - pub raw_data: Vec, -} - enum StoredBenFrame { - Ben(BenFrame), + Ben(BenDecodeFrame), TwoDelta { frame: TwoDeltaFrame, count: u16 }, } @@ -255,9 +237,12 @@ impl BenDecoder { /// /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read /// failure, or `None` at a clean end of stream. - fn pop_standard_frame_from_reader(&mut self, with_count: bool) -> Option> { + fn pop_standard_frame_from_reader( + &mut self, + with_count: bool, + ) -> Option> { let mut b1 = [0u8; 1]; - let max_val_bits = match self.reader.read_exact(&mut b1) { + let max_val_bit_count = match self.reader.read_exact(&mut b1) { Ok(()) => b1[0], Err(e) => { if e.kind() == io::ErrorKind::UnexpectedEof { @@ -273,7 +258,7 @@ impl BenDecoder { if let Err(e) = self.reader.read_exact(&mut b2) { return Some(Err(e)); } - let max_len_bits = b2[0]; + let max_len_bit_count = b2[0]; let n_bytes = match self.reader.read_u32::() { Ok(n) => n, @@ -294,11 +279,11 @@ impl BenDecoder { 1 }; - Some(Ok(BenFrame { - max_val_bits, - max_len_bits, + Some(Ok(BenDecodeFrame { + max_val_bit_count, + max_len_bit_count, n_bytes, - raw_data: raw_assignment, + raw_bytes: raw_assignment, count, })) } @@ -457,7 +442,7 @@ impl BenDecoder { "TwoDelta frame encountered before an initial BEN frame", ) })?; - let run_lengths = decode_twodelta_run_lengths(&frame)?; + let run_lengths = frame.run_length_vector; let assignment = apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths)?; let keep_going = f(&assignment, count)?; @@ -485,9 +470,9 @@ impl BenDecoder { /// # Returns /// /// Returns the expanded assignment vector. -fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { +fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { decode_ben_line( - Cursor::new(&frame.raw_data), + Cursor::new(&frame.raw_bytes), frame.max_val_bits, frame.max_len_bits, frame.n_bytes, @@ -495,50 +480,6 @@ fn decode_ben_frame_to_assignment(frame: &BenFrame) -> io::Result> { .map(rle_to_vec) } -/// Decode the run-length payload of a TwoDelta frame. -/// -/// # Arguments -/// -/// * `frame` - The TwoDelta frame whose packed payload is decoded. -/// -/// # Returns -/// -/// Returns the sequence of non-zero run lengths extracted from the payload. -pub(crate) fn decode_twodelta_run_lengths(frame: &TwoDeltaFrame) -> io::Result> { - let mut items = Vec::new(); - let mut buffer: u32 = 0; - let mut n_bits_in_buff: u16 = 0; - let mut current: Option = None; - - for &byte in frame.payload() { - buffer |= (byte as u32).to_be() >> n_bits_in_buff; - n_bits_in_buff += 8; - - if n_bits_in_buff >= frame.max_len_bits as u16 && current.is_none() { - current = Some((buffer >> (32 - frame.max_len_bits)) as u16); - buffer <<= frame.max_len_bits; - n_bits_in_buff -= frame.max_len_bits as u16; - } - - if let Some(item) = current.take() { - if item > 0 { - items.push(item); - } - } - - while n_bits_in_buff >= frame.max_len_bits as u16 { - let item = (buffer >> (32 - frame.max_len_bits)) as u16; - buffer <<= frame.max_len_bits; - n_bits_in_buff -= frame.max_len_bits as u16; - if item > 0 { - items.push(item); - } - } - } - - Ok(items) -} - /// Apply decoded TwoDelta run lengths to produce a new assignment vector. /// /// Positions in `previous_assignment` that hold either value of `pair` are @@ -606,8 +547,7 @@ fn decode_twodelta_frame_to_assignment( assignment: Vec, frame: &TwoDeltaFrame, ) -> io::Result> { - let run_lengths = decode_twodelta_run_lengths(frame)?; - apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths) + apply_twodelta_runs_to_assignment(assignment, frame.pair, &frame.run_length_vector) } /// Decode a stored BEN frame into a full assignment vector. @@ -687,7 +627,7 @@ impl BenFrameDecoeder { } impl Iterator for BenFrameDecoeder { - type Item = io::Result; + type Item = io::Result; /// Return the next raw BEN frame from the input stream. fn next(&mut self) -> Option { @@ -704,14 +644,14 @@ impl Iterator for BenFrameDecoeder { }, BenVariant::TwoDelta => match self.inner.next() { Some(Ok((assignment, count))) => { - let encoded = encode_ben_vec_from_assign(&assignment); + let encoded = BenDecodeFrame::from_assignment(&assignment); let raw_data = encoded.as_slice()[6..].to_vec(); - Some(Ok(BenFrame { - max_val_bits: encoded.max_val_bits, - max_len_bits: encoded.max_len_bits, + Some(Ok(BenDecodeFrame { + max_val_bits: encoded.max_val_bit_count, + max_len_bits: encoded.max_len_bit_count, count, n_bytes: encoded.n_bytes, - raw_data, + raw_bytes: raw_data, })) } Some(Err(err)) => Some(Err(err)), @@ -1243,9 +1183,9 @@ impl Iterator for XBenFrameDecoder { #[derive(Clone)] /// A generalized frame type used by the subsampling machinery. -pub enum Frame { +pub enum DecodeFrame { /// A raw BEN frame. - Ben(BenFrame), + Ben(BenDecodeFrame), /// A raw ben32 frame from an XBEN stream together with its variant. XBen(Vec, BenVariant), } @@ -1269,17 +1209,17 @@ pub enum Selection { /// # Returns /// /// Returns the expanded assignment vector. -fn decode_frame_to_assignment(frame: &Frame) -> io::Result> { +fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { match frame { - Frame::Ben(f) => decode_ben_frame_to_assignment(f), - Frame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), + DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), + DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), } } /// Iterator adaptor that decodes only selected samples from a frame stream. pub struct SubsampleFrameDecoder where - I: Iterator>, + I: Iterator>, { inner: I, selection: Selection, @@ -1288,7 +1228,7 @@ where impl SubsampleFrameDecoder where - I: Iterator>, + I: Iterator>, { /// Create a subsampling iterator from a lower-level frame iterator. /// @@ -1420,7 +1360,7 @@ where impl Iterator for SubsampleFrameDecoder where - I: Iterator>, + I: Iterator>, { type Item = io::Result; @@ -1482,7 +1422,7 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result io::Result Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), @@ -1513,14 +1453,14 @@ impl BenDecoder { pub fn into_subsample_by_indices( self, indices: T, - ) -> SubsampleFrameDecoder> + Send> + ) -> SubsampleFrameDecoder> + Send> where T: IntoIterator, { let frames = self.into_frames().map(|res| { res.map(|f| { let count = f.count; - (Frame::Ben(f), count) + (DecodeFrame::Ben(f), count) }) }); SubsampleFrameDecoder::by_indices(frames, indices) @@ -1541,11 +1481,11 @@ impl BenDecoder { self, start: usize, end: usize, - ) -> SubsampleFrameDecoder> + Send> { + ) -> SubsampleFrameDecoder> + Send> { let frames = self.into_frames().map(|res| { res.map(|f| { let cnt = f.count; - (Frame::Ben(f), cnt) + (DecodeFrame::Ben(f), cnt) }) }); SubsampleFrameDecoder::by_range(frames, start, end) @@ -1566,11 +1506,11 @@ impl BenDecoder { self, step: usize, offset: usize, - ) -> SubsampleFrameDecoder> + Send> { + ) -> SubsampleFrameDecoder> + Send> { let frames = self.into_frames().map(|res| { res.map(|f| { let cnt = f.count; - (Frame::Ben(f), cnt) + (DecodeFrame::Ben(f), cnt) }) }); SubsampleFrameDecoder::every(frames, step, offset) @@ -1591,14 +1531,14 @@ impl XBenDecoder { pub fn into_subsample_by_indices( self, indices: T, - ) -> SubsampleFrameDecoder> + Send> + ) -> SubsampleFrameDecoder> + Send> where T: IntoIterator, { let variant = self.variant; let frames = self .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); SubsampleFrameDecoder::by_indices(Box::new(frames), indices) } @@ -1617,11 +1557,11 @@ impl XBenDecoder { self, start: usize, end: usize, - ) -> SubsampleFrameDecoder> + Send> { + ) -> SubsampleFrameDecoder> + Send> { let variant = self.variant; let frames = self .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); SubsampleFrameDecoder::by_range(Box::new(frames), start, end) } @@ -1640,11 +1580,11 @@ impl XBenDecoder { self, step: usize, offset: usize, - ) -> SubsampleFrameDecoder> + Send> { + ) -> SubsampleFrameDecoder> + Send> { let variant = self.variant; let frames = self .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (Frame::XBen(bytes, variant), cnt))); + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); SubsampleFrameDecoder::every(Box::new(frames), step, offset) } } diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs new file mode 100644 index 0000000..dc3acc6 --- /dev/null +++ b/ben/src/io/reader/ben.rs @@ -0,0 +1,1613 @@ +use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::codec::{BenDecodeFrame, TwoDeltaFrame}; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::util::rle::rle_to_vec; +use crate::{progress, BenVariant}; +use byteorder::{BigEndian, ReadBytesExt}; +use serde_json::json; +use std::fs::File; +use std::io::{self, BufReader, Cursor, Read, Write}; +use std::iter::Peekable; +use std::path::{Path, PathBuf}; +use xz2::read::XzDecoder; + +const XBEN_TWODELTA_FULL_TAG: u8 = 0; +const XBEN_TWODELTA_DELTA_TAG: u8 = 1; +const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; + +/// A decoded assignment together with the number of times it repeats. +pub type MkvRecord = (Vec, u16); +/// A raw ben32 frame together with the number of times it repeats. +pub type Ben32Frame = (Vec, u16); +/// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. +pub type FrameIter = Box> + Send>; + +#[derive(Debug)] +/// Errors produced while validating the header of a decoder input stream. +pub enum DecoderInitError { + /// The leading bytes did not match any supported BEN banner. + InvalidFileFormat(Vec), + /// An I/O error occurred while reading the header. + Io(io::Error), +} + +/// Check whether a header prefix matches the XZ file signature. +/// +/// # Arguments +/// +/// * `h` - The bytes to inspect. +/// +/// # Returns +/// +/// Returns `true` when `h` begins with the standard XZ magic bytes. +fn is_xz_header(h: &[u8]) -> bool { + h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" +} + +/// Convert a byte slice into a space-separated uppercase hex string. +/// +/// # Arguments +/// +/// * `bytes` - The bytes to render. +/// +/// # Returns +/// +/// Returns the formatted hex string. +fn to_hex(bytes: &[u8]) -> String { + bytes + .iter() + .map(|b| format!("{:02X}", b)) + .collect::>() + .join(" ") +} + +impl std::fmt::Display for DecoderInitError { + /// Format the decoder initialization error for display. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(e) => write!(f, "IO error: {e}"), + Self::InvalidFileFormat(header) => { + if is_xz_header(header) { + write!( + f, + "Invalid file format: Compressed header detected (hex: {}). \ + This reader expects an uncompressed .ben file. \ + Decompress this file using the BEN cli `ben -m decode .xben` tool \ + or the `decode_xben_to_ben` function in this library.", + to_hex(header) + ) + } else { + let lossy = String::from_utf8_lossy(header); + write!( + f, + "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", + to_hex(header) + ) + } + } + } + } +} + +impl std::error::Error for DecoderInitError { + /// Return the underlying source error when one exists. + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + DecoderInitError::Io(e) => Some(e), + DecoderInitError::InvalidFileFormat(_) => None, + } + } +} + +impl From for DecoderInitError { + /// Wrap a plain I/O error as a decoder initialization error. + fn from(error: io::Error) -> Self { + DecoderInitError::Io(error) + } +} + +impl From for io::Error { + /// Convert a decoder initialization error into a plain I/O error. + fn from(error: DecoderInitError) -> Self { + match error { + DecoderInitError::Io(e) => e, + DecoderInitError::InvalidFileFormat(msg) => { + io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) + } + } + } +} + +/// Iterator over decoded assignments in an uncompressed BEN stream. +pub struct BenDecoder { + reader: R, + sample_count: usize, + variant: BenVariant, + previous_assignment: Option>, + twodelta_consumed_first_frame: bool, + silent: bool, +} + +enum StoredBenFrame { + Ben(BenDecodeFrame), + TwoDelta { frame: TwoDeltaFrame, count: u16 }, +} + +enum XBenTwoDeltaFrame { + Full { + runs: Vec<(u16, u16)>, + }, + Delta { + pair: (u16, u16), + run_lengths: Vec, + }, +} + +impl StoredBenFrame { + fn count(&self) -> u16 { + match self { + Self::Ben(frame) => frame.count, + Self::TwoDelta { count, .. } => *count, + } + } +} + +impl BenDecoder { + /// Create a decoder for an uncompressed BEN stream. + /// + /// The reader must begin with one of the BEN banners such as + /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. + /// + /// # Arguments + /// + /// * `reader` - The input BEN stream, including its 17-byte banner. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first BEN frame. + pub fn new(mut reader: R) -> Result { + let mut check_buffer = [0u8; BANNER_LEN]; + + if let Err(e) = reader.read_exact(&mut check_buffer) { + return Err(DecoderInitError::Io(e)); + } + + match variant_from_banner(&check_buffer) { + Some(variant) => Ok(BenDecoder { + reader, + sample_count: 0, + variant, + previous_assignment: None, + twodelta_consumed_first_frame: false, + silent: false, + }), + None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), + } + } + + /// Suppress progress output from this decoder's iterator. + /// + /// # Arguments + /// + /// * `silent` - When `true`, the decoder will not emit progress messages. + /// + /// # Returns + /// + /// Returns `self` for method chaining. + pub fn silent(mut self, silent: bool) -> Self { + self.silent = silent; + self + } + + /// Decode the remaining BEN stream and write it as JSONL. + /// + /// # Arguments + /// + /// * `writer` - The destination that will receive one JSON object per + /// decoded sample. + /// + /// # Returns + /// + /// Returns `Ok(())` after the remaining stream has been fully decoded. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + Ok(true) + }) + } + + /// Read and return the next raw BEN frame stored in standard BEN layout. + /// + /// # Arguments + /// + /// * `with_count` - When `true`, read a trailing `u16` repetition count; + /// otherwise the count defaults to `1`. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. + fn pop_standard_frame_from_reader( + &mut self, + with_count: bool, + ) -> Option> { + let mut b1 = [0u8; 1]; + let max_val_bit_count = match self.reader.read_exact(&mut b1) { + Ok(()) => b1[0], + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + tracing::trace!(""); + tracing::trace!("Done!"); + return None; + } + return Some(Err(e)); + } + }; + + let mut b2 = [0u8; 1]; + if let Err(e) = self.reader.read_exact(&mut b2) { + return Some(Err(e)); + } + let max_len_bit_count = b2[0]; + + let n_bytes = match self.reader.read_u32::() { + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + + let mut raw_assignment = vec![0u8; n_bytes as usize]; + if let Err(e) = self.reader.read_exact(&mut raw_assignment) { + return Some(Err(e)); + } + + let count = if with_count { + match self.reader.read_u16::() { + Ok(c) => c, + Err(e) => return Some(Err(e)), + } + } else { + 1 + }; + + Some(Ok(BenDecodeFrame { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes: raw_assignment, + count, + })) + } + + /// Read and return the next raw TwoDelta frame from the underlying stream. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next TwoDelta frame, `Some(Err(...))` + /// for a read failure, or `None` at a clean end of stream. + fn pop_twodelta_frame_from_reader(&mut self) -> Option> { + let pair_a = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + tracing::trace!(""); + tracing::trace!("Done!"); + return None; + } + return Some(Err(e)); + } + }; + + let pair_b = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + let mut bits = [0u8; 1]; + if let Err(e) = self.reader.read_exact(&mut bits) { + return Some(Err(e)); + } + let max_len_bits = bits[0]; + + let n_bytes = match self.reader.read_u32::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + let mut payload = vec![0u8; n_bytes as usize]; + if let Err(e) = self.reader.read_exact(&mut payload) { + return Some(Err(e)); + } + + let count = match self.reader.read_u16::() { + Ok(value) => value, + Err(e) => return Some(Err(e)), + }; + + Some(Ok(StoredBenFrame::TwoDelta { + frame: TwoDeltaFrame::from_parts((pair_a, pair_b), max_len_bits, payload), + count, + })) + } + + /// Read and return the next stored frame from the underlying BEN stream. + /// + /// # Arguments + /// + /// * `&mut self` - The decoder whose internal reader is advanced. + /// + /// # Returns + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. + fn pop_frame_from_reader(&mut self) -> Option> { + match self.variant { + BenVariant::Standard => self + .pop_standard_frame_from_reader(false) + .map(|res| res.map(StoredBenFrame::Ben)), + BenVariant::MkvChain => self + .pop_standard_frame_from_reader(true) + .map(|res| res.map(StoredBenFrame::Ben)), + BenVariant::TwoDelta => { + if !self.twodelta_consumed_first_frame { + self.twodelta_consumed_first_frame = true; + self.pop_standard_frame_from_reader(true) + .map(|res| res.map(StoredBenFrame::Ben)) + } else { + self.pop_twodelta_frame_from_reader() + } + } + } + } + + /// Consume this decoder and iterate over raw BEN frames instead of + /// materialized assignments. + /// + /// # Returns + /// + /// Returns an iterator that yields raw BEN frames from the remaining input. + pub fn into_frames(self) -> BenFrameDecoeder { + BenFrameDecoeder { inner: self } + } + + /// Count the number of samples remaining in the BEN stream. + /// + /// This consumes the decoder but only walks frame boundaries rather than + /// expanding every assignment into a full vector. + /// + /// # Returns + /// + /// Returns the number of remaining samples in the stream. + pub fn count_samples(self) -> io::Result { + let mut this = self; + let mut total = 0usize; + while let Some(frame_res) = this.pop_frame_from_reader() { + total += frame_res?.count() as usize; + } + Ok(total) + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// Unlike the `Iterator` implementation, this avoids cloning the assignment + /// buffer on every frame. The decoder owns a single buffer, mutates it in + /// place for TwoDelta frames, and lends `&[u16]` to the callback. This + /// eliminates one full-length memcpy per frame. + /// + /// The callback receives a borrowed assignment slice and its repetition + /// count. Return `true` to continue decoding or `false` to stop early. + /// + /// # Arguments + /// + /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. + /// + /// # Returns + /// + /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + loop { + let frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + + let count = frame.count(); + + match frame { + StoredBenFrame::Ben(ben_frame) => { + let assignment = decode_ben_frame_to_assignment(&ben_frame)?; + let keep_going = f(&assignment, count)?; + self.previous_assignment = Some(assignment); + if !keep_going { + return Ok(()); + } + } + StoredBenFrame::TwoDelta { frame, count } => { + let assignment = self.previous_assignment.take().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta frame encountered before an initial BEN frame", + ) + })?; + let run_lengths = frame.run_length_vector; + let assignment = + apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths)?; + let keep_going = f(&assignment, count)?; + self.previous_assignment = Some(assignment); + if !keep_going { + return Ok(()); + } + } + } + + self.sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } + } + } +} + +/// Decode a raw BEN frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame` - The raw BEN frame to decode. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { + decode_ben_line( + Cursor::new(&frame.raw_bytes), + frame.max_val_bits, + frame.max_len_bits, + frame.n_bytes, + ) + .map(rle_to_vec) +} + +/// Apply decoded TwoDelta run lengths to produce a new assignment vector. +/// +/// Positions in `previous_assignment` that hold either value of `pair` are +/// overwritten according to the alternating run-length encoding. +/// +/// # Arguments +/// +/// * `assignment` - The assignment from the preceding frame (mutated in place). +/// * `pair` - The two label values that participate in the delta. +/// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. +/// +/// # Returns +/// +/// Returns the updated assignment vector. +fn apply_twodelta_runs_to_assignment( + mut assignment: Vec, + pair: (u16, u16), + run_lengths: &[u16], +) -> io::Result> { + let (first, second) = pair; + + let mut run_idx = 0usize; + let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); + let mut current_value = first; + + for val in assignment.iter_mut() { + if *val == first || *val == second { + if remaining_in_run == 0 { + run_idx += 1; + if run_idx >= run_lengths.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta payload exhausted before all pair positions were covered", + )); + } + remaining_in_run = run_lengths[run_idx]; + current_value = if current_value == first { + second + } else { + first + }; + } + *val = current_value; + remaining_in_run -= 1; + } + } + + Ok(assignment) +} + +/// Decode a raw TwoDelta frame into a full assignment vector. +/// +/// Unpacks the bitpacked run lengths from the frame payload, then applies +/// them in a single pass over the assignment. +/// +/// # Arguments +/// +/// * `assignment` - The assignment from the preceding frame (mutated in place). +/// * `frame` - The TwoDelta frame whose packed payload is decoded and applied. +/// +/// # Returns +/// +/// Returns the updated assignment vector. +fn decode_twodelta_frame_to_assignment( + assignment: Vec, + frame: &TwoDeltaFrame, +) -> io::Result> { + apply_twodelta_runs_to_assignment(assignment, frame.pair, &frame.run_length_vector) +} + +/// Decode a stored BEN frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `previous_assignment` - The assignment from the preceding frame, required +/// for TwoDelta frames. +/// * `frame` - The stored frame to decode. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +fn decode_stored_frame_to_assignment( + previous_assignment: &mut Option>, + frame: &StoredBenFrame, +) -> io::Result> { + match frame { + StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), + StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( + previous_assignment.take().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta frame encountered before an initial BEN frame", + ) + })?, + frame, + ), + } +} + +impl Iterator for BenDecoder { + type Item = io::Result; + + /// Decode and return the next assignment from the BEN stream. + fn next(&mut self) -> Option> { + let frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Some(Err(e)), + None => return None, + }; + let assignment = + match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { + Ok(assgn) => assgn, + Err(e) => return Some(Err(e)), + }; + let count = frame.count(); + self.previous_assignment = Some(assignment.clone()); + self.sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } + Some(Ok((assignment, count))) + } +} + +/// Iterator over raw BEN frames. +pub struct BenFrameDecoeder { + inner: BenDecoder, +} + +impl BenFrameDecoeder { + /// Create a raw BEN frame iterator from a reader. + /// + /// # Arguments + /// + /// * `reader` - The input BEN stream, including its 17-byte banner. + /// + /// # Returns + /// + /// Returns an iterator over raw BEN frames. + pub fn new(reader: R) -> io::Result { + Ok(Self { + inner: BenDecoder::new(reader)?, + }) + } +} + +impl Iterator for BenFrameDecoeder { + type Item = io::Result; + + /// Return the next raw BEN frame from the input stream. + fn next(&mut self) -> Option { + match self.inner.variant { + BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() + { + Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), + Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + "unexpected TwoDelta frame in non-TwoDelta BEN stream", + ))), + Some(Err(err)) => Some(Err(err)), + None => None, + }, + BenVariant::TwoDelta => match self.inner.next() { + Some(Ok((assignment, count))) => { + let encoded = BenDecodeFrame::from_assignment(&assignment); + let raw_data = encoded.as_slice()[6..].to_vec(); + Some(Ok(BenDecodeFrame { + max_val_bits: encoded.max_val_bit_count, + max_len_bits: encoded.max_len_bit_count, + count, + n_bytes: encoded.n_bytes, + raw_bytes: raw_data, + })) + } + Some(Err(err)) => Some(Err(err)), + None => None, + }, + } + } +} + +/// Iterator over decoded assignments in an XBEN stream. +pub struct XBenDecoder { + xz: BufReader>, + /// Variant encoded in the XBEN banner. + pub variant: BenVariant, + overflow: Vec, + buf: Box<[u8]>, + previous_assignment: Option>, + chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, +} + +impl XBenDecoder { + /// Create an XBEN decoder from an already-opened decompressed stream. + /// + /// # Arguments + /// + /// * `xz` - A buffered XZ decompression reader positioned past the banner. + /// * `variant` - The BEN variant indicated by the banner. + /// + /// # Returns + /// + /// Returns a new decoder ready to yield frames from the stream. + pub(crate) fn from_decompressed_stream( + xz: BufReader>, + variant: BenVariant, + ) -> Self { + Self { + xz, + variant, + overflow: Vec::with_capacity(1 << 20), + buf: vec![0u8; 1 << 20].into_boxed_slice(), + previous_assignment: None, + chunk_queue: std::collections::VecDeque::new(), + } + } + + /// Create a decoder for an XBEN stream. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first ben32 frame in the + /// decompressed payload. + pub fn new(reader: R) -> io::Result { + let xz = XzDecoder::new(reader); + let mut xz = BufReader::with_capacity(1 << 20, xz); + + let mut first = [0u8; BANNER_LEN]; + xz.read_exact(&mut first)?; + let variant = variant_from_banner(&first).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", + ) + })?; + + Ok(Self::from_decompressed_stream(xz, variant)) + } + + /// Try to extract one complete ben32 frame from the buffered overflow. + /// + /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 + /// frame and, for MkvChain streams, reads the trailing repetition count. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain one or more + /// complete ben32 frames. + /// + /// # Returns + /// + /// Returns the frame bytes, the number of consumed bytes, and the decoded + /// repetition count when a complete frame is available. + fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { + match self.variant { + BenVariant::Standard => { + if overflow.len() < 4 { + return None; + } + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let end = i + 1; + let frame = &overflow[..end]; + return Some((frame, end, 1)); + } + } + None + } + BenVariant::MkvChain => { + if overflow.len() < 6 { + return None; + } + for i in (3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let count_hi = overflow[i + 1]; + let count_lo = overflow[i + 2]; + let count = u16::from_be_bytes([count_hi, count_lo]); + let end = i + 3; + let frame = &overflow[..end]; + return Some((frame, end, count)); + } + } + None + } + BenVariant::TwoDelta => None, + } + } + + /// Try to extract one complete TwoDelta frame from the buffered overflow. + /// + /// Inspects the leading tag byte to determine whether the frame is a full + /// RLE frame or a delta frame, then reads the corresponding payload. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain a complete + /// TwoDelta frame. + /// + /// # Returns + /// + /// Returns the parsed frame, the number of consumed bytes, and the + /// repetition count when a complete frame is available. + fn pop_twodelta_frame_from_overflow( + &self, + overflow: &[u8], + ) -> Option> { + let tag = *overflow.first()?; + match tag { + XBEN_TWODELTA_FULL_TAG => { + if overflow.len() < 7 { + return None; + } + let run_count = + u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) + as usize; + let payload_len = run_count.checked_mul(4)?; + let total_len = 1usize + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut runs = Vec::with_capacity(run_count); + let mut cursor = 5usize; + for _ in 0..run_count { + let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); + runs.push((value, len)); + cursor += 4; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) + } + XBEN_TWODELTA_DELTA_TAG => { + if overflow.len() < 11 { + return None; + } + let pair = ( + u16::from_be_bytes([overflow[1], overflow[2]]), + u16::from_be_bytes([overflow[3], overflow[4]]), + ); + let run_count = + u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) + as usize; + let payload_len = run_count.checked_mul(2)?; + let total_len = 1usize + .checked_add(2)? + .checked_add(2)? + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut run_lengths = Vec::with_capacity(run_count); + let mut cursor = 9usize; + for _ in 0..run_count { + run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); + cursor += 2; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok(( + XBenTwoDeltaFrame::Delta { pair, run_lengths }, + total_len, + count, + ))) + } + XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. + _ => Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid TwoDelta XBEN frame tag", + ))), + } + } + + /// Try to parse a columnar TwoDelta chunk from the overflow buffer. + /// + /// If the overflow starts with the chunk tag and contains enough bytes for + /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. + /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, + /// or `None` when the overflow is incomplete. + fn try_parse_twodelta_chunk(&mut self) -> Option> { + if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { + return None; + } + if self.overflow.len() < 5 { + return None; + } + + let n_frames = u32::from_be_bytes([ + self.overflow[1], + self.overflow[2], + self.overflow[3], + self.overflow[4], + ]) as usize; + + // Calculate total chunk size: tag(1) + n_frames(4) + // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) + let header_len = 5; + let pairs_len = n_frames * 4; + let counts_len = n_frames * 2; + let run_counts_len = n_frames * 4; + let fixed_len = header_len + pairs_len + counts_len + run_counts_len; + + if self.overflow.len() < fixed_len { + return None; + } + + // Read run-length counts to determine total run data size. + let run_counts_start = header_len + pairs_len + counts_len; + let mut total_runs = 0usize; + let mut run_counts = Vec::with_capacity(n_frames); + for i in 0..n_frames { + let offset = run_counts_start + i * 4; + let rc = u32::from_be_bytes([ + self.overflow[offset], + self.overflow[offset + 1], + self.overflow[offset + 2], + self.overflow[offset + 3], + ]) as usize; + run_counts.push(rc); + total_runs += rc; + } + + let run_data_len = total_runs * 2; + let total_len = fixed_len + run_data_len; + if self.overflow.len() < total_len { + return None; + } + + // Parse pairs channel. + let pairs_start = header_len; + // Parse counts channel. + let counts_start = pairs_start + pairs_len; + // Run data starts after run counts. + let run_data_start = run_counts_start + run_counts_len; + + let mut run_cursor = run_data_start; + for i in 0..n_frames { + let po = pairs_start + i * 4; + let pair = ( + u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), + u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), + ); + let co = counts_start + i * 2; + let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); + + let rc = run_counts[i]; + let mut run_lengths = Vec::with_capacity(rc); + for _ in 0..rc { + run_lengths.push(u16::from_be_bytes([ + self.overflow[run_cursor], + self.overflow[run_cursor + 1], + ])); + run_cursor += 2; + } + + self.chunk_queue + .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); + } + + self.overflow.drain(..total_len); + Some(Ok(())) + } + + /// Consume this decoder and iterate over raw ben32 frames instead of + /// materialized assignments. + /// + /// # Returns + /// + /// Returns an iterator that yields raw ben32 frames from the remaining + /// input. + pub fn into_frames(self) -> XBenFrameDecoder { + XBenFrameDecoder { inner: self } + } + + /// Count the number of samples remaining in the XBEN stream. + /// + /// # Returns + /// + /// Returns the number of remaining samples in the stream. + pub fn count_samples(self) -> io::Result { + let mut total = 0usize; + for frame_res in self.into_frames() { + let (_bytes, cnt) = frame_res?; + total += cnt as usize; + } + Ok(total) + } +} + +/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame_bytes` - The ben32 frame bytes. +/// * `variant` - The BEN variant used to interpret the frame tail. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +fn decode_xben_frame_to_assignment( + frame_bytes: &[u8], + variant: BenVariant, +) -> io::Result> { + let cursor = Cursor::new(frame_bytes); + let (assignment, _) = decode_ben32_line(cursor, variant)?; + Ok(assignment) +} + +impl Iterator for XBenDecoder { + type Item = io::Result; + + /// Decode and return the next assignment from the XBEN stream. + fn next(&mut self) -> Option { + loop { + match self.variant { + BenVariant::Standard | BenVariant::MkvChain => { + if let Some((frame_bytes, consumed, count)) = + self.pop_frame_from_overflow(&self.overflow) + { + let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + Ok((assignment, count)) + } + Err(e) => Err(e), + }; + self.overflow.drain(..consumed); + return Some(res); + } + } + BenVariant::TwoDelta => { + // Drain frames from a previously parsed chunk first. + if let Some((frame, count)) = self.chunk_queue.pop_front() { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.take() { + Some(prev) => { + apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) + } + None => Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN frame encountered before an initial BEN frame", + )), + } + } + }; + return Some(match assignment { + Ok(a) => { + self.previous_assignment = Some(a.clone()); + Ok((a, count)) + } + Err(e) => Err(e), + }); + } + + // Try to parse a columnar chunk. + if let Some(result) = self.try_parse_twodelta_chunk() { + match result { + Ok(()) => continue, // Loop to drain chunk_queue. + Err(e) => return Some(Err(e)), + } + } + + // Try a single legacy frame (tag 0 or 1). + if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { + let res = match parsed { + Ok((frame, consumed, count)) => { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.take() { + Some(previous_assignment) => { + apply_twodelta_runs_to_assignment( + previous_assignment, + pair, + &run_lengths, + ) + } + None => Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN frame encountered before an initial BEN frame", + )), + } + } + }; + match assignment { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + self.overflow.drain(..consumed); + Ok((assignment, count)) + } + Err(err) => { + self.overflow.drain(..consumed); + Err(err) + } + } + } + Err(err) => { + self.overflow.clear(); + Err(err) + } + }; + return Some(res); + } + } + } + + let read = match self.xz.read(&mut self.buf) { + Ok(0) => { + if self.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated .xben stream (partial frame at EOF)", + ))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.overflow.extend_from_slice(&self.buf[..read]); + } + } +} + +/// Iterator over raw ben32 frames inside an XBEN stream. +pub struct XBenFrameDecoder { + inner: XBenDecoder, +} + +impl XBenFrameDecoder { + /// Create a raw XBEN frame iterator from a reader. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns an iterator over raw ben32 frames. + pub fn new(reader: R) -> io::Result { + Ok(Self { + inner: XBenDecoder::new(reader)?, + }) + } +} + +impl Iterator for XBenFrameDecoder { + type Item = io::Result; + + /// Return the next raw ben32 frame from the input stream. + fn next(&mut self) -> Option { + if self.inner.variant == BenVariant::TwoDelta { + return self.inner.next().map(|result| { + result.and_then(|(assignment, count)| { + Ok((encode_ben32_assignments(&assignment)?.into_u8_vec()?, count)) + }) + }); + } + + loop { + if let Some((frame, consumed, count)) = + self.inner.pop_frame_from_overflow(&self.inner.overflow) + { + let out = frame.to_vec(); + self.inner.overflow.drain(..consumed); + return Some(Ok((out, count))); + } + + let read = match self.inner.xz.read(&mut self.inner.buf) { + Ok(0) => { + if self.inner.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated .xben stream (partial frame at EOF)", + ))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.inner + .overflow + .extend_from_slice(&self.inner.buf[..read]); + } + } +} + +#[derive(Clone)] +/// A generalized frame type used by the subsampling machinery. +pub enum DecodeFrame { + /// A raw BEN frame. + Ben(BenDecodeFrame), + /// A raw ben32 frame from an XBEN stream together with its variant. + XBen(Vec, BenVariant), +} + +/// A selection strategy for extracting only part of a frame stream. +pub enum Selection { + /// Select explicit 1-based indices. + Indices(Peekable>), + /// Select every `step` samples starting at the 1-based `offset`. + Every { step: usize, offset: usize }, + /// Select the inclusive 1-based range `[start, end]`. + Range { start: usize, end: usize }, +} + +/// Decode a generic frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame` - Either a BEN frame or an XBEN ben32 frame. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { + match frame { + DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), + DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), + } +} + +/// Iterator adaptor that decodes only selected samples from a frame stream. +pub struct SubsampleFrameDecoder +where + I: Iterator>, +{ + inner: I, + selection: Selection, + sample: usize, +} + +impl SubsampleFrameDecoder +where + I: Iterator>, +{ + /// Create a subsampling iterator from a lower-level frame iterator. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `selection` - The sample-selection rule to apply. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn new(inner: I, selection: Selection) -> Self { + Self { + inner, + selection, + sample: 0, + } + } + + /// Select a set of 1-based sample indices. + /// + /// Indices are sorted and deduplicated before iteration begins. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn by_indices(inner: I, indices: T) -> Self + where + T: IntoIterator, + { + let mut v: Vec = indices.into_iter().collect(); + v.sort_unstable(); + v.dedup(); + Self::new(inner, Selection::Indices(v.into_iter().peekable())) + } + + /// Select the inclusive 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn by_range(inner: I, start: usize, end: usize) -> Self { + assert!( + start >= 1 && end >= start, + "range must be 1-based and end >= start" + ); + Self::new(inner, Selection::Range { start, end }) + } + + /// Select every `step` samples beginning from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn every(inner: I, step: usize, offset: usize) -> Self { + assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); + Self::new(inner, Selection::Every { step, offset }) + } + + /// Count how many selected samples fall within an inclusive sample interval. + /// + /// # Arguments + /// + /// * `lo` - The first 1-based sample index covered by the current frame. + /// * `hi` - The last 1-based sample index covered by the current frame. + /// + /// # Returns + /// + /// Returns the number of selected samples represented by the frame. + fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { + match &mut self.selection { + Selection::Indices(iter) => { + let mut taken = 0u16; + while let Some(&next) = iter.peek() { + if next < lo { + iter.next(); + continue; + } + if next > hi { + break; + } + iter.next(); + taken = taken.saturating_add(1); + } + taken + } + Selection::Every { step, offset } => { + let start = lo.max(*offset); + if start > hi { + return 0; + } + let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; + let first = start + ((*step - r) % *step); + if first > hi { + 0 + } else { + (1 + (hi - first) / *step) as u16 + } + } + Selection::Range { start, end } => { + if hi < *start || lo > *end { + 0 + } else { + let a = lo.max(*start); + let b = hi.min(*end); + (b - a + 1) as u16 + } + } + } + } +} + +impl Iterator for SubsampleFrameDecoder +where + I: Iterator>, +{ + type Item = io::Result; + + /// Return the next decoded sample selected by the subsampling rule. + fn next(&mut self) -> Option { + loop { + if let Selection::Range { end, .. } = self.selection { + if self.sample >= end { + return None; + } + } + if let Selection::Indices(ref mut it) = self.selection { + if it.peek().is_none() { + return None; + } + } + + let (frame, count) = match self.inner.next()? { + Ok(x) => x, + Err(e) => return Some(Err(e)), + }; + + let lo = self.sample + 1; + let hi = self.sample + count as usize; + let selected = self.count_selected_in(lo, hi); + + self.sample = hi; + + if selected > 0 { + match decode_frame_to_assignment(&frame) { + Ok(assignment) => return Some(Ok((assignment, selected))), + Err(e) => return Some(Err(e)), + } + } + } + } +} + +/// Build a generic frame iterator from a BEN or XBEN file path. +/// +/// Frame iteration is useful for subsampling and counting because it avoids +/// decoding every sample into a full assignment vector. +/// +/// # Arguments +/// +/// * `file_path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns a boxed iterator over generic frames and their repetition counts. +pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { + let file = File::options().read(true).open(file_path)?; + let reader = BufReader::new(file); + + match mode { + "ben" => { + let frames = BenFrameDecoeder::new(reader)?; + let mapped = frames.map(|res| { + res.map(|f| { + let cnt = f.count; + (DecodeFrame::Ben(f), cnt) + }) + }); + Ok(Box::new(mapped)) + } + "xben" => { + let x = XBenDecoder::new(reader)?; + let variant = x.variant; + let frames = x.into_frames(); + let mapped = frames + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + Ok(Box::new(mapped)) + } + _ => Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), + } +} + +impl BenDecoder { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + /// + /// # Arguments + /// + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + Send> + where + T: IntoIterator, + { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let count = f.count; + (DecodeFrame::Ben(f), count) + }) + }); + SubsampleFrameDecoder::by_indices(frames, indices) + } + + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> + Send> { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let cnt = f.count; + (DecodeFrame::Ben(f), cnt) + }) + }); + SubsampleFrameDecoder::by_range(frames, start, end) + } + + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> + Send> { + let frames = self.into_frames().map(|res| { + res.map(|f| { + let cnt = f.count; + (DecodeFrame::Ben(f), cnt) + }) + }); + SubsampleFrameDecoder::every(frames, step, offset) + } +} + +impl XBenDecoder { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + /// + /// # Arguments + /// + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + Send> + where + T: IntoIterator, + { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_indices(Box::new(frames), indices) + } + + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_range(Box::new(frames), start, end) + } + + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::every(Box::new(frames), step, offset) + } +} + +/// Count the number of samples in a BEN or XBEN file on disk. +/// +/// The file is walked frame-by-frame, so this is linear in file size but avoids +/// materializing full assignment vectors. +/// +/// # Arguments +/// +/// * `path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns the number of samples in the file. +pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { + let iter = build_frame_iter(&path.to_path_buf(), mode)?; + let mut total = 0usize; + for item in iter { + let (_frame, cnt) = item?; + total += cnt as usize; + } + Ok(total) +} diff --git a/ben/src/io/reader/errors.rs b/ben/src/io/reader/errors.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs new file mode 100644 index 0000000..1a01ab4 --- /dev/null +++ b/ben/src/io/reader/mod.rs @@ -0,0 +1 @@ +pub mod ben; diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/writer.rs b/ben/src/io/writer/ben.rs similarity index 96% rename from ben/src/io/writer.rs rename to ben/src/io/writer/ben.rs index 2f89cf5..a61f940 100644 --- a/ben/src/io/writer.rs +++ b/ben/src/io/writer/ben.rs @@ -1,11 +1,8 @@ use crate::codec::decode::decode_ben_line; -use crate::codec::encode::{ - build_twodelta_runs_with_hint, encode_ben32_assignments, encode_ben_vec_from_assign, - encode_twodelta_vec_with_hint, BenFrame, TwoDeltaFrame, -}; +use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; +use crate::codec::{BenEncodeFrame, FromAssign, TwoDeltaFrame}; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::io::reader::decode_twodelta_run_lengths; use crate::util::rle::assign_to_rle; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -28,7 +25,7 @@ struct BufferedDeltaFrame { } enum BufferedBenFrame { - Ben(BenFrame), + Ben(BenEncodeFrame), TwoDelta(TwoDeltaFrame), } @@ -352,7 +349,7 @@ impl BenEncoder { } } - let encoded = encode_ben_vec_from_assign(&assign_vec); + let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); self.writer.write_all(encoded.as_slice())?; self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 0); Ok(()) @@ -367,13 +364,13 @@ impl BenEncoder { self.flush_pending_frame()?; } - let encoded = encode_ben_vec_from_assign(&assign_vec); + let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); Ok(()) } BenVariant::TwoDelta => { if self.previous_sample.is_empty() { - let encoded = encode_ben_vec_from_assign(&assign_vec); + let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); return Ok(()); } @@ -383,11 +380,11 @@ impl BenEncoder { return Ok(()); } - let encoded = encode_twodelta_vec_with_hint( + let encoded = encode_twodelta_frame_with_hint( &self.previous_sample, &assign_vec, hints.delta_pair, - Some(&self.previous_masks), + Some(&mut self.previous_masks), )?; self.flush_pending_frame()?; @@ -779,7 +776,7 @@ impl XBenEncoder { pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { match self.variant { BenVariant::Standard => { - let encoded = encode_ben32_assignments(&assign_vec)?.into_u8_vec()?; + let encoded = encode_ben32_assignments(&assign_vec)?; self.encoder.write_all(&encoded)?; self.previous_assignment = assign_vec; self.previous_frame = encoded; @@ -792,7 +789,7 @@ impl XBenEncoder { } self.flush_pending_frame()?; - let encoded = encode_ben32_assignments(&assign_vec)?.into_u8_vec()?; + let encoded = encode_ben32_assignments(&assign_vec)?; self.set_previous_assignment(assign_vec, encoded, 1); Ok(()) } @@ -824,27 +821,24 @@ impl XBenEncoder { self.flush_pending_frame()?; } - let (ordered_pair, run_lengths) = build_twodelta_runs_with_hint( + let encoded_frame: TwoDeltaFrame = match encode_twodelta_frame_with_hint( &self.previous_assignment, &assign_vec, hints.delta_pair, - Some(&self.previous_masks), - )?; + Some(&mut self.previous_masks), + ) { + Ok(frame) => frame, + Err(e) => { + return Err(e); + } + }; self.chunk_buffer.push(BufferedDeltaFrame { - pair: ordered_pair, - run_lengths, + pair: encoded_frame.pair, + run_lengths: encoded_frame.run_length_vector, count: 1, }); - if let Some(pair) = hints.delta_pair { - self.update_masks_for_delta(&assign_vec, pair); - self.previous_assignment = assign_vec; - } else { - self.previous_assignment = assign_vec; - self.rebuild_previous_masks(); - } - if self.chunk_buffer.len() >= self.chunk_size { self.flush_chunk()?; } @@ -931,7 +925,7 @@ impl XBenEncoder { // Unpack bitpacked run lengths. let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); - let run_lengths = decode_twodelta_run_lengths(&frame)?; + let run_lengths = frame.run_length_vector; // Flush the initial full frame before the first delta chunk. if self.chunk_buffer.is_empty() && self.count > 0 { @@ -939,7 +933,7 @@ impl XBenEncoder { } self.chunk_buffer.push(BufferedDeltaFrame { - pair: frame.pair(), + pair: frame.pair, run_lengths, count, }); diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs new file mode 100644 index 0000000..43aa94d --- /dev/null +++ b/ben/src/io/writer/mod.rs @@ -0,0 +1,3 @@ +pub mod ben; + +pub use ben::{BenEncoder, XBenEncoder}; diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs new file mode 100644 index 0000000..2cc0f34 --- /dev/null +++ b/ben/src/io/writer/twodelta.rs @@ -0,0 +1,5 @@ +const XBEN_TWODELTA_FULL_TAG: u8 = 0; +const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; + +/// Default number of delta frames per columnar chunk in XBEN TwoDelta. +pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 37a763b..af51f60 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -1,7 +1,7 @@ //! Relabeling operations for BEN files. use crate::codec::decode::decode_ben_line; -use crate::codec::encode::encode_ben_vec_from_rle; +use crate::codec::BenEncodeFrame; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::io::reader::BenDecoder; use crate::io::writer::BenEncoder; @@ -358,7 +358,7 @@ fn relabel_ben_lines_impl( 1 }; - let relabeled = encode_ben_vec_from_rle(ben_line); + let relabeled = BenEncodeFrame::from_rle(ben_line); writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; @@ -584,7 +584,7 @@ fn relabel_ben_lines_with_map_impl( 1 }; - let relabeled = encode_ben_vec_from_rle(new_rle.clone()); + let relabeled = BenEncodeFrame::from_rle(new_rle.clone()); writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 98ccefc..a3fcf07 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -29,10 +29,10 @@ where fn test_relabel_ben_line_simple() { let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; - let input = encode_ben_vec_from_rle(in_rle); + let input = BenFrame::from_rle(in_rle); let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; - let expected = encode_ben_vec_from_rle(out_rle); + let expected = BenFrame::from_rle(out_rle); let mut buf = Vec::new(); relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); @@ -203,11 +203,11 @@ fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; let in_rle = assign_to_rle(in_assign); - let input = encode_ben_vec_from_rle(in_rle); + let input = BenFrame::from_rle(in_rle); let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); + let expected = BenFrame::from_rle(out_rle); let mut new_to_old_map = HashMap::new(); new_to_old_map.insert(0, 2); @@ -238,11 +238,11 @@ fn test_relabel_ben_line_with_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign); - let input = encode_ben_vec_from_rle(in_rle); + let input = BenFrame::from_rle(in_rle); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); + let expected = BenFrame::from_rle(out_rle); let mut buf = Vec::new(); relabel_ben_lines_with_map( @@ -269,11 +269,11 @@ fn test_relabel_ben_line_with_large_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign.to_vec()); - let input = encode_ben_vec_from_rle(in_rle); + let input = BenFrame::from_rle(in_rle); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = encode_ben_vec_from_rle(out_rle); + let expected = BenFrame::from_rle(out_rle); let mut buf = Vec::new(); relabel_ben_lines_with_map( From 7dd91fe6c6ae5617f375e1cacf1f088eeaa0e9f5 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 19 Mar 2026 21:13:17 -0600 Subject: [PATCH 040/221] More reorg --- ben/src/codec/encode/mod.rs | 1 + ben/src/codec/frames.rs | 2 +- ben/src/io/reader.rs | 1613 --------------------------------- ben/src/io/reader/ben.rs | 133 +-- ben/src/io/reader/errors.rs | 97 ++ ben/src/io/reader/mod.rs | 10 + ben/src/io/reader/twodelta.rs | 13 + ben/src/io/writer/ben.rs | 7 +- ben/src/io/writer/mod.rs | 5 + ben/src/io/writer/twodelta.rs | 4 +- ben/src/ops/relabel/mod.rs | 6 +- ben/src/ops/relabel/tests.rs | 19 +- 12 files changed, 157 insertions(+), 1753 deletions(-) delete mode 100644 ben/src/io/reader.rs diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index c32bc7e..3a3138b 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -10,6 +10,7 @@ mod xz; pub(crate) use ben::encode_ben32_assignments; pub use traits::{FromAssign, FromRLE}; pub(crate) use twodelta::encode_twodelta_frame_with_hint; +pub use twodelta::encode_twodelta_frame; #[cfg(test)] pub(crate) use ben::encode_ben32_line; diff --git a/ben/src/codec/frames.rs b/ben/src/codec/frames.rs index bceaba8..be86690 100644 --- a/ben/src/codec/frames.rs +++ b/ben/src/codec/frames.rs @@ -137,7 +137,7 @@ pub struct BenDecodeFrame { // The full serialized BEN frame bytes, including the header and payload. pub raw_bytes: Vec, // The number of times this frame was repeated - pub count: usize, + pub count: u16, } impl BenDecodeFrame { diff --git a/ben/src/io/reader.rs b/ben/src/io/reader.rs deleted file mode 100644 index dc3acc6..0000000 --- a/ben/src/io/reader.rs +++ /dev/null @@ -1,1613 +0,0 @@ -use crate::codec::decode::{decode_ben32_line, decode_ben_line}; -use crate::codec::{BenDecodeFrame, TwoDeltaFrame}; -use crate::format::banners::{variant_from_banner, BANNER_LEN}; -use crate::util::rle::rle_to_vec; -use crate::{progress, BenVariant}; -use byteorder::{BigEndian, ReadBytesExt}; -use serde_json::json; -use std::fs::File; -use std::io::{self, BufReader, Cursor, Read, Write}; -use std::iter::Peekable; -use std::path::{Path, PathBuf}; -use xz2::read::XzDecoder; - -const XBEN_TWODELTA_FULL_TAG: u8 = 0; -const XBEN_TWODELTA_DELTA_TAG: u8 = 1; -const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; - -/// A decoded assignment together with the number of times it repeats. -pub type MkvRecord = (Vec, u16); -/// A raw ben32 frame together with the number of times it repeats. -pub type Ben32Frame = (Vec, u16); -/// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. -pub type FrameIter = Box> + Send>; - -#[derive(Debug)] -/// Errors produced while validating the header of a decoder input stream. -pub enum DecoderInitError { - /// The leading bytes did not match any supported BEN banner. - InvalidFileFormat(Vec), - /// An I/O error occurred while reading the header. - Io(io::Error), -} - -/// Check whether a header prefix matches the XZ file signature. -/// -/// # Arguments -/// -/// * `h` - The bytes to inspect. -/// -/// # Returns -/// -/// Returns `true` when `h` begins with the standard XZ magic bytes. -fn is_xz_header(h: &[u8]) -> bool { - h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" -} - -/// Convert a byte slice into a space-separated uppercase hex string. -/// -/// # Arguments -/// -/// * `bytes` - The bytes to render. -/// -/// # Returns -/// -/// Returns the formatted hex string. -fn to_hex(bytes: &[u8]) -> String { - bytes - .iter() - .map(|b| format!("{:02X}", b)) - .collect::>() - .join(" ") -} - -impl std::fmt::Display for DecoderInitError { - /// Format the decoder initialization error for display. - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Io(e) => write!(f, "IO error: {e}"), - Self::InvalidFileFormat(header) => { - if is_xz_header(header) { - write!( - f, - "Invalid file format: Compressed header detected (hex: {}). \ - This reader expects an uncompressed .ben file. \ - Decompress this file using the BEN cli `ben -m decode .xben` tool \ - or the `decode_xben_to_ben` function in this library.", - to_hex(header) - ) - } else { - let lossy = String::from_utf8_lossy(header); - write!( - f, - "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", - to_hex(header) - ) - } - } - } - } -} - -impl std::error::Error for DecoderInitError { - /// Return the underlying source error when one exists. - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - DecoderInitError::Io(e) => Some(e), - DecoderInitError::InvalidFileFormat(_) => None, - } - } -} - -impl From for DecoderInitError { - /// Wrap a plain I/O error as a decoder initialization error. - fn from(error: io::Error) -> Self { - DecoderInitError::Io(error) - } -} - -impl From for io::Error { - /// Convert a decoder initialization error into a plain I/O error. - fn from(error: DecoderInitError) -> Self { - match error { - DecoderInitError::Io(e) => e, - DecoderInitError::InvalidFileFormat(msg) => { - io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) - } - } - } -} - -/// Iterator over decoded assignments in an uncompressed BEN stream. -pub struct BenDecoder { - reader: R, - sample_count: usize, - variant: BenVariant, - previous_assignment: Option>, - twodelta_consumed_first_frame: bool, - silent: bool, -} - -enum StoredBenFrame { - Ben(BenDecodeFrame), - TwoDelta { frame: TwoDeltaFrame, count: u16 }, -} - -enum XBenTwoDeltaFrame { - Full { - runs: Vec<(u16, u16)>, - }, - Delta { - pair: (u16, u16), - run_lengths: Vec, - }, -} - -impl StoredBenFrame { - fn count(&self) -> u16 { - match self { - Self::Ben(frame) => frame.count, - Self::TwoDelta { count, .. } => *count, - } - } -} - -impl BenDecoder { - /// Create a decoder for an uncompressed BEN stream. - /// - /// The reader must begin with one of the BEN banners such as - /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. - /// - /// # Arguments - /// - /// * `reader` - The input BEN stream, including its 17-byte banner. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first BEN frame. - pub fn new(mut reader: R) -> Result { - let mut check_buffer = [0u8; BANNER_LEN]; - - if let Err(e) = reader.read_exact(&mut check_buffer) { - return Err(DecoderInitError::Io(e)); - } - - match variant_from_banner(&check_buffer) { - Some(variant) => Ok(BenDecoder { - reader, - sample_count: 0, - variant, - previous_assignment: None, - twodelta_consumed_first_frame: false, - silent: false, - }), - None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), - } - } - - /// Suppress progress output from this decoder's iterator. - /// - /// # Arguments - /// - /// * `silent` - When `true`, the decoder will not emit progress messages. - /// - /// # Returns - /// - /// Returns `self` for method chaining. - pub fn silent(mut self, silent: bool) -> Self { - self.silent = silent; - self - } - - /// Decode the remaining BEN stream and write it as JSONL. - /// - /// # Arguments - /// - /// * `writer` - The destination that will receive one JSON object per - /// decoded sample. - /// - /// # Returns - /// - /// Returns `Ok(())` after the remaining stream has been fully decoded. - pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - let mut sample_number = 0usize; - self.for_each_assignment(|assignment, count| { - for _ in 0..count { - sample_number += 1; - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - } - Ok(true) - }) - } - - /// Read and return the next raw BEN frame stored in standard BEN layout. - /// - /// # Arguments - /// - /// * `with_count` - When `true`, read a trailing `u16` repetition count; - /// otherwise the count defaults to `1`. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_standard_frame_from_reader( - &mut self, - with_count: bool, - ) -> Option> { - let mut b1 = [0u8; 1]; - let max_val_bit_count = match self.reader.read_exact(&mut b1) { - Ok(()) => b1[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - tracing::trace!(""); - tracing::trace!("Done!"); - return None; - } - return Some(Err(e)); - } - }; - - let mut b2 = [0u8; 1]; - if let Err(e) = self.reader.read_exact(&mut b2) { - return Some(Err(e)); - } - let max_len_bit_count = b2[0]; - - let n_bytes = match self.reader.read_u32::() { - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - - let mut raw_assignment = vec![0u8; n_bytes as usize]; - if let Err(e) = self.reader.read_exact(&mut raw_assignment) { - return Some(Err(e)); - } - - let count = if with_count { - match self.reader.read_u16::() { - Ok(c) => c, - Err(e) => return Some(Err(e)), - } - } else { - 1 - }; - - Some(Ok(BenDecodeFrame { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes: raw_assignment, - count, - })) - } - - /// Read and return the next raw TwoDelta frame from the underlying stream. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next TwoDelta frame, `Some(Err(...))` - /// for a read failure, or `None` at a clean end of stream. - fn pop_twodelta_frame_from_reader(&mut self) -> Option> { - let pair_a = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - tracing::trace!(""); - tracing::trace!("Done!"); - return None; - } - return Some(Err(e)); - } - }; - - let pair_b = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - let mut bits = [0u8; 1]; - if let Err(e) = self.reader.read_exact(&mut bits) { - return Some(Err(e)); - } - let max_len_bits = bits[0]; - - let n_bytes = match self.reader.read_u32::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - let mut payload = vec![0u8; n_bytes as usize]; - if let Err(e) = self.reader.read_exact(&mut payload) { - return Some(Err(e)); - } - - let count = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - Some(Ok(StoredBenFrame::TwoDelta { - frame: TwoDeltaFrame::from_parts((pair_a, pair_b), max_len_bits, payload), - count, - })) - } - - /// Read and return the next stored frame from the underlying BEN stream. - /// - /// # Arguments - /// - /// * `&mut self` - The decoder whose internal reader is advanced. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_frame_from_reader(&mut self) -> Option> { - match self.variant { - BenVariant::Standard => self - .pop_standard_frame_from_reader(false) - .map(|res| res.map(StoredBenFrame::Ben)), - BenVariant::MkvChain => self - .pop_standard_frame_from_reader(true) - .map(|res| res.map(StoredBenFrame::Ben)), - BenVariant::TwoDelta => { - if !self.twodelta_consumed_first_frame { - self.twodelta_consumed_first_frame = true; - self.pop_standard_frame_from_reader(true) - .map(|res| res.map(StoredBenFrame::Ben)) - } else { - self.pop_twodelta_frame_from_reader() - } - } - } - } - - /// Consume this decoder and iterate over raw BEN frames instead of - /// materialized assignments. - /// - /// # Returns - /// - /// Returns an iterator that yields raw BEN frames from the remaining input. - pub fn into_frames(self) -> BenFrameDecoeder { - BenFrameDecoeder { inner: self } - } - - /// Count the number of samples remaining in the BEN stream. - /// - /// This consumes the decoder but only walks frame boundaries rather than - /// expanding every assignment into a full vector. - /// - /// # Returns - /// - /// Returns the number of remaining samples in the stream. - pub fn count_samples(self) -> io::Result { - let mut this = self; - let mut total = 0usize; - while let Some(frame_res) = this.pop_frame_from_reader() { - total += frame_res?.count() as usize; - } - Ok(total) - } - - /// Decode assignments and pass each one to a callback by reference. - /// - /// Unlike the `Iterator` implementation, this avoids cloning the assignment - /// buffer on every frame. The decoder owns a single buffer, mutates it in - /// place for TwoDelta frames, and lends `&[u16]` to the callback. This - /// eliminates one full-length memcpy per frame. - /// - /// The callback receives a borrowed assignment slice and its repetition - /// count. Return `true` to continue decoding or `false` to stop early. - /// - /// # Arguments - /// - /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. - /// - /// # Returns - /// - /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. - pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> - where - F: FnMut(&[u16], u16) -> io::Result, - { - loop { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Err(e), - None => return Ok(()), - }; - - let count = frame.count(); - - match frame { - StoredBenFrame::Ben(ben_frame) => { - let assignment = decode_ben_frame_to_assignment(&ben_frame)?; - let keep_going = f(&assignment, count)?; - self.previous_assignment = Some(assignment); - if !keep_going { - return Ok(()); - } - } - StoredBenFrame::TwoDelta { frame, count } => { - let assignment = self.previous_assignment.take().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta frame encountered before an initial BEN frame", - ) - })?; - let run_lengths = frame.run_length_vector; - let assignment = - apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths)?; - let keep_going = f(&assignment, count)?; - self.previous_assignment = Some(assignment); - if !keep_going { - return Ok(()); - } - } - } - - self.sample_count += count as usize; - if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); - } - } - } -} - -/// Decode a raw BEN frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame` - The raw BEN frame to decode. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { - decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bits, - frame.max_len_bits, - frame.n_bytes, - ) - .map(rle_to_vec) -} - -/// Apply decoded TwoDelta run lengths to produce a new assignment vector. -/// -/// Positions in `previous_assignment` that hold either value of `pair` are -/// overwritten according to the alternating run-length encoding. -/// -/// # Arguments -/// -/// * `assignment` - The assignment from the preceding frame (mutated in place). -/// * `pair` - The two label values that participate in the delta. -/// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. -/// -/// # Returns -/// -/// Returns the updated assignment vector. -fn apply_twodelta_runs_to_assignment( - mut assignment: Vec, - pair: (u16, u16), - run_lengths: &[u16], -) -> io::Result> { - let (first, second) = pair; - - let mut run_idx = 0usize; - let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); - let mut current_value = first; - - for val in assignment.iter_mut() { - if *val == first || *val == second { - if remaining_in_run == 0 { - run_idx += 1; - if run_idx >= run_lengths.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta payload exhausted before all pair positions were covered", - )); - } - remaining_in_run = run_lengths[run_idx]; - current_value = if current_value == first { - second - } else { - first - }; - } - *val = current_value; - remaining_in_run -= 1; - } - } - - Ok(assignment) -} - -/// Decode a raw TwoDelta frame into a full assignment vector. -/// -/// Unpacks the bitpacked run lengths from the frame payload, then applies -/// them in a single pass over the assignment. -/// -/// # Arguments -/// -/// * `assignment` - The assignment from the preceding frame (mutated in place). -/// * `frame` - The TwoDelta frame whose packed payload is decoded and applied. -/// -/// # Returns -/// -/// Returns the updated assignment vector. -fn decode_twodelta_frame_to_assignment( - assignment: Vec, - frame: &TwoDeltaFrame, -) -> io::Result> { - apply_twodelta_runs_to_assignment(assignment, frame.pair, &frame.run_length_vector) -} - -/// Decode a stored BEN frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `previous_assignment` - The assignment from the preceding frame, required -/// for TwoDelta frames. -/// * `frame` - The stored frame to decode. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_stored_frame_to_assignment( - previous_assignment: &mut Option>, - frame: &StoredBenFrame, -) -> io::Result> { - match frame { - StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), - StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( - previous_assignment.take().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta frame encountered before an initial BEN frame", - ) - })?, - frame, - ), - } -} - -impl Iterator for BenDecoder { - type Item = io::Result; - - /// Decode and return the next assignment from the BEN stream. - fn next(&mut self) -> Option> { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; - let assignment = - match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { - Ok(assgn) => assgn, - Err(e) => return Some(Err(e)), - }; - let count = frame.count(); - self.previous_assignment = Some(assignment.clone()); - self.sample_count += count as usize; - if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); - } - Some(Ok((assignment, count))) - } -} - -/// Iterator over raw BEN frames. -pub struct BenFrameDecoeder { - inner: BenDecoder, -} - -impl BenFrameDecoeder { - /// Create a raw BEN frame iterator from a reader. - /// - /// # Arguments - /// - /// * `reader` - The input BEN stream, including its 17-byte banner. - /// - /// # Returns - /// - /// Returns an iterator over raw BEN frames. - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: BenDecoder::new(reader)?, - }) - } -} - -impl Iterator for BenFrameDecoeder { - type Item = io::Result; - - /// Return the next raw BEN frame from the input stream. - fn next(&mut self) -> Option { - match self.inner.variant { - BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() - { - Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), - Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - "unexpected TwoDelta frame in non-TwoDelta BEN stream", - ))), - Some(Err(err)) => Some(Err(err)), - None => None, - }, - BenVariant::TwoDelta => match self.inner.next() { - Some(Ok((assignment, count))) => { - let encoded = BenDecodeFrame::from_assignment(&assignment); - let raw_data = encoded.as_slice()[6..].to_vec(); - Some(Ok(BenDecodeFrame { - max_val_bits: encoded.max_val_bit_count, - max_len_bits: encoded.max_len_bit_count, - count, - n_bytes: encoded.n_bytes, - raw_bytes: raw_data, - })) - } - Some(Err(err)) => Some(Err(err)), - None => None, - }, - } - } -} - -/// Iterator over decoded assignments in an XBEN stream. -pub struct XBenDecoder { - xz: BufReader>, - /// Variant encoded in the XBEN banner. - pub variant: BenVariant, - overflow: Vec, - buf: Box<[u8]>, - previous_assignment: Option>, - chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, -} - -impl XBenDecoder { - /// Create an XBEN decoder from an already-opened decompressed stream. - /// - /// # Arguments - /// - /// * `xz` - A buffered XZ decompression reader positioned past the banner. - /// * `variant` - The BEN variant indicated by the banner. - /// - /// # Returns - /// - /// Returns a new decoder ready to yield frames from the stream. - pub(crate) fn from_decompressed_stream( - xz: BufReader>, - variant: BenVariant, - ) -> Self { - Self { - xz, - variant, - overflow: Vec::with_capacity(1 << 20), - buf: vec![0u8; 1 << 20].into_boxed_slice(), - previous_assignment: None, - chunk_queue: std::collections::VecDeque::new(), - } - } - - /// Create a decoder for an XBEN stream. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first ben32 frame in the - /// decompressed payload. - pub fn new(reader: R) -> io::Result { - let xz = XzDecoder::new(reader); - let mut xz = BufReader::with_capacity(1 << 20, xz); - - let mut first = [0u8; BANNER_LEN]; - xz.read_exact(&mut first)?; - let variant = variant_from_banner(&first).ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", - ) - })?; - - Ok(Self::from_decompressed_stream(xz, variant)) - } - - /// Try to extract one complete ben32 frame from the buffered overflow. - /// - /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 - /// frame and, for MkvChain streams, reads the trailing repetition count. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain one or more - /// complete ben32 frames. - /// - /// # Returns - /// - /// Returns the frame bytes, the number of consumed bytes, and the decoded - /// repetition count when a complete frame is available. - fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { - match self.variant { - BenVariant::Standard => { - if overflow.len() < 4 { - return None; - } - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let end = i + 1; - let frame = &overflow[..end]; - return Some((frame, end, 1)); - } - } - None - } - BenVariant::MkvChain => { - if overflow.len() < 6 { - return None; - } - for i in (3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let count_hi = overflow[i + 1]; - let count_lo = overflow[i + 2]; - let count = u16::from_be_bytes([count_hi, count_lo]); - let end = i + 3; - let frame = &overflow[..end]; - return Some((frame, end, count)); - } - } - None - } - BenVariant::TwoDelta => None, - } - } - - /// Try to extract one complete TwoDelta frame from the buffered overflow. - /// - /// Inspects the leading tag byte to determine whether the frame is a full - /// RLE frame or a delta frame, then reads the corresponding payload. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain a complete - /// TwoDelta frame. - /// - /// # Returns - /// - /// Returns the parsed frame, the number of consumed bytes, and the - /// repetition count when a complete frame is available. - fn pop_twodelta_frame_from_overflow( - &self, - overflow: &[u8], - ) -> Option> { - let tag = *overflow.first()?; - match tag { - XBEN_TWODELTA_FULL_TAG => { - if overflow.len() < 7 { - return None; - } - let run_count = - u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) - as usize; - let payload_len = run_count.checked_mul(4)?; - let total_len = 1usize - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; - if overflow.len() < total_len { - return None; - } - - let mut runs = Vec::with_capacity(run_count); - let mut cursor = 5usize; - for _ in 0..run_count { - let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); - runs.push((value, len)); - cursor += 4; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) - } - XBEN_TWODELTA_DELTA_TAG => { - if overflow.len() < 11 { - return None; - } - let pair = ( - u16::from_be_bytes([overflow[1], overflow[2]]), - u16::from_be_bytes([overflow[3], overflow[4]]), - ); - let run_count = - u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) - as usize; - let payload_len = run_count.checked_mul(2)?; - let total_len = 1usize - .checked_add(2)? - .checked_add(2)? - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; - if overflow.len() < total_len { - return None; - } - - let mut run_lengths = Vec::with_capacity(run_count); - let mut cursor = 9usize; - for _ in 0..run_count { - run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); - cursor += 2; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok(( - XBenTwoDeltaFrame::Delta { pair, run_lengths }, - total_len, - count, - ))) - } - XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. - _ => Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - "invalid TwoDelta XBEN frame tag", - ))), - } - } - - /// Try to parse a columnar TwoDelta chunk from the overflow buffer. - /// - /// If the overflow starts with the chunk tag and contains enough bytes for - /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. - /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, - /// or `None` when the overflow is incomplete. - fn try_parse_twodelta_chunk(&mut self) -> Option> { - if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { - return None; - } - if self.overflow.len() < 5 { - return None; - } - - let n_frames = u32::from_be_bytes([ - self.overflow[1], - self.overflow[2], - self.overflow[3], - self.overflow[4], - ]) as usize; - - // Calculate total chunk size: tag(1) + n_frames(4) - // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) - let header_len = 5; - let pairs_len = n_frames * 4; - let counts_len = n_frames * 2; - let run_counts_len = n_frames * 4; - let fixed_len = header_len + pairs_len + counts_len + run_counts_len; - - if self.overflow.len() < fixed_len { - return None; - } - - // Read run-length counts to determine total run data size. - let run_counts_start = header_len + pairs_len + counts_len; - let mut total_runs = 0usize; - let mut run_counts = Vec::with_capacity(n_frames); - for i in 0..n_frames { - let offset = run_counts_start + i * 4; - let rc = u32::from_be_bytes([ - self.overflow[offset], - self.overflow[offset + 1], - self.overflow[offset + 2], - self.overflow[offset + 3], - ]) as usize; - run_counts.push(rc); - total_runs += rc; - } - - let run_data_len = total_runs * 2; - let total_len = fixed_len + run_data_len; - if self.overflow.len() < total_len { - return None; - } - - // Parse pairs channel. - let pairs_start = header_len; - // Parse counts channel. - let counts_start = pairs_start + pairs_len; - // Run data starts after run counts. - let run_data_start = run_counts_start + run_counts_len; - - let mut run_cursor = run_data_start; - for i in 0..n_frames { - let po = pairs_start + i * 4; - let pair = ( - u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), - u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), - ); - let co = counts_start + i * 2; - let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); - - let rc = run_counts[i]; - let mut run_lengths = Vec::with_capacity(rc); - for _ in 0..rc { - run_lengths.push(u16::from_be_bytes([ - self.overflow[run_cursor], - self.overflow[run_cursor + 1], - ])); - run_cursor += 2; - } - - self.chunk_queue - .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); - } - - self.overflow.drain(..total_len); - Some(Ok(())) - } - - /// Consume this decoder and iterate over raw ben32 frames instead of - /// materialized assignments. - /// - /// # Returns - /// - /// Returns an iterator that yields raw ben32 frames from the remaining - /// input. - pub fn into_frames(self) -> XBenFrameDecoder { - XBenFrameDecoder { inner: self } - } - - /// Count the number of samples remaining in the XBEN stream. - /// - /// # Returns - /// - /// Returns the number of remaining samples in the stream. - pub fn count_samples(self) -> io::Result { - let mut total = 0usize; - for frame_res in self.into_frames() { - let (_bytes, cnt) = frame_res?; - total += cnt as usize; - } - Ok(total) - } -} - -/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame_bytes` - The ben32 frame bytes. -/// * `variant` - The BEN variant used to interpret the frame tail. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_xben_frame_to_assignment( - frame_bytes: &[u8], - variant: BenVariant, -) -> io::Result> { - let cursor = Cursor::new(frame_bytes); - let (assignment, _) = decode_ben32_line(cursor, variant)?; - Ok(assignment) -} - -impl Iterator for XBenDecoder { - type Item = io::Result; - - /// Decode and return the next assignment from the XBEN stream. - fn next(&mut self) -> Option { - loop { - match self.variant { - BenVariant::Standard | BenVariant::MkvChain => { - if let Some((frame_bytes, consumed, count)) = - self.pop_frame_from_overflow(&self.overflow) - { - let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - Ok((assignment, count)) - } - Err(e) => Err(e), - }; - self.overflow.drain(..consumed); - return Some(res); - } - } - BenVariant::TwoDelta => { - // Drain frames from a previously parsed chunk first. - if let Some((frame, count)) = self.chunk_queue.pop_front() { - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(prev) => { - apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) - } - None => Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN frame encountered before an initial BEN frame", - )), - } - } - }; - return Some(match assignment { - Ok(a) => { - self.previous_assignment = Some(a.clone()); - Ok((a, count)) - } - Err(e) => Err(e), - }); - } - - // Try to parse a columnar chunk. - if let Some(result) = self.try_parse_twodelta_chunk() { - match result { - Ok(()) => continue, // Loop to drain chunk_queue. - Err(e) => return Some(Err(e)), - } - } - - // Try a single legacy frame (tag 0 or 1). - if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { - let res = match parsed { - Ok((frame, consumed, count)) => { - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(previous_assignment) => { - apply_twodelta_runs_to_assignment( - previous_assignment, - pair, - &run_lengths, - ) - } - None => Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN frame encountered before an initial BEN frame", - )), - } - } - }; - match assignment { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - self.overflow.drain(..consumed); - Ok((assignment, count)) - } - Err(err) => { - self.overflow.drain(..consumed); - Err(err) - } - } - } - Err(err) => { - self.overflow.clear(); - Err(err) - } - }; - return Some(res); - } - } - } - - let read = match self.xz.read(&mut self.buf) { - Ok(0) => { - if self.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.overflow.extend_from_slice(&self.buf[..read]); - } - } -} - -/// Iterator over raw ben32 frames inside an XBEN stream. -pub struct XBenFrameDecoder { - inner: XBenDecoder, -} - -impl XBenFrameDecoder { - /// Create a raw XBEN frame iterator from a reader. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns an iterator over raw ben32 frames. - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: XBenDecoder::new(reader)?, - }) - } -} - -impl Iterator for XBenFrameDecoder { - type Item = io::Result; - - /// Return the next raw ben32 frame from the input stream. - fn next(&mut self) -> Option { - if self.inner.variant == BenVariant::TwoDelta { - return self.inner.next().map(|result| { - result.and_then(|(assignment, count)| { - Ok((encode_ben32_assignments(&assignment)?.into_u8_vec()?, count)) - }) - }); - } - - loop { - if let Some((frame, consumed, count)) = - self.inner.pop_frame_from_overflow(&self.inner.overflow) - { - let out = frame.to_vec(); - self.inner.overflow.drain(..consumed); - return Some(Ok((out, count))); - } - - let read = match self.inner.xz.read(&mut self.inner.buf) { - Ok(0) => { - if self.inner.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.inner - .overflow - .extend_from_slice(&self.inner.buf[..read]); - } - } -} - -#[derive(Clone)] -/// A generalized frame type used by the subsampling machinery. -pub enum DecodeFrame { - /// A raw BEN frame. - Ben(BenDecodeFrame), - /// A raw ben32 frame from an XBEN stream together with its variant. - XBen(Vec, BenVariant), -} - -/// A selection strategy for extracting only part of a frame stream. -pub enum Selection { - /// Select explicit 1-based indices. - Indices(Peekable>), - /// Select every `step` samples starting at the 1-based `offset`. - Every { step: usize, offset: usize }, - /// Select the inclusive 1-based range `[start, end]`. - Range { start: usize, end: usize }, -} - -/// Decode a generic frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame` - Either a BEN frame or an XBEN ben32 frame. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { - match frame { - DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), - DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), - } -} - -/// Iterator adaptor that decodes only selected samples from a frame stream. -pub struct SubsampleFrameDecoder -where - I: Iterator>, -{ - inner: I, - selection: Selection, - sample: usize, -} - -impl SubsampleFrameDecoder -where - I: Iterator>, -{ - /// Create a subsampling iterator from a lower-level frame iterator. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `selection` - The sample-selection rule to apply. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn new(inner: I, selection: Selection) -> Self { - Self { - inner, - selection, - sample: 0, - } - } - - /// Select a set of 1-based sample indices. - /// - /// Indices are sorted and deduplicated before iteration begins. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn by_indices(inner: I, indices: T) -> Self - where - T: IntoIterator, - { - let mut v: Vec = indices.into_iter().collect(); - v.sort_unstable(); - v.dedup(); - Self::new(inner, Selection::Indices(v.into_iter().peekable())) - } - - /// Select the inclusive 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn by_range(inner: I, start: usize, end: usize) -> Self { - assert!( - start >= 1 && end >= start, - "range must be 1-based and end >= start" - ); - Self::new(inner, Selection::Range { start, end }) - } - - /// Select every `step` samples beginning from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn every(inner: I, step: usize, offset: usize) -> Self { - assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); - Self::new(inner, Selection::Every { step, offset }) - } - - /// Count how many selected samples fall within an inclusive sample interval. - /// - /// # Arguments - /// - /// * `lo` - The first 1-based sample index covered by the current frame. - /// * `hi` - The last 1-based sample index covered by the current frame. - /// - /// # Returns - /// - /// Returns the number of selected samples represented by the frame. - fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { - match &mut self.selection { - Selection::Indices(iter) => { - let mut taken = 0u16; - while let Some(&next) = iter.peek() { - if next < lo { - iter.next(); - continue; - } - if next > hi { - break; - } - iter.next(); - taken = taken.saturating_add(1); - } - taken - } - Selection::Every { step, offset } => { - let start = lo.max(*offset); - if start > hi { - return 0; - } - let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; - let first = start + ((*step - r) % *step); - if first > hi { - 0 - } else { - (1 + (hi - first) / *step) as u16 - } - } - Selection::Range { start, end } => { - if hi < *start || lo > *end { - 0 - } else { - let a = lo.max(*start); - let b = hi.min(*end); - (b - a + 1) as u16 - } - } - } - } -} - -impl Iterator for SubsampleFrameDecoder -where - I: Iterator>, -{ - type Item = io::Result; - - /// Return the next decoded sample selected by the subsampling rule. - fn next(&mut self) -> Option { - loop { - if let Selection::Range { end, .. } = self.selection { - if self.sample >= end { - return None; - } - } - if let Selection::Indices(ref mut it) = self.selection { - if it.peek().is_none() { - return None; - } - } - - let (frame, count) = match self.inner.next()? { - Ok(x) => x, - Err(e) => return Some(Err(e)), - }; - - let lo = self.sample + 1; - let hi = self.sample + count as usize; - let selected = self.count_selected_in(lo, hi); - - self.sample = hi; - - if selected > 0 { - match decode_frame_to_assignment(&frame) { - Ok(assignment) => return Some(Ok((assignment, selected))), - Err(e) => return Some(Err(e)), - } - } - } - } -} - -/// Build a generic frame iterator from a BEN or XBEN file path. -/// -/// Frame iteration is useful for subsampling and counting because it avoids -/// decoding every sample into a full assignment vector. -/// -/// # Arguments -/// -/// * `file_path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns a boxed iterator over generic frames and their repetition counts. -pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { - let file = File::options().read(true).open(file_path)?; - let reader = BufReader::new(file); - - match mode { - "ben" => { - let frames = BenFrameDecoeder::new(reader)?; - let mapped = frames.map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - Ok(Box::new(mapped)) - } - "xben" => { - let x = XBenDecoder::new(reader)?; - let variant = x.variant; - let frames = x.into_frames(); - let mapped = frames - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - Ok(Box::new(mapped)) - } - _ => Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), - } -} - -impl BenDecoder { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let count = f.count; - (DecodeFrame::Ben(f), count) - }) - }); - SubsampleFrameDecoder::by_indices(frames, indices) - } - - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::by_range(frames, start, end) - } - - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::every(frames, step, offset) - } -} - -impl XBenDecoder { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_indices(Box::new(frames), indices) - } - - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_range(Box::new(frames), start, end) - } - - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::every(Box::new(frames), step, offset) - } -} - -/// Count the number of samples in a BEN or XBEN file on disk. -/// -/// The file is walked frame-by-frame, so this is linear in file size but avoids -/// materializing full assignment vectors. -/// -/// # Arguments -/// -/// * `path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns the number of samples in the file. -pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { - let iter = build_frame_iter(&path.to_path_buf(), mode)?; - let mut total = 0usize; - for item in iter { - let (_frame, cnt) = item?; - total += cnt as usize; - } - Ok(total) -} diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs index dc3acc6..b4cbc4f 100644 --- a/ben/src/io/reader/ben.rs +++ b/ben/src/io/reader/ben.rs @@ -1,5 +1,11 @@ +use super::errors::DecoderInitError; +use super::twodelta::{ + XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_DELTA_TAG, XBEN_TWODELTA_FULL_TAG, +}; use crate::codec::decode::{decode_ben32_line, decode_ben_line}; -use crate::codec::{BenDecodeFrame, TwoDeltaFrame}; +use crate::codec::encode::encode_ben32_assignments; +use crate::codec::{BenDecodeFrame, BenEncodeFrame, TwoDeltaFrame}; +use crate::codec::encode::FromAssign; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; @@ -11,10 +17,6 @@ use std::iter::Peekable; use std::path::{Path, PathBuf}; use xz2::read::XzDecoder; -const XBEN_TWODELTA_FULL_TAG: u8 = 0; -const XBEN_TWODELTA_DELTA_TAG: u8 = 1; -const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; - /// A decoded assignment together with the number of times it repeats. pub type MkvRecord = (Vec, u16); /// A raw ben32 frame together with the number of times it repeats. @@ -22,102 +24,6 @@ pub type Ben32Frame = (Vec, u16); /// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. pub type FrameIter = Box> + Send>; -#[derive(Debug)] -/// Errors produced while validating the header of a decoder input stream. -pub enum DecoderInitError { - /// The leading bytes did not match any supported BEN banner. - InvalidFileFormat(Vec), - /// An I/O error occurred while reading the header. - Io(io::Error), -} - -/// Check whether a header prefix matches the XZ file signature. -/// -/// # Arguments -/// -/// * `h` - The bytes to inspect. -/// -/// # Returns -/// -/// Returns `true` when `h` begins with the standard XZ magic bytes. -fn is_xz_header(h: &[u8]) -> bool { - h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" -} - -/// Convert a byte slice into a space-separated uppercase hex string. -/// -/// # Arguments -/// -/// * `bytes` - The bytes to render. -/// -/// # Returns -/// -/// Returns the formatted hex string. -fn to_hex(bytes: &[u8]) -> String { - bytes - .iter() - .map(|b| format!("{:02X}", b)) - .collect::>() - .join(" ") -} - -impl std::fmt::Display for DecoderInitError { - /// Format the decoder initialization error for display. - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Io(e) => write!(f, "IO error: {e}"), - Self::InvalidFileFormat(header) => { - if is_xz_header(header) { - write!( - f, - "Invalid file format: Compressed header detected (hex: {}). \ - This reader expects an uncompressed .ben file. \ - Decompress this file using the BEN cli `ben -m decode .xben` tool \ - or the `decode_xben_to_ben` function in this library.", - to_hex(header) - ) - } else { - let lossy = String::from_utf8_lossy(header); - write!( - f, - "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", - to_hex(header) - ) - } - } - } - } -} - -impl std::error::Error for DecoderInitError { - /// Return the underlying source error when one exists. - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - DecoderInitError::Io(e) => Some(e), - DecoderInitError::InvalidFileFormat(_) => None, - } - } -} - -impl From for DecoderInitError { - /// Wrap a plain I/O error as a decoder initialization error. - fn from(error: io::Error) -> Self { - DecoderInitError::Io(error) - } -} - -impl From for io::Error { - /// Convert a decoder initialization error into a plain I/O error. - fn from(error: DecoderInitError) -> Self { - match error { - DecoderInitError::Io(e) => e, - DecoderInitError::InvalidFileFormat(msg) => { - io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) - } - } - } -} - /// Iterator over decoded assignments in an uncompressed BEN stream. pub struct BenDecoder { reader: R, @@ -133,16 +39,6 @@ enum StoredBenFrame { TwoDelta { frame: TwoDeltaFrame, count: u16 }, } -enum XBenTwoDeltaFrame { - Full { - runs: Vec<(u16, u16)>, - }, - Delta { - pair: (u16, u16), - run_lengths: Vec, - }, -} - impl StoredBenFrame { fn count(&self) -> u16 { match self { @@ -473,8 +369,8 @@ impl BenDecoder { fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { decode_ben_line( Cursor::new(&frame.raw_bytes), - frame.max_val_bits, - frame.max_len_bits, + frame.max_val_bit_count, + frame.max_len_bit_count, frame.n_bytes, ) .map(rle_to_vec) @@ -644,14 +540,13 @@ impl Iterator for BenFrameDecoeder { }, BenVariant::TwoDelta => match self.inner.next() { Some(Ok((assignment, count))) => { - let encoded = BenDecodeFrame::from_assignment(&assignment); - let raw_data = encoded.as_slice()[6..].to_vec(); + let encoded = BenEncodeFrame::from_assignment(&assignment, None); Some(Ok(BenDecodeFrame { - max_val_bits: encoded.max_val_bit_count, - max_len_bits: encoded.max_len_bit_count, + max_val_bit_count: encoded.max_val_bit_count, + max_len_bit_count: encoded.max_len_bit_count, count, n_bytes: encoded.n_bytes, - raw_bytes: raw_data, + raw_bytes: encoded.raw_bytes[6..].to_vec(), })) } Some(Err(err)) => Some(Err(err)), @@ -1146,7 +1041,7 @@ impl Iterator for XBenFrameDecoder { if self.inner.variant == BenVariant::TwoDelta { return self.inner.next().map(|result| { result.and_then(|(assignment, count)| { - Ok((encode_ben32_assignments(&assignment)?.into_u8_vec()?, count)) + Ok((encode_ben32_assignments(&assignment)?, count)) }) }); } diff --git a/ben/src/io/reader/errors.rs b/ben/src/io/reader/errors.rs index e69de29..bbefac8 100644 --- a/ben/src/io/reader/errors.rs +++ b/ben/src/io/reader/errors.rs @@ -0,0 +1,97 @@ +use std::io; + +#[derive(Debug)] +/// Errors produced while validating the header of a decoder input stream. +pub enum DecoderInitError { + /// The leading bytes did not match any supported BEN banner. + InvalidFileFormat(Vec), + /// An I/O error occurred while reading the header. + Io(io::Error), +} + +/// Check whether a header prefix matches the XZ file signature. +/// +/// # Arguments +/// +/// * `h` - The bytes to inspect. +/// +/// # Returns +/// +/// Returns `true` when `h` begins with the standard XZ magic bytes. +fn is_xz_header(h: &[u8]) -> bool { + h.len() >= 6 && &h[..6] == b"\xFD\x37\x7A\x58\x5A\x00" +} + +/// Convert a byte slice into a space-separated uppercase hex string. +/// +/// # Arguments +/// +/// * `bytes` - The bytes to render. +/// +/// # Returns +/// +/// Returns the formatted hex string. +fn to_hex(bytes: &[u8]) -> String { + bytes + .iter() + .map(|b| format!("{:02X}", b)) + .collect::>() + .join(" ") +} + +impl std::fmt::Display for DecoderInitError { + /// Format the decoder initialization error for display. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(e) => write!(f, "IO error: {e}"), + Self::InvalidFileFormat(header) => { + if is_xz_header(header) { + write!( + f, + "Invalid file format: Compressed header detected (hex: {}). \ + This reader expects an uncompressed .ben file. \ + Decompress this file using the BEN cli `ben -m decode .xben` tool \ + or the `decode_xben_to_ben` function in this library.", + to_hex(header) + ) + } else { + let lossy = String::from_utf8_lossy(header); + write!( + f, + "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", + to_hex(header) + ) + } + } + } + } +} + +impl std::error::Error for DecoderInitError { + /// Return the underlying source error when one exists. + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + DecoderInitError::Io(e) => Some(e), + DecoderInitError::InvalidFileFormat(_) => None, + } + } +} + +impl From for DecoderInitError { + /// Wrap a plain I/O error as a decoder initialization error. + fn from(error: io::Error) -> Self { + DecoderInitError::Io(error) + } +} + +impl From for io::Error { + /// Convert a decoder initialization error into a plain I/O error. + fn from(error: DecoderInitError) -> Self { + match error { + DecoderInitError::Io(e) => e, + DecoderInitError::InvalidFileFormat(msg) => { + io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) + } + } + } +} diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs index 1a01ab4..01ff258 100644 --- a/ben/src/io/reader/mod.rs +++ b/ben/src/io/reader/mod.rs @@ -1 +1,11 @@ pub mod ben; +pub mod errors; +pub(crate) mod tests; +pub(crate) mod twodelta; + +pub use ben::{ + build_frame_iter, count_samples_from_file, Ben32Frame, BenDecoder, BenFrameDecoeder, + DecodeFrame, FrameIter, MkvRecord, Selection, SubsampleFrameDecoder, XBenDecoder, + XBenFrameDecoder, +}; +pub use errors::DecoderInitError; diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs index e69de29..2fb31b6 100644 --- a/ben/src/io/reader/twodelta.rs +++ b/ben/src/io/reader/twodelta.rs @@ -0,0 +1,13 @@ +pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; +pub(super) const XBEN_TWODELTA_DELTA_TAG: u8 = 1; +pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; + +pub(super) enum XBenTwoDeltaFrame { + Full { + runs: Vec<(u16, u16)>, + }, + Delta { + pair: (u16, u16), + run_lengths: Vec, + }, +} diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index a61f940..559f725 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -1,3 +1,4 @@ +use super::twodelta::{DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; @@ -11,12 +12,6 @@ use std::collections::HashMap; use std::io::{self, BufRead, Read, Result, Write}; use xz2::write::XzEncoder; -const XBEN_TWODELTA_FULL_TAG: u8 = 0; -const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; - -/// Default number of delta frames per columnar chunk in XBEN TwoDelta. -pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; - /// A buffered delta frame awaiting chunk serialization. struct BufferedDeltaFrame { pair: (u16, u16), diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 43aa94d..a5a426f 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -1,3 +1,8 @@ pub mod ben; +pub(crate) mod frames; +pub(crate) mod tests; +pub(crate) mod twodelta; +pub(crate) mod utils; pub use ben::{BenEncoder, XBenEncoder}; +pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 2cc0f34..6901077 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -1,5 +1,5 @@ -const XBEN_TWODELTA_FULL_TAG: u8 = 0; -const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; +pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; +pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// Default number of delta frames per columnar chunk in XBEN TwoDelta. pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index af51f60..a7dd4ad 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -1,7 +1,7 @@ //! Relabeling operations for BEN files. use crate::codec::decode::decode_ben_line; -use crate::codec::BenEncodeFrame; +use crate::codec::{BenEncodeFrame, FromRLE}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::io::reader::BenDecoder; use crate::io::writer::BenEncoder; @@ -358,7 +358,7 @@ fn relabel_ben_lines_impl( 1 }; - let relabeled = BenEncodeFrame::from_rle(ben_line); + let relabeled = BenEncodeFrame::from_rle(ben_line, None); writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; @@ -584,7 +584,7 @@ fn relabel_ben_lines_with_map_impl( 1 }; - let relabeled = BenEncodeFrame::from_rle(new_rle.clone()); + let relabeled = BenEncodeFrame::from_rle(new_rle.clone(), None); writer.write_all(relabeled.as_slice())?; if variant == BenVariant::MkvChain { writer.write_all(&count_occurrences.to_be_bytes())?; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index a3fcf07..f2d3259 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -1,6 +1,7 @@ use super::*; use crate::codec::decode::decode_ben_to_jsonl; -use crate::codec::encode::{encode_ben_vec_from_rle, encode_jsonl_to_ben}; +use crate::codec::encode::encode_jsonl_to_ben; +use crate::codec::{BenEncodeFrame, FromRLE}; use crate::util::rle::assign_to_rle; use rand::seq::SliceRandom; use rand::SeedableRng; @@ -29,10 +30,10 @@ where fn test_relabel_ben_line_simple() { let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; - let input = BenFrame::from_rle(in_rle); + let input = BenEncodeFrame::from_rle(in_rle, None); let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; - let expected = BenFrame::from_rle(out_rle); + let expected = BenEncodeFrame::from_rle(out_rle, None); let mut buf = Vec::new(); relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); @@ -203,11 +204,11 @@ fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; let in_rle = assign_to_rle(in_assign); - let input = BenFrame::from_rle(in_rle); + let input = BenEncodeFrame::from_rle(in_rle, None); let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; let out_rle = assign_to_rle(out_assign); - let expected = BenFrame::from_rle(out_rle); + let expected = BenEncodeFrame::from_rle(out_rle, None); let mut new_to_old_map = HashMap::new(); new_to_old_map.insert(0, 2); @@ -238,11 +239,11 @@ fn test_relabel_ben_line_with_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign); - let input = BenFrame::from_rle(in_rle); + let input = BenEncodeFrame::from_rle(in_rle, None); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenFrame::from_rle(out_rle); + let expected = BenEncodeFrame::from_rle(out_rle, None); let mut buf = Vec::new(); relabel_ben_lines_with_map( @@ -269,11 +270,11 @@ fn test_relabel_ben_line_with_large_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign.to_vec()); - let input = BenFrame::from_rle(in_rle); + let input = BenEncodeFrame::from_rle(in_rle, None); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenFrame::from_rle(out_rle); + let expected = BenEncodeFrame::from_rle(out_rle, None); let mut buf = Vec::new(); relabel_ben_lines_with_map( From a0e8f737f6567906b20bd36e7ba33e56f12ac3cd Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 19 Mar 2026 22:32:15 -0600 Subject: [PATCH 041/221] Make a better error system --- ben/src/cli/pben.rs | 4 +- ben/src/codec/decode/ben32.rs | 4 +- ben/src/codec/decode/errors.rs | 37 ++++ ben/src/codec/decode/mod.rs | 2 + ben/src/codec/decode/xz.rs | 19 +- ben/src/codec/encode/errors.rs | 55 ++++- ben/src/codec/encode/jsonl.rs | 7 +- ben/src/codec/encode/mod.rs | 1 + ben/src/codec/encode/twodelta.rs | 86 +++----- ben/src/codec/encode/xz.rs | 15 +- ben/src/codec/translate/errors.rs | 33 +++ ben/src/codec/translate/mod.rs | 28 +-- ben/src/codec/translate/tests.rs | 4 +- ben/src/format/errors.rs | 24 +++ ben/src/format/mod.rs | 2 + ben/src/io/reader/ben.rs | 72 +++---- ben/src/io/reader/errors.rs | 83 +++----- ben/src/io/writer/ben.rs | 329 +++--------------------------- ben/src/io/writer/frames.rs | 28 +++ ben/src/io/writer/utils.rs | 210 +++++++++++++++++++ ben/src/ops/extract/mod.rs | 114 +++-------- ben/src/ops/extract/tests.rs | 16 +- ben/src/ops/relabel/errors.rs | 32 +++ ben/src/ops/relabel/mod.rs | 64 +++--- ben/src/ops/relabel/tests.rs | 4 +- pyben/src/encode/mod.rs | 3 +- 26 files changed, 655 insertions(+), 621 deletions(-) create mode 100644 ben/src/codec/decode/errors.rs create mode 100644 ben/src/codec/translate/errors.rs create mode 100644 ben/src/format/errors.rs create mode 100644 ben/src/ops/relabel/errors.rs diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 9a2b819..3dfe31e 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -223,7 +223,7 @@ fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { /// Read zero-based assignment vectors and encode them as BEN. fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { - let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain); + let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) @@ -239,7 +239,7 @@ fn assignment_encode_ben(reader: R, writer: W) -> i /// Read zero-based assignment vectors and encode them as XBEN. fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { let encoder = XzEncoder::new(writer, 9); - let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain); + let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs index b6b1b92..e630300 100644 --- a/ben/src/codec/decode/ben32.rs +++ b/ben/src/codec/decode/ben32.rs @@ -45,9 +45,7 @@ pub(crate) fn decode_ben32_line( } let count = if variant == BenVariant::MkvChain { - reader - .read_u16::() - .expect("Error when reading sample.") + reader.read_u16::()? } else { 1 }; diff --git a/ben/src/codec/decode/errors.rs b/ben/src/codec/decode/errors.rs new file mode 100644 index 0000000..d2d280b --- /dev/null +++ b/ben/src/codec/decode/errors.rs @@ -0,0 +1,37 @@ +use crate::BenVariant; +use std::io; +use thiserror::Error; + +/// Errors produced while decoding BEN or XBEN streams. +#[derive(Debug, Error)] +pub enum DecodeError { + #[error("TwoDelta run-length vector exhausted after {run_idx} runs \ + before position {pos} was covered")] + TwoDeltaRunsExhausted { run_idx: usize, pos: usize }, + + #[error("unknown XBEN frame tag byte {tag:#04x}")] + XBenUnknownFrameTag { tag: u8 }, + + #[error("truncated XBEN stream: partial frame at end of input")] + XBenTruncated, + + #[error("TwoDelta frame encountered before an initial full-assignment frame")] + TwoDeltaNoAnchorFrame, + + #[error( + "unexpected TwoDelta frame in a non-TwoDelta BEN stream (variant: {variant:?})" + )] + UnexpectedTwoDeltaFrame { variant: BenVariant }, + + #[error("IO error: {0}")] + Io(#[from] io::Error), +} + +impl From for io::Error { + fn from(e: DecodeError) -> Self { + match e { + DecodeError::Io(e) => e, + other => io::Error::new(io::ErrorKind::InvalidData, other), + } + } +} diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index 2036898..9b5cce7 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -2,6 +2,8 @@ mod ben; mod ben32; +pub(crate) mod errors; +pub(crate) use errors::DecodeError; mod xz; pub use ben::{decode_ben_line, decode_ben_to_jsonl}; diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index d6a52cb..aef9793 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,11 +1,12 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::codec::translate::ben32_to_ben_lines; use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; +use crate::format::FormatError; use crate::io::reader::XBenDecoder; use crate::io::writer::BenEncoder; use crate::{progress, BenVariant}; use serde_json::json; -use std::io::{self, BufRead, BufReader, Error, Read, Write}; +use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; /// Decode an XBEN stream into an equivalent BEN stream. @@ -44,7 +45,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BufReader::new(decoder), BenVariant::TwoDelta, ); - let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta); + let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta)?; for record in &mut xben { let (assignment, count) = record?; ben.write_assignment(assignment.clone())?; @@ -55,10 +56,9 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: return Ok(()); } None => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); + return Err(io::Error::from(FormatError::UnknownBanner { + actual: first_buffer.to_vec(), + })); } }; @@ -183,10 +183,9 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i return Ok(()); } None => { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )); + return Err(io::Error::from(FormatError::UnknownBanner { + actual: first_buffer.to_vec(), + })); } }; diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index c1027ca..39e38fc 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -2,13 +2,56 @@ use std::io; use thiserror::Error; #[derive(Debug, Error)] -pub(crate) enum BenEncodeError { - #[error("Encountered a repeated sample when encoding.")] - RepeatedSample, +pub enum EncodeError { + #[error("line {line}: JSON parse error: {source}")] + JsonParse { + line: usize, + #[source] + source: serde_json::Error, + }, + + #[error("line {line}: `assignment` field missing or not an array")] + MissingAssignment { line: usize }, + + #[error("line {line}: value `{value}` cannot be represented as u16")] + InvalidAssignmentValue { line: usize, value: u64 }, + + #[error("TwoDelta transition involves more than two distinct assignment ids")] + TwoDeltaTooManyIds, + + #[error("TwoDelta received identical assignment to previous frame")] + TwoDeltaIdentical, + + #[error("TwoDelta requires equal-length assignment vectors, got {prev_len} vs {new_len}")] + TwoDeltaLengthMismatch { prev_len: usize, new_len: usize }, + + #[error("TwoDelta delta_pair hint provided without corresponding masks")] + TwoDeltaHintWithoutMasks, + + #[error("TwoDelta pair hint has identical values for both ids (value: {value})")] + TwoDeltaIdenticalPairHint { value: u16 }, + + #[error("TwoDelta mask for id {id} is missing from the position map")] + TwoDeltaMissingMask { id: u16 }, + + #[error("TwoDelta mask for id {id} is empty")] + TwoDeltaEmptyMask { id: u16 }, + + #[error("TwoDelta mask referenced position {pos} whose value {actual} is outside the pair ({a}, {b})")] + TwoDeltaMaskOutOfPair { pos: usize, actual: u16, a: u16, b: u16 }, + + #[error("XZ encoder initialization failed: {0}")] + XzInit(#[source] xz2::stream::Error), + + #[error(transparent)] + Io(#[from] io::Error), } -impl From for io::Error { - fn from(error: BenEncodeError) -> Self { - io::Error::new(io::ErrorKind::Other, error) +impl From for io::Error { + fn from(e: EncodeError) -> Self { + match e { + EncodeError::Io(e) => e, + other => io::Error::new(io::ErrorKind::InvalidData, other), + } } } diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 22696aa..f03d11a 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,3 +1,4 @@ +use crate::codec::encode::errors::EncodeError; use crate::io::writer::{BenEncoder, XBenEncoder}; use crate::{progress, BenVariant}; use serde_json::Value; @@ -47,9 +48,9 @@ pub fn encode_jsonl_to_xben( .preset(level) .block_size(0) .encoder() - .expect("init MT encoder"); + .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let encoder = XzEncoder::new_stream(writer, mt); - let mut ben_encoder = XBenEncoder::new(encoder, variant); + let mut ben_encoder = XBenEncoder::new(encoder, variant)?; if let Some(cs) = chunk_size { ben_encoder = ben_encoder.with_chunk_size(cs); } @@ -97,7 +98,7 @@ pub fn encode_jsonl_to_ben( variant: BenVariant, ) -> Result<()> { let mut line_num = 1; - let mut ben_encoder = BenEncoder::new(writer, variant); + let mut ben_encoder = BenEncoder::new(writer, variant)?; for line_result in reader.lines() { progress!("Encoding line: {}\r", line_num); line_num += 1; diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 3a3138b..67aab9c 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -8,6 +8,7 @@ mod twodelta; mod xz; pub(crate) use ben::encode_ben32_assignments; +pub use errors::EncodeError; pub use traits::{FromAssign, FromRLE}; pub(crate) use twodelta::encode_twodelta_frame_with_hint; pub use twodelta::encode_twodelta_frame; diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index b8aa774..2641f86 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -1,4 +1,4 @@ -use super::errors::BenEncodeError; +use super::errors::EncodeError; use crate::codec::frames::TwoDeltaFrame; use std::collections::HashMap; use std::io::{Error, ErrorKind, Result}; @@ -209,30 +209,21 @@ pub(crate) fn encode_twodelta_frame_with_hint( let new_assignment = new_assignment.as_ref(); if previous_assignment.len() != new_assignment.len() { - return Err(Error::new( - ErrorKind::InvalidData, - format!( - "TwoDelta requires previous and new assignment vectors to be of \ - equal length, but got lengths {} and {}", - previous_assignment.len(), - new_assignment.len() - ), - )); + return Err(Error::from(EncodeError::TwoDeltaLengthMismatch { + prev_len: previous_assignment.len(), + new_len: new_assignment.len(), + })); } if delta_pair.is_some() { if masks.is_none() { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair hint provided without corresponding masks", - )); + return Err(Error::from(EncodeError::TwoDeltaHintWithoutMasks)); } let pair = delta_pair.unwrap(); if pair.0 == pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair hint cannot have identical values for the two ids", - )); + return Err(Error::from(EncodeError::TwoDeltaIdenticalPairHint { + value: pair.0, + })); } } @@ -274,36 +265,20 @@ fn validate_masks_and_order_pairs_for_twodelta( ) -> Result<(u16, u16)> { let mask_a = match masks.get(&pair.0) { Some(m) => m, - None => { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask is missing for the previous assignment", - )) - } + None => return Err(Error::from(EncodeError::TwoDeltaMissingMask { id: pair.0 })), }; let mask_b = match masks.get(&pair.1) { Some(m) => m, - None => { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask is missing for the current assignment", - )) - } + None => return Err(Error::from(EncodeError::TwoDeltaMissingMask { id: pair.1 })), }; if mask_a.len() == 0 { - return Err(Error::new( - ErrorKind::InvalidData, - format!("TwoDelta pair mask for the id {} is empty", pair.0), - )); + return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.0 })); }; if mask_b.len() == 0 { - return Err(Error::new( - ErrorKind::InvalidData, - format!("TwoDelta pair mask for the id {} is empty", pair.1), - )); + return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.1 })); }; if mask_a[0] < mask_b[0] { @@ -407,16 +382,20 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let current_value = current[idx]; if previous_value != pair.0 && previous_value != pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta pair mask referenced an index outside the selected id pair", - )); + return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { + pos: idx, + actual: previous_value, + a: pair.0, + b: pair.1, + })); } if current_value != pair.0 && current_value != pair.1 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta payload encountered an assignment outside the selected id pair", - )); + return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { + pos: idx, + actual: current_value, + a: pair.0, + b: pair.1, + })); } if current_value != previous_value { found_assignment_change = true; @@ -432,7 +411,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( // Special error that signals that we can reuse the last TwoDelta frame if !found_assignment_change { - return Err(BenEncodeError::RepeatedSample.into()); + return Err(Error::from(EncodeError::TwoDeltaIdentical)); } masks.insert(pair.0, new_mask_a); @@ -475,7 +454,7 @@ fn construct_twodelta_frame_from_mask_hint( } } - return Err(BenEncodeError::RepeatedSample.into()); + return Err(Error::from(EncodeError::TwoDeltaIdentical)); } /// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no @@ -506,9 +485,11 @@ fn construct_twodelta_frame_from_scratch( let mut run_lengths = Vec::new(); let mut current_value = 0u16; let mut current_run_length = 0u16; + let mut found_assignment_change = false; for (&assign0, &assign1) in previous.iter().zip(current.iter()) { if assign0 != assign1 { + found_assignment_change = true; // We are encoding the current, so the first value we encounter in the current should // be added to the front of the pair for value in [assign1, assign0] { @@ -516,10 +497,7 @@ fn construct_twodelta_frame_from_scratch( // We have found both values for the pair and yet encountered a third value // so this is not a valid TwoDelta transition. if pair_len == 2 { - return Err(Error::new( - ErrorKind::InvalidData, - "TwoDelta transitions may involve at most two assignment ids", - )); + return Err(Error::from(EncodeError::TwoDeltaTooManyIds)); } delta_pair[pair_len] = value; pair_len += 1; @@ -534,6 +512,10 @@ fn construct_twodelta_frame_from_scratch( } } } + + if !found_assignment_change { + return Err(Error::from(EncodeError::TwoDeltaIdentical)); + } run_lengths.push(current_run_length); Ok(TwoDeltaFrame::from_run_lengths( diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 58761da..92f8f92 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -1,4 +1,6 @@ +use crate::codec::encode::errors::EncodeError; use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::format::FormatError; use crate::io::writer::XBenEncoder; use std::io::{self, BufRead, Cursor, Read, Result, Write}; use xz2::stream::MtStreamBuilder; @@ -44,7 +46,7 @@ pub fn xz_compress( .preset(level) .block_size(0) .encoder() - .expect("init MT encoder"); + .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let mut encoder = XzEncoder::new_stream(writer, mt); while let Ok(count) = reader.read(&mut buff) { @@ -99,12 +101,15 @@ pub fn encode_ben_to_xben( .preset(level) .block_size(0) .encoder() - .expect("init MT encoder"); + .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let encoder = XzEncoder::new_stream(writer, mt); - let variant = variant_from_banner(&check_buffer) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; - let mut ben_encoder = XBenEncoder::new(encoder, variant); + let variant = variant_from_banner(&check_buffer).ok_or_else(|| { + io::Error::from(FormatError::UnknownBanner { + actual: check_buffer.to_vec(), + }) + })?; + let mut ben_encoder = XBenEncoder::new(encoder, variant)?; if let Some(cs) = chunk_size { ben_encoder = ben_encoder.with_chunk_size(cs); } diff --git a/ben/src/codec/translate/errors.rs b/ben/src/codec/translate/errors.rs new file mode 100644 index 0000000..c26501d --- /dev/null +++ b/ben/src/codec/translate/errors.rs @@ -0,0 +1,33 @@ +use std::io; +use thiserror::Error; + +/// Errors produced while translating between BEN and ben32 wire formats. +#[derive(Debug, Error)] +pub enum TranslateError { + #[error("ben32 frame payload length {len} is not a multiple of 4")] + Ben32BadLength { len: usize }, + + #[error( + "ben32 frame missing 4-byte zero end-of-line sentinel at offset {offset} \ + (got {actual:?})" + )] + Ben32MissingTerminator { actual: [u8; 4], offset: usize }, + + #[error("TwoDelta BEN streams cannot be translated to ben32/XBEN via this path")] + TwoDeltaUnsupported, + + #[error("IO error: {0}")] + Io(#[from] io::Error), +} + +impl From for io::Error { + fn from(e: TranslateError) -> Self { + match e { + TranslateError::Io(e) => e, + TranslateError::TwoDeltaUnsupported => { + io::Error::new(io::ErrorKind::Unsupported, e.to_string()) + } + other => io::Error::new(io::ErrorKind::InvalidData, other), + } + } +} diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index a40a849..c4e277c 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -1,8 +1,11 @@ //! Translation helpers between BEN and ben32 representations. -use crate::codec::{FromAssign, FromRLE}; +mod errors; +use errors::TranslateError; + +use crate::codec::FromRLE; use byteorder::{BigEndian, ReadBytesExt}; -use std::io::{self, Error, Read, Write}; +use std::io::{self, Read, Write}; use crate::codec::decode::decode_ben_line; use crate::codec::BenEncodeFrame; @@ -24,10 +27,9 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { let mut reader = ben32_vec.as_slice(); if !ben32_vec.len().is_multiple_of(4) { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid ben32 data length", - )); + return Err(io::Error::from(TranslateError::Ben32BadLength { + len: ben32_vec.len(), + })); } for _ in 0..((ben32_vec.len() / 4) - 1) { @@ -40,12 +42,13 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { ben32_rle.push((value, count)); } + let eol_offset = ben32_vec.len(); reader.read_exact(&mut buffer)?; if buffer != [0u8; 4] { - return Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid ben32 data format. Missing end of line separator.", - )); + return Err(io::Error::from(TranslateError::Ben32MissingTerminator { + actual: buffer, + offset: eol_offset, + })); } Ok(BenEncodeFrame::from_rle(ben32_rle, None).into_bytes()) @@ -194,10 +197,7 @@ pub fn ben_to_ben32_lines( writer.write_all(&n_reps.to_be_bytes())?; } BenVariant::TwoDelta => { - return Err(io::Error::new( - io::ErrorKind::Unsupported, - "TwoDelta BEN streams cannot yet be translated to ben32/XBEN", - )); + return Err(io::Error::from(TranslateError::TwoDeltaUnsupported)); } } } diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index d396bc5..185bfb5 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -238,7 +238,7 @@ fn test_random_translation_ben_to_ben32() { fn test_ben32_to_ben_line_rejects_invalid_length() { let err = ben32_to_ben_line(vec![1, 2, 3]).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert_eq!(err.to_string(), "Invalid ben32 data length"); + assert_eq!(err.to_string(), "ben32 frame payload length 3 is not a multiple of 4"); } #[test] @@ -247,7 +247,7 @@ fn test_ben32_to_ben_line_rejects_missing_terminator() { assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), - "Invalid ben32 data format. Missing end of line separator." + "ben32 frame missing 4-byte zero end-of-line sentinel at offset 8 (got [0, 0, 0, 1])" ); } diff --git a/ben/src/format/errors.rs b/ben/src/format/errors.rs new file mode 100644 index 0000000..0c8d3e6 --- /dev/null +++ b/ben/src/format/errors.rs @@ -0,0 +1,24 @@ +use std::io; +use thiserror::Error; + +/// Errors produced while parsing or validating a BEN file header/banner. +#[derive(Debug, Error)] +pub enum FormatError { + #[error( + "unrecognized BEN banner (got {actual:?}; expected one of \ + \"STANDARD BEN FILE\", \"MKVCHAIN BEN FILE\", or \"TWODELTA BEN FILE\")" + )] + UnknownBanner { actual: Vec }, + + #[error("IO error reading banner: {0}")] + Io(#[from] io::Error), +} + +impl From for io::Error { + fn from(e: FormatError) -> Self { + match e { + FormatError::Io(e) => e, + other => io::Error::new(io::ErrorKind::InvalidData, other), + } + } +} diff --git a/ben/src/format/mod.rs b/ben/src/format/mod.rs index 8811025..c0ca8cf 100644 --- a/ben/src/format/mod.rs +++ b/ben/src/format/mod.rs @@ -1,3 +1,5 @@ //! Shared on-disk format metadata for BEN and XBEN streams. pub mod banners; +pub mod errors; +pub use errors::FormatError; diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs index b4cbc4f..ced58db 100644 --- a/ben/src/io/reader/ben.rs +++ b/ben/src/io/reader/ben.rs @@ -2,11 +2,12 @@ use super::errors::DecoderInitError; use super::twodelta::{ XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_DELTA_TAG, XBEN_TWODELTA_FULL_TAG, }; -use crate::codec::decode::{decode_ben32_line, decode_ben_line}; +use crate::codec::decode::{decode_ben32_line, decode_ben_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; use crate::codec::{BenDecodeFrame, BenEncodeFrame, TwoDeltaFrame}; use crate::codec::encode::FromAssign; use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::format::FormatError; use crate::util::rle::rle_to_vec; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -333,10 +334,7 @@ impl BenDecoder { } StoredBenFrame::TwoDelta { frame, count } => { let assignment = self.previous_assignment.take().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta frame encountered before an initial BEN frame", - ) + io::Error::from(DecodeError::TwoDeltaNoAnchorFrame) })?; let run_lengths = frame.run_length_vector; let assignment = @@ -401,15 +399,15 @@ fn apply_twodelta_runs_to_assignment( let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); let mut current_value = first; - for val in assignment.iter_mut() { + for (pos, val) in assignment.iter_mut().enumerate() { if *val == first || *val == second { if remaining_in_run == 0 { run_idx += 1; if run_idx >= run_lengths.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta payload exhausted before all pair positions were covered", - )); + return Err(io::Error::from(DecodeError::TwoDeltaRunsExhausted { + run_idx, + pos, + })); } remaining_in_run = run_lengths[run_idx]; current_value = if current_value == first { @@ -464,12 +462,9 @@ fn decode_stored_frame_to_assignment( match frame { StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( - previous_assignment.take().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta frame encountered before an initial BEN frame", - ) - })?, + previous_assignment + .take() + .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?, frame, ), } @@ -531,9 +526,10 @@ impl Iterator for BenFrameDecoeder { BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() { Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), - Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - "unexpected TwoDelta frame in non-TwoDelta BEN stream", + Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::from( + DecodeError::UnexpectedTwoDeltaFrame { + variant: self.inner.variant, + }, ))), Some(Err(err)) => Some(Err(err)), None => None, @@ -609,10 +605,9 @@ impl XBenDecoder { let mut first = [0u8; BANNER_LEN]; xz.read_exact(&mut first)?; let variant = variant_from_banner(&first).ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "Invalid .xben header (expecting STANDARD/MKVCHAIN/TWODELTA BEN FILE)", - ) + io::Error::from(FormatError::UnknownBanner { + actual: first.to_vec(), + }) })?; Ok(Self::from_decompressed_stream(xz, variant)) @@ -750,10 +745,7 @@ impl XBenDecoder { ))) } XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. - _ => Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - "invalid TwoDelta XBEN frame tag", - ))), + _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { tag }))), } } @@ -924,10 +916,9 @@ impl Iterator for XBenDecoder { Some(prev) => { apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) } - None => Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN frame encountered before an initial BEN frame", - )), + None => { + Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)) + } } } }; @@ -963,9 +954,8 @@ impl Iterator for XBenDecoder { &run_lengths, ) } - None => Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN frame encountered before an initial BEN frame", + None => Err(io::Error::from( + DecodeError::TwoDeltaNoAnchorFrame, )), } } @@ -997,10 +987,7 @@ impl Iterator for XBenDecoder { if self.overflow.is_empty() { return None; } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); } } Ok(n) => n, @@ -1060,10 +1047,7 @@ impl Iterator for XBenFrameDecoder { if self.inner.overflow.is_empty() { return None; } else { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "truncated .xben stream (partial frame at EOF)", - ))); + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); } } Ok(n) => n, @@ -1330,7 +1314,9 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result Err(io::Error::new(io::ErrorKind::InvalidInput, "Unknown mode")), + _ => Err(io::Error::from(DecoderInitError::UnknownMode { + mode: mode.to_string(), + })), } } diff --git a/ben/src/io/reader/errors.rs b/ben/src/io/reader/errors.rs index bbefac8..8abbb2f 100644 --- a/ben/src/io/reader/errors.rs +++ b/ben/src/io/reader/errors.rs @@ -1,13 +1,5 @@ use std::io; - -#[derive(Debug)] -/// Errors produced while validating the header of a decoder input stream. -pub enum DecoderInitError { - /// The leading bytes did not match any supported BEN banner. - InvalidFileFormat(Vec), - /// An I/O error occurred while reading the header. - Io(io::Error), -} +use thiserror::Error; /// Check whether a header prefix matches the XZ file signature. /// @@ -39,49 +31,39 @@ fn to_hex(bytes: &[u8]) -> String { .join(" ") } -impl std::fmt::Display for DecoderInitError { - /// Format the decoder initialization error for display. - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Io(e) => write!(f, "IO error: {e}"), - Self::InvalidFileFormat(header) => { - if is_xz_header(header) { - write!( - f, - "Invalid file format: Compressed header detected (hex: {}). \ - This reader expects an uncompressed .ben file. \ - Decompress this file using the BEN cli `ben -m decode .xben` tool \ - or the `decode_xben_to_ben` function in this library.", - to_hex(header) - ) - } else { - let lossy = String::from_utf8_lossy(header); - write!( - f, - "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", - to_hex(header) - ) - } - } - } +/// Format an `InvalidFileFormat` byte header into a human-readable error message. +fn format_invalid_file_format(header: &Vec) -> String { + if is_xz_header(header) { + format!( + "Invalid file format: Compressed header detected (hex: {}). \ + This reader expects an uncompressed .ben file. \ + Decompress this file using the BEN cli `ben -m decode .xben` tool \ + or the `decode_xben_to_ben` function in this library.", + to_hex(header) + ) + } else { + let lossy = String::from_utf8_lossy(header); + format!( + "Invalid file format. Found header (utf8-lossy: {lossy:?}, hex: {})", + to_hex(header) + ) } } -impl std::error::Error for DecoderInitError { - /// Return the underlying source error when one exists. - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - DecoderInitError::Io(e) => Some(e), - DecoderInitError::InvalidFileFormat(_) => None, - } - } -} +#[derive(Debug, Error)] +/// Errors produced while validating the header of a decoder input stream. +pub enum DecoderInitError { + /// The leading bytes did not match any supported BEN banner. + #[error("{}", format_invalid_file_format(.0))] + InvalidFileFormat(Vec), -impl From for DecoderInitError { - /// Wrap a plain I/O error as a decoder initialization error. - fn from(error: io::Error) -> Self { - DecoderInitError::Io(error) - } + /// The file mode string was not recognised. + #[error("unknown BEN file mode {mode:?}; expected \"ben\" or \"xben\"")] + UnknownMode { mode: String }, + + /// An I/O error occurred while reading the header. + #[error("IO error: {0}")] + Io(#[from] io::Error), } impl From for io::Error { @@ -89,9 +71,10 @@ impl From for io::Error { fn from(error: DecoderInitError) -> Self { match error { DecoderInitError::Io(e) => e, - DecoderInitError::InvalidFileFormat(msg) => { - io::Error::new(io::ErrorKind::InvalidData, format!("{msg:?}")) + DecoderInitError::UnknownMode { .. } => { + io::Error::new(io::ErrorKind::InvalidInput, error.to_string()) } + other => io::Error::new(io::ErrorKind::InvalidData, other.to_string()), } } } diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index 559f725..c718203 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -1,10 +1,16 @@ -use super::twodelta::{DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; +use super::frames::{AssignmentHints, BufferedBenFrame, BufferedDeltaFrame}; +use super::twodelta::{ + DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, +}; +use super::utils::{ + analyze_twodelta_transition, encode_xben_twodelta_full_frame, is_repeated_assignment, + parse_json_assignment, +}; use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; use crate::codec::{BenEncodeFrame, FromAssign, TwoDeltaFrame}; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::util::rle::assign_to_rle; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use serde_json::Value; @@ -12,237 +18,6 @@ use std::collections::HashMap; use std::io::{self, BufRead, Read, Result, Write}; use xz2::write::XzEncoder; -/// A buffered delta frame awaiting chunk serialization. -struct BufferedDeltaFrame { - pair: (u16, u16), - run_lengths: Vec, - count: u16, -} - -enum BufferedBenFrame { - Ben(BenEncodeFrame), - TwoDelta(TwoDeltaFrame), -} - -impl BufferedBenFrame { - fn as_slice(&self) -> &[u8] { - match self { - Self::Ben(frame) => frame.as_slice(), - Self::TwoDelta(frame) => frame.as_slice(), - } - } -} - -#[derive(Clone, Copy, Debug, Default)] -struct AssignmentHints { - is_repeated: bool, - delta_pair: Option<(u16, u16)>, -} - -/// Check whether two assignment vectors are identical element-by-element. -/// -/// # Arguments -/// -/// * `previous_sample` - The previous assignment vector. -/// * `assign_vec` - The current assignment vector. -/// -/// # Returns -/// -/// Returns `true` if both vectors have the same length and every element matches. -fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return false; - } - - for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { - if previous != current { - return false; - } - } - - true -} - -/// Analyze the transition between two assignment vectors for two-delta encoding. -/// -/// Determines whether the assignments are identical (repeated) or differ by -/// exactly one swapped pair of values, which qualifies for delta encoding. -/// -/// When `masks` are available the pair is detected in O(K) where K is the -/// number of unique label values, by checking each label's mask positions for -/// changes rather than scanning the full assignment array. -/// -/// # Arguments -/// -/// * `previous_sample` - The previous assignment vector. -/// * `assign_vec` - The current assignment vector. -/// * `masks` - An optional index map from each label value to its sorted -/// positions in the previous assignment. -/// -/// # Returns -/// -/// Returns an `AssignmentHints` with `is_repeated` set if the vectors match, -/// or `delta_pair` set if all differences involve exactly two values. -fn analyze_twodelta_transition( - previous_sample: &[u16], - assign_vec: &[u16], - masks: Option<&HashMap>>, -) -> AssignmentHints { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return AssignmentHints::default(); - } - - // Fast path: use masks to find the pair in O(K) instead of O(N). - if let Some(masks) = masks { - if previous_sample == assign_vec { - return AssignmentHints { - is_repeated: true, - delta_pair: None, - }; - } - - // Check each label's mask positions. Only labels involved in the swap - // will have any changed positions; all others short-circuit immediately. - let mut pair: Option<(u16, u16)> = None; - for (&label, positions) in masks { - for &pos in positions { - if assign_vec[pos] != label { - let other = assign_vec[pos]; - match pair { - None => { - pair = Some((label, other)); - break; - } - Some((a, b)) => { - if (label == a || label == b) && (other == a || other == b) { - break; - } - // More than two values involved. - return AssignmentHints { - is_repeated: false, - delta_pair: None, - }; - } - } - } - } - } - - return AssignmentHints { - is_repeated: false, - delta_pair: pair, - }; - } - - // Slow path: full O(N) scan when masks are not available. - let Some(first_mismatch) = previous_sample - .iter() - .zip(assign_vec.iter()) - .position(|(&previous, ¤t)| previous != current) - else { - return AssignmentHints { - is_repeated: true, - delta_pair: None, - }; - }; - - let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); - - for (&previous, ¤t) in previous_sample - .iter() - .zip(assign_vec.iter()) - .skip(first_mismatch + 1) - { - if previous == current { - continue; - } - - if previous != pair.0 && previous != pair.1 { - return AssignmentHints { - is_repeated: false, - delta_pair: None, - }; - } - - if current != pair.0 && current != pair.1 { - return AssignmentHints { - is_repeated: false, - delta_pair: None, - }; - } - } - - AssignmentHints { - is_repeated: false, - delta_pair: Some(pair), - } -} - -/// Extract and validate the `assignment` array from a JSON object. -/// -/// # Arguments -/// -/// * `data` - A JSON value expected to contain an `assignment` array of integers. -/// -/// # Returns -/// -/// Returns a `Vec` of assignment values, or an error if the field is -/// missing, not an array, or contains values that do not fit in a `u16`. -fn parse_json_assignment(data: Value) -> Result> { - let assign_vec = data["assignment"].as_array().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "'assignment' field either missing or is not an array of integers", - ) - })?; - - assign_vec - .iter() - .map(|x| { - let u = x.as_u64().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!( - "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", - x - ), - ) - })?; - - u16::try_from(u).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("The value '{}' is too large to fit in a u16.", u), - ) - }) - }) - .collect() -} - -/// Encode an assignment vector as a full XBEN two-delta frame. -/// -/// The frame begins with a full-frame tag byte followed by RLE-encoded -/// assignment runs in big-endian format. -/// -/// # Arguments -/// -/// * `assignments` - The full assignment vector to encode. -/// -/// # Returns -/// -/// Returns the encoded frame as a byte vector. -fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { - let runs = assign_to_rle(assignments); - let mut bytes = Vec::with_capacity(1 + 4 + runs.len() * 4); - bytes.push(XBEN_TWODELTA_FULL_TAG); - bytes.extend_from_slice(&(runs.len() as u32).to_be_bytes()); - for (value, len) in runs { - bytes.extend_from_slice(&value.to_be_bytes()); - bytes.extend_from_slice(&len.to_be_bytes()); - } - bytes -} - /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { writer: W, @@ -265,10 +40,10 @@ impl BenEncoder { /// # Returns /// /// Returns a new encoder ready to accept assignments or RLE frames. - pub fn new(mut writer: W, variant: BenVariant) -> Self { - writer.write_all(banner_for_variant(variant)).unwrap(); + pub fn new(mut writer: W, variant: BenVariant) -> io::Result { + writer.write_all(banner_for_variant(variant))?; - BenEncoder { + Ok(BenEncoder { writer, previous_sample: Vec::new(), previous_masks: HashMap::new(), @@ -276,7 +51,7 @@ impl BenEncoder { sample_count: 0, complete: false, variant, - } + }) } /// Rebuild the value-to-position index map from the current previous sample. @@ -383,13 +158,8 @@ impl BenEncoder { )?; self.flush_pending_frame()?; - if let Some(pair) = hints.delta_pair { - self.update_masks_for_delta(&assign_vec, pair); - self.previous_sample = assign_vec; - } else { - self.previous_sample = assign_vec; - self.rebuild_previous_masks(); - } + self.previous_sample = assign_vec; + self.rebuild_previous_masks(); self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); self.sample_count = 1; Ok(()) @@ -413,7 +183,7 @@ impl BenEncoder { let encoded = self .previous_encoded_sample .as_ref() - .expect("missing previous BEN frame"); + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing previous BEN frame"))?; self.writer.write_all(encoded.as_slice())?; if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { @@ -452,54 +222,6 @@ impl BenEncoder { Ok(()) } - /// Update the value-to-position masks incrementally for a two-delta transition. - /// - /// Instead of rebuilding the entire mask HashMap, only the positions belonging - /// to the two swapped values are repartitioned. This is O(pair_positions) - /// rather than O(assignment_length). - /// - /// # Arguments - /// - /// * `new_sample` - The new assignment vector after the transition. - /// * `pair` - The two values involved in the delta swap. - fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { - if pair.0 == pair.1 { - return; - } - - let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); - let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); - - let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); - let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); - - let (mut i, mut j) = (0, 0); - while i < pos_a.len() || j < pos_b.len() { - let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { - let p = pos_a[i]; - i += 1; - p - } else { - let p = pos_b[j]; - j += 1; - p - }; - - if new_sample[pos] == pair.0 { - new_a.push(pos); - } else { - new_b.push(pos); - } - } - - if !new_a.is_empty() { - self.previous_masks.insert(pair.0, new_a); - } - if !new_b.is_empty() { - self.previous_masks.insert(pair.1, new_b); - } - } - /// Encode and write a full assignment vector. /// /// # Arguments @@ -536,7 +258,8 @@ impl BenEncoder { /// /// Returns `Ok(())` after the record has been validated and encoded. pub fn write_json_value(&mut self, data: Value) -> Result<()> { - self.write_assignment(parse_json_assignment(data)?) + let new_assign = parse_json_assignment(data)?; + self.write_assignment(new_assign) } /// Flush any buffered repetition state to the underlying writer. @@ -551,8 +274,7 @@ impl BenEncoder { if self.complete { return Ok(()); } - self.flush_pending_frame() - .expect("Error while flushing trailing BEN frame"); + self.flush_pending_frame()?; self.complete = true; Ok(()) } @@ -610,6 +332,7 @@ impl XBenEncoder { /// /// * `new_sample` - The new assignment vector after the transition. /// * `pair` - The two values involved in the delta swap. + #[allow(dead_code)] fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { if pair.0 == pair.1 { return; @@ -728,9 +451,9 @@ impl XBenEncoder { /// # Returns /// /// Returns a new XBEN encoder ready to accept assignments or BEN frames. - pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> Self { - encoder.write_all(banner_for_variant(variant)).unwrap(); - XBenEncoder { + pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> io::Result { + encoder.write_all(banner_for_variant(variant))?; + Ok(XBenEncoder { encoder, previous_assignment: Vec::new(), previous_masks: HashMap::new(), @@ -739,7 +462,7 @@ impl XBenEncoder { variant, chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, chunk_buffer: Vec::new(), - } + }) } /// Set the number of delta frames per columnar chunk. @@ -976,12 +699,10 @@ impl Drop for XBenEncoder { /// Flush any buffered XBEN repetition state during drop. fn drop(&mut self) { if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { - self.flush_pending_frame() - .expect("Error writing last XBEN frame to file"); + let _ = self.flush_pending_frame(); } if !self.chunk_buffer.is_empty() { - self.flush_chunk() - .expect("Error writing last XBEN TwoDelta chunk"); + let _ = self.flush_chunk(); } } } diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs index e69de29..3d32222 100644 --- a/ben/src/io/writer/frames.rs +++ b/ben/src/io/writer/frames.rs @@ -0,0 +1,28 @@ +use crate::codec::{BenEncodeFrame, TwoDeltaFrame}; + +/// A buffered delta frame awaiting chunk serialization. +pub(super) struct BufferedDeltaFrame { + pub pair: (u16, u16), + pub run_lengths: Vec, + pub count: u16, +} + +pub(super) enum BufferedBenFrame { + Ben(BenEncodeFrame), + TwoDelta(TwoDeltaFrame), +} + +impl BufferedBenFrame { + pub fn as_slice(&self) -> &[u8] { + match self { + Self::Ben(frame) => frame.as_slice(), + Self::TwoDelta(frame) => frame.as_slice(), + } + } +} + +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct AssignmentHints { + pub is_repeated: bool, + pub delta_pair: Option<(u16, u16)>, +} diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index e69de29..ebd71bf 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -0,0 +1,210 @@ +use super::frames::AssignmentHints; +use super::twodelta::XBEN_TWODELTA_FULL_TAG; +use crate::util::rle::assign_to_rle; +use serde_json::Value; +use std::collections::HashMap; +use std::io::{self, Result}; + +/// Check whether two assignment vectors are identical element-by-element. +/// +/// # Arguments +/// +/// * `previous_sample` - The previous assignment vector. +/// * `assign_vec` - The current assignment vector. +/// +/// # Returns +/// +/// Returns `true` if both vectors have the same length and every element matches. +pub(super) fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return false; + } + + for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { + if previous != current { + return false; + } + } + + true +} + +/// Analyze the transition between two assignment vectors for two-delta encoding. +/// +/// Determines whether the assignments are identical (repeated) or differ by +/// exactly one swapped pair of values, which qualifies for delta encoding. +/// +/// When `masks` are available the pair is detected in O(K) where K is the +/// number of unique label values, by checking each label's mask positions for +/// changes rather than scanning the full assignment array. +/// +/// # Arguments +/// +/// * `previous_sample` - The previous assignment vector. +/// * `assign_vec` - The current assignment vector. +/// * `masks` - An optional index map from each label value to its sorted +/// positions in the previous assignment. +/// +/// # Returns +/// +/// Returns an `AssignmentHints` with `is_repeated` set if the vectors match, +/// or `delta_pair` set if all differences involve exactly two values. +pub(super) fn analyze_twodelta_transition( + previous_sample: &[u16], + assign_vec: &[u16], + masks: Option<&HashMap>>, +) -> AssignmentHints { + if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { + return AssignmentHints::default(); + } + + // Fast path: use masks to find the pair in O(K) instead of O(N). + if let Some(masks) = masks { + if previous_sample == assign_vec { + return AssignmentHints { + is_repeated: true, + delta_pair: None, + }; + } + + // Check each label's mask positions. Only labels involved in the swap + // will have any changed positions; all others short-circuit immediately. + let mut pair: Option<(u16, u16)> = None; + for (&label, positions) in masks { + for &pos in positions { + if assign_vec[pos] != label { + let other = assign_vec[pos]; + match pair { + None => { + pair = Some((label, other)); + break; + } + Some((a, b)) => { + if (label == a || label == b) && (other == a || other == b) { + break; + } + // More than two values involved. + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + } + } + } + } + + return AssignmentHints { + is_repeated: false, + delta_pair: pair, + }; + } + + // Slow path: full O(N) scan when masks are not available. + let Some(first_mismatch) = previous_sample + .iter() + .zip(assign_vec.iter()) + .position(|(&previous, ¤t)| previous != current) + else { + return AssignmentHints { + is_repeated: true, + delta_pair: None, + }; + }; + + let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); + + for (&previous, ¤t) in previous_sample + .iter() + .zip(assign_vec.iter()) + .skip(first_mismatch + 1) + { + if previous == current { + continue; + } + + if previous != pair.0 && previous != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + + if current != pair.0 && current != pair.1 { + return AssignmentHints { + is_repeated: false, + delta_pair: None, + }; + } + } + + AssignmentHints { + is_repeated: false, + delta_pair: Some(pair), + } +} + +/// Extract and validate the `assignment` array from a JSON object. +/// +/// # Arguments +/// +/// * `data` - A JSON value expected to contain an `assignment` array of integers. +/// +/// # Returns +/// +/// Returns a `Vec` of assignment values, or an error if the field is +/// missing, not an array, or contains values that do not fit in a `u16`. +pub(super) fn parse_json_assignment(data: Value) -> Result> { + let assign_vec = data["assignment"].as_array().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "'assignment' field either missing or is not an array of integers", + ) + })?; + + assign_vec + .iter() + .map(|x| { + let u = x.as_u64().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "The value '{}' could not be unwrapped as an unsigned 64 bit integer.", + x + ), + ) + })?; + + u16::try_from(u).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("The value '{}' is too large to fit in a u16.", u), + ) + }) + }) + .collect() +} + +/// Encode an assignment vector as a full XBEN two-delta frame. +/// +/// The frame begins with a full-frame tag byte followed by RLE-encoded +/// assignment runs in big-endian format. +/// +/// # Arguments +/// +/// * `assignments` - The full assignment vector to encode. +/// +/// # Returns +/// +/// Returns the encoded frame as a byte vector. +pub(super) fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { + let runs = assign_to_rle(assignments); + let mut bytes = Vec::with_capacity(1 + 4 + runs.len() * 4); + bytes.push(XBEN_TWODELTA_FULL_TAG); + bytes.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for (value, len) in runs { + bytes.extend_from_slice(&value.to_be_bytes()); + bytes.extend_from_slice(&len.to_be_bytes()); + } + bytes +} diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 1e3c11a..9de303d 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -3,24 +3,32 @@ use crate::codec::decode::decode_ben32_line; use crate::io::reader::{BenDecoder, XBenDecoder}; use serde_json::Error as SerdeError; -use std::fmt; use std::io::Cursor; use std::io::{self, Read}; +use thiserror::Error; -#[derive(Debug)] -/// Error categories returned when extracting an individual sample from a file. -pub enum SampleErrorKind { +#[derive(Debug, Error)] +/// Error returned by sample extraction helpers. +pub enum SampleError { + /// The provided sample number was zero, which is invalid. + #[error("Invalid sample number. Sample number must be greater than 0")] InvalidSampleNumber, + + /// The requested sample index was not found in the file. + #[error( + "Sample number not found in file. Failed to find sample '{sample_number}'. \ + Last sample seems to be '{}'", + sample_number - 1 + )] SampleNotFound { sample_number: usize }, - IoError(io::Error), - JsonError(SerdeError), -} -#[derive(Debug)] -/// Error returned by sample extraction helpers. -pub struct SampleError { - /// The underlying extraction failure category. - pub kind: SampleErrorKind, + /// An I/O error occurred during extraction. + #[error("IO Error: {0}")] + IoError(#[from] io::Error), + + /// A JSON parsing error occurred during extraction. + #[error("JSON Error: {0}")] + JsonError(#[from] SerdeError), } impl SampleError { @@ -32,63 +40,9 @@ impl SampleError { /// /// # Returns /// - /// Returns a new [`SampleError`] with [`SampleErrorKind::IoError`]. + /// Returns a new [`SampleError`] with [`SampleError::IoError`]. pub fn new_io_error(error: io::Error) -> Self { - SampleError { - kind: SampleErrorKind::IoError(error), - } - } -} - -impl fmt::Display for SampleError { - /// Format the sample extraction error for display. - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match &self.kind { - SampleErrorKind::InvalidSampleNumber => { - write!( - f, - "Invalid sample number. Sample number must be greater than 0" - ) - } - SampleErrorKind::SampleNotFound { sample_number } => { - write!( - f, - "Sample number not found in file. Failed to find sample '{}'. Last sample seems to be '{}'", - sample_number, - sample_number - 1 - ) - } - SampleErrorKind::IoError(e) => write!(f, "IO Error: {}", e), - SampleErrorKind::JsonError(e) => write!(f, "JSON Error: {}", e), - } - } -} - -impl std::error::Error for SampleError { - /// Return the underlying source error when one exists. - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match &self.kind { - SampleErrorKind::InvalidSampleNumber => None, - SampleErrorKind::SampleNotFound { .. } => None, - SampleErrorKind::IoError(e) => Some(e), - SampleErrorKind::JsonError(e) => Some(e), - } - } -} - -impl From for SampleError { - /// Wrap a plain I/O error as a sample extraction error. - fn from(error: io::Error) -> Self { - SampleError::new_io_error(error) - } -} - -impl From for SampleError { - /// Wrap a JSON parsing error as a sample extraction error. - fn from(error: SerdeError) -> Self { - SampleError { - kind: SampleErrorKind::JsonError(error), - } + SampleError::IoError(error) } } @@ -107,13 +61,11 @@ pub fn extract_assignment_ben( sample_number: usize, ) -> Result, SampleError> { if sample_number == 0 { - return Err(SampleError { - kind: SampleErrorKind::InvalidSampleNumber, - }); + return Err(SampleError::InvalidSampleNumber); } let mut current_sample = 1; - let inner_decoder = BenDecoder::new(&mut reader).expect("Failed to create XBenDecoder"); + let inner_decoder = BenDecoder::new(&mut reader).map_err(io::Error::from)?; for record in inner_decoder { let (assignment, count) = record.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + count as usize > sample_number { @@ -122,10 +74,8 @@ pub fn extract_assignment_ben( current_sample += count as usize; } - Err(SampleError { - kind: SampleErrorKind::SampleNotFound { - sample_number: current_sample, - }, + Err(SampleError::SampleNotFound { + sample_number: current_sample, }) } @@ -144,12 +94,10 @@ pub fn extract_assignment_xben( sample_number: usize, ) -> Result, SampleError> { if sample_number == 0 { - return Err(SampleError { - kind: SampleErrorKind::InvalidSampleNumber, - }); + return Err(SampleError::InvalidSampleNumber); } - let inner_decoder = XBenDecoder::new(&mut reader).expect("Failed to create XBenDecoder"); + let inner_decoder = XBenDecoder::new(&mut reader).map_err(SampleError::new_io_error)?; let variant = inner_decoder.variant; let frame_iterator = inner_decoder.into_frames(); @@ -165,10 +113,8 @@ pub fn extract_assignment_xben( current_sample += frame.1 as usize; } - Err(SampleError { - kind: SampleErrorKind::SampleNotFound { - sample_number: current_sample, - }, + Err(SampleError::SampleNotFound { + sample_number: current_sample, }) } diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 3f769ad..89ce057 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -107,9 +107,7 @@ fn test_extract_assignment_sample_too_large() { let result = extract_assignment_ben(&mut reader, sample_number); match result { - Err(SampleError { - kind: SampleErrorKind::SampleNotFound { sample_number: 4 }, - }) => (), + Err(SampleError::SampleNotFound { sample_number: 4 }) => (), _ => panic!( "{}", format!("Expected SampleError::SampleNotFound, got {:?}", result) @@ -120,7 +118,7 @@ fn test_extract_assignment_sample_too_large() { #[test] fn test_extract_assignment_ben_rejects_zero_sample_number() { let err = extract_assignment_ben([].as_slice(), 0).unwrap_err(); - assert!(matches!(err.kind, SampleErrorKind::InvalidSampleNumber)); + assert!(matches!(err, SampleError::InvalidSampleNumber)); assert_eq!( err.to_string(), "Invalid sample number. Sample number must be greater than 0" @@ -154,8 +152,8 @@ fn test_extract_assignment_xben_roundtrip_and_errors() { let missing = extract_assignment_xben(xben.as_slice(), 4).unwrap_err(); assert!(matches!( - missing.kind, - SampleErrorKind::SampleNotFound { sample_number: 4 } + missing, + SampleError::SampleNotFound { sample_number: 4 } )); assert_eq!( missing.to_string(), @@ -164,20 +162,20 @@ fn test_extract_assignment_xben_roundtrip_and_errors() { assert!(missing.source().is_none()); let zero = extract_assignment_xben(xben.as_slice(), 0).unwrap_err(); - assert!(matches!(zero.kind, SampleErrorKind::InvalidSampleNumber)); + assert!(matches!(zero, SampleError::InvalidSampleNumber)); } #[test] fn test_sample_error_conversion_and_sources() { let io_err = io::Error::other("boom"); let sample_err = SampleError::from(io_err); - assert!(matches!(sample_err.kind, SampleErrorKind::IoError(_))); + assert!(matches!(sample_err, SampleError::IoError(_))); assert_eq!(sample_err.to_string(), "IO Error: boom"); assert!(sample_err.source().is_some()); let json_err = serde_json::from_str::("{").unwrap_err(); let sample_err = SampleError::from(json_err); - assert!(matches!(sample_err.kind, SampleErrorKind::JsonError(_))); + assert!(matches!(sample_err, SampleError::JsonError(_))); assert!(sample_err.to_string().starts_with("JSON Error: ")); assert!(sample_err.source().is_some()); } diff --git a/ben/src/ops/relabel/errors.rs b/ben/src/ops/relabel/errors.rs new file mode 100644 index 0000000..d8d52fb --- /dev/null +++ b/ben/src/ops/relabel/errors.rs @@ -0,0 +1,32 @@ +use std::io; +use thiserror::Error; + +/// Errors produced by BEN relabeling operations. +#[derive(Debug, Error)] +pub enum RelabelError { + #[error( + "relabel map must cover a contiguous range of new indices \ + (max index: {max_key}, but {missing} entries are missing)" + )] + NonContiguousMap { max_key: usize, missing: usize }, + + #[error( + "relabel map length {map_len} does not match assignment length {assignment_len}" + )] + LengthMismatch { + map_len: usize, + assignment_len: usize, + }, + + #[error("IO error: {0}")] + Io(#[from] io::Error), +} + +impl From for io::Error { + fn from(e: RelabelError) -> Self { + match e { + RelabelError::Io(e) => e, + other => io::Error::new(io::ErrorKind::InvalidInput, other), + } + } +} diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index a7dd4ad..b9ad465 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -1,15 +1,19 @@ //! Relabeling operations for BEN files. +mod errors; +use errors::RelabelError; + use crate::codec::decode::decode_ben_line; use crate::codec::{BenEncodeFrame, FromRLE}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::format::FormatError; use crate::io::reader::BenDecoder; use crate::io::writer::BenEncoder; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; -use std::io::{self, Cursor, Error, Read, Write}; +use std::io::{self, Cursor, Read, Write}; /// Convert a sparse permutation map into a dense index vector. /// @@ -30,11 +34,12 @@ fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result< permutation[new_idx] = old_idx; } - if permutation.iter().any(|&old_idx| old_idx == usize::MAX) { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Relabel map must contain a contiguous set of new indices", - )); + let missing = permutation.iter().filter(|&&x| x == usize::MAX).count(); + if missing > 0 { + return Err(io::Error::from(RelabelError::NonContiguousMap { + max_key, + missing, + })); } Ok(permutation) @@ -83,14 +88,10 @@ fn canonicalize_assignment(assignment: &[u16]) -> Vec { /// or an error if the lengths do not match. fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result> { if assignment.len() != permutation.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!( - "Relabel map length {} does not match assignment length {}", - permutation.len(), - assignment.len() - ), - )); + return Err(io::Error::from(RelabelError::LengthMismatch { + map_len: permutation.len(), + assignment_len: assignment.len(), + })); } let mut out = vec![0u16; permutation.len()]; @@ -125,7 +126,7 @@ where F: FnMut(&[u16]) -> io::Result>, { let mut decoder = BenDecoder::new(reader)?.silent(true); - let mut encoder = BenEncoder::new(writer, variant); + let mut encoder = BenEncoder::new(writer, variant)?; let mut sample_number = 0usize; decoder.for_each_assignment(|assignment, count| { @@ -168,10 +169,9 @@ fn detect_ben_variant(header: &[u8; 17]) -> io::Result { b"STANDARD BEN FILE" => Ok(BenVariant::Standard), b"MKVCHAIN BEN FILE" => Ok(BenVariant::MkvChain), b"TWODELTA BEN FILE" => Ok(BenVariant::TwoDelta), - _ => Err(Error::new( - io::ErrorKind::InvalidData, - "Invalid file format", - )), + _ => Err(io::Error::from(FormatError::UnknownBanner { + actual: header.to_vec(), + })), } } @@ -427,8 +427,11 @@ fn relabel_ben_file_impl( let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; - let variant = variant_from_banner(&check_buffer) - .ok_or_else(|| Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; + let variant = variant_from_banner(&check_buffer).ok_or_else(|| { + io::Error::from(FormatError::UnknownBanner { + actual: check_buffer.to_vec(), + }) + })?; match variant { BenVariant::Standard | BenVariant::MkvChain => { @@ -558,14 +561,10 @@ fn relabel_ben_lines_with_map_impl( rle_to_vec_in_place(&ben_line, &mut assignment_vec); if assignment_vec.len() != permutation.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!( - "Relabel map length {} does not match assignment length {}", - permutation.len(), - assignment_vec.len() - ), - )); + return Err(io::Error::from(RelabelError::LengthMismatch { + map_len: permutation.len(), + assignment_len: assignment_vec.len(), + })); } for (new_idx, &old_idx) in permutation.iter().enumerate() { @@ -669,8 +668,11 @@ fn relabel_ben_file_with_map_impl( let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; - let variant = variant_from_banner(&check_buffer) - .ok_or_else(|| Error::new(io::ErrorKind::InvalidData, "Invalid file format"))?; + let variant = variant_from_banner(&check_buffer).ok_or_else(|| { + io::Error::from(FormatError::UnknownBanner { + actual: check_buffer.to_vec(), + }) + })?; match variant { BenVariant::Standard | BenVariant::MkvChain => { diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index f2d3259..442e005 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -498,7 +498,7 @@ fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { fn test_relabel_file_rejects_invalid_header() { let err = relabel_ben_file(b"not a valid banner".as_slice(), Vec::new()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert_eq!(err.to_string(), "Invalid file format"); + assert_eq!(err.to_string(), "unrecognized BEN banner (got [110, 111, 116, 32, 97, 32, 118, 97, 108, 105, 100, 32, 98, 97, 110, 110, 101]; expected one of \"STANDARD BEN FILE\", \"MKVCHAIN BEN FILE\", or \"TWODELTA BEN FILE\")"); } #[test] @@ -507,7 +507,7 @@ fn test_relabel_file_with_map_rejects_invalid_header() { relabel_ben_file_with_map(b"not a valid banner".as_slice(), Vec::new(), HashMap::new()) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert_eq!(err.to_string(), "Invalid file format"); + assert_eq!(err.to_string(), "unrecognized BEN banner (got [110, 111, 116, 32, 97, 32, 118, 97, 108, 105, 100, 32, 98, 97, 110, 110, 101]; expected one of \"STANDARD BEN FILE\", \"MKVCHAIN BEN FILE\", or \"TWODELTA BEN FILE\")"); } #[test] diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index df82964..43153c4 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -24,7 +24,8 @@ impl PyBenEncoder { let ben_var = parse_variant(variant.as_deref())?; let writer = open_output(&file_path, overwrite)?; - let encoder = BenEncoder::new(writer, ben_var); + let encoder = BenEncoder::new(writer, ben_var) + .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {}", e)))?; Ok(PyBenEncoder { encoder: Some(encoder), }) From 2ce7fbf6b546b9715ceb11a6431cae7f7f1e76fb Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 00:37:38 -0600 Subject: [PATCH 042/221] Fix up twodelta so that the tests all work again --- ben/src/codec/encode/twodelta.rs | 152 +- ben/src/io/writer/ben.rs | 2 + ben/tests/test_coverage.rs | 2079 +++++++++++++++++ .../test_impls_pipeline.proptest-regressions | 2 + ben/tests/test_impls_pipeline.rs | 49 +- 5 files changed, 2181 insertions(+), 103 deletions(-) create mode 100644 ben/tests/test_coverage.rs diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 2641f86..f73847f 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -103,24 +103,11 @@ impl TwoDeltaFrame { let mut run_length_vector = Vec::new(); let mut buffer: u32 = 0; let mut n_bits_in_buff: u16 = 0; - let mut current: Option = None; for byte in payload { buffer |= (byte as u32).to_be() >> n_bits_in_buff; n_bits_in_buff += 8; - if n_bits_in_buff >= max_len_bit_count as u16 && current.is_none() { - current = Some((buffer >> (32 - max_len_bit_count)) as u16); - buffer <<= max_len_bit_count; - n_bits_in_buff -= max_len_bit_count as u16; - } - - if let Some(item) = current.take() { - if item > 0 { - run_length_vector.push(item); - } - } - while n_bits_in_buff >= max_len_bit_count as u16 { let item = (buffer >> (32 - max_len_bit_count)) as u16; buffer <<= max_len_bit_count; @@ -257,11 +244,12 @@ pub(crate) fn encode_twodelta_frame_with_hint( /// /// # Returns /// -/// The pair reordered so that `pair.0` has a smaller first position than `pair.1`, -/// or an error if either id is absent from `masks` or has an empty position list. +/// The pair reordered so that `pair.0` has a smaller first position in the current vector than +/// `pair.1`, or an error if either id is absent from `masks` or has an empty position list. fn validate_masks_and_order_pairs_for_twodelta( pair: (u16, u16), masks: &HashMap>, + current: &[u16], ) -> Result<(u16, u16)> { let mask_a = match masks.get(&pair.0) { Some(m) => m, @@ -281,7 +269,11 @@ fn validate_masks_and_order_pairs_for_twodelta( return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.1 })); }; - if mask_a[0] < mask_b[0] { + // Order so that pair.0 is the value the new assignment places at the first + // pair position (the lowest index held by either mask). This guarantees + // run_lengths[0] >= 1 with no leading-zero sentinel. + let first_pos = mask_a[0].min(mask_b[0]); + if current[first_pos] == pair.0 { Ok((pair.0, pair.1)) } else { Ok((pair.1, pair.0)) @@ -318,7 +310,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( delta_pair: (u16, u16), masks: &mut HashMap>, ) -> Result { - let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, masks) { + let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, masks, current) { Ok(pair) => pair, Err(e) => { return Err(Error::new( @@ -345,41 +337,26 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let mut new_mask_a = Vec::with_capacity(new_capacity); let mut new_mask_b = Vec::with_capacity(new_capacity); - // Two-pointer merge over the sorted position lists. `current_value` tracks - // which id owns the active run; `current_mask_count` is the length of that run. let (mut i, mut j) = (0usize, 0usize); + // pair.0 is guaranteed to equal current[first_pos] by validate_masks_and_order_pairs_for_twodelta, + // so the first iteration always hits the `new_val == run_value` branch and increments + // the count — no special-case initialization needed. + let mut run_value = pair.0; let mut current_mask_count = 0u16; - let mut current_value = pair.0; - let mut found_assignment_change = false; while i < mask_a.len() || j < mask_b.len() { - // Pick the next position from whichever mask is lower, mirroring the - // merge step used when building pair_positions from two masks. + // Pick the next position from whichever mask is lower. let idx = if j == mask_b.len() || (i < mask_a.len() && mask_a[i] < mask_b[j]) { - if current_value != pair.0 { - run_lengths.push(current_mask_count); - current_mask_count = 1; - current_value = pair.0; - } else { - current_mask_count += 1; - } i += 1; mask_a[i - 1] } else { - if current_value != pair.1 { - run_lengths.push(current_mask_count); - current_mask_count = 1; - current_value = pair.1; - } else { - current_mask_count += 1; - } j += 1; mask_b[j - 1] }; let previous_value = previous[idx]; - let current_value = current[idx]; + let new_val = current[idx]; if previous_value != pair.0 && previous_value != pair.1 { return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { @@ -389,19 +366,27 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( b: pair.1, })); } - if current_value != pair.0 && current_value != pair.1 { + if new_val != pair.0 && new_val != pair.1 { return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { pos: idx, - actual: current_value, + actual: new_val, a: pair.0, b: pair.1, })); } - if current_value != previous_value { + if new_val != previous_value { found_assignment_change = true; } - if current_value == pair.0 { + if new_val == run_value { + current_mask_count += 1; + } else { + run_lengths.push(current_mask_count); + run_value = new_val; + current_mask_count = 1; + } + + if new_val == pair.0 { new_mask_a.push(idx); } else { new_mask_b.push(idx); @@ -460,11 +445,11 @@ fn construct_twodelta_frame_from_mask_hint( /// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no /// hints from the caller. /// -/// Simultaneously discovers the pair and computes run lengths in a single pass over -/// the zipped assignments. Only positions where the two assignments differ are -/// considered; unchanged positions are skipped entirely. The pair is ordered so that -/// the first id encountered in `current` at a changed position becomes `pair.0`, -/// which ensures the run-length sequence begins with the id that appears first. +/// Scans to the first changed position to discover the raw pair values, then makes +/// a second pass from position 0 to build run lengths over all pair positions. +/// `enc_pair.0` is determined lazily at the first pair position encountered in the +/// second pass (which may precede the first changed position), guaranteeing +/// `run_lengths[0] >= 1` with no leading zero. /// /// # Arguments /// @@ -479,49 +464,48 @@ fn construct_twodelta_frame_from_scratch( previous: &[u16], current: &[u16], ) -> Result { - let mut delta_pair = [0u16; 2]; - let mut pair_len = 0usize; - - let mut run_lengths = Vec::new(); - let mut current_value = 0u16; - let mut current_run_length = 0u16; - let mut found_assignment_change = false; - - for (&assign0, &assign1) in previous.iter().zip(current.iter()) { - if assign0 != assign1 { - found_assignment_change = true; - // We are encoding the current, so the first value we encounter in the current should - // be added to the front of the pair - for value in [assign1, assign0] { - if !delta_pair[..pair_len].contains(&value) { - // We have found both values for the pair and yet encountered a third value - // so this is not a valid TwoDelta transition. - if pair_len == 2 { - return Err(Error::from(EncodeError::TwoDeltaTooManyIds)); - } - delta_pair[pair_len] = value; - pair_len += 1; - } + // Find the pair at the first changed position. + let first_change = previous + .iter() + .zip(current.iter()) + .position(|(&p, &c)| p != c) + .ok_or_else(|| Error::from(EncodeError::TwoDeltaIdentical))?; + + let (a, b) = (previous[first_change], current[first_change]); + + // Scan all positions: build run lengths for pair positions in previous. + // enc_pair ordering is determined lazily at the first pair position encountered: + // curr_val there is enc_pair.0, which may precede first_change for unchanged pair positions. + let mut enc_pair = (0u16, 0u16); + let mut enc_pair_known = false; + let mut run_lengths: Vec = Vec::new(); + let mut run_value = 0u16; + let mut run_count = 0u16; + + for (&prev_val, &curr_val) in previous.iter().zip(current.iter()) { + if prev_val == a || prev_val == b { + if curr_val != a && curr_val != b { + return Err(Error::from(EncodeError::TwoDeltaTooManyIds)); } - if current_run_length > 0 && current_value != assign1 { - run_lengths.push(current_run_length); - current_run_length = 1; - current_value = assign1; + if !enc_pair_known { + enc_pair = (curr_val, if curr_val == a { b } else { a }); + run_value = enc_pair.0; + enc_pair_known = true; + } + if curr_val == run_value { + run_count += 1; } else { - current_run_length += 1; + run_lengths.push(run_count); + run_value = curr_val; + run_count = 1; } + } else if prev_val != curr_val { + return Err(Error::from(EncodeError::TwoDeltaTooManyIds)); } } + run_lengths.push(run_count); - if !found_assignment_change { - return Err(Error::from(EncodeError::TwoDeltaIdentical)); - } - run_lengths.push(current_run_length); - - Ok(TwoDeltaFrame::from_run_lengths( - (delta_pair[0], delta_pair[1]), - run_lengths, - )) + Ok(TwoDeltaFrame::from_run_lengths(enc_pair, run_lengths)) } /// Encode a transition between two assignment vectors as a TwoDelta frame. diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index c718203..b91f280 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -557,6 +557,8 @@ impl XBenEncoder { count: 1, }); + self.previous_assignment = assign_vec; + if self.chunk_buffer.len() >= self.chunk_size { self.flush_chunk()?; } diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs new file mode 100644 index 0000000..38e0f9b --- /dev/null +++ b/ben/tests/test_coverage.rs @@ -0,0 +1,2079 @@ +//! Rigorous coverage tests for the binary-ensemble `ben` library. +//! +//! These tests target code paths and edge-cases that are not covered by the +//! existing integration / property-based suites. They are deliberately strict: +//! if the implementation behaves in an unexpected way the test should fail +//! rather than silently accept wrong output. + +use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; +use binary_ensemble::codec::encode::{ + encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, encode_twodelta_frame, +}; +use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE, TwoDeltaFrame}; +use binary_ensemble::format::banners::{ + banner_for_variant, has_known_banner_prefix, variant_from_banner, BANNER_LEN, + MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, +}; +use binary_ensemble::io::reader::{ + BenDecoder, BenFrameDecoeder, DecoderInitError, XBenDecoder, XBenFrameDecoder, +}; +use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::json::graph::{ + sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, +}; +use binary_ensemble::ops::relabel::{ + convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, + relabel_ben_file_as_variant_limit, relabel_ben_file_with_map_as_variant, + relabel_ben_file_with_map_as_variant_limit, relabel_ben_lines_limit, +}; +use binary_ensemble::util::rle::{assign_to_rle, rle_to_vec}; +use binary_ensemble::BenVariant; + +use serde_json::json; +use std::collections::HashMap; +use std::io::{self, BufReader, Cursor, Write}; + +// ────────────────────────────────────────────────────────────────────────────── +// Helpers +// ────────────────────────────────────────────────────────────────────────────── + +/// Build a minimal JSONL payload for the given list of assignment vectors. +fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { + let mut buf = Vec::new(); + for (i, a) in assignments.iter().enumerate() { + writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); + } + buf +} + +/// Encode assignments as a Standard BEN byte vector (including the 17-byte banner). +fn encode_standard_ben(assignments: &[Vec]) -> Vec { + let jsonl = jsonl_from_assignments(assignments); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::Standard).unwrap(); + ben +} + +/// Decode a BEN byte vector back to JSONL. +fn decode_ben_to_string(ben: &[u8]) -> String { + let mut out = Vec::new(); + decode_ben_to_jsonl(ben, &mut out).unwrap(); + String::from_utf8(out).unwrap() +} + +/// Encode assignments as an XBEN (compressed) byte vector. +fn encode_xben(assignments: &[Vec], variant: BenVariant) -> Vec { + let jsonl = jsonl_from_assignments(assignments); + let mut xben = Vec::new(); + encode_jsonl_to_xben( + Cursor::new(jsonl), + &mut xben, + variant, + Some(1), + Some(1), + None, + ) + .unwrap(); + xben +} + +/// Build a ring-graph JSON string with `n` nodes (0-based ids). +/// Each node i is connected to (i-1) mod n and (i+1) mod n. +fn make_ring_graph_json(n: usize) -> String { + let nodes: Vec = (0..n).map(|i| json!({"id": i})).collect(); + let adjacency: Vec = (0..n) + .map(|i| { + let prev = (i + n - 1) % n; + let next = (i + 1) % n; + json!([{"id": prev}, {"id": next}]) + }) + .collect(); + serde_json::to_string(&json!({"nodes": nodes, "adjacency": adjacency})).unwrap() +} + +// ────────────────────────────────────────────────────────────────────────────── +// format::banners +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn banner_constants_have_correct_length() { + assert_eq!(STANDARD_BEN_BANNER.len(), BANNER_LEN); + assert_eq!(MKVCHAIN_BEN_BANNER.len(), BANNER_LEN); + assert_eq!(TWODELTA_BEN_BANNER.len(), BANNER_LEN); + assert_eq!(BANNER_LEN, 17); +} + +#[test] +fn banner_constants_have_correct_content() { + assert_eq!(STANDARD_BEN_BANNER, b"STANDARD BEN FILE"); + assert_eq!(MKVCHAIN_BEN_BANNER, b"MKVCHAIN BEN FILE"); + assert_eq!(TWODELTA_BEN_BANNER, b"TWODELTA BEN FILE"); +} + +#[test] +fn banner_for_variant_returns_correct_banners() { + assert_eq!( + banner_for_variant(BenVariant::Standard), + STANDARD_BEN_BANNER + ); + assert_eq!( + banner_for_variant(BenVariant::MkvChain), + MKVCHAIN_BEN_BANNER + ); + assert_eq!( + banner_for_variant(BenVariant::TwoDelta), + TWODELTA_BEN_BANNER + ); +} + +#[test] +fn variant_from_banner_round_trips_all_variants() { + assert_eq!( + variant_from_banner(STANDARD_BEN_BANNER), + Some(BenVariant::Standard) + ); + assert_eq!( + variant_from_banner(MKVCHAIN_BEN_BANNER), + Some(BenVariant::MkvChain) + ); + assert_eq!( + variant_from_banner(TWODELTA_BEN_BANNER), + Some(BenVariant::TwoDelta) + ); +} + +#[test] +fn variant_from_banner_returns_none_for_unknown_banner() { + let bad: [u8; 17] = *b"BAD BAD BAD BAD!!"; + assert_eq!(variant_from_banner(&bad), None); +} + +#[test] +fn variant_from_banner_returns_none_for_all_zeros() { + let zeros = [0u8; 17]; + assert_eq!(variant_from_banner(&zeros), None); +} + +#[test] +fn variant_from_banner_returns_none_for_partial_match() { + // First 16 bytes match STANDARD BEN FILE but last byte is wrong. + let mut partial = *STANDARD_BEN_BANNER; + partial[16] = b'X'; + assert_eq!(variant_from_banner(&partial), None); +} + +#[test] +fn has_known_banner_prefix_recognises_all_variants() { + assert!(has_known_banner_prefix(STANDARD_BEN_BANNER)); + assert!(has_known_banner_prefix(MKVCHAIN_BEN_BANNER)); + assert!(has_known_banner_prefix(TWODELTA_BEN_BANNER)); +} + +#[test] +fn has_known_banner_prefix_recognises_prefixed_bytes() { + // Extra bytes after the banner should still match. + let mut extended = STANDARD_BEN_BANNER.to_vec(); + extended.extend_from_slice(b"\x01\x02\x03"); + assert!(has_known_banner_prefix(&extended)); +} + +#[test] +fn has_known_banner_prefix_rejects_garbage() { + assert!(!has_known_banner_prefix(b"NOT A BEN FILE!!!")); + assert!(!has_known_banner_prefix(b"")); + assert!(!has_known_banner_prefix(b"\x00")); +} + +// ────────────────────────────────────────────────────────────────────────────── +// util::rle +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn assign_to_rle_empty_vec() { + let v: Vec = vec![]; + assert_eq!(assign_to_rle(&v), vec![]); +} + +#[test] +fn rle_to_vec_empty_vec() { + let rle: Vec<(u16, u16)> = vec![]; + assert_eq!(rle_to_vec(rle), Vec::::new()); +} + +#[test] +fn assign_to_rle_single_element() { + assert_eq!(assign_to_rle(&[42u16]), vec![(42, 1)]); +} + +#[test] +fn assign_to_rle_all_same() { + let v = vec![7u16; 100]; + assert_eq!(assign_to_rle(&v), vec![(7, 100)]); +} + +#[test] +fn assign_to_rle_all_different() { + let v = vec![1u16, 2, 3, 4, 5]; + let expected = vec![(1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]; + assert_eq!(assign_to_rle(&v), expected); +} + +#[test] +fn assign_to_rle_alternating() { + let v = vec![1u16, 2, 1, 2, 1, 2]; + let expected = vec![(1, 1), (2, 1), (1, 1), (2, 1), (1, 1), (2, 1)]; + assert_eq!(assign_to_rle(&v), expected); +} + +#[test] +fn assign_to_rle_with_zero_values() { + let v = vec![0u16, 0, 1, 0, 0]; + let expected = vec![(0, 2), (1, 1), (0, 2)]; + assert_eq!(assign_to_rle(&v), expected); +} + +#[test] +fn assign_to_rle_max_u16_value() { + let v = vec![65535u16; 3]; + assert_eq!(assign_to_rle(&v), vec![(65535, 3)]); +} + +#[test] +fn rle_to_vec_single_long_run() { + let rle = vec![(99u16, 1000u16)]; + let result = rle_to_vec(rle); + assert_eq!(result.len(), 1000); + assert!(result.iter().all(|&v| v == 99)); +} + +#[test] +fn rle_roundtrip_preserves_data() { + let original = vec![3u16, 3, 3, 1, 1, 4, 4, 4, 4, 2]; + let rle = assign_to_rle(&original); + let recovered = rle_to_vec(rle); + assert_eq!(recovered, original); +} + +#[test] +fn rle_roundtrip_with_max_values() { + let original = vec![0u16, 65535, 65535, 0, 1, 65534]; + let rle = assign_to_rle(&original); + let recovered = rle_to_vec(rle); + assert_eq!(recovered, original); +} + +// ────────────────────────────────────────────────────────────────────────────── +// io::reader – DecoderInitError +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn decoder_init_error_display_io_variant() { + let e = DecoderInitError::Io(io::Error::other("disk on fire")); + let msg = e.to_string(); + assert!(msg.contains("disk on fire"), "got: {msg}"); +} + +#[test] +fn decoder_init_error_display_invalid_format_non_xz() { + let header = b"NOT A BEN FILE!!!".to_vec(); + let e = DecoderInitError::InvalidFileFormat(header); + let msg = e.to_string(); + assert!( + msg.contains("Invalid file format"), + "message should mention invalid file format, got: {msg}" + ); +} + +#[test] +fn decoder_init_error_display_xz_header_mentions_compressed() { + // XZ magic bytes: FD 37 7A 58 5A 00 + let mut header = vec![0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]; + header.extend_from_slice(b" "); + let e = DecoderInitError::InvalidFileFormat(header); + let msg = e.to_string(); + assert!( + msg.to_lowercase().contains("compress"), + "should mention compressed file, got: {msg}" + ); +} + +#[test] +fn decoder_init_error_source_io_variant_has_source() { + use std::error::Error as _; + let e = DecoderInitError::Io(io::Error::other("boom")); + assert!(e.source().is_some()); +} + +#[test] +fn decoder_init_error_source_invalid_format_has_no_source() { + use std::error::Error as _; + let e = DecoderInitError::InvalidFileFormat(b"bad".to_vec()); + assert!(e.source().is_none()); +} + +#[test] +fn decoder_init_error_converts_from_io_error() { + let io_err = io::Error::other("wrapped"); + let init_err = DecoderInitError::from(io_err); + assert!(matches!(init_err, DecoderInitError::Io(_))); +} + +#[test] +fn decoder_init_error_converts_to_io_error_from_io() { + let init_err = DecoderInitError::Io(io::Error::other("pass-through")); + let io_err: io::Error = init_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::Other); +} + +#[test] +fn decoder_init_error_converts_to_io_error_from_invalid_format() { + let init_err = DecoderInitError::InvalidFileFormat(b"garbage".to_vec()); + let io_err: io::Error = init_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); +} + +// ────────────────────────────────────────────────────────────────────────────── +// io::reader – BenDecoder +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_decoder_rejects_empty_input() { + match BenDecoder::new(io::empty()) { + Err(DecoderInitError::Io(_)) => {} + Ok(_) => panic!("expected Io error"), + Err(e) => panic!("unexpected error variant: {e}"), + } +} + +#[test] +fn ben_decoder_rejects_wrong_banner() { + match BenDecoder::new(b"BAD BAD BAD BAD!!".as_slice()) { + Err(DecoderInitError::InvalidFileFormat(_)) => {} + Ok(_) => panic!("expected InvalidFileFormat error"), + Err(e) => panic!("unexpected error variant: {e}"), + } +} + +#[test] +fn ben_decoder_rejects_xz_data_with_helpful_message() { + // Manufacture a valid XZ header prefix. + let xz_magic = b"\xFD\x37\x7A\x58\x5A\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00"; + match BenDecoder::new(xz_magic.as_slice()) { + Err(DecoderInitError::InvalidFileFormat(ref header)) => { + let e = DecoderInitError::InvalidFileFormat(header.clone()); + let msg = e.to_string(); + assert!(msg.to_lowercase().contains("compress"), "got: {msg}"); + } + Ok(_) => panic!("expected InvalidFileFormat error"), + Err(e) => panic!("unexpected error variant: {e}"), + } +} + +#[test] +fn ben_decoder_standard_single_assignment_round_trip() { + let assignment = vec![1u16, 1, 2, 3, 3, 3]; + let ben = encode_standard_ben(&[assignment.clone()]); + + let mut decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 1); + assert_eq!(decoded, assignment); + assert!(decoder.next().is_none()); +} + +#[test] +fn ben_decoder_standard_multiple_assignments_round_trip() { + let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![1u16, 1, 1]]; + let ben = encode_standard_ben(&assignments); + + let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + for expected in &assignments { + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 1); + assert_eq!(&decoded, expected); + } + assert!(decoder.next().is_none()); +} + +#[test] +fn ben_decoder_mkv_preserves_repetition_counts() { + // Three identical lines followed by one different line. + let jsonl = concat!( + r#"{"assignment":[1,2,3],"sample":1}"#, + "\n", + r#"{"assignment":[1,2,3],"sample":2}"#, + "\n", + r#"{"assignment":[1,2,3],"sample":3}"#, + "\n", + r#"{"assignment":[3,2,1],"sample":4}"#, + "\n", + ); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + + let (a1, c1) = decoder.next().unwrap().unwrap(); + assert_eq!(a1, vec![1u16, 2, 3]); + assert_eq!(c1, 3, "expected repetition count of 3, got {c1}"); + + let (a2, c2) = decoder.next().unwrap().unwrap(); + assert_eq!(a2, vec![3u16, 2, 1]); + assert_eq!(c2, 1); + + assert!(decoder.next().is_none()); +} + +#[test] +fn ben_decoder_count_samples_standard() { + let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(ben.as_slice()).unwrap(); + assert_eq!(decoder.count_samples().unwrap(), 3); +} + +#[test] +fn ben_decoder_count_samples_mkv_with_repetitions() { + let jsonl = concat!( + r#"{"assignment":[1],"sample":1}"#, + "\n", + r#"{"assignment":[1],"sample":2}"#, + "\n", + r#"{"assignment":[1],"sample":3}"#, + "\n", + r#"{"assignment":[2],"sample":4}"#, + "\n", + ); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let decoder = BenDecoder::new(ben.as_slice()).unwrap(); + assert_eq!(decoder.count_samples().unwrap(), 4); +} + +#[test] +fn ben_decoder_write_all_jsonl_produces_correct_output() { + let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; + let ben = encode_standard_ben(&assignments); + + let mut decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let mut out = Vec::new(); + decoder.write_all_jsonl(&mut out).unwrap(); + + let expected = concat!( + r#"{"assignment":[1,2,3],"sample":1}"#, + "\n", + r#"{"assignment":[4,5,6],"sample":2}"#, + "\n", + ); + assert_eq!(String::from_utf8(out).unwrap(), expected); +} + +#[test] +fn ben_decoder_for_each_assignment_early_stop() { + let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; + let ben = encode_standard_ben(&assignments); + + let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let mut seen = Vec::new(); + decoder + .for_each_assignment(|a, _count| { + seen.push(a.to_vec()); + Ok(seen.len() < 2) // Stop after second frame. + }) + .unwrap(); + + assert_eq!(seen.len(), 2); + assert_eq!(seen[0], vec![1u16, 2]); + assert_eq!(seen[1], vec![3u16, 4]); +} + +// ────────────────────────────────────────────────────────────────────────────── +// io::reader – XBenDecoder +// ────────────────────────────────────────────────────────────────────────────── + +fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { + let jsonl = jsonl_from_assignments(assignments); + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + variant, + Some(1), + Some(1), + None, + ) + .unwrap(); + xben +} + +#[test] +fn xben_decoder_reads_variant_from_banner_standard() { + let assignments = vec![vec![1u16, 2, 3]]; + let xben = make_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant, BenVariant::Standard); +} + +#[test] +fn xben_decoder_reads_variant_from_banner_mkvchain() { + let assignments = vec![vec![1u16, 2, 3]]; + let xben = make_xben(&assignments, BenVariant::MkvChain); + let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant, BenVariant::MkvChain); +} + +#[test] +fn xben_decoder_reads_variant_from_banner_twodelta() { + // TwoDelta requires an initial "base" sample then transitions. + let base = vec![1u16, 1, 2, 2]; + let second = vec![1u16, 2, 2, 1]; // swap positions 1 & 3 + let xben = make_xben(&[base, second], BenVariant::TwoDelta); + let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant, BenVariant::TwoDelta); +} + +// ────────────────────────────────────────────────────────────────────────────── +// io::writer – BenEncoder +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_encoder_writes_correct_banner_standard() { + let mut out = Vec::new(); + let encoder = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + drop(encoder); + assert!(out.starts_with(STANDARD_BEN_BANNER)); +} + +#[test] +fn ben_encoder_writes_correct_banner_mkvchain() { + let mut out = Vec::new(); + let encoder = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + drop(encoder); + assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); +} + +#[test] +fn ben_encoder_writes_correct_banner_twodelta() { + let mut out = Vec::new(); + let encoder = BenEncoder::new(&mut out, BenVariant::TwoDelta).unwrap(); + drop(encoder); + assert!(out.starts_with(TWODELTA_BEN_BANNER)); +} + +#[test] +fn ben_encoder_standard_single_assignment_round_trip() { + let assignment = vec![1u16, 2, 3, 3, 2, 1]; + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.finish().unwrap(); + } + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let decoded_str = String::from_utf8(decoded).unwrap(); + assert!(decoded_str.contains("\"assignment\":[1,2,3,3,2,1]")); +} + +#[test] +fn ben_encoder_standard_repeat_previous_writes_frames() { + let assignment = vec![5u16, 5, 5]; + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.repeat_previous(2).unwrap(); // 2 extra copies → 3 total + enc.finish().unwrap(); + } + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let decoded_str = String::from_utf8(decoded).unwrap(); + // Three lines expected + assert_eq!(decoded_str.lines().count(), 3, "decoded:\n{decoded_str}"); +} + +#[test] +fn ben_encoder_mkv_repeat_previous_increments_count() { + let assignment = vec![9u16, 8, 7]; + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.repeat_previous(4).unwrap(); // 4 extra → count = 5 + enc.finish().unwrap(); + } + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 5); +} + +#[test] +fn ben_encoder_finish_is_idempotent() { + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + enc.write_assignment(vec![1u16, 2]).unwrap(); + enc.finish().unwrap(); + let len_after_first_finish = enc.finish().unwrap(); // second call + let _ = len_after_first_finish; + } + // The output should decode to exactly one sample (not duplicated). + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 1); +} + +#[test] +fn ben_encoder_write_json_value_valid_input() { + let data = json!({"assignment": [1, 2, 3], "sample": 1}); + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + enc.write_json_value(data).unwrap(); + enc.finish().unwrap(); + } + let decoded_str = decode_ben_to_string(&out); + assert!(decoded_str.contains("\"assignment\":[1,2,3]")); +} + +#[test] +fn ben_encoder_write_json_value_missing_assignment_field_errors() { + let data = json!({"sample": 1}); // no "assignment" + let mut out = Vec::new(); + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let result = enc.write_json_value(data); + assert!( + result.is_err(), + "expected error for missing assignment field" + ); +} + +#[test] +fn ben_encoder_write_json_value_value_too_large_errors() { + // 65536 doesn't fit in u16. + let data = json!({"assignment": [65536], "sample": 1}); + let mut out = Vec::new(); + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let result = enc.write_json_value(data); + assert!(result.is_err(), "expected error for value out of u16 range"); +} + +#[test] +fn ben_encoder_write_json_value_negative_value_errors() { + let data = json!({"assignment": [-1], "sample": 1}); + let mut out = Vec::new(); + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let result = enc.write_json_value(data); + assert!( + result.is_err(), + "expected error for negative assignment value" + ); +} + +#[test] +fn ben_encoder_standard_identical_assignments_still_written() { + // For Standard variant, repeated identical assignments are each written. + let assignment = vec![2u16, 2, 2]; + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.finish().unwrap(); + } + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn ben_encoder_mkv_identical_assignments_deduplicated() { + // MkvChain compresses runs of identical assignments. + let assignment = vec![2u16, 2, 2]; + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.write_assignment(assignment.clone()).unwrap(); + enc.finish().unwrap(); + } + + // The BEN payload should be much smaller than 3 independent frames. + // More importantly, decoding must give back 3 lines. + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { + // Two assignments differing only in two values: valid TwoDelta transition. + let base = vec![1u16, 1, 2, 2, 1, 2]; + let next = vec![2u16, 2, 1, 1, 2, 1]; // all 1s→2s and 2s→1s + let mut out = Vec::new(); + { + let mut enc = BenEncoder::new(&mut out, BenVariant::TwoDelta).unwrap(); + enc.write_assignment(base.clone()).unwrap(); + enc.write_assignment(next.clone()).unwrap(); + enc.finish().unwrap(); + } + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + let lines: Vec<&str> = s.lines().collect(); + assert_eq!(lines.len(), 2, "decoded:\n{s}"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// codec::encode – encode_ben_vec_from_rle and encode_ben_vec_from_assign +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_ben_vec_from_rle_empty_rle() { + // Empty RLE produces a minimal frame with zero payload bytes. + let frame = BenEncodeFrame::from_rle(vec![], None); + // 1 byte max_val_bits + 1 byte max_len_bits + 4 bytes n_bytes = 6 bytes + assert_eq!(frame.as_slice().len(), 6); +} + +#[test] +fn encode_ben_vec_from_assign_and_rle_are_equivalent() { + let assign = vec![3u16, 3, 3, 1, 2, 2]; + let rle = assign_to_rle(&assign); + let via_assign = BenEncodeFrame::from_assignment(&assign, None); + let via_rle = BenEncodeFrame::from_rle(rle, None); + assert_eq!(via_assign.as_slice(), via_rle.as_slice()); +} + +#[test] +fn encode_ben_vec_from_assign_single_element() { + let frame = BenEncodeFrame::from_assignment(&[42u16], None); + assert!(!frame.as_slice().is_empty()); +} + +#[test] +fn encode_ben_vec_from_assign_all_same() { + let assign = vec![7u16; 500]; + let frame = BenEncodeFrame::from_assignment(&assign, None); + // Should encode efficiently — the payload compresses a single run. + assert!(!frame.as_slice().is_empty()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// codec::encode – encode_ben_to_xben +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_ben_to_xben_and_back_standard() { + let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; + let ben = encode_standard_ben(&assignments); + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, None, None, None).unwrap(); + + let mut ben2 = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben2.as_slice(), &mut decoded).unwrap(); + + let expected = concat!( + r#"{"assignment":[1,2,3],"sample":1}"#, + "\n", + r#"{"assignment":[4,5,6],"sample":2}"#, + "\n", + ); + assert_eq!(String::from_utf8(decoded).unwrap(), expected); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – convert_ben_file and convert_ben_file_limit +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn convert_ben_file_standard_to_standard_identity() { + let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + convert_ben_file(ben.as_slice(), &mut out, BenVariant::Standard).unwrap(); + + // Decoding the converted file must match the original assignments. + let original_jsonl = jsonl_from_assignments(&assignments); + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded, original_jsonl); +} + +#[test] +fn convert_ben_file_standard_to_mkvchain() { + let assignments = vec![ + vec![1u16, 2, 3], + vec![1u16, 2, 3], // duplicate + vec![3u16, 2, 1], + ]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + convert_ben_file(ben.as_slice(), &mut out, BenVariant::MkvChain).unwrap(); + + assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + + let expected = jsonl_from_assignments(&assignments); + assert_eq!(decoded, expected); +} + +#[test] +fn convert_ben_file_rejects_invalid_header() { + let err = convert_ben_file( + b"BAD HEADER!!!!!!!!".as_slice(), + Vec::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn convert_ben_file_limit_truncates_to_max_samples() { + let assignments: Vec> = (0..10u16).map(|i| vec![i, i + 1]).collect(); + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::Standard, 4).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 4); +} + +#[test] +fn convert_ben_file_limit_zero_produces_banner_only() { + let assignments = vec![vec![1u16, 2, 3]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::Standard, 0).unwrap(); + + // Banner must be present; no frames. + assert!(out.starts_with(STANDARD_BEN_BANNER)); + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert!(decoded.is_empty()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – relabel_ben_lines_limit +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_lines_limit_truncates_standard() { + let assignments: Vec> = vec![ + vec![3u16, 1, 2], + vec![2u16, 3, 1], + vec![1u16, 2, 3], + vec![3u16, 3, 1], + ]; + let ben = encode_standard_ben(&assignments); + + // Relabel only the payload (strip the 17-byte banner first). + let payload = &ben[BANNER_LEN..]; + let mut relabeled_payload = Vec::new(); + relabel_ben_lines_limit(payload, &mut relabeled_payload, BenVariant::Standard, 2).unwrap(); + + // Reconstruct a full BEN file so we can decode it. + let mut full = STANDARD_BEN_BANNER.to_vec(); + full.extend_from_slice(&relabeled_payload); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(full.as_slice(), &mut decoded).unwrap(); + assert_eq!( + decoded.iter().filter(|&&b| b == b'\n').count(), + 2, + "expected 2 decoded samples" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – relabel_ben_file_as_variant +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_file_as_variant_standard_to_standard() { + let assignments = vec![vec![5u16, 5, 1], vec![1u16, 5, 5]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_as_variant(ben.as_slice(), &mut out, BenVariant::Standard).unwrap(); + + assert!(out.starts_with(STANDARD_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + + // Each frame is canonicalized independently (first-seen within the frame → 1, etc.). + // Frame 1: [5,5,1] → first 5→1, then 1→2 → [1,1,2] + // Frame 2: [1,5,5] → first 1→1, then 5→2 → [1,2,2] + assert!( + s.contains("\"assignment\":[1,1,2]"), + "frame1 mismatch, got: {s}" + ); + assert!( + s.contains("\"assignment\":[1,2,2]"), + "frame2 mismatch, got: {s}" + ); +} + +#[test] +fn relabel_ben_file_as_variant_standard_to_mkvchain() { + let assignments = vec![ + vec![3u16, 1, 2], + vec![3u16, 1, 2], // duplicate + vec![1u16, 3, 2], + ]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_as_variant(ben.as_slice(), &mut out, BenVariant::MkvChain).unwrap(); + + assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn relabel_ben_file_as_variant_rejects_invalid_header() { + let err = relabel_ben_file_as_variant( + b"TOTALLY WRONG!!!!!!".as_slice(), + Vec::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn relabel_ben_file_as_variant_limit_truncates_output() { + let assignments: Vec> = (1u16..=8).map(|i| vec![i, i + 1]).collect(); + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_as_variant_limit(ben.as_slice(), &mut out, BenVariant::Standard, 3).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn relabel_ben_file_as_variant_limit_zero_gives_empty() { + let assignments = vec![vec![1u16, 2, 3]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_as_variant_limit(ben.as_slice(), &mut out, BenVariant::Standard, 0).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert!(decoded.is_empty(), "expected empty output for limit=0"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – relabel_ben_file_with_map_as_variant +// ────────────────────────────────────────────────────────────────────────────── + +/// Build a map that reverses a 3-element assignment: new[0]←old[2], etc. +fn reverse_map_3() -> HashMap { + [(0, 2), (1, 1), (2, 0)].iter().cloned().collect() +} + +#[test] +fn relabel_ben_file_with_map_as_variant_standard_to_standard() { + let assignments = vec![vec![10u16, 20, 30]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_with_map_as_variant( + ben.as_slice(), + &mut out, + reverse_map_3(), + BenVariant::Standard, + ) + .unwrap(); + + assert!(out.starts_with(STANDARD_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + // Reversed: [30, 20, 10] + assert!(s.contains("\"assignment\":[30,20,10]"), "got: {s}"); +} + +#[test] +fn relabel_ben_file_with_map_as_variant_standard_to_mkvchain() { + let assignments = vec![ + vec![1u16, 2, 3], + vec![1u16, 2, 3], // duplicate + vec![3u16, 2, 1], + ]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_with_map_as_variant( + ben.as_slice(), + &mut out, + reverse_map_3(), + BenVariant::MkvChain, + ) + .unwrap(); + + assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn relabel_ben_file_with_map_as_variant_rejects_invalid_header() { + let err = relabel_ben_file_with_map_as_variant( + b"NOT A VALID BEN!!".as_slice(), + Vec::new(), + reverse_map_3(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn relabel_ben_file_with_map_as_variant_limit_truncates() { + let assignments = vec![ + vec![1u16, 2, 3], + vec![3u16, 2, 1], + vec![2u16, 1, 3], + vec![1u16, 3, 2], + vec![2u16, 3, 1], + ]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_with_map_as_variant_limit( + ben.as_slice(), + &mut out, + reverse_map_3(), + BenVariant::Standard, + 3, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +#[test] +fn relabel_ben_file_with_map_as_variant_limit_zero_gives_empty() { + let assignments = vec![vec![1u16, 2, 3]]; + let ben = encode_standard_ben(&assignments); + + let mut out = Vec::new(); + relabel_ben_file_with_map_as_variant_limit( + ben.as_slice(), + &mut out, + reverse_map_3(), + BenVariant::Standard, + 0, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert!(decoded.is_empty()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – dense_permutation edge cases (tested indirectly) +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_file_with_map_detects_gap_in_permutation() { + // new_to_old_map skips index 1 → should fail with InvalidInput. + let assignments = vec![vec![1u16, 2, 3]]; + let ben = encode_standard_ben(&assignments); + + // Map {0→0, 2→2} – index 1 is missing. + let bad_map: HashMap = [(0, 0), (2, 2)].iter().cloned().collect(); + + use binary_ensemble::ops::relabel::relabel_ben_file_with_map; + let err = relabel_ben_file_with_map(ben.as_slice(), Vec::new(), bad_map).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – convert_ben_file with MkvChain truncation +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn convert_ben_file_limit_with_mkvchain_repetitions() { + // 5 copies of the same assignment encoded as one MkvChain frame with count=5. + let jsonl = concat!( + r#"{"assignment":[1,2],"sample":1}"#, + "\n", + r#"{"assignment":[1,2],"sample":2}"#, + "\n", + r#"{"assignment":[1,2],"sample":3}"#, + "\n", + r#"{"assignment":[1,2],"sample":4}"#, + "\n", + r#"{"assignment":[1,2],"sample":5}"#, + "\n", + r#"{"assignment":[3,4],"sample":6}"#, + "\n", + ); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut out = Vec::new(); + convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::MkvChain, 3).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); +} + +// ────────────────────────────────────────────────────────────────────────────── +// ops::relabel – relabel_ben_file TwoDelta (canonicalization path) +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_file_twodelta_canonicalizes_labels() { + // Start with high label values; after canonicalization they should map to 1,2,3. + let file = concat!( + r#"{"assignment":[100,100,200,200,300,300],"sample":1}"#, + "\n", + r#"{"assignment":[100,100,200,200,300,300],"sample":2}"#, + "\n", + r#"{"assignment":[100,200,200,100,300,300],"sample":3}"#, + "\n", + ); + let mut ben = Vec::new(); + encode_jsonl_to_ben(file.as_bytes(), &mut ben, BenVariant::TwoDelta).unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_file(ben.as_slice(), &mut relabeled).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(relabeled.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + + // Canonical: first-seen is 1, second is 2, third is 3. + assert!(s.contains("\"assignment\":[1,1,2,2,3,3]"), "got: {s}"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Encoding – empty assignment vectors +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_and_decode_empty_assignment_standard() { + // An empty assignment is a valid (if unusual) edge case. + let data = json!({"assignment": [], "sample": 1}).to_string() + "\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(data.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + assert!(s.contains("\"assignment\":[]"), "got: {s}"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Encoding – large u16 values +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_and_decode_max_u16_values_standard() { + let assignment = vec![0u16, 65535, 32768, 1, 65534]; + let ben = encode_standard_ben(&[assignment.clone()]); + let decoded_str = decode_ben_to_string(&ben); + assert!( + decoded_str.contains("\"assignment\":[0,65535,32768,1,65534]"), + "got: {decoded_str}" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Encoding – single-sample files +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn single_sample_standard_round_trip() { + let assignment = vec![42u16; 1000]; + let ben = encode_standard_ben(&[assignment.clone()]); + let decoded_str = decode_ben_to_string(&ben); + assert_eq!(decoded_str.lines().count(), 1); + assert!(decoded_str.contains("\"sample\":1")); +} + +#[test] +fn single_sample_mkvchain_round_trip() { + let data = json!({"assignment": [1, 2, 3], "sample": 1}).to_string() + "\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(data.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + assert_eq!(s.lines().count(), 1); + assert!(s.contains("\"assignment\":[1,2,3]"), "got: {s}"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Decode error paths +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn decode_ben_to_jsonl_rejects_empty_input() { + let err = decode_ben_to_jsonl([].as_slice(), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn decode_ben_to_jsonl_rejects_wrong_banner() { + let err = decode_ben_to_jsonl(b"THIS IS NOT BEN!!".as_slice(), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn decode_ben_to_jsonl_rejects_truncated_frame_header() { + // Banner is correct but frame header is incomplete (only 2 bytes of the 6). + let mut data = STANDARD_BEN_BANNER.to_vec(); + data.extend_from_slice(&[0x02, 0x03]); // only 2 bytes of 6-byte header + let err = decode_ben_to_jsonl(data.as_slice(), Vec::new()).unwrap_err(); + assert_ne!(err.kind(), io::ErrorKind::Other); // not just "ok" +} + +// ────────────────────────────────────────────────────────────────────────────── +// XBEN round-trip with various compression levels +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn xben_round_trip_with_level_0_compression() { + let assignments = vec![vec![1u16, 2, 3, 4], vec![4u16, 3, 2, 1]]; + let jsonl = jsonl_from_assignments(&assignments); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::Standard, + Some(1), + Some(0), // compression level 0 + None, + ) + .unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + assert_eq!(decoded, jsonl); +} + +#[test] +fn xben_mkvchain_round_trip_preserves_all_samples() { + let jsonl = concat!( + r#"{"assignment":[1,2,3],"sample":1}"#, + "\n", + r#"{"assignment":[1,2,3],"sample":2}"#, + "\n", + r#"{"assignment":[1,2,3],"sample":3}"#, + "\n", + r#"{"assignment":[3,2,1],"sample":4}"#, + "\n", + ); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + assert_eq!( + decoded.iter().filter(|&&b| b == b'\n').count(), + 4, + "expected 4 decoded lines" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Relabel – file_as_variant with MkvChain source +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_file_as_variant_mkvchain_to_standard() { + // Build a MkvChain file with repetitions. + let jsonl = concat!( + r#"{"assignment":[5,5,3],"sample":1}"#, + "\n", + r#"{"assignment":[5,5,3],"sample":2}"#, + "\n", + r#"{"assignment":[3,3,5],"sample":3}"#, + "\n", + ); + let mut mkv_ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut mkv_ben, BenVariant::MkvChain).unwrap(); + + let mut out = Vec::new(); + relabel_ben_file_as_variant(mkv_ben.as_slice(), &mut out, BenVariant::Standard).unwrap(); + + assert!(out.starts_with(STANDARD_BEN_BANNER)); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + // Canonical labels: 5→1, 3→2 + assert!(s.contains("\"assignment\":[1,1,2]"), "got: {s}"); + assert_eq!(s.lines().count(), 3); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Relabel – with_map_as_variant permutation correctness +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_file_with_map_as_variant_permutes_correctly() { + // Assignment [a, b, c, d] with map {0→3, 1→2, 2→1, 3→0} → [d, c, b, a] + let assignments = vec![vec![10u16, 20, 30, 40]]; + let ben = encode_standard_ben(&assignments); + + let map: HashMap = [(0, 3), (1, 2), (2, 1), (3, 0)].iter().cloned().collect(); + + let mut out = Vec::new(); + relabel_ben_file_with_map_as_variant(ben.as_slice(), &mut out, map, BenVariant::Standard) + .unwrap(); + + let decoded_str = decode_ben_to_string(&out); + assert!( + decoded_str.contains("\"assignment\":[40,30,20,10]"), + "got: {decoded_str}" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// BenDecoder – iterator interface +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_decoder_iterator_collects_all_frames() { + let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6], vec![7u16, 8, 9]]; + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let frames: Vec<_> = decoder.collect::>>().unwrap(); + assert_eq!(frames.len(), 3); + for (i, (a, count)) in frames.iter().enumerate() { + assert_eq!(*count, 1); + assert_eq!(a, &assignments[i]); + } +} + +#[test] +fn ben_decoder_iterator_on_empty_payload_yields_nothing() { + let ben = STANDARD_BEN_BANNER.to_vec(); // banner only, no frames + let decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let frames: Vec<_> = decoder.collect::>>().unwrap(); + assert!(frames.is_empty()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Relabeling – idempotence of canonicalization +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn relabel_ben_file_standard_is_idempotent() { + let assignments = vec![vec![7u16, 3, 5, 1], vec![3u16, 7, 1, 5]]; + let ben = encode_standard_ben(&assignments); + + // First relabeling. + let mut relabeled1 = Vec::new(); + relabel_ben_file(ben.as_slice(), &mut relabeled1).unwrap(); + + // Second relabeling on already-canonical output. + let mut relabeled2 = Vec::new(); + relabel_ben_file(relabeled1.as_slice(), &mut relabeled2).unwrap(); + + // The decoded output of both should be identical. + let mut decoded1 = Vec::new(); + decode_ben_to_jsonl(relabeled1.as_slice(), &mut decoded1).unwrap(); + + let mut decoded2 = Vec::new(); + decode_ben_to_jsonl(relabeled2.as_slice(), &mut decoded2).unwrap(); + + assert_eq!(decoded1, decoded2, "relabeling is not idempotent"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Edge case: assignment with a single unique label +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn single_unique_label_assignment_round_trips() { + let assignment = vec![42u16; 50]; + let ben = encode_standard_ben(&[assignment.clone()]); + let decoded_str = decode_ben_to_string(&ben); + assert!( + decoded_str.contains("\"assignment\":[42,42,42"), + "got: {decoded_str}" + ); +} + +#[test] +fn single_unique_label_relabeled_to_one() { + let assignment = vec![99u16; 10]; + let ben = encode_standard_ben(&[assignment]); + + let mut relabeled = Vec::new(); + relabel_ben_file(ben.as_slice(), &mut relabeled).unwrap(); + + let decoded_str = decode_ben_to_string(&relabeled); + // All 99s should become 1s. + assert!( + decoded_str.contains("\"assignment\":[1,1,1"), + "got: {decoded_str}" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Edge case: frame with maximum run-length value +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_decode_max_run_length_standard() { + // A run of 65535 identical values. + let assignment = vec![7u16; 65535]; + let ben = encode_standard_ben(&[assignment.clone()]); + + let decoded_str = decode_ben_to_string(&ben); + assert!(decoded_str.contains("\"sample\":1")); + // Parse and verify the assignment length. + let parsed: serde_json::Value = serde_json::from_str(decoded_str.trim()).unwrap(); + assert_eq!( + parsed["assignment"].as_array().unwrap().len(), + 65535, + "wrong decoded length" + ); +} + +// ────────────────────────────────────────────────────────────────────────────── +// BenVariant debug / clone / copy +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_variant_clone_and_copy() { + let v = BenVariant::MkvChain; + let v2 = v; // Copy + let v3 = v.clone(); // Clone + assert_eq!(v2, v3); + assert_eq!(v, BenVariant::MkvChain); +} + +#[test] +fn ben_variant_debug() { + let s = format!("{:?}", BenVariant::TwoDelta); + assert_eq!(s, "TwoDelta"); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Cursor::new round trips for Cursor-based readers +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_decoder_accepts_cursor_reader() { + let assignment = vec![1u16, 2, 3]; + let ben = encode_standard_ben(&[assignment.clone()]); + let cursor = Cursor::new(ben); + let mut decoder = BenDecoder::new(cursor).unwrap().silent(true); + let (decoded, _) = decoder.next().unwrap().unwrap(); + assert_eq!(decoded, assignment); +} + +// ────────────────────────────────────────────────────────────────────────────── +// encode_twodelta_frame error paths +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_twodelta_frame_different_lengths_errors() { + let prev = vec![1u16, 2, 3]; + let next = vec![1u16, 2]; + let err = encode_twodelta_frame(&prev, &next).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("equal-length")); +} + +#[test] +fn encode_twodelta_frame_identical_assignments_errors() { + let assign = vec![1u16, 2, 3]; + let err = encode_twodelta_frame(&assign, &assign).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("identical")); +} + +#[test] +fn encode_twodelta_frame_more_than_two_values_errors() { + // prev = [1,2,3], next = [3,1,2]: positions 0,1,2 all change and involve ids 1,2,3 → 3 ids + let prev = vec![1u16, 2, 3]; + let next = vec![3u16, 1, 2]; + let err = encode_twodelta_frame(&prev, &next).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("two distinct assignment ids")); +} + +#[test] +fn encode_twodelta_frame_valid_two_value_transition() { + let prev = vec![1u16, 1, 2, 2]; + let next = vec![2u16, 2, 1, 1]; + let frame = encode_twodelta_frame(&prev, &next).unwrap(); + // All 4 positions belong to the pair, and all flip + assert_eq!(frame.n_bytes as usize, frame.payload().len()); +} + +#[test] +fn encode_twodelta_frame_single_value_swap() { + // Only one position changes: prev[3]=2 → next[3]=1; pair is (new_val, old_val) = (1, 2) + let prev = vec![1u16, 1, 1, 2]; + let next = vec![1u16, 1, 1, 1]; + let frame = encode_twodelta_frame(&prev, &next).unwrap(); + assert_eq!(frame.pair, (1, 2)); +} + +// ────────────────────────────────────────────────────────────────────────────── +// TwoDeltaFrame accessors +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn twodelta_frame_pair_accessor() { + let pair = (3u16, 7u16); + let run_lengths = vec![2u16, 3, 1]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + assert_eq!(frame.pair, pair); +} + +#[test] +fn twodelta_frame_max_len_bits_accessor() { + // max run length = 4 = 0b100 → 3 bits + let pair = (1u16, 2u16); + let run_lengths = vec![4u16, 4]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + assert_eq!(frame.max_len_bit_count, 3); +} + +#[test] +fn twodelta_frame_n_bytes_and_payload_consistent() { + let pair = (5u16, 10u16); + let run_lengths = vec![1u16, 2, 3]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + assert_eq!(frame.n_bytes as usize, frame.payload().len()); +} + +#[test] +fn twodelta_frame_to_bytes_and_as_slice_same() { + let pair = (1u16, 2u16); + let run_lengths = vec![3u16, 2]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + assert_eq!(frame.to_bytes(), frame.as_slice()); +} + +#[test] +fn twodelta_frame_into_bytes_consumes() { + let pair = (1u16, 2u16); + let run_lengths = vec![3u16, 2]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let expected = frame.to_bytes(); + let actual = frame.into_bytes(); + assert_eq!(actual, expected); +} + +#[test] +fn twodelta_frame_from_parts_round_trip() { + let pair = (10u16, 20u16); + let run_lengths = vec![2u16, 5, 1]; + let original = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let reconstructed = TwoDeltaFrame::from_parts( + pair, + original.max_len_bit_count, + original.payload().to_vec(), + ); + assert_eq!(original.as_slice(), reconstructed.as_slice()); + assert_eq!(original.pair, reconstructed.pair); + assert_eq!(original.max_len_bit_count, reconstructed.max_len_bit_count); + assert_eq!(original.n_bytes, reconstructed.n_bytes); +} + +#[test] +fn twodelta_frame_asref_and_deref() { + let pair = (1u16, 2u16); + let run_lengths = vec![3u16]; + let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let as_ref: &[u8] = frame.as_ref(); + let deref: &[u8] = &*frame; + assert_eq!(as_ref, deref); + assert_eq!(as_ref, frame.as_slice()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// EncodeBenFrame (BenFrame from codec::encode) accessors +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn encode_ben_frame_from_rle_runs_accessor() { + let runs = vec![(3u16, 2u16), (5u16, 4u16)]; + let frame = BenEncodeFrame::from_rle(runs.clone(), None); + assert_eq!(frame.runs.as_slice(), runs.as_slice()); +} + +#[test] +fn encode_ben_frame_max_val_bits() { + // max value = 5 = 0b101 → 3 bits + let runs = vec![(1u16, 3u16), (5u16, 2u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + assert_eq!(frame.max_val_bit_count, 3); +} + +#[test] +fn encode_ben_frame_max_len_bits() { + // max run length = 7 = 0b111 → 3 bits + let runs = vec![(1u16, 7u16), (2u16, 1u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + assert_eq!(frame.max_len_bit_count, 3); +} + +#[test] +fn encode_ben_frame_n_bytes_consistent() { + // Frame layout: 1 byte (max_val_bits) + 1 byte (max_len_bits) + 4 bytes (n_bytes header) + n_bytes payload + let runs = vec![(1u16, 5u16), (2u16, 3u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + assert_eq!(frame.n_bytes as usize + 6, frame.as_slice().len()); +} + +#[test] +fn encode_ben_frame_to_bytes_and_as_slice_same() { + let runs = vec![(1u16, 2u16), (3u16, 4u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + assert_eq!(frame.to_bytes(), frame.as_slice()); +} + +#[test] +fn encode_ben_frame_into_bytes_consumes() { + let runs = vec![(1u16, 2u16), (3u16, 4u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + let expected = frame.to_bytes(); + let actual = frame.into_bytes(); + assert_eq!(actual, expected); +} + +#[test] +fn encode_ben_frame_eq_with_vec_u8() { + let runs = vec![(1u16, 2u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + let bytes = frame.to_bytes(); + assert!(frame == bytes); + assert!(bytes == frame); +} + +#[test] +fn encode_ben_frame_asref_and_deref() { + let runs = vec![(1u16, 1u16)]; + let frame = BenEncodeFrame::from_rle(runs, None); + let as_ref: &[u8] = frame.as_ref(); + let deref: &[u8] = &*frame; + assert_eq!(as_ref, deref); + assert_eq!(as_ref, frame.as_slice()); +} + +#[test] +fn encode_ben_frame_from_assignment() { + let assignment = vec![1u16, 1, 2, 2, 3]; + let frame = BenEncodeFrame::from_assignment(&assignment, None); + // Frame from assignment should produce runs + let runs = &frame.runs[..]; + assert_eq!(runs, &[(1u16, 2u16), (2u16, 2u16), (3u16, 1u16)]); +} + +// ────────────────────────────────────────────────────────────────────────────── +// Graph ordering with >8 nodes (triggers multilevel clustering recursion) +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn sort_by_ordering_multilevel_cluster_large_ring_graph() { + // 20-node ring → component > 8 nodes → triggers recursive greedy_cluster_partition path + let graph_json = make_ring_graph_json(20); + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + graph_json.as_bytes(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + + assert_eq!(mapping.len(), 20); + let mut new_ids: Vec = mapping.values().copied().collect(); + new_ids.sort_unstable(); + assert_eq!(new_ids, (0..20).collect::>()); + + let output_json: serde_json::Value = serde_json::from_slice(&output).unwrap(); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 20); +} + +#[test] +fn sort_by_ordering_rcm_large_ring_graph() { + let graph_json = make_ring_graph_json(20); + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + graph_json.as_bytes(), + &mut output, + GraphOrderingMethod::ReverseCuthillMckee, + ) + .unwrap(); + + assert_eq!(mapping.len(), 20); + let mut new_ids: Vec = mapping.values().copied().collect(); + new_ids.sort_unstable(); + assert_eq!(new_ids, (0..20).collect::>()); +} + +#[test] +fn sort_by_ordering_disconnected_graph_multilevel() { + // Two triangles (two disconnected components) + let input = r#"{ + "nodes": [ + {"id": 0}, {"id": 1}, {"id": 2}, + {"id": 3}, {"id": 4}, {"id": 5} + ], + "adjacency": [ + [{"id": 1}, {"id": 2}], + [{"id": 0}, {"id": 2}], + [{"id": 0}, {"id": 1}], + [{"id": 4}, {"id": 5}], + [{"id": 3}, {"id": 5}], + [{"id": 3}, {"id": 4}] + ] + }"#; + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + input.as_bytes(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + assert_eq!(mapping.len(), 6); + let output_json: serde_json::Value = serde_json::from_slice(&output).unwrap(); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 6); +} + +#[test] +fn sort_by_ordering_disconnected_graph_rcm() { + let input = r#"{ + "nodes": [ + {"id": 0}, {"id": 1}, {"id": 2}, + {"id": 3}, {"id": 4}, {"id": 5} + ], + "adjacency": [ + [{"id": 1}, {"id": 2}], + [{"id": 0}, {"id": 2}], + [{"id": 0}, {"id": 1}], + [{"id": 4}, {"id": 5}], + [{"id": 3}, {"id": 5}], + [{"id": 3}, {"id": 4}] + ] + }"#; + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + input.as_bytes(), + &mut output, + GraphOrderingMethod::ReverseCuthillMckee, + ) + .unwrap(); + assert_eq!(mapping.len(), 6); +} + +#[test] +fn graph_invalid_node_id_errors() { + // Negative node id cannot be parsed as usize + let input = r#"{ + "nodes": [{"id": -1}, {"id": 1}], + "adjacency": [[{"id": 1}], [{"id": 0}]] + }"#; + let mut output = Vec::new(); + let result = sort_json_file_by_ordering( + input.as_bytes(), + &mut output, + GraphOrderingMethod::ReverseCuthillMckee, + ); + assert!(result.is_err()); +} + +#[test] +fn graph_unknown_adjacency_node_errors() { + // Edge target id 99 does not exist in node list + let input = r#"{ + "nodes": [{"id": 0}, {"id": 1}], + "adjacency": [[{"id": 99}], [{"id": 0}]] + }"#; + let mut output = Vec::new(); + let result = sort_json_file_by_key(input.as_bytes(), &mut output, "id"); + assert!(result.is_err()); +} + +#[test] +fn graph_invalid_link_id_errors() { + // Edge target id is negative → parse_link_id fails + let input = r#"{ + "nodes": [{"id": 0}, {"id": 1}], + "adjacency": [[{"id": -1}], [{"id": 0}]] + }"#; + let mut output = Vec::new(); + let result = sort_json_file_by_key(input.as_bytes(), &mut output, "id"); + assert!(result.is_err()); +} + +#[test] +fn sort_by_ordering_large_graph_multilevel_verifies_permutation() { + // 30-node ring — large enough that greedy_cluster_partition produces multiple clusters + // and the coarse graph recursion fires + let graph_json = make_ring_graph_json(30); + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + graph_json.as_bytes(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + + assert_eq!(mapping.len(), 30); + // All 30 new ids must be a valid permutation of 0..29 + let mut new_ids: Vec = mapping.values().copied().collect(); + new_ids.sort_unstable(); + assert_eq!(new_ids, (0..30).collect::>()); +} + +// ────────────────────────────────────────────────────────────────────────────── +// XBenDecoder / XBenFrameDecoder +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn xben_decoder_iterator_standard_collects_all() { + let assignments = vec![vec![1u16, 1, 2, 2], vec![3u16, 3, 3, 3]]; + let xben = encode_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + assert_eq!(decoder.variant, BenVariant::Standard); + let results: Vec> = decoder.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn xben_decoder_count_samples_standard() { + let assignments = vec![ + vec![1u16, 2, 1, 2], + vec![3u16, 4, 3, 4], + vec![5u16, 6, 5, 6], + ]; + let xben = encode_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + assert_eq!(decoder.count_samples().unwrap(), 3); +} + +#[test] +fn xben_decoder_count_samples_mkvchain() { + let assignments: Vec> = (0..5u16).map(|i| vec![i, i + 1]).collect(); + let xben = encode_xben(&assignments, BenVariant::MkvChain); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + assert_eq!(decoder.count_samples().unwrap(), 5); +} + +#[test] +fn xben_frame_decoder_new_and_iterate() { + let assignments = vec![vec![1u16, 1, 2], vec![2u16, 2, 1]]; + let xben = encode_xben(&assignments, BenVariant::Standard); + let frame_iter = XBenFrameDecoder::new(Cursor::new(xben)).unwrap(); + let frames: Vec<(Vec, u16)> = frame_iter.map(|r| r.unwrap()).collect(); + assert_eq!(frames.len(), 2); + for (frame_bytes, count) in &frames { + assert_eq!(*count, 1u16); + // Every standard ben32 frame ends with the 4-zero sentinel + assert!(frame_bytes.ends_with(&[0u8, 0, 0, 0])); + } +} + +// ────────────────────────────────────────────────────────────────────────────── +// BenFrameDecoeder (note: typo in source name is intentional) +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_frame_decoder_standard_iterates() { + let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; + let ben = encode_standard_ben(&assignments); + let frame_iter = BenFrameDecoeder::new(Cursor::new(ben)).unwrap(); + let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); + assert_eq!(frames.len(), 2); + assert_eq!(frames[0].count, 1); + assert_eq!(frames[1].count, 1); +} + +#[test] +fn ben_frame_decoder_twodelta_yields_standard_frames() { + let prev = vec![1u16, 1, 2, 2]; + let next = vec![2u16, 2, 1, 1]; + let assignments = vec![prev, next]; + let jsonl = jsonl_from_assignments(&assignments); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::TwoDelta).unwrap(); + + // BenFrameDecoeder should re-encode TwoDelta frames back to standard BEN frames + let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + let frame_iter = decoder.into_frames(); + let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); + assert_eq!(frames.len(), 2); +} + +// ────────────────────────────────────────────────────────────────────────────── +// SubsampleFrameDecoder — BenDecoder subsample methods +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn ben_decoder_subsample_by_indices() { + let assignments: Vec> = (0u16..10).map(|i| vec![i; 4]).collect(); + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + // 1-based indices: 2, 5, 8 + let selected: Vec> = decoder + .into_subsample_by_indices(vec![2usize, 5, 8]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 3); + assert_eq!(selected[0], assignments[1]); // 1-based 2 = 0-based 1 + assert_eq!(selected[1], assignments[4]); // 1-based 5 = 0-based 4 + assert_eq!(selected[2], assignments[7]); // 1-based 8 = 0-based 7 +} + +#[test] +fn ben_decoder_subsample_by_range() { + let assignments: Vec> = (0u16..10).map(|i| vec![i; 3]).collect(); + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + // Inclusive 1-based range [3, 6] + let selected: Vec> = decoder + .into_subsample_by_range(3, 6) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 4); + assert_eq!(selected[0], assignments[2]); // 3rd sample + assert_eq!(selected[3], assignments[5]); // 6th sample +} + +#[test] +fn ben_decoder_subsample_every_nth() { + let assignments: Vec> = (0u16..10).map(|i| vec![i; 2]).collect(); + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + // Every 3rd sample starting at 1-based offset 1: samples 1, 4, 7, 10 + let selected: Vec> = decoder + .into_subsample_every(3, 1) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 4); + assert_eq!(selected[0], assignments[0]); + assert_eq!(selected[1], assignments[3]); + assert_eq!(selected[2], assignments[6]); + assert_eq!(selected[3], assignments[9]); +} + +#[test] +fn ben_decoder_subsample_by_indices_dedup() { + let assignments: Vec> = (0u16..5).map(|i| vec![i; 2]).collect(); + let ben = encode_standard_ben(&assignments); + let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + // Duplicate index 2 → after dedup only samples 2 and 3 are selected + let selected: Vec> = decoder + .into_subsample_by_indices(vec![2usize, 2, 3]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 2); + assert_eq!(selected[0], assignments[1]); + assert_eq!(selected[1], assignments[2]); +} + +// ────────────────────────────────────────────────────────────────────────────── +// SubsampleFrameDecoder — XBenDecoder subsample methods +// ────────────────────────────────────────────────────────────────────────────── + +#[test] +fn xben_decoder_subsample_by_indices() { + let assignments: Vec> = (1u16..=5).map(|i| vec![i; 4]).collect(); + let xben = encode_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let selected: Vec> = decoder + .into_subsample_by_indices(vec![1usize, 3, 5]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 3); + assert_eq!(selected[0], assignments[0]); + assert_eq!(selected[1], assignments[2]); + assert_eq!(selected[2], assignments[4]); +} + +#[test] +fn xben_decoder_subsample_by_range() { + let assignments: Vec> = (0u16..6).map(|i| vec![i; 3]).collect(); + let xben = encode_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let selected: Vec> = decoder + .into_subsample_by_range(2, 4) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 3); + assert_eq!(selected[0], assignments[1]); + assert_eq!(selected[2], assignments[3]); +} + +#[test] +fn xben_decoder_subsample_every() { + let assignments: Vec> = (0u16..6).map(|i| vec![i; 2]).collect(); + let xben = encode_xben(&assignments, BenVariant::Standard); + let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + // Every 2nd sample starting from offset 1: samples 1, 3, 5 + let selected: Vec> = decoder + .into_subsample_every(2, 1) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(selected.len(), 3); + assert_eq!(selected[0], assignments[0]); + assert_eq!(selected[1], assignments[2]); + assert_eq!(selected[2], assignments[4]); +} diff --git a/ben/tests/test_impls_pipeline.proptest-regressions b/ben/tests/test_impls_pipeline.proptest-regressions index 1ee2b23..25d1a98 100644 --- a/ben/tests/test_impls_pipeline.proptest-regressions +++ b/ben/tests/test_impls_pipeline.proptest-regressions @@ -5,3 +5,5 @@ # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc adbda176b74e4d9dd494b3996cef6dfd43dce6735177b3b056ed7085d827a7e6 # shrinks to seq = [[1], [1]], params = (1, 0), step = 1, offset = 2 +cc 4093c67f6d4a6b6872ca5c1a89743be6078e4f5deb392d259a69378a49e484af # shrinks to seq = [[1, 2], [1, 1], [1, 1], [1, 1]] +cc f078f2580e251a6f2209658989ac64238bcdd0e8cd44f9b3ec8a14914bd05db3 # shrinks to seq = [[13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], params = (1, 0) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index e2a1ec2..5be0a65 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -4,11 +4,11 @@ use binary_ensemble::codec::decode::{ decode_ben_line, decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, }; use binary_ensemble::codec::encode::{ - encode_ben_to_xben, encode_ben_vec_from_assign, encode_ben_vec_from_rle, encode_jsonl_to_ben, - encode_jsonl_to_xben, xz_compress, + encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; +use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE}; use binary_ensemble::io::reader::{ - build_frame_iter, count_samples_from_file, BenDecoder, DecoderInitError, Frame, + build_frame_iter, count_samples_from_file, BenDecoder, DecodeFrame, DecoderInitError, SubsampleFrameDecoder, XBenDecoder, }; use binary_ensemble::io::writer::BenEncoder; @@ -74,9 +74,9 @@ where Ok(out) } -fn collect_frames(it: I) -> std::io::Result> +fn collect_frames(it: I) -> std::io::Result> where - I: IntoIterator>, + I: IntoIterator>, { let mut out = Vec::new(); for rec in it { @@ -654,7 +654,7 @@ fn benencoder_finish_flushes_once() { let mut ben_vec = Vec::new(); { - let mut enc = BenEncoder::new(&mut ben_vec, BenVariant::MkvChain); + let mut enc = BenEncoder::new(&mut ben_vec, BenVariant::MkvChain).unwrap(); for line in lines.lines() { let v: serde_json::Value = serde_json::from_str(line).unwrap(); enc.write_json_value(v).unwrap(); @@ -778,7 +778,8 @@ fn xben_truncated_frame_reports_unexpected_eof() { fn encode_decode_ben32_odd_bit_packing_roundtrip() { // values up to 3 (2 bits), lengths big to make non-byte boundary let rle = vec![(1u16, 3u16), (2, 5), (3, 7)]; - let ben = encode_ben_vec_from_rle(rle.clone()); + let ben_frame = BenEncodeFrame::from_rle(rle.clone(), None); + let ben = ben_frame.as_slice(); // ben layout: [max_val_bits, max_len_bits, n_bytes, payload...] let max_val_bits = ben[0]; let max_len_bits = ben[1]; @@ -882,8 +883,14 @@ fn ben_encode_xben_respects_existing_ben_header() { encode_jsonl_to_ben(BufReader::new(jsonl.as_bytes()), &mut ben, variant).unwrap(); let mut xz = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xz, Some(1), Some(0), None) - .expect("ben->xben failed"); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .expect("ben->xben failed"); let mut ben_back = Vec::new(); decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut ben_back).unwrap(); @@ -919,7 +926,7 @@ fn xz_mt_params_are_capped_and_safe() { fn ben_encoder_write_assignment_path_roundtrips() { let mut ben = Vec::new(); { - let mut enc = BenEncoder::new(&mut ben, BenVariant::Standard); + let mut enc = BenEncoder::new(&mut ben, BenVariant::Standard).unwrap(); enc.write_assignment(vec![9u16, 9, 2, 2, 2]).unwrap(); enc.finish().unwrap(); } @@ -1001,7 +1008,7 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { let mut payload_only = Vec::new(); { - let mut enc = BenEncoder::new(&mut payload_only, BenVariant::Standard); + let mut enc = BenEncoder::new(&mut payload_only, BenVariant::Standard).unwrap(); enc.write_assignment(vec![5u16, 5, 7]).unwrap(); enc.finish().unwrap(); } @@ -1016,7 +1023,7 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { .encoder() .unwrap(); let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); - let mut xben = binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard); + let mut xben = binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard).unwrap(); xben.write_ben_file(BufReader::new(payload_only.as_slice())) .unwrap(); } @@ -1123,7 +1130,11 @@ fn subsample_frame_decoder_propagates_inner_and_decode_errors() { assert_eq!(err.kind(), std::io::ErrorKind::Other); let mut malformed = SubsampleFrameDecoder::by_indices( - vec![Ok((Frame::XBen(vec![1, 2, 3], BenVariant::Standard), 1))].into_iter(), + vec![Ok(( + DecodeFrame::XBen(vec![1, 2, 3], BenVariant::Standard), + 1, + ))] + .into_iter(), vec![1], ); let err = malformed.next().unwrap().unwrap_err(); @@ -1334,7 +1345,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let mut ben = Vec::new(); { - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); for assignment in &assignments { encoder.write_assignment(assignment.clone()).unwrap(); } @@ -1357,7 +1368,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let frames = BenDecoder::new(ben.as_slice()).unwrap().into_frames(); assert_eq!( - collect_frames(frames.map(|res| res.map(|f| (Frame::Ben(f.clone()), f.count)))) + collect_frames(frames.map(|res| res.map(|f| (DecodeFrame::Ben(f.clone()), f.count as u16)))) .unwrap() .len(), 3 @@ -1371,14 +1382,14 @@ fn twodelta_first_frame_carries_repeat_trailer() { let mut ben = Vec::new(); { - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(second).unwrap(); encoder.finish().unwrap(); } - let expected_first = encode_ben_vec_from_assign(&first); + let expected_first = BenEncodeFrame::from_assignment(&first, None); assert_eq!(&ben[..17], b"TWODELTA BEN FILE"); assert_eq!( &ben[17..17 + expected_first.as_slice().len()], @@ -1394,7 +1405,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { #[test] fn twodelta_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); let err = encoder.write_assignment(vec![1u16, 3, 2, 4]).err().unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -1403,7 +1414,7 @@ fn twodelta_rejects_non_pair_transition() { #[test] fn twodelta_write_json_value_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta); + let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) .unwrap(); From a911ff6b76c9c812a1e09b953125cac205089e1f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 08:28:58 -0600 Subject: [PATCH 043/221] More org --- ben/src/codec/encode/twodelta.rs | 77 +++--- ben/src/io/writer/ben.rs | 459 ++----------------------------- ben/src/io/writer/frames.rs | 6 - ben/src/io/writer/mod.rs | 4 +- ben/src/io/writer/utils.rs | 2 +- ben/src/io/writer/xben.rs | 441 +++++++++++++++++++++++++++++ 6 files changed, 500 insertions(+), 489 deletions(-) create mode 100644 ben/src/io/writer/xben.rs diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index f73847f..939f094 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -136,8 +136,8 @@ impl TwoDeltaFrame { /// * `previous_assignment` - The full assignment vector from the preceding sample. /// * `new_assignment` - The full assignment vector for the sample being encoded. /// * `delta_pair` - An optional hint asserting which pair of ids is involved in the -/// transition. Must be provided together with `masks`, and the two ids must be distinct. -/// * `masks` - An optional mutable map from assignment id to the sorted list of positions +/// transition. Must be provided together with `previous_masks`, and the two ids must be distinct. +/// * `previous_masks` - An optional mutable map from assignment id to the sorted list of positions /// it occupies in `previous_assignment`. When provided, the map is updated in-place to /// reflect `new_assignment` before returning. /// @@ -160,18 +160,18 @@ impl TwoDeltaFrame { /// Two optional hints can be provided to avoid scanning the full assignment vector: /// /// - `delta_pair`: The caller asserts that exactly this pair of ids is involved in -/// the transition. Must be provided together with `masks`. The pair must have two +/// the transition. Must be provided together with `previous_masks`. The pair must have two /// distinct ids — passing `(x, x)` is an error. /// -/// - `masks`: A mutable map from assignment id to the sorted list of positions it +/// - `previous_masks`: A mutable map from assignment id to the sorted list of positions it /// occupies in `previous_assignment`. When provided, the function reads positions /// directly from the map instead of scanning the assignment vector, and updates -/// the map in-place to reflect `new_assignment` before returning. The masks must +/// the map in-place to reflect `new_assignment` before returning. The previous_masks must /// cover every id that appears in the pair; a missing or empty entry is an error. /// -/// The hints are not independent: `delta_pair` requires `masks`. Providing `masks` +/// The hints are not independent: `delta_pair` requires `previous_masks`. Providing `previous_masks` /// without `delta_pair` is allowed — the function will infer the pair from the first -/// differing position and then use the masks from there. +/// differing position and then use the previous_masks from there. /// /// When no hints are provided the function falls back to a full scan of both /// assignment vectors. @@ -180,7 +180,7 @@ impl TwoDeltaFrame { /// /// Returns an error if: /// - The assignment vectors have different lengths. -/// - `delta_pair` is provided without `masks`. +/// - `delta_pair` is provided without `previous_masks`. /// - `delta_pair` contains two identical ids. /// - A mask entry required by the pair is absent or empty. /// - A position referenced by a mask holds a value outside the pair. @@ -190,7 +190,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, delta_pair: Option<(u16, u16)>, - masks: Option<&mut HashMap>>, + previous_masks: Option<&mut HashMap>>, ) -> Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); @@ -203,7 +203,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( } if delta_pair.is_some() { - if masks.is_none() { + if previous_masks.is_none() { return Err(Error::from(EncodeError::TwoDeltaHintWithoutMasks)); } let pair = delta_pair.unwrap(); @@ -214,7 +214,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( } } - match (delta_pair, masks) { + match (delta_pair, previous_masks) { (Some(pair), Some(masks)) => construct_twodelta_frame_from_pair_and_mask_hints( previous_assignment, new_assignment, @@ -230,7 +230,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( // Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) } -/// Validate that `masks` contains non-empty entries for both ids in `pair` and return +/// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return /// the pair ordered so that `pair.0` occupies a lower index than `pair.1`. /// /// Ordering by first position ensures that the run-length sequence produced during @@ -240,12 +240,12 @@ pub(crate) fn encode_twodelta_frame_with_hint( /// # Arguments /// /// * `pair` - The two assignment ids to validate and order. -/// * `masks` - The position mask map to look up entries in. +/// * `previous_masks` - The position mask map to look up entries in. /// /// # Returns /// /// The pair reordered so that `pair.0` has a smaller first position in the current vector than -/// `pair.1`, or an error if either id is absent from `masks` or has an empty position list. +/// `pair.1`, or an error if either id is absent from `previous_masks` or has an empty position list. fn validate_masks_and_order_pairs_for_twodelta( pair: (u16, u16), masks: &HashMap>, @@ -285,10 +285,10 @@ fn validate_masks_and_order_pairs_for_twodelta( /// This is the fast path used during recombination-aware encoding, where the caller /// already knows which two ids are swapping and has maintained a mask for each id. /// -/// The function merges the two sorted position lists from `masks` to produce the +/// The function merges the two sorted position lists from `previous_masks` to produce the /// interleaved sequence of positions, validates that every referenced position in /// `previous` and `current` belongs to the pair, computes the run lengths over -/// `current`, and then updates `masks` in-place to reflect the new positions of +/// `current`, and then updates `previous_masks` in-place to reflect the new positions of /// each id in `current`. /// /// # Arguments @@ -296,7 +296,7 @@ fn validate_masks_and_order_pairs_for_twodelta( /// * `previous` - The full assignment vector from the preceding sample. /// * `current` - The full assignment vector for the sample being encoded. /// * `delta_pair` - The pair of ids asserted to be involved in the transition. -/// * `masks` - Mutable position mask map for both ids in the pair. Updated in-place +/// * `previous_masks` - Mutable position mask map for both ids in the pair. Updated in-place /// to reflect `current` before returning. /// /// # Returns @@ -308,26 +308,27 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous: &[u16], current: &[u16], delta_pair: (u16, u16), - masks: &mut HashMap>, + previous_masks: &mut HashMap>, ) -> Result { - let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, masks, current) { - Ok(pair) => pair, - Err(e) => { - return Err(Error::new( - ErrorKind::InvalidData, - format!( - "Encountered when validating masks and ordering pairs in \ + let pair = + match validate_masks_and_order_pairs_for_twodelta(delta_pair, previous_masks, current) { + Ok(pair) => pair, + Err(e) => { + return Err(Error::new( + ErrorKind::InvalidData, + format!( + "Encountered when validating previous_masks and ordering pairs in \ `determine_twodelta_run_from_pair_and_mask_hints`:\n{}", - e - ), - )); - } - }; + e + ), + )); + } + }; - let mask_a = masks + let mask_a = previous_masks .get(&pair.0) .expect("Failed to get mask for pair.0 after validation"); - let mask_b = masks + let mask_b = previous_masks .get(&pair.1) .expect("Failed to get mask for pair.1 after validation"); @@ -399,8 +400,8 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( return Err(Error::from(EncodeError::TwoDeltaIdentical)); } - masks.insert(pair.0, new_mask_a); - masks.insert(pair.1, new_mask_b); + previous_masks.insert(pair.0, new_mask_a); + previous_masks.insert(pair.1, new_mask_b); Ok(TwoDeltaFrame::from_run_lengths(pair, run_lengths)) } @@ -416,7 +417,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( /// /// * `previous` - The full assignment vector from the preceding sample. /// * `current` - The full assignment vector for the sample being encoded. -/// * `masks` - Mutable position mask map covering all ids that may appear in the pair. +/// * `previous_masks` - Mutable position mask map covering all ids that may appear in the pair. /// Updated in-place to reflect `current` before returning. /// /// # Returns @@ -426,7 +427,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( fn construct_twodelta_frame_from_mask_hint( previous: &[u16], current: &[u16], - masks: &mut HashMap>, + previous_masks: &mut HashMap>, ) -> Result { for (&assign0, &assign1) in previous.iter().zip(current.iter()) { if assign0 != assign1 { @@ -434,7 +435,7 @@ fn construct_twodelta_frame_from_mask_hint( previous, current, (assign0, assign1), - masks, + previous_masks, ); } } @@ -512,7 +513,7 @@ fn construct_twodelta_frame_from_scratch( /// /// This is the unhinted entry point. It falls back to a full scan of both /// assignment vectors to discover the pair and compute run lengths. Prefer -/// `encode_twodelta_frame_with_hint` when masks are available, as it avoids +/// `encode_twodelta_frame_with_hint` when previous_masks are available, as it avoids /// the scan entirely. /// /// The transition is valid only when all changed positions involve exactly two diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index b91f280..1b7c716 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -1,22 +1,12 @@ -use super::frames::{AssignmentHints, BufferedBenFrame, BufferedDeltaFrame}; -use super::twodelta::{ - DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, -}; -use super::utils::{ - analyze_twodelta_transition, encode_xben_twodelta_full_frame, is_repeated_assignment, - parse_json_assignment, -}; -use crate::codec::decode::decode_ben_line; -use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; -use crate::codec::translate::ben_to_ben32_lines; -use crate::codec::{BenEncodeFrame, FromAssign, TwoDeltaFrame}; -use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::{progress, BenVariant}; -use byteorder::{BigEndian, ReadBytesExt}; +use super::frames::BufferedBenFrame; +use super::utils::{analyze_twodelta_transition, is_repeated_assignment, parse_json_assignment}; +use crate::codec::encode::encode_twodelta_frame_with_hint; +use crate::codec::{BenEncodeFrame, FromAssign}; +use crate::format::banners::banner_for_variant; +use crate::BenVariant; use serde_json::Value; use std::collections::HashMap; -use std::io::{self, BufRead, Read, Result, Write}; -use xz2::write::XzEncoder; +use std::io::{self, Result, Write}; /// A struct to make the writing of BEN files easier and more ergonomic. pub struct BenEncoder { @@ -29,6 +19,12 @@ pub struct BenEncoder { complete: bool, } +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct AssignmentHints { + pub is_repeated: bool, + pub delta_pair: Option<(u16, u16)>, +} + impl BenEncoder { /// Create a new BEN writer and immediately emit the BEN banner. /// @@ -180,10 +176,9 @@ impl BenEncoder { return Ok(()); } - let encoded = self - .previous_encoded_sample - .as_ref() - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing previous BEN frame"))?; + let encoded = self.previous_encoded_sample.as_ref().ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "missing previous BEN frame") + })?; self.writer.write_all(encoded.as_slice())?; if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { @@ -286,425 +281,3 @@ impl Drop for BenEncoder { let _ = self.finish(); } } - -/// A struct to make the writing of XBEN files easier and more ergonomic. -pub struct XBenEncoder { - encoder: XzEncoder, - previous_assignment: Vec, - previous_masks: HashMap>, - previous_frame: Vec, - count: u16, - variant: BenVariant, - chunk_size: usize, - chunk_buffer: Vec, -} - -impl XBenEncoder { - /// Rebuild the value-to-position index map from the current previous assignment. - fn rebuild_previous_masks(&mut self) { - self.previous_masks.clear(); - for (idx, &assignment) in self.previous_assignment.iter().enumerate() { - self.previous_masks.entry(assignment).or_default().push(idx); - } - } - - /// Store a new previous assignment along with its encoded frame and repetition count. - /// - /// # Arguments - /// - /// * `assignment` - The assignment vector to cache. - /// * `frame` - The already-encoded frame bytes for this assignment. - /// * `count` - The initial repetition count for this assignment. - fn set_previous_assignment(&mut self, assignment: Vec, frame: Vec, count: u16) { - self.previous_assignment = assignment; - self.rebuild_previous_masks(); - self.previous_frame = frame; - self.count = count; - } - - /// Update the value-to-position masks incrementally for a two-delta transition. - /// - /// Instead of rebuilding the entire mask HashMap, only the positions belonging - /// to the two swapped values are repartitioned. This is O(pair_positions) - /// rather than O(assignment_length). - /// - /// # Arguments - /// - /// * `new_sample` - The new assignment vector after the transition. - /// * `pair` - The two values involved in the delta swap. - #[allow(dead_code)] - fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { - if pair.0 == pair.1 { - return; - } - - let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); - let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); - - let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); - let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); - - let (mut i, mut j) = (0, 0); - while i < pos_a.len() || j < pos_b.len() { - let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { - let p = pos_a[i]; - i += 1; - p - } else { - let p = pos_b[j]; - j += 1; - p - }; - if new_sample[pos] == pair.0 { - new_a.push(pos); - } else { - new_b.push(pos); - } - } - - if !new_a.is_empty() { - self.previous_masks.insert(pair.0, new_a); - } - if !new_b.is_empty() { - self.previous_masks.insert(pair.1, new_b); - } - } - - /// Flush the buffered frame and its repetition count to the XZ encoder. - /// - /// For MkvChain and TwoDelta variants, the repetition count is appended - /// after the encoded frame. This is a no-op when no samples are pending. - /// - /// # Returns - /// - /// Returns `Ok(())` once the pending frame has been written. - fn flush_pending_frame(&mut self) -> Result<()> { - if self.count == 0 { - return Ok(()); - } - - self.encoder.write_all(&self.previous_frame)?; - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { - self.encoder.write_all(&self.count.to_be_bytes())?; - } - self.count = 0; - Ok(()) - } - - /// Write all buffered delta frames as a single columnar chunk. - /// - /// The chunk layout groups same-type fields together so XZ's dictionary - /// compression can exploit the resulting byte-level regularity: - /// - /// ```text - /// [chunk_tag=2] [n_frames: u32] - /// [pairs channel: (pair_a u16, pair_b u16) × n_frames] - /// [counts channel: count u16 × n_frames] - /// [run-length counts: n_runs u32 × n_frames] - /// [run-length data: u16 × total_runs] - /// ``` - fn flush_chunk(&mut self) -> Result<()> { - if self.chunk_buffer.is_empty() { - return Ok(()); - } - - let n = self.chunk_buffer.len() as u32; - self.encoder.write_all(&[XBEN_TWODELTA_CHUNK_TAG])?; - self.encoder.write_all(&n.to_be_bytes())?; - - // Pairs channel. - for frame in &self.chunk_buffer { - self.encoder.write_all(&frame.pair.0.to_be_bytes())?; - self.encoder.write_all(&frame.pair.1.to_be_bytes())?; - } - - // Counts channel. - for frame in &self.chunk_buffer { - self.encoder.write_all(&frame.count.to_be_bytes())?; - } - - // Run-length counts channel. - for frame in &self.chunk_buffer { - self.encoder - .write_all(&(frame.run_lengths.len() as u32).to_be_bytes())?; - } - - // Run-length data channel. - for frame in &self.chunk_buffer { - for &rl in &frame.run_lengths { - self.encoder.write_all(&rl.to_be_bytes())?; - } - } - - self.chunk_buffer.clear(); - Ok(()) - } - - /// Create a new XBEN writer around an already-configured XZ encoder. - /// - /// # Arguments - /// - /// * `encoder` - The configured XZ encoder that will receive the ben32 - /// payload. - /// * `variant` - The BEN variant to encode inside the compressed stream. - /// - /// # Returns - /// - /// Returns a new XBEN encoder ready to accept assignments or BEN frames. - pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> io::Result { - encoder.write_all(banner_for_variant(variant))?; - Ok(XBenEncoder { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - previous_frame: Vec::new(), - count: 0, - variant, - chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, - chunk_buffer: Vec::new(), - }) - } - - /// Set the number of delta frames per columnar chunk. - /// - /// Only affects TwoDelta variant encoding. Larger chunks give XZ more - /// same-type data to compress together; smaller chunks reduce peak memory. - /// - /// # Arguments - /// - /// * `size` - Number of delta frames per chunk. - /// - /// # Returns - /// - /// Returns `self` for method chaining. - pub fn with_chunk_size(mut self, size: usize) -> Self { - self.chunk_size = size.max(1); - self - } - - /// Encode and write a full assignment vector into the compressed XBEN stream. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - match self.variant { - BenVariant::Standard => { - let encoded = encode_ben32_assignments(&assign_vec)?; - self.encoder.write_all(&encoded)?; - self.previous_assignment = assign_vec; - self.previous_frame = encoded; - Ok(()) - } - BenVariant::MkvChain => { - if is_repeated_assignment(&self.previous_assignment, &assign_vec) { - self.count += 1; - return Ok(()); - } - - self.flush_pending_frame()?; - let encoded = encode_ben32_assignments(&assign_vec)?; - self.set_previous_assignment(assign_vec, encoded, 1); - Ok(()) - } - BenVariant::TwoDelta => { - if self.previous_assignment.is_empty() { - let encoded = encode_xben_twodelta_full_frame(&assign_vec); - self.set_previous_assignment(assign_vec, encoded, 1); - return Ok(()); - } - - let masks = if self.previous_masks.is_empty() { - None - } else { - Some(&self.previous_masks) - }; - let hints = - analyze_twodelta_transition(&self.previous_assignment, &assign_vec, masks); - if hints.is_repeated { - if self.chunk_buffer.is_empty() { - self.count += 1; - } else { - self.chunk_buffer.last_mut().unwrap().count += 1; - } - return Ok(()); - } - - // Flush the initial full frame before the first delta. - if self.chunk_buffer.is_empty() { - self.flush_pending_frame()?; - } - - let encoded_frame: TwoDeltaFrame = match encode_twodelta_frame_with_hint( - &self.previous_assignment, - &assign_vec, - hints.delta_pair, - Some(&mut self.previous_masks), - ) { - Ok(frame) => frame, - Err(e) => { - return Err(e); - } - }; - - self.chunk_buffer.push(BufferedDeltaFrame { - pair: encoded_frame.pair, - run_lengths: encoded_frame.run_length_vector, - count: 1, - }); - - self.previous_assignment = assign_vec; - - if self.chunk_buffer.len() >= self.chunk_size { - self.flush_chunk()?; - } - Ok(()) - } - } - } - - /// Encode and write a JSON assignment record into the compressed XBEN stream. - /// - /// # Arguments - /// - /// * `data` - A JSON object containing an `assignment` array. - /// - /// # Returns - /// - /// Returns `Ok(())` after the record has been validated and encoded. - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - self.write_assignment(parse_json_assignment(data)?) - } - - /// Read BEN frames from `reader` and write them into this XBEN stream. - /// - /// If the source still contains the 17-byte BEN banner, it is consumed and - /// replaced by the banner already written by this encoder. - /// - /// # Arguments - /// - /// * `reader` - The BEN input stream, with or without its banner. - /// - /// # Returns - /// - /// Returns `Ok(())` after the BEN stream has been translated into XBEN. - /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without - /// materializing full assignment vectors. - /// - /// The first frame (standard BEN RLE) is decoded to RLE runs and written as - /// an XBEN full frame. Subsequent delta frames have their bitpacked run - /// lengths unpacked and written as XBEN delta frames with raw u16 runs. - /// This avoids O(N) assignment reconstruction per frame entirely. - /// - /// # Arguments - /// - /// * `reader` - The BEN TwoDelta stream positioned after the banner. - /// - /// # Returns - /// - /// Returns `Ok(())` after the stream has been fully translated. - fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> Result<()> { - // First frame: standard BEN RLE → XBEN full frame. - let max_val_bits = reader.read_u8()?; - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - let first_count = reader.read_u16::()?; - - let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); - encoded.push(XBEN_TWODELTA_FULL_TAG); - encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); - for &(value, len) in &runs { - encoded.extend_from_slice(&value.to_be_bytes()); - encoded.extend_from_slice(&len.to_be_bytes()); - } - self.previous_frame = encoded; - self.count = first_count; - - let mut sample_count = first_count as usize; - progress!("Encoding line: {}\r", sample_count); - - // Delta frames: unpack bitpacked run lengths and buffer into chunks. - loop { - let pair_a = match reader.read_u16::() { - Ok(v) => v, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, - Err(e) => return Err(e), - }; - let pair_b = reader.read_u16::()?; - let delta_max_len_bits = reader.read_u8()?; - let delta_n_bytes = reader.read_u32::()?; - - let mut payload = vec![0u8; delta_n_bytes as usize]; - reader.read_exact(&mut payload)?; - let count = reader.read_u16::()?; - - // Unpack bitpacked run lengths. - let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); - let run_lengths = frame.run_length_vector; - - // Flush the initial full frame before the first delta chunk. - if self.chunk_buffer.is_empty() && self.count > 0 { - self.flush_pending_frame()?; - } - - self.chunk_buffer.push(BufferedDeltaFrame { - pair: frame.pair, - run_lengths, - count, - }); - - if self.chunk_buffer.len() >= self.chunk_size { - self.flush_chunk()?; - } - - sample_count += count as usize; - progress!("Encoding line: {}\r", sample_count); - } - - // Flush remaining partial chunk (Drop will also catch this, but be explicit). - self.flush_chunk()?; - - tracing::trace!(""); - tracing::trace!("Done!"); - Ok(()) - } - - pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { - let peek = reader.fill_buf()?; - let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); - - if has_banner { - if self.variant == BenVariant::TwoDelta { - reader.consume(BANNER_LEN); - return self.translate_ben_twodelta_to_xben(reader); - } - reader.consume(BANNER_LEN); - } - - if self.variant == BenVariant::TwoDelta { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN translation requires a BEN stream with its banner", - )); - } - - ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) - } -} - -impl Drop for XBenEncoder { - /// Flush any buffered XBEN repetition state during drop. - fn drop(&mut self) { - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { - let _ = self.flush_pending_frame(); - } - if !self.chunk_buffer.is_empty() { - let _ = self.flush_chunk(); - } - } -} diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs index 3d32222..1680d6f 100644 --- a/ben/src/io/writer/frames.rs +++ b/ben/src/io/writer/frames.rs @@ -20,9 +20,3 @@ impl BufferedBenFrame { } } } - -#[derive(Clone, Copy, Debug, Default)] -pub(super) struct AssignmentHints { - pub is_repeated: bool, - pub delta_pair: Option<(u16, u16)>, -} diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index a5a426f..9f7116c 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -3,6 +3,8 @@ pub(crate) mod frames; pub(crate) mod tests; pub(crate) mod twodelta; pub(crate) mod utils; +pub mod xben; -pub use ben::{BenEncoder, XBenEncoder}; +pub use ben::BenEncoder; pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; +pub use xben::XBenEncoder; diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index ebd71bf..18c45a6 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -1,4 +1,4 @@ -use super::frames::AssignmentHints; +use super::ben::AssignmentHints; use super::twodelta::XBEN_TWODELTA_FULL_TAG; use crate::util::rle::assign_to_rle; use serde_json::Value; diff --git a/ben/src/io/writer/xben.rs b/ben/src/io/writer/xben.rs new file mode 100644 index 0000000..95efcb7 --- /dev/null +++ b/ben/src/io/writer/xben.rs @@ -0,0 +1,441 @@ +use super::frames::BufferedDeltaFrame; +use super::twodelta::{ + DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, +}; +use super::utils::{ + analyze_twodelta_transition, encode_xben_twodelta_full_frame, is_repeated_assignment, + parse_json_assignment, +}; +use crate::codec::decode::decode_ben_line; +use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; +use crate::codec::translate::ben_to_ben32_lines; +use crate::codec::TwoDeltaFrame; +use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; +use crate::{progress, BenVariant}; +use byteorder::{BigEndian, ReadBytesExt}; +use serde_json::Value; +use std::collections::HashMap; +use std::io::{self, BufRead, Read, Result, Write}; +use xz2::write::XzEncoder; + +/// A struct to make the writing of XBEN files easier and more ergonomic. +pub struct XBenEncoder { + encoder: XzEncoder, + previous_assignment: Vec, + previous_masks: HashMap>, + previous_frame: Vec, + count: u16, + variant: BenVariant, + chunk_size: usize, + chunk_buffer: Vec, +} + +impl XBenEncoder { + /// Rebuild the value-to-position index map from the current previous assignment. + fn rebuild_previous_masks(&mut self) { + self.previous_masks.clear(); + for (idx, &assignment) in self.previous_assignment.iter().enumerate() { + self.previous_masks.entry(assignment).or_default().push(idx); + } + } + + /// Store a new previous assignment along with its encoded frame and repetition count. + /// + /// # Arguments + /// + /// * `assignment` - The assignment vector to cache. + /// * `frame` - The already-encoded frame bytes for this assignment. + /// * `count` - The initial repetition count for this assignment. + fn set_previous_assignment(&mut self, assignment: Vec, frame: Vec, count: u16) { + self.previous_assignment = assignment; + self.rebuild_previous_masks(); + self.previous_frame = frame; + self.count = count; + } + + /// Update the value-to-position masks incrementally for a two-delta transition. + /// + /// Instead of rebuilding the entire mask HashMap, only the positions belonging + /// to the two swapped values are repartitioned. This is O(pair_positions) + /// rather than O(assignment_length). + /// + /// # Arguments + /// + /// * `new_sample` - The new assignment vector after the transition. + /// * `pair` - The two values involved in the delta swap. + #[allow(dead_code)] + fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { + if pair.0 == pair.1 { + return; + } + + let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); + let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); + + let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); + let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); + + let (mut i, mut j) = (0, 0); + while i < pos_a.len() || j < pos_b.len() { + let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { + let p = pos_a[i]; + i += 1; + p + } else { + let p = pos_b[j]; + j += 1; + p + }; + if new_sample[pos] == pair.0 { + new_a.push(pos); + } else { + new_b.push(pos); + } + } + + if !new_a.is_empty() { + self.previous_masks.insert(pair.0, new_a); + } + if !new_b.is_empty() { + self.previous_masks.insert(pair.1, new_b); + } + } + + /// Flush the buffered frame and its repetition count to the XZ encoder. + /// + /// For MkvChain and TwoDelta variants, the repetition count is appended + /// after the encoded frame. This is a no-op when no samples are pending. + /// + /// # Returns + /// + /// Returns `Ok(())` once the pending frame has been written. + fn flush_pending_frame(&mut self) -> Result<()> { + if self.count == 0 { + return Ok(()); + } + + self.encoder.write_all(&self.previous_frame)?; + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { + self.encoder.write_all(&self.count.to_be_bytes())?; + } + self.count = 0; + Ok(()) + } + + /// Write all buffered delta frames as a single columnar chunk. + /// + /// The chunk layout groups same-type fields together so XZ's dictionary + /// compression can exploit the resulting byte-level regularity: + /// + /// ```text + /// [chunk_tag=2] [n_frames: u32] + /// [pairs channel: (pair_a u16, pair_b u16) × n_frames] + /// [counts channel: count u16 × n_frames] + /// [run-length counts: n_runs u32 × n_frames] + /// [run-length data: u16 × total_runs] + /// ``` + fn flush_chunk(&mut self) -> Result<()> { + if self.chunk_buffer.is_empty() { + return Ok(()); + } + + let n = self.chunk_buffer.len() as u32; + self.encoder.write_all(&[XBEN_TWODELTA_CHUNK_TAG])?; + self.encoder.write_all(&n.to_be_bytes())?; + + // Pairs channel. + for frame in &self.chunk_buffer { + self.encoder.write_all(&frame.pair.0.to_be_bytes())?; + self.encoder.write_all(&frame.pair.1.to_be_bytes())?; + } + + // Counts channel. + for frame in &self.chunk_buffer { + self.encoder.write_all(&frame.count.to_be_bytes())?; + } + + // Run-length counts channel. + for frame in &self.chunk_buffer { + self.encoder + .write_all(&(frame.run_lengths.len() as u32).to_be_bytes())?; + } + + // Run-length data channel. + for frame in &self.chunk_buffer { + for &rl in &frame.run_lengths { + self.encoder.write_all(&rl.to_be_bytes())?; + } + } + + self.chunk_buffer.clear(); + Ok(()) + } + + /// Create a new XBEN writer around an already-configured XZ encoder. + /// + /// # Arguments + /// + /// * `encoder` - The configured XZ encoder that will receive the ben32 + /// payload. + /// * `variant` - The BEN variant to encode inside the compressed stream. + /// + /// # Returns + /// + /// Returns a new XBEN encoder ready to accept assignments or BEN frames. + pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> io::Result { + encoder.write_all(banner_for_variant(variant))?; + Ok(XBenEncoder { + encoder, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + previous_frame: Vec::new(), + count: 0, + variant, + chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, + chunk_buffer: Vec::new(), + }) + } + + /// Set the number of delta frames per columnar chunk. + /// + /// Only affects TwoDelta variant encoding. Larger chunks give XZ more + /// same-type data to compress together; smaller chunks reduce peak memory. + /// + /// # Arguments + /// + /// * `size` - Number of delta frames per chunk. + /// + /// # Returns + /// + /// Returns `self` for method chaining. + pub fn with_chunk_size(mut self, size: usize) -> Self { + self.chunk_size = size.max(1); + self + } + + /// Encode and write a full assignment vector into the compressed XBEN stream. + /// + /// # Arguments + /// + /// * `assign_vec` - The full assignment vector to encode. + /// + /// # Returns + /// + /// Returns `Ok(())` after the assignment has been queued or written. + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { + match self.variant { + BenVariant::Standard => { + let encoded = encode_ben32_assignments(&assign_vec)?; + self.encoder.write_all(&encoded)?; + self.previous_assignment = assign_vec; + self.previous_frame = encoded; + Ok(()) + } + BenVariant::MkvChain => { + if is_repeated_assignment(&self.previous_assignment, &assign_vec) { + self.count += 1; + return Ok(()); + } + + self.flush_pending_frame()?; + let encoded = encode_ben32_assignments(&assign_vec)?; + self.set_previous_assignment(assign_vec, encoded, 1); + Ok(()) + } + BenVariant::TwoDelta => { + if self.previous_assignment.is_empty() { + let encoded = encode_xben_twodelta_full_frame(&assign_vec); + self.set_previous_assignment(assign_vec, encoded, 1); + return Ok(()); + } + + let masks = if self.previous_masks.is_empty() { + None + } else { + Some(&self.previous_masks) + }; + let hints = + analyze_twodelta_transition(&self.previous_assignment, &assign_vec, masks); + if hints.is_repeated { + if self.chunk_buffer.is_empty() { + self.count += 1; + } else { + self.chunk_buffer.last_mut().unwrap().count += 1; + } + return Ok(()); + } + + // Flush the initial full frame before the first delta. + if self.chunk_buffer.is_empty() { + self.flush_pending_frame()?; + } + + let encoded_frame: TwoDeltaFrame = match encode_twodelta_frame_with_hint( + &self.previous_assignment, + &assign_vec, + hints.delta_pair, + Some(&mut self.previous_masks), + ) { + Ok(frame) => frame, + Err(e) => { + return Err(e); + } + }; + + self.chunk_buffer.push(BufferedDeltaFrame { + pair: encoded_frame.pair, + run_lengths: encoded_frame.run_length_vector, + count: 1, + }); + + self.previous_assignment = assign_vec; + + if self.chunk_buffer.len() >= self.chunk_size { + self.flush_chunk()?; + } + Ok(()) + } + } + } + + /// Encode and write a JSON assignment record into the compressed XBEN stream. + /// + /// # Arguments + /// + /// * `data` - A JSON object containing an `assignment` array. + /// + /// # Returns + /// + /// Returns `Ok(())` after the record has been validated and encoded. + pub fn write_json_value(&mut self, data: Value) -> Result<()> { + self.write_assignment(parse_json_assignment(data)?) + } + + /// Read BEN frames from `reader` and write them into this XBEN stream. + /// + /// If the source still contains the 17-byte BEN banner, it is consumed and + /// replaced by the banner already written by this encoder. + /// + /// # Arguments + /// + /// * `reader` - The BEN input stream, with or without its banner. + /// + /// # Returns + /// + /// Returns `Ok(())` after the BEN stream has been translated into XBEN. + /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without + /// materializing full assignment vectors. + /// + /// The first frame (standard BEN RLE) is decoded to RLE runs and written as + /// an XBEN full frame. Subsequent delta frames have their bitpacked run + /// lengths unpacked and written as XBEN delta frames with raw u16 runs. + /// This avoids O(N) assignment reconstruction per frame entirely. + /// + /// # Arguments + /// + /// * `reader` - The BEN TwoDelta stream positioned after the banner. + /// + /// # Returns + /// + /// Returns `Ok(())` after the stream has been fully translated. + fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> Result<()> { + // First frame: standard BEN RLE → XBEN full frame. + let max_val_bits = reader.read_u8()?; + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + let first_count = reader.read_u16::()?; + + let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); + encoded.push(XBEN_TWODELTA_FULL_TAG); + encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for &(value, len) in &runs { + encoded.extend_from_slice(&value.to_be_bytes()); + encoded.extend_from_slice(&len.to_be_bytes()); + } + self.previous_frame = encoded; + self.count = first_count; + + let mut sample_count = first_count as usize; + progress!("Encoding line: {}\r", sample_count); + + // Delta frames: unpack bitpacked run lengths and buffer into chunks. + loop { + let pair_a = match reader.read_u16::() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + }; + let pair_b = reader.read_u16::()?; + let delta_max_len_bits = reader.read_u8()?; + let delta_n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; delta_n_bytes as usize]; + reader.read_exact(&mut payload)?; + let count = reader.read_u16::()?; + + // Unpack bitpacked run lengths. + let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); + let run_lengths = frame.run_length_vector; + + // Flush the initial full frame before the first delta chunk. + if self.chunk_buffer.is_empty() && self.count > 0 { + self.flush_pending_frame()?; + } + + self.chunk_buffer.push(BufferedDeltaFrame { + pair: frame.pair, + run_lengths, + count, + }); + + if self.chunk_buffer.len() >= self.chunk_size { + self.flush_chunk()?; + } + + sample_count += count as usize; + progress!("Encoding line: {}\r", sample_count); + } + + // Flush remaining partial chunk (Drop will also catch this, but be explicit). + self.flush_chunk()?; + + tracing::trace!(""); + tracing::trace!("Done!"); + Ok(()) + } + + pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { + let peek = reader.fill_buf()?; + let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); + + if has_banner { + if self.variant == BenVariant::TwoDelta { + reader.consume(BANNER_LEN); + return self.translate_ben_twodelta_to_xben(reader); + } + reader.consume(BANNER_LEN); + } + + if self.variant == BenVariant::TwoDelta { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN translation requires a BEN stream with its banner", + )); + } + + ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) + } +} + +impl Drop for XBenEncoder { + /// Flush any buffered XBEN repetition state during drop. + fn drop(&mut self) { + if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { + let _ = self.flush_pending_frame(); + } + if !self.chunk_buffer.is_empty() { + let _ = self.flush_chunk(); + } + } +} From 1199d505b06696c2fb54e8ab5d3c05eeffc8948c Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 08:36:36 -0600 Subject: [PATCH 044/221] Modify responsibilities of the encoder --- ben/src/io/writer/ben.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index 1b7c716..970e5e0 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -98,7 +98,6 @@ impl BenEncoder { ) -> Result<()> { match self.variant { BenVariant::Standard => { - let repeated = is_repeated_assignment(&self.previous_sample, &assign_vec); if hints.is_repeated { if let Some(encoded) = self.previous_encoded_sample.as_ref() { self.writer.write_all(encoded.as_slice())?; @@ -107,21 +106,13 @@ impl BenEncoder { } } - if repeated { - if let Some(encoded) = self.previous_encoded_sample.as_ref() { - self.writer.write_all(encoded.as_slice())?; - self.previous_sample = assign_vec; - return Ok(()); - } - } - let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); self.writer.write_all(encoded.as_slice())?; self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 0); Ok(()) } BenVariant::MkvChain => { - if is_repeated_assignment(&self.previous_sample, &assign_vec) { + if hints.is_repeated { self.sample_count += 1; return Ok(()); } @@ -235,7 +226,10 @@ impl BenEncoder { }; analyze_twodelta_transition(&self.previous_sample, &assign_vec, masks) } else { - AssignmentHints::default() + AssignmentHints { + is_repeated: is_repeated_assignment(&self.previous_sample, &assign_vec), + delta_pair: None, + } }; self.write_assignment_with_hints(assign_vec, hints) } From d352036ed4c57fcfa5568c5747c94f29dd19ad5d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 08:56:44 -0600 Subject: [PATCH 045/221] Remove redundant code in io/writer/ben --- ben/src/io/writer/ben.rs | 42 +++----------------------------------- ben/src/ops/relabel/mod.rs | 6 +++--- ben/tests/test_coverage.rs | 34 ------------------------------ 3 files changed, 6 insertions(+), 76 deletions(-) diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index 970e5e0..993b47f 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -50,14 +50,6 @@ impl BenEncoder { }) } - /// Rebuild the value-to-position index map from the current previous sample. - fn rebuild_previous_masks(&mut self) { - self.previous_masks.clear(); - for (idx, &assignment) in self.previous_sample.iter().enumerate() { - self.previous_masks.entry(assignment).or_default().push(idx); - } - } - /// Store a new previous sample along with its encoded frame and repetition count. /// /// # Arguments @@ -72,7 +64,6 @@ impl BenEncoder { sample_count: u16, ) { self.previous_sample = sample; - self.rebuild_previous_masks(); self.previous_encoded_sample = Some(encoded); self.sample_count = sample_count; } @@ -128,6 +119,9 @@ impl BenEncoder { BenVariant::TwoDelta => { if self.previous_sample.is_empty() { let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); + for (idx, &val) in assign_vec.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); return Ok(()); } @@ -146,7 +140,6 @@ impl BenEncoder { self.flush_pending_frame()?; self.previous_sample = assign_vec; - self.rebuild_previous_masks(); self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); self.sample_count = 1; Ok(()) @@ -179,35 +172,6 @@ impl BenEncoder { Ok(()) } - /// Record additional repetitions of the most recently written assignment. - /// - /// For MkvChain and TwoDelta variants the repetition count is incremented - /// directly. For Standard, the cached encoded frame is re-emitted once per - /// additional repeat. - /// - /// # Arguments - /// - /// * `additional` - The number of extra copies beyond the one already written. - /// - /// # Returns - /// - /// Returns `Ok(())` after all additional repeats have been recorded. - pub fn repeat_previous(&mut self, additional: u16) -> Result<()> { - match self.variant { - BenVariant::Standard => { - if let Some(encoded) = self.previous_encoded_sample.as_ref() { - for _ in 0..additional { - self.writer.write_all(encoded.as_slice())?; - } - } - } - BenVariant::MkvChain | BenVariant::TwoDelta => { - self.sample_count += additional; - } - } - Ok(()) - } - /// Encode and write a full assignment vector. /// /// # Arguments diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index b9ad465..e39ccf2 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -139,10 +139,10 @@ where .map(|limit| (limit - sample_number).min(count as usize)) .unwrap_or(count as usize); - encoder.write_assignment(relabeled)?; - if out_count > 1 { - encoder.repeat_previous((out_count - 1) as u16)?; + for _ in 1..out_count { + encoder.write_assignment(relabeled.clone())?; } + encoder.write_assignment(relabeled)?; sample_number += out_count; progress!("Relabelling line: {}\r", sample_number); diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 38e0f9b..f34deb2 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -577,40 +577,6 @@ fn ben_encoder_standard_single_assignment_round_trip() { assert!(decoded_str.contains("\"assignment\":[1,2,3,3,2,1]")); } -#[test] -fn ben_encoder_standard_repeat_previous_writes_frames() { - let assignment = vec![5u16, 5, 5]; - let mut out = Vec::new(); - { - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); - enc.write_assignment(assignment.clone()).unwrap(); - enc.repeat_previous(2).unwrap(); // 2 extra copies → 3 total - enc.finish().unwrap(); - } - - let mut decoded = Vec::new(); - decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); - let decoded_str = String::from_utf8(decoded).unwrap(); - // Three lines expected - assert_eq!(decoded_str.lines().count(), 3, "decoded:\n{decoded_str}"); -} - -#[test] -fn ben_encoder_mkv_repeat_previous_increments_count() { - let assignment = vec![9u16, 8, 7]; - let mut out = Vec::new(); - { - let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); - enc.write_assignment(assignment.clone()).unwrap(); - enc.repeat_previous(4).unwrap(); // 4 extra → count = 5 - enc.finish().unwrap(); - } - - let mut decoded = Vec::new(); - decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); - assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 5); -} - #[test] fn ben_encoder_finish_is_idempotent() { let mut out = Vec::new(); From 0f009b24ff7e5fde7235f01b70057097f708320b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:18:46 -0600 Subject: [PATCH 046/221] Reorg (I will figure out the design I like eventually) --- ben/src/codec/encode/ben.rs | 121 --------------- ben/src/codec/encode/mod.rs | 2 - ben/src/codec/encode/tests.rs | 2 +- ben/src/codec/encode/traits.rs | 7 - ben/src/codec/encode/twodelta.rs | 151 ++---------------- ben/src/codec/frames.rs | 241 ----------------------------- ben/src/codec/frames/ben_decode.rs | 56 +++++++ ben/src/codec/frames/ben_encode.rs | 97 ++++++++++++ ben/src/codec/frames/mkv_encode.rs | 108 +++++++++++++ ben/src/codec/frames/mod.rs | 62 ++++++++ ben/src/codec/frames/twodelta.rs | 179 +++++++++++++++++++++ ben/src/codec/mod.rs | 3 +- ben/src/io/reader/ben.rs | 9 +- ben/src/io/writer/frames.rs | 4 +- ben/src/io/writer/xben.rs | 6 +- ben/tests/test_coverage.rs | 20 +-- 16 files changed, 536 insertions(+), 532 deletions(-) delete mode 100644 ben/src/codec/encode/traits.rs delete mode 100644 ben/src/codec/frames.rs create mode 100644 ben/src/codec/frames/ben_decode.rs create mode 100644 ben/src/codec/frames/ben_encode.rs create mode 100644 ben/src/codec/frames/mkv_encode.rs create mode 100644 ben/src/codec/frames/mod.rs create mode 100644 ben/src/codec/frames/twodelta.rs diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 4289efb..14f37f9 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,6 +1,3 @@ -use crate::codec::encode::traits::{FromAssign, FromRLE}; -use crate::codec::frames::{BenEncodeFrame, MkvBenEncodeFrame}; -use crate::util::rle::assign_to_rle; use serde_json::Value; use std::io::{Error, ErrorKind, Result}; @@ -98,121 +95,3 @@ pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result< ret.extend([0, 0, 0, 0]); Ok(ret) } - -/// Compresses a Run-length encoded vector into a BEN-bytes vector. -fn compress_rle_to_bytes( - max_val_bit_count: u8, - max_len_bit_count: u8, - n_bytes: u32, - runs: &Vec<(u16, u16)>, -) -> Vec { - let mut bytes = Vec::with_capacity(6 + n_bytes as usize); - bytes.push(max_val_bit_count); - bytes.push(max_len_bit_count); - bytes.extend_from_slice(&n_bytes.to_be_bytes()); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for &(val, len) in runs { - let mut packed = (remainder << max_val_bit_count) | (val as u32); - let mut bits_left = remainder_bits + max_val_bit_count; - - while bits_left >= 8 { - bits_left -= 8; - bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - packed = (packed << max_len_bit_count) | (len as u32); - bits_left += max_len_bit_count; - - while bits_left >= 8 { - bits_left -= 8; - bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - remainder = packed; - remainder_bits = bits_left; - } - - if remainder_bits > 0 { - bytes.push((remainder << (8 - remainder_bits)) as u8); - } - - bytes -} - -impl FromRLE for BenEncodeFrame { - /// Build a frame from an RLE run vector. - fn from_rle(runs: Vec<(u16, u16)>, _count: Option) -> Self { - let (max_val, max_len) = runs - .iter() - .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { - (max_val.max(val), max_len.max(len)) - }); - let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - let raw_bytes = compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); - - Self { - runs, - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - } - } -} - -impl FromAssign for BenEncodeFrame { - /// Build a frame from a full assignment vector. - fn from_assignment(assignments: impl AsRef<[u16]>, _count: Option) -> Self { - Self::from_rle(assign_to_rle(assignments), _count) - } -} - -impl FromRLE for MkvBenEncodeFrame { - /// Build a frame from an RLE run vector. - fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self { - let count = match count { - Some(v) => v, - None => 1, - }; - - let (max_val, max_len) = runs - .iter() - .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { - (max_val.max(val), max_len.max(len)) - }); - let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - let mut raw_bytes = - compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); - - raw_bytes.extend(count.to_be_bytes()); - - Self { - runs, - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - count, - } - } -} - -impl FromAssign for MkvBenEncodeFrame { - /// Build a frame from a full assignment vector. - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { - Self::from_rle(assign_to_rle(assignments), count) - } -} diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 67aab9c..7110fc3 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -3,13 +3,11 @@ mod ben; mod errors; mod jsonl; -mod traits; mod twodelta; mod xz; pub(crate) use ben::encode_ben32_assignments; pub use errors::EncodeError; -pub use traits::{FromAssign, FromRLE}; pub(crate) use twodelta::encode_twodelta_frame_with_hint; pub use twodelta::encode_twodelta_frame; diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 6e6d5fb..a4d83aa 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::codec::frames::BenEncodeFrame; +use crate::codec::frames::{BenEncodeFrame, FromAssign, FromRLE}; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; diff --git a/ben/src/codec/encode/traits.rs b/ben/src/codec/encode/traits.rs deleted file mode 100644 index 0380056..0000000 --- a/ben/src/codec/encode/traits.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub trait FromRLE { - fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; -} - -pub trait FromAssign { - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self; -} diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 939f094..1c9e755 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -1,133 +1,8 @@ use super::errors::EncodeError; -use crate::codec::frames::TwoDeltaFrame; +use crate::codec::frames::TwoDeltaEncodeFrame; use std::collections::HashMap; use std::io::{Error, ErrorKind, Result}; -impl TwoDeltaFrame { - /// Build a TwoDelta frame by packing a run-length vector into the binary format. - /// - /// Run lengths are packed at `max_len_bit_count` bits per value (the minimum - /// bit width needed to represent the largest run length), MSB-first with no - /// padding between values. If the total bit count is not a multiple of 8, the - /// final byte is zero-padded on the right. - /// - /// The serialized layout is: - /// ```text - /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...] - /// ``` - /// where the payload is the bit-packed run lengths. - /// - /// # Arguments - /// - /// * `pair` - The ordered pair of assignment ids. `pair.0` corresponds to the first run. - /// * `run_length_vector` - The lengths of alternating runs of `pair.0` and `pair.1` - /// over the positions occupied by the pair, in position order. - /// - /// # Returns - /// - /// A fully serialized `TwoDeltaFrame` with both the packed `raw_bytes` and the - /// original `run_length_vector` stored on the struct. - pub fn from_run_lengths(pair: (u16, u16), run_length_vector: Vec) -> Self { - let max_len = run_length_vector.iter().copied().max().unwrap_or(0); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - - let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - - // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) - let mut raw_bytes = Vec::with_capacity((n_bytes + 9) as usize); - raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); - raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); - raw_bytes.push(max_len_bit_count); - raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for &item in &run_length_vector { - let mut packed = (remainder << max_len_bit_count) | item as u32; - let mut bits_left = remainder_bits + max_len_bit_count; - - while bits_left >= 8 { - bits_left -= 8; - raw_bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - remainder = packed; - remainder_bits = bits_left; - } - - if remainder_bits > 0 { - raw_bytes.push((remainder << (8 - remainder_bits)) as u8); - } - - Self { - pair, - max_len_bit_count, - n_bytes, - run_length_vector, - raw_bytes, - } - } - - /// Reconstruct a TwoDelta frame from already-parsed header fields and a raw payload. - /// - /// This is the inverse of `from_run_lengths`: it re-assembles the serialized bytes - /// and decodes the bit-packed payload back into the run-length vector so that both - /// representations are available on the resulting frame. - /// - /// The decoding reads `max_len_bit_count` bits at a time from the payload, MSB-first, - /// and discards any trailing zero-valued items produced by right-padding in the final byte. - /// - /// # Arguments - /// - /// * `pair` - The ordered pair of assignment ids as read from the frame header. - /// * `max_len_bit_count` - The bit width of each packed run length, as read from the - /// frame header. - /// * `payload` - The raw packed payload bytes, not including the 9-byte header. - /// - /// # Returns - /// - /// A `TwoDeltaFrame` with both `raw_bytes` (header + payload) and the decoded - /// `run_length_vector` populated. - pub fn from_parts(pair: (u16, u16), max_len_bit_count: u8, payload: Vec) -> Self { - let n_bytes = payload.len() as u32; - let mut raw_bytes = Vec::with_capacity(9 + payload.len()); - raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); - raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); - raw_bytes.push(max_len_bit_count); - raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); - raw_bytes.extend_from_slice(&payload); - - let mut run_length_vector = Vec::new(); - let mut buffer: u32 = 0; - let mut n_bits_in_buff: u16 = 0; - - for byte in payload { - buffer |= (byte as u32).to_be() >> n_bits_in_buff; - n_bits_in_buff += 8; - - while n_bits_in_buff >= max_len_bit_count as u16 { - let item = (buffer >> (32 - max_len_bit_count)) as u16; - buffer <<= max_len_bit_count; - n_bits_in_buff -= max_len_bit_count as u16; - if item > 0 { - run_length_vector.push(item); - } - } - } - - Self { - pair, - max_len_bit_count, - n_bytes, - run_length_vector, - raw_bytes, - } - } -} - /// Encode a transition between two assignment vectors as a TwoDelta frame, optionally /// using caller-supplied hints to accelerate encoding. /// @@ -143,7 +18,7 @@ impl TwoDeltaFrame { /// /// # Returns /// -/// A `TwoDeltaFrame` describing the transition from `previous_assignment` to +/// A `TwoDeltaEncodeFrame` describing the transition from `previous_assignment` to /// `new_assignment`. /// /// # TwoDelta encoding @@ -191,7 +66,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( new_assignment: impl AsRef<[u16]>, delta_pair: Option<(u16, u16)>, previous_masks: Option<&mut HashMap>>, -) -> Result { +) -> Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); @@ -227,7 +102,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment), } - // Ok(TwoDeltaFrame::from_run_lengths(ordered_pair, run_lengths)) + // Ok(TwoDeltaEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) } /// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return @@ -301,7 +176,7 @@ fn validate_masks_and_order_pairs_for_twodelta( /// /// # Returns /// -/// A `TwoDeltaFrame` for the transition, or `BenEncodeError::RepeatedSample` if no +/// A `TwoDeltaEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if no /// position actually changed value (signalling the frame can be deduplicated), or /// another error if a mask entry is inconsistent with the assignment data. fn construct_twodelta_frame_from_pair_and_mask_hints( @@ -309,7 +184,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( current: &[u16], delta_pair: (u16, u16), previous_masks: &mut HashMap>, -) -> Result { +) -> Result { let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, previous_masks, current) { Ok(pair) => pair, @@ -402,7 +277,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous_masks.insert(pair.0, new_mask_a); previous_masks.insert(pair.1, new_mask_b); - Ok(TwoDeltaFrame::from_run_lengths(pair, run_lengths)) + Ok(TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths)) } /// Build a TwoDelta frame using only pre-computed position masks, inferring the pair @@ -422,13 +297,13 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( /// /// # Returns /// -/// A `TwoDeltaFrame` for the transition, or `BenEncodeError::RepeatedSample` if the +/// A `TwoDeltaEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if the /// two assignments are identical. fn construct_twodelta_frame_from_mask_hint( previous: &[u16], current: &[u16], previous_masks: &mut HashMap>, -) -> Result { +) -> Result { for (&assign0, &assign1) in previous.iter().zip(current.iter()) { if assign0 != assign1 { return construct_twodelta_frame_from_pair_and_mask_hints( @@ -459,12 +334,12 @@ fn construct_twodelta_frame_from_mask_hint( /// /// # Returns /// -/// A `TwoDeltaFrame` for the transition, or an error if more than two distinct ids +/// A `TwoDeltaEncodeFrame` for the transition, or an error if more than two distinct ids /// appear across all changed positions. fn construct_twodelta_frame_from_scratch( previous: &[u16], current: &[u16], -) -> Result { +) -> Result { // Find the pair at the first changed position. let first_change = previous .iter() @@ -506,7 +381,7 @@ fn construct_twodelta_frame_from_scratch( } run_lengths.push(run_count); - Ok(TwoDeltaFrame::from_run_lengths(enc_pair, run_lengths)) + Ok(TwoDeltaEncodeFrame::from_run_lengths(enc_pair, run_lengths)) } /// Encode a transition between two assignment vectors as a TwoDelta frame. @@ -531,6 +406,6 @@ fn construct_twodelta_frame_from_scratch( pub fn encode_twodelta_frame( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, -) -> Result { +) -> Result { encode_twodelta_frame_with_hint(previous_assignment, new_assignment, None, None) } diff --git a/ben/src/codec/frames.rs b/ben/src/codec/frames.rs deleted file mode 100644 index be86690..0000000 --- a/ben/src/codec/frames.rs +++ /dev/null @@ -1,241 +0,0 @@ -/// Canonical representation of a BEN frame. -/// -/// The frame stores the semantic RLE runs together with the derived header -/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN -/// frame, including the two one-byte bit-width fields and the four-byte payload -/// length. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BenEncodeFrame { - // The RLE runs that were encoded into this frame, stored here for reference - pub runs: Vec<(u16, u16)>, - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized BEN frame bytes, including the header and payload. - pub raw_bytes: Vec, -} - -impl BenEncodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for BenEncodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for BenEncodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for BenEncodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &BenEncodeFrame) -> bool { - *self == other.raw_bytes - } -} - -/// Canonical representation of a BEN frame. -/// -/// The frame stores the semantic RLE runs together with the derived header -/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN -/// frame, including the two one-byte bit-width fields and the four-byte payload -/// length. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct MkvBenEncodeFrame { - // The RLE runs that were encoded into this frame, stored here for reference - pub runs: Vec<(u16, u16)>, - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized MKVBEN frame bytes, including the header and payload. - pub raw_bytes: Vec, - // The number of times that this frame was repeated - pub count: u16, -} - -impl MkvBenEncodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for MkvBenEncodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for MkvBenEncodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for MkvBenEncodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &MkvBenEncodeFrame) -> bool { - *self == other.raw_bytes - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BenDecodeFrame { - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized BEN frame bytes, including the header and payload. - pub raw_bytes: Vec, - // The number of times this frame was repeated - pub count: u16, -} - -impl BenDecodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for BenDecodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for BenDecodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for BenDecodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &BenDecodeFrame) -> bool { - *self == other.raw_bytes - } -} - -/// Canonical representation of a TwoDelta frame. -/// -/// A TwoDelta frame stores the two assignment ids that may change relative to -/// the previous sample and then encodes the lengths of alternating runs over -/// just those two ids. The first run always corresponds to `pair.0`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TwoDeltaFrame { - // The pair of assignment ids that are encoded in this frame, stored here for reference. - // Canonically, `pair.0` is the id for the first run in the run-length vector and `pair.1` - // is the id for the second run. - pub pair: (u16, u16), - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The run-length vector that was encoded into this frame, stored here for reference. - pub run_length_vector: Vec, - // The full serialized TwoDelta frame bytes, including the header and payload. - pub raw_bytes: Vec, -} - -impl TwoDeltaFrame { - /// Borrow just the packed payload bytes. - pub fn payload(&self) -> &[u8] { - &self.raw_bytes[9..] - } - - /// Borrow the serialized TwoDelta frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized TwoDelta frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for TwoDeltaFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for TwoDeltaFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} diff --git a/ben/src/codec/frames/ben_decode.rs b/ben/src/codec/frames/ben_decode.rs new file mode 100644 index 0000000..9f9c4d1 --- /dev/null +++ b/ben/src/codec/frames/ben_decode.rs @@ -0,0 +1,56 @@ +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BenDecodeFrame { + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized BEN frame bytes, including the header and payload. + pub raw_bytes: Vec, + // The number of times this frame was repeated + pub count: u16, +} + +impl BenDecodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for BenDecodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenDecodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenDecodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenDecodeFrame) -> bool { + *self == other.raw_bytes + } +} diff --git a/ben/src/codec/frames/ben_encode.rs b/ben/src/codec/frames/ben_encode.rs new file mode 100644 index 0000000..cf290bc --- /dev/null +++ b/ben/src/codec/frames/ben_encode.rs @@ -0,0 +1,97 @@ +use super::{compress_rle_to_bytes, FromAssign, FromRLE}; +use crate::util::rle::assign_to_rle; + +/// Canonical representation of a BEN frame. +/// +/// The frame stores the semantic RLE runs together with the derived header +/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN +/// frame, including the two one-byte bit-width fields and the four-byte payload +/// length. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BenEncodeFrame { + // The RLE runs that were encoded into this frame, stored here for reference + pub runs: Vec<(u16, u16)>, + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized BEN frame bytes, including the header and payload. + pub raw_bytes: Vec, +} + +impl BenEncodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for BenEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenEncodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenEncodeFrame) -> bool { + *self == other.raw_bytes + } +} + +impl FromRLE for BenEncodeFrame { + /// Build a frame from an RLE run vector. + fn from_rle(runs: Vec<(u16, u16)>, _count: Option) -> Self { + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let raw_bytes = compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + + Self { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + } + } +} + +impl FromAssign for BenEncodeFrame { + /// Build a frame from a full assignment vector. + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { + Self::from_rle(assign_to_rle(assignments), count) + } +} diff --git a/ben/src/codec/frames/mkv_encode.rs b/ben/src/codec/frames/mkv_encode.rs new file mode 100644 index 0000000..6c3d53d --- /dev/null +++ b/ben/src/codec/frames/mkv_encode.rs @@ -0,0 +1,108 @@ +use super::{compress_rle_to_bytes, FromAssign, FromRLE}; +use crate::util::rle::assign_to_rle; + +/// Canonical representation of a BEN frame. +/// +/// The frame stores the semantic RLE runs together with the derived header +/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN +/// frame, including the two one-byte bit-width fields and the four-byte payload +/// length. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MkvBenEncodeFrame { + // The RLE runs that were encoded into this frame, stored here for reference + pub runs: Vec<(u16, u16)>, + // The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The full serialized MKVBEN frame bytes, including the header and payload. + pub raw_bytes: Vec, + // The number of times that this frame was repeated + pub count: u16, +} + +impl MkvBenEncodeFrame { + /// Borrow the serialized BEN frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized BEN frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized BEN bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl AsRef<[u8]> for MkvBenEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for MkvBenEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for MkvBenEncodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &MkvBenEncodeFrame) -> bool { + *self == other.raw_bytes + } +} + +impl FromRLE for MkvBenEncodeFrame { + /// Build a frame from an RLE run vector. + fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self { + let count = match count { + Some(v) => v, + None => 1, + }; + + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let mut raw_bytes = + compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + + raw_bytes.extend(count.to_be_bytes()); + + Self { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + } + } +} + +impl FromAssign for MkvBenEncodeFrame { + /// Build a frame from a full assignment vector. + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { + Self::from_rle(assign_to_rle(assignments), count) + } +} diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs new file mode 100644 index 0000000..c8006a2 --- /dev/null +++ b/ben/src/codec/frames/mod.rs @@ -0,0 +1,62 @@ +mod ben_decode; +mod ben_encode; +mod mkv_encode; +mod twodelta; + +pub use ben_decode::BenDecodeFrame; +pub use ben_encode::BenEncodeFrame; +pub use mkv_encode::MkvBenEncodeFrame; +pub use twodelta::TwoDeltaEncodeFrame; + +pub trait FromRLE { + fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; +} + +pub trait FromAssign { + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self; +} + +/// Compresses a run-length encoded vector into BEN payload bytes. +pub(super) fn compress_rle_to_bytes( + max_val_bit_count: u8, + max_len_bit_count: u8, + n_bytes: u32, + runs: &Vec<(u16, u16)>, +) -> Vec { + let mut bytes = Vec::with_capacity(6 + n_bytes as usize); + bytes.push(max_val_bit_count); + bytes.push(max_len_bit_count); + bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &(val, len) in runs { + let mut packed = (remainder << max_val_bit_count) | (val as u32); + let mut bits_left = remainder_bits + max_val_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + packed = (packed << max_len_bit_count) | (len as u32); + bits_left += max_len_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + bytes +} diff --git a/ben/src/codec/frames/twodelta.rs b/ben/src/codec/frames/twodelta.rs new file mode 100644 index 0000000..24bc4a2 --- /dev/null +++ b/ben/src/codec/frames/twodelta.rs @@ -0,0 +1,179 @@ +/// Canonical representation of a TwoDelta frame. +/// +/// A TwoDelta frame stores the two assignment ids that may change relative to +/// the previous sample and then encodes the lengths of alternating runs over +/// just those two ids. The first run always corresponds to `pair.0`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TwoDeltaEncodeFrame { + // The pair of assignment ids that are encoded in this frame, stored here for reference. + // Canonically, `pair.0` is the id for the first run in the run-length vector and `pair.1` + // is the id for the second run. + pub pair: (u16, u16), + // The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + // The number of bytes in the packed payload. + pub n_bytes: u32, + // The run-length vector that was encoded into this frame, stored here for reference. + pub run_length_vector: Vec, + // The full serialized TwoDelta frame bytes, including the header and payload. + pub raw_bytes: Vec, +} + +impl TwoDeltaEncodeFrame { + /// Borrow just the packed payload bytes. + pub fn payload(&self) -> &[u8] { + &self.raw_bytes[9..] + } + + /// Borrow the serialized TwoDelta frame bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the serialized TwoDelta frame bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the serialized bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } + + /// Build a TwoDelta frame by packing a run-length vector into the binary format. + /// + /// Run lengths are packed at `max_len_bit_count` bits per value (the minimum + /// bit width needed to represent the largest run length), MSB-first with no + /// padding between values. If the total bit count is not a multiple of 8, the + /// final byte is zero-padded on the right. + /// + /// The serialized layout is: + /// ```text + /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...] + /// ``` + /// where the payload is the bit-packed run lengths. + /// + /// # Arguments + /// + /// * `pair` - The ordered pair of assignment ids. `pair.0` corresponds to the first run. + /// * `run_length_vector` - The lengths of alternating runs of `pair.0` and `pair.1` + /// over the positions occupied by the pair, in position order. + /// + /// # Returns + /// + /// A fully serialized `TwoDeltaEncodeFrame` with both the packed `raw_bytes` and the + /// original `run_length_vector` stored on the struct. + pub fn from_run_lengths(pair: (u16, u16), run_length_vector: Vec) -> Self { + let max_len = run_length_vector.iter().copied().max().unwrap_or(0); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + + let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + + // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) + let mut raw_bytes = Vec::with_capacity((n_bytes + 9) as usize); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &item in &run_length_vector { + let mut packed = (remainder << max_len_bit_count) | item as u32; + let mut bits_left = remainder_bits + max_len_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + raw_bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + raw_bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + Self { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + } + } + + /// Reconstruct a TwoDelta frame from already-parsed header fields and a raw payload. + /// + /// This is the inverse of `from_run_lengths`: it re-assembles the serialized bytes + /// and decodes the bit-packed payload back into the run-length vector so that both + /// representations are available on the resulting frame. + /// + /// The decoding reads `max_len_bit_count` bits at a time from the payload, MSB-first, + /// and discards any trailing zero-valued items produced by right-padding in the final byte. + /// + /// # Arguments + /// + /// * `pair` - The ordered pair of assignment ids as read from the frame header. + /// * `max_len_bit_count` - The bit width of each packed run length, as read from the + /// frame header. + /// * `payload` - The raw packed payload bytes, not including the 9-byte header. + /// + /// # Returns + /// + /// A `TwoDeltaEncodeFrame` with both `raw_bytes` (header + payload) and the decoded + /// `run_length_vector` populated. + pub fn from_parts(pair: (u16, u16), max_len_bit_count: u8, payload: Vec) -> Self { + let n_bytes = payload.len() as u32; + let mut raw_bytes = Vec::with_capacity(9 + payload.len()); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + raw_bytes.extend_from_slice(&payload); + + let mut run_length_vector = Vec::new(); + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + + for byte in payload { + buffer |= (byte as u32).to_be() >> n_bits_in_buff; + n_bits_in_buff += 8; + + while n_bits_in_buff >= max_len_bit_count as u16 { + let item = (buffer >> (32 - max_len_bit_count)) as u16; + buffer <<= max_len_bit_count; + n_bits_in_buff -= max_len_bit_count as u16; + if item > 0 { + run_length_vector.push(item); + } + } + } + + Self { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + } + } +} + +impl AsRef<[u8]> for TwoDeltaEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for TwoDeltaEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index b37ecb2..e2a5cd6 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -10,5 +10,4 @@ pub mod encode; pub mod frames; pub mod translate; -pub use encode::{FromAssign, FromRLE}; -pub use frames::{BenDecodeFrame, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaFrame}; +pub use frames::{BenDecodeFrame, BenEncodeFrame, FromAssign, FromRLE, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs index ced58db..7254db7 100644 --- a/ben/src/io/reader/ben.rs +++ b/ben/src/io/reader/ben.rs @@ -4,8 +4,7 @@ use super::twodelta::{ }; use crate::codec::decode::{decode_ben32_line, decode_ben_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; -use crate::codec::{BenDecodeFrame, BenEncodeFrame, TwoDeltaFrame}; -use crate::codec::encode::FromAssign; +use crate::codec::{BenDecodeFrame, BenEncodeFrame, FromAssign, TwoDeltaEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::util::rle::rle_to_vec; @@ -37,7 +36,7 @@ pub struct BenDecoder { enum StoredBenFrame { Ben(BenDecodeFrame), - TwoDelta { frame: TwoDeltaFrame, count: u16 }, + TwoDelta { frame: TwoDeltaEncodeFrame, count: u16 }, } impl StoredBenFrame { @@ -231,7 +230,7 @@ impl BenDecoder { }; Some(Ok(StoredBenFrame::TwoDelta { - frame: TwoDeltaFrame::from_parts((pair_a, pair_b), max_len_bits, payload), + frame: TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload), count, })) } @@ -439,7 +438,7 @@ fn apply_twodelta_runs_to_assignment( /// Returns the updated assignment vector. fn decode_twodelta_frame_to_assignment( assignment: Vec, - frame: &TwoDeltaFrame, + frame: &TwoDeltaEncodeFrame, ) -> io::Result> { apply_twodelta_runs_to_assignment(assignment, frame.pair, &frame.run_length_vector) } diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs index 1680d6f..b02feab 100644 --- a/ben/src/io/writer/frames.rs +++ b/ben/src/io/writer/frames.rs @@ -1,4 +1,4 @@ -use crate::codec::{BenEncodeFrame, TwoDeltaFrame}; +use crate::codec::{BenEncodeFrame, TwoDeltaEncodeFrame}; /// A buffered delta frame awaiting chunk serialization. pub(super) struct BufferedDeltaFrame { @@ -9,7 +9,7 @@ pub(super) struct BufferedDeltaFrame { pub(super) enum BufferedBenFrame { Ben(BenEncodeFrame), - TwoDelta(TwoDeltaFrame), + TwoDelta(TwoDeltaEncodeFrame), } impl BufferedBenFrame { diff --git a/ben/src/io/writer/xben.rs b/ben/src/io/writer/xben.rs index 95efcb7..542ac47 100644 --- a/ben/src/io/writer/xben.rs +++ b/ben/src/io/writer/xben.rs @@ -9,7 +9,7 @@ use super::utils::{ use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; -use crate::codec::TwoDeltaFrame; +use crate::codec::TwoDeltaEncodeFrame; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -270,7 +270,7 @@ impl XBenEncoder { self.flush_pending_frame()?; } - let encoded_frame: TwoDeltaFrame = match encode_twodelta_frame_with_hint( + let encoded_frame: TwoDeltaEncodeFrame = match encode_twodelta_frame_with_hint( &self.previous_assignment, &assign_vec, hints.delta_pair, @@ -375,7 +375,7 @@ impl XBenEncoder { let count = reader.read_u16::()?; // Unpack bitpacked run lengths. - let frame = TwoDeltaFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); + let frame = TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); let run_lengths = frame.run_length_vector; // Flush the initial full frame before the first delta chunk. diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index f34deb2..b19df49 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -9,7 +9,7 @@ use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, encode_twodelta_frame, }; -use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE, TwoDeltaFrame}; +use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE, TwoDeltaEncodeFrame}; use binary_ensemble::format::banners::{ banner_for_variant, has_known_banner_prefix, variant_from_banner, BANNER_LEN, MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, @@ -1540,14 +1540,14 @@ fn encode_twodelta_frame_single_value_swap() { } // ────────────────────────────────────────────────────────────────────────────── -// TwoDeltaFrame accessors +// TwoDeltaEncodeFrame accessors // ────────────────────────────────────────────────────────────────────────────── #[test] fn twodelta_frame_pair_accessor() { let pair = (3u16, 7u16); let run_lengths = vec![2u16, 3, 1]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); assert_eq!(frame.pair, pair); } @@ -1556,7 +1556,7 @@ fn twodelta_frame_max_len_bits_accessor() { // max run length = 4 = 0b100 → 3 bits let pair = (1u16, 2u16); let run_lengths = vec![4u16, 4]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); assert_eq!(frame.max_len_bit_count, 3); } @@ -1564,7 +1564,7 @@ fn twodelta_frame_max_len_bits_accessor() { fn twodelta_frame_n_bytes_and_payload_consistent() { let pair = (5u16, 10u16); let run_lengths = vec![1u16, 2, 3]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); assert_eq!(frame.n_bytes as usize, frame.payload().len()); } @@ -1572,7 +1572,7 @@ fn twodelta_frame_n_bytes_and_payload_consistent() { fn twodelta_frame_to_bytes_and_as_slice_same() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); assert_eq!(frame.to_bytes(), frame.as_slice()); } @@ -1580,7 +1580,7 @@ fn twodelta_frame_to_bytes_and_as_slice_same() { fn twodelta_frame_into_bytes_consumes() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); let expected = frame.to_bytes(); let actual = frame.into_bytes(); assert_eq!(actual, expected); @@ -1590,8 +1590,8 @@ fn twodelta_frame_into_bytes_consumes() { fn twodelta_frame_from_parts_round_trip() { let pair = (10u16, 20u16); let run_lengths = vec![2u16, 5, 1]; - let original = TwoDeltaFrame::from_run_lengths(pair, run_lengths); - let reconstructed = TwoDeltaFrame::from_parts( + let original = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let reconstructed = TwoDeltaEncodeFrame::from_parts( pair, original.max_len_bit_count, original.payload().to_vec(), @@ -1606,7 +1606,7 @@ fn twodelta_frame_from_parts_round_trip() { fn twodelta_frame_asref_and_deref() { let pair = (1u16, 2u16); let run_lengths = vec![3u16]; - let frame = TwoDeltaFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); let as_ref: &[u8] = frame.as_ref(); let deref: &[u8] = &*frame; assert_eq!(as_ref, deref); From 6078ab2dbaedc43a0cf6b00f0c9fc28793296c48 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:22:56 -0600 Subject: [PATCH 047/221] Centralized constructor impl for Ben-type frames --- ben/src/codec/encode/tests.rs | 2 +- ben/src/codec/frames/ben_encode.rs | 15 ++++----------- ben/src/codec/frames/mkv_encode.rs | 14 +++----------- ben/src/codec/frames/mod.rs | 15 ++++++++++----- ben/src/codec/mod.rs | 2 +- ben/src/codec/translate/mod.rs | 2 +- ben/src/io/reader/ben.rs | 2 +- ben/src/io/writer/ben.rs | 2 +- ben/src/ops/relabel/mod.rs | 2 +- ben/src/ops/relabel/tests.rs | 2 +- ben/tests/test_coverage.rs | 2 +- ben/tests/test_impls_pipeline.rs | 2 +- 12 files changed, 26 insertions(+), 36 deletions(-) diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index a4d83aa..2129188 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::codec::frames::{BenEncodeFrame, FromAssign, FromRLE}; +use crate::codec::frames::{BenConstruct, BenEncodeFrame}; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; diff --git a/ben/src/codec/frames/ben_encode.rs b/ben/src/codec/frames/ben_encode.rs index cf290bc..20fbf6c 100644 --- a/ben/src/codec/frames/ben_encode.rs +++ b/ben/src/codec/frames/ben_encode.rs @@ -1,5 +1,4 @@ -use super::{compress_rle_to_bytes, FromAssign, FromRLE}; -use crate::util::rle::assign_to_rle; +use super::{compress_rle_to_ben_bytes, BenConstruct}; /// Canonical representation of a BEN frame. /// @@ -64,7 +63,7 @@ impl PartialEq for Vec { } } -impl FromRLE for BenEncodeFrame { +impl BenConstruct for BenEncodeFrame { /// Build a frame from an RLE run vector. fn from_rle(runs: Vec<(u16, u16)>, _count: Option) -> Self { let (max_val, max_len) = runs @@ -77,7 +76,8 @@ impl FromRLE for BenEncodeFrame { let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; let payload_bits = assign_bits * runs.len() as u32; let n_bytes = payload_bits.div_ceil(8); - let raw_bytes = compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + let raw_bytes = + compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); Self { runs, @@ -88,10 +88,3 @@ impl FromRLE for BenEncodeFrame { } } } - -impl FromAssign for BenEncodeFrame { - /// Build a frame from a full assignment vector. - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { - Self::from_rle(assign_to_rle(assignments), count) - } -} diff --git a/ben/src/codec/frames/mkv_encode.rs b/ben/src/codec/frames/mkv_encode.rs index 6c3d53d..311a909 100644 --- a/ben/src/codec/frames/mkv_encode.rs +++ b/ben/src/codec/frames/mkv_encode.rs @@ -1,5 +1,4 @@ -use super::{compress_rle_to_bytes, FromAssign, FromRLE}; -use crate::util::rle::assign_to_rle; +use super::{compress_rle_to_ben_bytes, BenConstruct}; /// Canonical representation of a BEN frame. /// @@ -66,7 +65,7 @@ impl PartialEq for Vec { } } -impl FromRLE for MkvBenEncodeFrame { +impl BenConstruct for MkvBenEncodeFrame { /// Build a frame from an RLE run vector. fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self { let count = match count { @@ -85,7 +84,7 @@ impl FromRLE for MkvBenEncodeFrame { let payload_bits = assign_bits * runs.len() as u32; let n_bytes = payload_bits.div_ceil(8); let mut raw_bytes = - compress_rle_to_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); raw_bytes.extend(count.to_be_bytes()); @@ -99,10 +98,3 @@ impl FromRLE for MkvBenEncodeFrame { } } } - -impl FromAssign for MkvBenEncodeFrame { - /// Build a frame from a full assignment vector. - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self { - Self::from_rle(assign_to_rle(assignments), count) - } -} diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index c8006a2..ce5da0b 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -8,16 +8,21 @@ pub use ben_encode::BenEncodeFrame; pub use mkv_encode::MkvBenEncodeFrame; pub use twodelta::TwoDeltaEncodeFrame; -pub trait FromRLE { +use crate::util::rle::assign_to_rle; + +pub trait BenConstruct { fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; -} -pub trait FromAssign { - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self; + fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self + where + Self: Sized, + { + Self::from_rle(assign_to_rle(assignments), count) + } } /// Compresses a run-length encoded vector into BEN payload bytes. -pub(super) fn compress_rle_to_bytes( +pub(super) fn compress_rle_to_ben_bytes( max_val_bit_count: u8, max_len_bit_count: u8, n_bytes: u32, diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index e2a5cd6..75bf32f 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -10,4 +10,4 @@ pub mod encode; pub mod frames; pub mod translate; -pub use frames::{BenDecodeFrame, BenEncodeFrame, FromAssign, FromRLE, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; +pub use frames::{BenConstruct, BenDecodeFrame, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index c4e277c..c9a54c2 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -3,7 +3,7 @@ mod errors; use errors::TranslateError; -use crate::codec::FromRLE; +use crate::codec::BenConstruct; use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Read, Write}; diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs index 7254db7..0b6d130 100644 --- a/ben/src/io/reader/ben.rs +++ b/ben/src/io/reader/ben.rs @@ -4,7 +4,7 @@ use super::twodelta::{ }; use crate::codec::decode::{decode_ben32_line, decode_ben_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; -use crate::codec::{BenDecodeFrame, BenEncodeFrame, FromAssign, TwoDeltaEncodeFrame}; +use crate::codec::{BenConstruct, BenDecodeFrame, BenEncodeFrame, TwoDeltaEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::util::rle::rle_to_vec; diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index 993b47f..a5477c4 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -1,7 +1,7 @@ use super::frames::BufferedBenFrame; use super::utils::{analyze_twodelta_transition, is_repeated_assignment, parse_json_assignment}; use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::{BenEncodeFrame, FromAssign}; +use crate::codec::{BenConstruct, BenEncodeFrame}; use crate::format::banners::banner_for_variant; use crate::BenVariant; use serde_json::Value; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index e39ccf2..dcaccf1 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -4,7 +4,7 @@ mod errors; use errors::RelabelError; use crate::codec::decode::decode_ben_line; -use crate::codec::{BenEncodeFrame, FromRLE}; +use crate::codec::{BenConstruct, BenEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::BenDecoder; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 442e005..6ddd7ea 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -1,7 +1,7 @@ use super::*; use crate::codec::decode::decode_ben_to_jsonl; use crate::codec::encode::encode_jsonl_to_ben; -use crate::codec::{BenEncodeFrame, FromRLE}; +use crate::codec::{BenConstruct, BenEncodeFrame}; use crate::util::rle::assign_to_rle; use rand::seq::SliceRandom; use rand::SeedableRng; diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index b19df49..69dc4a6 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -9,7 +9,7 @@ use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, encode_twodelta_frame, }; -use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE, TwoDeltaEncodeFrame}; +use binary_ensemble::codec::{BenConstruct, BenEncodeFrame, TwoDeltaEncodeFrame}; use binary_ensemble::format::banners::{ banner_for_variant, has_known_banner_prefix, variant_from_banner, BANNER_LEN, MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 5be0a65..37b26ae 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -6,7 +6,7 @@ use binary_ensemble::codec::decode::{ use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; -use binary_ensemble::codec::{BenEncodeFrame, FromAssign, FromRLE}; +use binary_ensemble::codec::{BenConstruct, BenEncodeFrame}; use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, DecodeFrame, DecoderInitError, SubsampleFrameDecoder, XBenDecoder, From cd0233ab31b8a220a66cc229a8c839afa57b46c2 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:38:00 -0600 Subject: [PATCH 048/221] Change BenEncoder -> AssignmentWriter --- ben/src/cli/pben.rs | 4 ++-- ben/src/codec/decode/xz.rs | 4 ++-- ben/src/codec/encode/jsonl.rs | 4 ++-- ben/src/io/writer/ben.rs | 8 ++++---- ben/src/io/writer/mod.rs | 2 +- ben/src/ops/relabel/mod.rs | 4 ++-- ben/tests/test_coverage.rs | 26 +++++++++++++------------- ben/tests/test_impls_pipeline.rs | 27 +++++++++++++++------------ pyben/src/encode/mod.rs | 22 ++++++++++++---------- 9 files changed, 53 insertions(+), 48 deletions(-) diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 3dfe31e..84abff9 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -1,6 +1,6 @@ use crate::cli::common::{check_overwrite, set_verbose}; use crate::io::reader::BenDecoder; -use crate::io::writer::{BenEncoder, XBenEncoder}; +use crate::io::writer::{AssignmentWriter, XBenEncoder}; use crate::BenVariant; use clap::{Parser, ValueEnum}; use pipe::pipe; @@ -223,7 +223,7 @@ fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { /// Read zero-based assignment vectors and encode them as BEN. fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { - let mut ben_writer = BenEncoder::new(writer, BenVariant::MkvChain)?; + let mut ben_writer = AssignmentWriter::new(writer, BenVariant::MkvChain)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index aef9793..521a567 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -3,7 +3,7 @@ use crate::codec::translate::ben32_to_ben_lines; use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::XBenDecoder; -use crate::io::writer::BenEncoder; +use crate::io::writer::AssignmentWriter; use crate::{progress, BenVariant}; use serde_json::json; use std::io::{self, BufRead, BufReader, Read, Write}; @@ -45,7 +45,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BufReader::new(decoder), BenVariant::TwoDelta, ); - let mut ben = BenEncoder::new(writer, BenVariant::TwoDelta)?; + let mut ben = AssignmentWriter::new(writer, BenVariant::TwoDelta)?; for record in &mut xben { let (assignment, count) = record?; ben.write_assignment(assignment.clone())?; diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index f03d11a..aa65075 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,5 +1,5 @@ use crate::codec::encode::errors::EncodeError; -use crate::io::writer::{BenEncoder, XBenEncoder}; +use crate::io::writer::{AssignmentWriter, XBenEncoder}; use crate::{progress, BenVariant}; use serde_json::Value; use std::io::{self, BufRead, Result, Write}; @@ -98,7 +98,7 @@ pub fn encode_jsonl_to_ben( variant: BenVariant, ) -> Result<()> { let mut line_num = 1; - let mut ben_encoder = BenEncoder::new(writer, variant)?; + let mut ben_encoder = AssignmentWriter::new(writer, variant)?; for line_result in reader.lines() { progress!("Encoding line: {}\r", line_num); line_num += 1; diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs index a5477c4..744bbd7 100644 --- a/ben/src/io/writer/ben.rs +++ b/ben/src/io/writer/ben.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::io::{self, Result, Write}; /// A struct to make the writing of BEN files easier and more ergonomic. -pub struct BenEncoder { +pub struct AssignmentWriter { writer: W, previous_sample: Vec, previous_masks: HashMap>, @@ -25,7 +25,7 @@ pub(super) struct AssignmentHints { pub delta_pair: Option<(u16, u16)>, } -impl BenEncoder { +impl AssignmentWriter { /// Create a new BEN writer and immediately emit the BEN banner. /// /// # Arguments @@ -39,7 +39,7 @@ impl BenEncoder { pub fn new(mut writer: W, variant: BenVariant) -> io::Result { writer.write_all(banner_for_variant(variant))?; - Ok(BenEncoder { + Ok(AssignmentWriter { writer, previous_sample: Vec::new(), previous_masks: HashMap::new(), @@ -233,7 +233,7 @@ impl BenEncoder { } } -impl Drop for BenEncoder { +impl Drop for AssignmentWriter { /// Flush any buffered BEN state during drop. fn drop(&mut self) { let _ = self.finish(); diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 9f7116c..fef7d44 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -5,6 +5,6 @@ pub(crate) mod twodelta; pub(crate) mod utils; pub mod xben; -pub use ben::BenEncoder; +pub use ben::AssignmentWriter; pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; pub use xben::XBenEncoder; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index dcaccf1..4f0c031 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -8,7 +8,7 @@ use crate::codec::{BenConstruct, BenEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::BenDecoder; -use crate::io::writer::BenEncoder; +use crate::io::writer::AssignmentWriter; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; use byteorder::{BigEndian, ReadBytesExt}; @@ -126,7 +126,7 @@ where F: FnMut(&[u16]) -> io::Result>, { let mut decoder = BenDecoder::new(reader)?.silent(true); - let mut encoder = BenEncoder::new(writer, variant)?; + let mut encoder = AssignmentWriter::new(writer, variant)?; let mut sample_number = 0usize; decoder.for_each_assignment(|assignment, count| { diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 69dc4a6..8c09301 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -17,7 +17,7 @@ use binary_ensemble::format::banners::{ use binary_ensemble::io::reader::{ BenDecoder, BenFrameDecoeder, DecoderInitError, XBenDecoder, XBenFrameDecoder, }; -use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::json::graph::{ sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, }; @@ -540,7 +540,7 @@ fn xben_decoder_reads_variant_from_banner_twodelta() { #[test] fn ben_encoder_writes_correct_banner_standard() { let mut out = Vec::new(); - let encoder = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let encoder = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); drop(encoder); assert!(out.starts_with(STANDARD_BEN_BANNER)); } @@ -548,7 +548,7 @@ fn ben_encoder_writes_correct_banner_standard() { #[test] fn ben_encoder_writes_correct_banner_mkvchain() { let mut out = Vec::new(); - let encoder = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + let encoder = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); drop(encoder); assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); } @@ -556,7 +556,7 @@ fn ben_encoder_writes_correct_banner_mkvchain() { #[test] fn ben_encoder_writes_correct_banner_twodelta() { let mut out = Vec::new(); - let encoder = BenEncoder::new(&mut out, BenVariant::TwoDelta).unwrap(); + let encoder = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); drop(encoder); assert!(out.starts_with(TWODELTA_BEN_BANNER)); } @@ -566,7 +566,7 @@ fn ben_encoder_standard_single_assignment_round_trip() { let assignment = vec![1u16, 2, 3, 3, 2, 1]; let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.finish().unwrap(); } @@ -581,7 +581,7 @@ fn ben_encoder_standard_single_assignment_round_trip() { fn ben_encoder_finish_is_idempotent() { let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); enc.write_assignment(vec![1u16, 2]).unwrap(); enc.finish().unwrap(); let len_after_first_finish = enc.finish().unwrap(); // second call @@ -598,7 +598,7 @@ fn ben_encoder_write_json_value_valid_input() { let data = json!({"assignment": [1, 2, 3], "sample": 1}); let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); enc.write_json_value(data).unwrap(); enc.finish().unwrap(); } @@ -610,7 +610,7 @@ fn ben_encoder_write_json_value_valid_input() { fn ben_encoder_write_json_value_missing_assignment_field_errors() { let data = json!({"sample": 1}); // no "assignment" let mut out = Vec::new(); - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!( result.is_err(), @@ -623,7 +623,7 @@ fn ben_encoder_write_json_value_value_too_large_errors() { // 65536 doesn't fit in u16. let data = json!({"assignment": [65536], "sample": 1}); let mut out = Vec::new(); - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!(result.is_err(), "expected error for value out of u16 range"); } @@ -632,7 +632,7 @@ fn ben_encoder_write_json_value_value_too_large_errors() { fn ben_encoder_write_json_value_negative_value_errors() { let data = json!({"assignment": [-1], "sample": 1}); let mut out = Vec::new(); - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!( result.is_err(), @@ -646,7 +646,7 @@ fn ben_encoder_standard_identical_assignments_still_written() { let assignment = vec![2u16, 2, 2]; let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); @@ -664,7 +664,7 @@ fn ben_encoder_mkv_identical_assignments_deduplicated() { let assignment = vec![2u16, 2, 2]; let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::MkvChain).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); @@ -685,7 +685,7 @@ fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { let next = vec![2u16, 2, 1, 1, 2, 1]; // all 1s→2s and 2s→1s let mut out = Vec::new(); { - let mut enc = BenEncoder::new(&mut out, BenVariant::TwoDelta).unwrap(); + let mut enc = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); enc.write_assignment(base.clone()).unwrap(); enc.write_assignment(next.clone()).unwrap(); enc.finish().unwrap(); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 37b26ae..c5eccd1 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -11,7 +11,7 @@ use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenDecoder, DecodeFrame, DecoderInitError, SubsampleFrameDecoder, XBenDecoder, }; -use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::ops::extract::extract_assignment_ben; use binary_ensemble::BenVariant; @@ -654,7 +654,7 @@ fn benencoder_finish_flushes_once() { let mut ben_vec = Vec::new(); { - let mut enc = BenEncoder::new(&mut ben_vec, BenVariant::MkvChain).unwrap(); + let mut enc = AssignmentWriter::new(&mut ben_vec, BenVariant::MkvChain).unwrap(); for line in lines.lines() { let v: serde_json::Value = serde_json::from_str(line).unwrap(); enc.write_json_value(v).unwrap(); @@ -926,7 +926,7 @@ fn xz_mt_params_are_capped_and_safe() { fn ben_encoder_write_assignment_path_roundtrips() { let mut ben = Vec::new(); { - let mut enc = BenEncoder::new(&mut ben, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); enc.write_assignment(vec![9u16, 9, 2, 2, 2]).unwrap(); enc.finish().unwrap(); } @@ -1008,7 +1008,7 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { let mut payload_only = Vec::new(); { - let mut enc = BenEncoder::new(&mut payload_only, BenVariant::Standard).unwrap(); + let mut enc = AssignmentWriter::new(&mut payload_only, BenVariant::Standard).unwrap(); enc.write_assignment(vec![5u16, 5, 7]).unwrap(); enc.finish().unwrap(); } @@ -1023,7 +1023,8 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { .encoder() .unwrap(); let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); - let mut xben = binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard).unwrap(); + let mut xben = + binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard).unwrap(); xben.write_ben_file(BufReader::new(payload_only.as_slice())) .unwrap(); } @@ -1345,7 +1346,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let mut ben = Vec::new(); { - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); for assignment in &assignments { encoder.write_assignment(assignment.clone()).unwrap(); } @@ -1368,9 +1369,11 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let frames = BenDecoder::new(ben.as_slice()).unwrap().into_frames(); assert_eq!( - collect_frames(frames.map(|res| res.map(|f| (DecodeFrame::Ben(f.clone()), f.count as u16)))) - .unwrap() - .len(), + collect_frames( + frames.map(|res| res.map(|f| (DecodeFrame::Ben(f.clone()), f.count as u16))) + ) + .unwrap() + .len(), 3 ); } @@ -1382,7 +1385,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { let mut ben = Vec::new(); { - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(second).unwrap(); @@ -1405,7 +1408,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { #[test] fn twodelta_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); let err = encoder.write_assignment(vec![1u16, 3, 2, 4]).err().unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -1414,7 +1417,7 @@ fn twodelta_rejects_non_pair_transition() { #[test] fn twodelta_write_json_value_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = BenEncoder::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) .unwrap(); diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 43153c4..071c58c 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -2,7 +2,7 @@ use crate::common::{open_input, open_output, parse_variant, validate_input_outpu use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, }; -use binary_ensemble::io::writer::BenEncoder; +use binary_ensemble::io::writer::AssignmentWriter; use pyo3::exceptions::PyIOError; use pyo3::prelude::PyResult; use pyo3::{pyclass, pyfunction, pymethods}; @@ -12,7 +12,7 @@ use std::path::PathBuf; #[pyclass] pub struct PyBenEncoder { - encoder: Option>>, + encoder: Option>>, } #[pymethods] @@ -24,7 +24,7 @@ impl PyBenEncoder { let ben_var = parse_variant(variant.as_deref())?; let writer = open_output(&file_path, overwrite)?; - let encoder = BenEncoder::new(writer, ben_var) + let encoder = AssignmentWriter::new(writer, ben_var) .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {}", e)))?; Ok(PyBenEncoder { encoder: Some(encoder), @@ -136,12 +136,14 @@ pub fn compress_jsonl_to_xben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert JSONL to XBEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; + encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err( + |e| { + PyIOError::new_err(format!( + "Failed to convert JSONL to XBEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + }, + )?; Ok(()) } From 8321f3e636479e97b351ce2a9838107a9cef7756 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:40:51 -0600 Subject: [PATCH 049/221] Change XBenEncoder -> XZAssignmentWriter --- ben/src/cli/pben.rs | 4 ++-- ben/src/codec/encode/jsonl.rs | 4 ++-- ben/src/codec/encode/xz.rs | 4 ++-- ben/src/io/writer/mod.rs | 2 +- ben/src/io/writer/xben.rs | 11 ++++++----- ben/tests/test_impls_pipeline.rs | 3 ++- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 84abff9..ecdfae3 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -1,6 +1,6 @@ use crate::cli::common::{check_overwrite, set_verbose}; use crate::io::reader::BenDecoder; -use crate::io::writer::{AssignmentWriter, XBenEncoder}; +use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; use clap::{Parser, ValueEnum}; use pipe::pipe; @@ -239,7 +239,7 @@ fn assignment_encode_ben(reader: R, writer: W) -> i /// Read zero-based assignment vectors and encode them as XBEN. fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { let encoder = XzEncoder::new(writer, 9); - let mut xben_writer = XBenEncoder::new(encoder, BenVariant::MkvChain)?; + let mut xben_writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index aa65075..d3f7a3b 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,5 +1,5 @@ use crate::codec::encode::errors::EncodeError; -use crate::io::writer::{AssignmentWriter, XBenEncoder}; +use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::{progress, BenVariant}; use serde_json::Value; use std::io::{self, BufRead, Result, Write}; @@ -50,7 +50,7 @@ pub fn encode_jsonl_to_xben( .encoder() .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let encoder = XzEncoder::new_stream(writer, mt); - let mut ben_encoder = XBenEncoder::new(encoder, variant)?; + let mut ben_encoder = XZAssignmentWriter::new(encoder, variant)?; if let Some(cs) = chunk_size { ben_encoder = ben_encoder.with_chunk_size(cs); } diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 92f8f92..0295a85 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -1,7 +1,7 @@ use crate::codec::encode::errors::EncodeError; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::writer::XBenEncoder; +use crate::io::writer::XZAssignmentWriter; use std::io::{self, BufRead, Cursor, Read, Result, Write}; use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; @@ -109,7 +109,7 @@ pub fn encode_ben_to_xben( actual: check_buffer.to_vec(), }) })?; - let mut ben_encoder = XBenEncoder::new(encoder, variant)?; + let mut ben_encoder = XZAssignmentWriter::new(encoder, variant)?; if let Some(cs) = chunk_size { ben_encoder = ben_encoder.with_chunk_size(cs); } diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index fef7d44..7402231 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -7,4 +7,4 @@ pub mod xben; pub use ben::AssignmentWriter; pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; -pub use xben::XBenEncoder; +pub use xben::XZAssignmentWriter; diff --git a/ben/src/io/writer/xben.rs b/ben/src/io/writer/xben.rs index 542ac47..af3cef2 100644 --- a/ben/src/io/writer/xben.rs +++ b/ben/src/io/writer/xben.rs @@ -19,7 +19,7 @@ use std::io::{self, BufRead, Read, Result, Write}; use xz2::write::XzEncoder; /// A struct to make the writing of XBEN files easier and more ergonomic. -pub struct XBenEncoder { +pub struct XZAssignmentWriter { encoder: XzEncoder, previous_assignment: Vec, previous_masks: HashMap>, @@ -30,7 +30,7 @@ pub struct XBenEncoder { chunk_buffer: Vec, } -impl XBenEncoder { +impl XZAssignmentWriter { /// Rebuild the value-to-position index map from the current previous assignment. fn rebuild_previous_masks(&mut self) { self.previous_masks.clear(); @@ -184,7 +184,7 @@ impl XBenEncoder { /// Returns a new XBEN encoder ready to accept assignments or BEN frames. pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> io::Result { encoder.write_all(banner_for_variant(variant))?; - Ok(XBenEncoder { + Ok(XZAssignmentWriter { encoder, previous_assignment: Vec::new(), previous_masks: HashMap::new(), @@ -375,7 +375,8 @@ impl XBenEncoder { let count = reader.read_u16::()?; // Unpack bitpacked run lengths. - let frame = TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); + let frame = + TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); let run_lengths = frame.run_length_vector; // Flush the initial full frame before the first delta chunk. @@ -428,7 +429,7 @@ impl XBenEncoder { } } -impl Drop for XBenEncoder { +impl Drop for XZAssignmentWriter { /// Flush any buffered XBEN repetition state during drop. fn drop(&mut self) { if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index c5eccd1..9b1056f 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1024,7 +1024,8 @@ fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { .unwrap(); let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); let mut xben = - binary_ensemble::io::writer::XBenEncoder::new(encoder, BenVariant::Standard).unwrap(); + binary_ensemble::io::writer::XZAssignmentWriter::new(encoder, BenVariant::Standard) + .unwrap(); xben.write_ben_file(BufReader::new(payload_only.as_slice())) .unwrap(); } From 03da0650a9764ca4fbb8c704cbf1c219ba60c971 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:52:02 -0600 Subject: [PATCH 050/221] Change writer/ben.rs -> write/assignment_writer and make better use of encoders --- ben/src/codec/encode/errors.rs | 11 +- ben/src/codec/encode/mod.rs | 4 +- ben/src/codec/encode/twodelta.rs | 34 +++- ben/src/codec/frames/twodelta.rs | 20 +- ben/src/io/writer/assignment_writer.rs | 161 +++++++++++++++++ ben/src/io/writer/ben.rs | 241 ------------------------- ben/src/io/writer/errors.rs | 0 ben/src/io/writer/frames.rs | 16 -- ben/src/io/writer/mod.rs | 8 +- ben/src/io/writer/utils.rs | 60 +++--- ben/tests/test_coverage.rs | 24 +-- ben/tests/test_impls_pipeline.rs | 7 +- 12 files changed, 270 insertions(+), 316 deletions(-) create mode 100644 ben/src/io/writer/assignment_writer.rs delete mode 100644 ben/src/io/writer/ben.rs create mode 100644 ben/src/io/writer/errors.rs diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index 39e38fc..359542f 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -22,8 +22,8 @@ pub enum EncodeError { #[error("TwoDelta received identical assignment to previous frame")] TwoDeltaIdentical, - #[error("TwoDelta requires equal-length assignment vectors, got {prev_len} vs {new_len}")] - TwoDeltaLengthMismatch { prev_len: usize, new_len: usize }, + #[error("Encoders require equal-length assignment vectors, got {prev_len} vs {new_len}")] + LengthMismatch { prev_len: usize, new_len: usize }, #[error("TwoDelta delta_pair hint provided without corresponding masks")] TwoDeltaHintWithoutMasks, @@ -38,7 +38,12 @@ pub enum EncodeError { TwoDeltaEmptyMask { id: u16 }, #[error("TwoDelta mask referenced position {pos} whose value {actual} is outside the pair ({a}, {b})")] - TwoDeltaMaskOutOfPair { pos: usize, actual: u16, a: u16, b: u16 }, + TwoDeltaMaskOutOfPair { + pos: usize, + actual: u16, + a: u16, + b: u16, + }, #[error("XZ encoder initialization failed: {0}")] XzInit(#[source] xz2::stream::Error), diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 7110fc3..ed20b6d 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -1,15 +1,15 @@ //! Encoding routines for BEN and XBEN formats. mod ben; -mod errors; +pub mod errors; mod jsonl; mod twodelta; mod xz; pub(crate) use ben::encode_ben32_assignments; pub use errors::EncodeError; -pub(crate) use twodelta::encode_twodelta_frame_with_hint; pub use twodelta::encode_twodelta_frame; +pub(crate) use twodelta::encode_twodelta_frame_with_hint; #[cfg(test)] pub(crate) use ben::encode_ben32_line; diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 1c9e755..23a2db8 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -66,12 +66,13 @@ pub(crate) fn encode_twodelta_frame_with_hint( new_assignment: impl AsRef<[u16]>, delta_pair: Option<(u16, u16)>, previous_masks: Option<&mut HashMap>>, + count: Option, ) -> Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); if previous_assignment.len() != new_assignment.len() { - return Err(Error::from(EncodeError::TwoDeltaLengthMismatch { + return Err(Error::from(EncodeError::LengthMismatch { prev_len: previous_assignment.len(), new_len: new_assignment.len(), })); @@ -95,11 +96,15 @@ pub(crate) fn encode_twodelta_frame_with_hint( new_assignment, pair, masks, + count, ), - (None, Some(masks)) => { - construct_twodelta_frame_from_mask_hint(previous_assignment, new_assignment, masks) - } - _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment), + (None, Some(masks)) => construct_twodelta_frame_from_mask_hint( + previous_assignment, + new_assignment, + masks, + count, + ), + _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment, count), } // Ok(TwoDeltaEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) @@ -184,6 +189,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( current: &[u16], delta_pair: (u16, u16), previous_masks: &mut HashMap>, + count: Option, ) -> Result { let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, previous_masks, current) { @@ -277,7 +283,11 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous_masks.insert(pair.0, new_mask_a); previous_masks.insert(pair.1, new_mask_b); - Ok(TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths)) + Ok(TwoDeltaEncodeFrame::from_run_lengths( + pair, + run_lengths, + count, + )) } /// Build a TwoDelta frame using only pre-computed position masks, inferring the pair @@ -303,6 +313,7 @@ fn construct_twodelta_frame_from_mask_hint( previous: &[u16], current: &[u16], previous_masks: &mut HashMap>, + count: Option, ) -> Result { for (&assign0, &assign1) in previous.iter().zip(current.iter()) { if assign0 != assign1 { @@ -311,6 +322,7 @@ fn construct_twodelta_frame_from_mask_hint( current, (assign0, assign1), previous_masks, + count, ); } } @@ -339,6 +351,7 @@ fn construct_twodelta_frame_from_mask_hint( fn construct_twodelta_frame_from_scratch( previous: &[u16], current: &[u16], + count: Option, ) -> Result { // Find the pair at the first changed position. let first_change = previous @@ -381,7 +394,11 @@ fn construct_twodelta_frame_from_scratch( } run_lengths.push(run_count); - Ok(TwoDeltaEncodeFrame::from_run_lengths(enc_pair, run_lengths)) + Ok(TwoDeltaEncodeFrame::from_run_lengths( + enc_pair, + run_lengths, + count, + )) } /// Encode a transition between two assignment vectors as a TwoDelta frame. @@ -406,6 +423,7 @@ fn construct_twodelta_frame_from_scratch( pub fn encode_twodelta_frame( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, + count: Option, ) -> Result { - encode_twodelta_frame_with_hint(previous_assignment, new_assignment, None, None) + encode_twodelta_frame_with_hint(previous_assignment, new_assignment, None, None, count) } diff --git a/ben/src/codec/frames/twodelta.rs b/ben/src/codec/frames/twodelta.rs index 24bc4a2..73e6cbe 100644 --- a/ben/src/codec/frames/twodelta.rs +++ b/ben/src/codec/frames/twodelta.rs @@ -22,7 +22,7 @@ pub struct TwoDeltaEncodeFrame { impl TwoDeltaEncodeFrame { /// Borrow just the packed payload bytes. pub fn payload(&self) -> &[u8] { - &self.raw_bytes[9..] + &self.raw_bytes[9..9 + self.n_bytes as usize] } /// Borrow the serialized TwoDelta frame bytes. @@ -49,7 +49,7 @@ impl TwoDeltaEncodeFrame { /// /// The serialized layout is: /// ```text - /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...] + /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...][count: u16 BE] /// ``` /// where the payload is the bit-packed run lengths. /// @@ -63,7 +63,16 @@ impl TwoDeltaEncodeFrame { /// /// A fully serialized `TwoDeltaEncodeFrame` with both the packed `raw_bytes` and the /// original `run_length_vector` stored on the struct. - pub fn from_run_lengths(pair: (u16, u16), run_length_vector: Vec) -> Self { + pub fn from_run_lengths( + pair: (u16, u16), + run_length_vector: Vec, + count: Option, + ) -> Self { + let count = match count { + Some(v) => v, + None => 1, + }; + let max_len = run_length_vector.iter().copied().max().unwrap_or(0); let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); @@ -98,6 +107,8 @@ impl TwoDeltaEncodeFrame { raw_bytes.push((remainder << (8 - remainder_bits)) as u8); } + raw_bytes.extend(count.to_be_bytes()); + Self { pair, max_len_bit_count, @@ -135,12 +146,13 @@ impl TwoDeltaEncodeFrame { raw_bytes.push(max_len_bit_count); raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); raw_bytes.extend_from_slice(&payload); + raw_bytes.extend_from_slice(&1u16.to_be_bytes()); let mut run_length_vector = Vec::new(); let mut buffer: u32 = 0; let mut n_bits_in_buff: u16 = 0; - for byte in payload { + for &byte in payload[..n_bytes as usize].iter() { buffer |= (byte as u32).to_be() >> n_bits_in_buff; n_bits_in_buff += 8; diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs new file mode 100644 index 0000000..20db650 --- /dev/null +++ b/ben/src/io/writer/assignment_writer.rs @@ -0,0 +1,161 @@ +use super::utils::parse_json_assignment; +use crate::codec::encode::encode_twodelta_frame_with_hint; +use crate::codec::{BenConstruct, BenEncodeFrame, MkvBenEncodeFrame}; +use crate::format::banners::banner_for_variant; +use crate::BenVariant; +use serde_json::Value; +use std::collections::HashMap; +use std::io::{self, Result, Write}; + +/// A struct to make the writing of BEN files easier and more ergonomic. +pub struct AssignmentWriter { + writer: W, + previous_sample: Vec, + previous_masks: HashMap>, + pending_sample: Option>, + sample_count: u16, + variant: BenVariant, + complete: bool, +} + +impl AssignmentWriter { + /// Create a new BEN writer and immediately emit the BEN banner. + /// + /// # Arguments + /// + /// * `writer` - The destination that will receive the BEN stream. + /// * `variant` - The BEN variant to encode. + /// + /// # Returns + /// + /// Returns a new encoder ready to accept assignments. + pub fn new(mut writer: W, variant: BenVariant) -> io::Result { + writer.write_all(banner_for_variant(variant))?; + + Ok(AssignmentWriter { + writer, + previous_sample: Vec::new(), + previous_masks: HashMap::new(), + pending_sample: None, + sample_count: 0, + complete: false, + variant, + }) + } + + /// Encode and write the pending assignment with the accumulated repetition count. + /// + /// For TwoDelta, the first frame is written as an MkvBen frame. Subsequent + /// frames are written as TwoDelta frames encoding the transition from + /// `previous_sample`. This is a no-op when no sample is pending. + /// + /// Note: That on the first call to `flush_pending_frame` when `self.pending_sample` is `None`, + /// the method will simply return `Ok(())` without writing anything. Flushing only happens + /// when there is a pending sample to write. + fn flush_pending_frame(&mut self) -> Result<()> { + let pending_sample = match self.pending_sample.take() { + Some(p) => p, + None => return Ok(()), + }; + + match self.variant { + BenVariant::Standard => { + let frame = BenEncodeFrame::from_assignment(&pending_sample, None); + for _ in 0..self.sample_count { + self.writer.write_all(frame.as_slice())?; + } + } + BenVariant::MkvChain => { + let frame = + MkvBenEncodeFrame::from_assignment(&pending_sample, Some(self.sample_count)); + self.writer.write_all(frame.as_slice())?; + } + BenVariant::TwoDelta => { + if self.previous_sample.is_empty() { + // First frame: encode as MkvBen and build the initial masks. + for (idx, &val) in pending_sample.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } + let frame = MkvBenEncodeFrame::from_assignment( + &pending_sample, + Some(self.sample_count), + ); + self.writer.write_all(frame.as_slice())?; + } else { + let frame = encode_twodelta_frame_with_hint( + &self.previous_sample, + &pending_sample, + None, + Some(&mut self.previous_masks), + Some(self.sample_count), + )?; + self.writer.write_all(frame.as_slice())?; + } + } + } + + self.previous_sample = pending_sample; + Ok(()) + } + + /// Encode and write a full assignment vector. + /// + /// Consecutive identical assignments are counted and written as a single + /// frame with the accumulated count for MkvChain and TwoDelta variants. + /// + /// # Arguments + /// + /// * `assign_vec` - The full assignment vector to encode. + /// + /// # Returns + /// + /// Returns `Ok(())` after the assignment has been queued or written. + pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { + if self.pending_sample.as_deref() == Some(assign_vec.as_slice()) { + self.sample_count += 1; + return Ok(()); + } + self.flush_pending_frame()?; + self.pending_sample = Some(assign_vec); + self.sample_count = 1; + Ok(()) + } + + /// Encode and write a JSON assignment record. + /// + /// The input must contain an `assignment` array of integers. Other fields + /// are ignored. + /// + /// # Arguments + /// + /// * `data` - A JSON object containing an `assignment` array. + /// + /// # Returns + /// + /// Returns `Ok(())` after the record has been validated and encoded. + pub fn write_json_value(&mut self, data: Value) -> Result<()> { + let new_assign = parse_json_assignment(data)?; + self.write_assignment(new_assign) + } + + /// Flush any buffered state to the underlying writer. + /// + /// # Returns + /// + /// Returns `Ok(())` once any buffered state has been flushed. + pub fn finish(&mut self) -> Result<()> { + if self.complete { + return Ok(()); + } + self.flush_pending_frame()?; + self.complete = true; + Ok(()) + } +} + +impl Drop for AssignmentWriter { + /// Flush any buffered BEN state during drop. + fn drop(&mut self) { + let _ = self.finish(); + } +} diff --git a/ben/src/io/writer/ben.rs b/ben/src/io/writer/ben.rs deleted file mode 100644 index 744bbd7..0000000 --- a/ben/src/io/writer/ben.rs +++ /dev/null @@ -1,241 +0,0 @@ -use super::frames::BufferedBenFrame; -use super::utils::{analyze_twodelta_transition, is_repeated_assignment, parse_json_assignment}; -use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::{BenConstruct, BenEncodeFrame}; -use crate::format::banners::banner_for_variant; -use crate::BenVariant; -use serde_json::Value; -use std::collections::HashMap; -use std::io::{self, Result, Write}; - -/// A struct to make the writing of BEN files easier and more ergonomic. -pub struct AssignmentWriter { - writer: W, - previous_sample: Vec, - previous_masks: HashMap>, - previous_encoded_sample: Option, - sample_count: u16, - variant: BenVariant, - complete: bool, -} - -#[derive(Clone, Copy, Debug, Default)] -pub(super) struct AssignmentHints { - pub is_repeated: bool, - pub delta_pair: Option<(u16, u16)>, -} - -impl AssignmentWriter { - /// Create a new BEN writer and immediately emit the BEN banner. - /// - /// # Arguments - /// - /// * `writer` - The destination that will receive the BEN stream. - /// * `variant` - The BEN variant to encode. - /// - /// # Returns - /// - /// Returns a new encoder ready to accept assignments or RLE frames. - pub fn new(mut writer: W, variant: BenVariant) -> io::Result { - writer.write_all(banner_for_variant(variant))?; - - Ok(AssignmentWriter { - writer, - previous_sample: Vec::new(), - previous_masks: HashMap::new(), - previous_encoded_sample: None, - sample_count: 0, - complete: false, - variant, - }) - } - - /// Store a new previous sample along with its encoded frame and repetition count. - /// - /// # Arguments - /// - /// * `sample` - The assignment vector to cache. - /// * `encoded` - The already-encoded frame for this assignment. - /// * `sample_count` - The initial repetition count for this sample. - fn set_previous_sample( - &mut self, - sample: Vec, - encoded: BufferedBenFrame, - sample_count: u16, - ) { - self.previous_sample = sample; - self.previous_encoded_sample = Some(encoded); - self.sample_count = sample_count; - } - - /// Encode and write an assignment vector using pre-computed transition hints. - /// - /// The encoding strategy depends on the configured `BenVariant`. Repeated - /// assignments may be deduplicated or counted, and two-delta hints enable - /// compact delta frames when applicable. - /// - /// # Arguments - /// - /// * `assign_vec` - The assignment vector to encode. - /// * `hints` - Pre-computed hints about repetition and delta-pair eligibility. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - fn write_assignment_with_hints( - &mut self, - assign_vec: Vec, - hints: AssignmentHints, - ) -> Result<()> { - match self.variant { - BenVariant::Standard => { - if hints.is_repeated { - if let Some(encoded) = self.previous_encoded_sample.as_ref() { - self.writer.write_all(encoded.as_slice())?; - self.previous_sample = assign_vec; - return Ok(()); - } - } - - let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); - self.writer.write_all(encoded.as_slice())?; - self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 0); - Ok(()) - } - BenVariant::MkvChain => { - if hints.is_repeated { - self.sample_count += 1; - return Ok(()); - } - - if self.sample_count > 0 { - self.flush_pending_frame()?; - } - - let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); - self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); - Ok(()) - } - BenVariant::TwoDelta => { - if self.previous_sample.is_empty() { - let encoded = BenEncodeFrame::from_assignment(&assign_vec, None); - for (idx, &val) in assign_vec.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - self.set_previous_sample(assign_vec, BufferedBenFrame::Ben(encoded), 1); - return Ok(()); - } - - if hints.is_repeated { - self.sample_count += 1; - return Ok(()); - } - - let encoded = encode_twodelta_frame_with_hint( - &self.previous_sample, - &assign_vec, - hints.delta_pair, - Some(&mut self.previous_masks), - )?; - self.flush_pending_frame()?; - - self.previous_sample = assign_vec; - self.previous_encoded_sample = Some(BufferedBenFrame::TwoDelta(encoded)); - self.sample_count = 1; - Ok(()) - } - } - } - - /// Flush the buffered frame and its repetition count to the underlying writer. - /// - /// For MkvChain and TwoDelta variants, the repetition count is appended - /// after the encoded frame. This is a no-op when no samples are pending. - /// - /// # Returns - /// - /// Returns `Ok(())` once the pending frame has been written. - fn flush_pending_frame(&mut self) -> Result<()> { - if self.sample_count == 0 { - return Ok(()); - } - - let encoded = self.previous_encoded_sample.as_ref().ok_or_else(|| { - io::Error::new(io::ErrorKind::InvalidData, "missing previous BEN frame") - })?; - self.writer.write_all(encoded.as_slice())?; - - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { - self.writer.write_all(&self.sample_count.to_be_bytes())?; - } - - Ok(()) - } - - /// Encode and write a full assignment vector. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - let hints = if self.variant == BenVariant::TwoDelta { - let masks = if self.previous_masks.is_empty() { - None - } else { - Some(&self.previous_masks) - }; - analyze_twodelta_transition(&self.previous_sample, &assign_vec, masks) - } else { - AssignmentHints { - is_repeated: is_repeated_assignment(&self.previous_sample, &assign_vec), - delta_pair: None, - } - }; - self.write_assignment_with_hints(assign_vec, hints) - } - - /// Encode and write a JSON assignment record. - /// - /// The input must contain an `assignment` array of integers. Other fields - /// are ignored. - /// - /// # Arguments - /// - /// * `data` - A JSON object containing an `assignment` array. - /// - /// # Returns - /// - /// Returns `Ok(())` after the record has been validated and encoded. - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let new_assign = parse_json_assignment(data)?; - self.write_assignment(new_assign) - } - - /// Flush any buffered repetition state to the underlying writer. - /// - /// This matters for [`BenVariant::MkvChain`], where repeated consecutive - /// samples are emitted only once together with their repetition count. - /// - /// # Returns - /// - /// Returns `Ok(())` once any buffered repetition state has been flushed. - pub fn finish(&mut self) -> Result<()> { - if self.complete { - return Ok(()); - } - self.flush_pending_frame()?; - self.complete = true; - Ok(()) - } -} - -impl Drop for AssignmentWriter { - /// Flush any buffered BEN state during drop. - fn drop(&mut self) { - let _ = self.finish(); - } -} diff --git a/ben/src/io/writer/errors.rs b/ben/src/io/writer/errors.rs new file mode 100644 index 0000000..e69de29 diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs index b02feab..60d4e9e 100644 --- a/ben/src/io/writer/frames.rs +++ b/ben/src/io/writer/frames.rs @@ -1,22 +1,6 @@ -use crate::codec::{BenEncodeFrame, TwoDeltaEncodeFrame}; - /// A buffered delta frame awaiting chunk serialization. pub(super) struct BufferedDeltaFrame { pub pair: (u16, u16), pub run_lengths: Vec, pub count: u16, } - -pub(super) enum BufferedBenFrame { - Ben(BenEncodeFrame), - TwoDelta(TwoDeltaEncodeFrame), -} - -impl BufferedBenFrame { - pub fn as_slice(&self) -> &[u8] { - match self { - Self::Ben(frame) => frame.as_slice(), - Self::TwoDelta(frame) => frame.as_slice(), - } - } -} diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 7402231..2f93639 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -1,10 +1,10 @@ -pub mod ben; +pub mod assignment_writer; pub(crate) mod frames; pub(crate) mod tests; pub(crate) mod twodelta; pub(crate) mod utils; -pub mod xben; +pub mod xz_assignment_writer; -pub use ben::AssignmentWriter; +pub use assignment_writer::AssignmentWriter; pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; -pub use xben::XZAssignmentWriter; +pub use xz_assignment_writer::XZAssignmentWriter; diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index 18c45a6..c2efeaf 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -1,5 +1,11 @@ -use super::ben::AssignmentHints; use super::twodelta::XBEN_TWODELTA_FULL_TAG; + +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct AssignmentHints { + pub is_repeated: bool, + pub delta_pair: Option<(u16, u16)>, +} +use crate::codec::encode::errors::EncodeError; use crate::util::rle::assign_to_rle; use serde_json::Value; use std::collections::HashMap; @@ -53,20 +59,28 @@ pub(super) fn analyze_twodelta_transition( previous_sample: &[u16], assign_vec: &[u16], masks: Option<&HashMap>>, -) -> AssignmentHints { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return AssignmentHints::default(); +) -> Result { + if previous_sample.is_empty() { + return Ok(AssignmentHints::default()); } - // Fast path: use masks to find the pair in O(K) instead of O(N). - if let Some(masks) = masks { - if previous_sample == assign_vec { - return AssignmentHints { - is_repeated: true, - delta_pair: None, - }; + if previous_sample.len() != assign_vec.len() { + return Err(EncodeError::LengthMismatch { + prev_len: previous_sample.len(), + new_len: assign_vec.len(), } + .into()); + } + if previous_sample == assign_vec { + return Ok(AssignmentHints { + is_repeated: true, + delta_pair: None, + }); + } + + // Fast path: use masks to find the pair in O(K) instead of O(N). + if let Some(masks) = masks { // Check each label's mask positions. Only labels involved in the swap // will have any changed positions; all others short-circuit immediately. let mut pair: Option<(u16, u16)> = None; @@ -84,20 +98,20 @@ pub(super) fn analyze_twodelta_transition( break; } // More than two values involved. - return AssignmentHints { + return Ok(AssignmentHints { is_repeated: false, delta_pair: None, - }; + }); } } } } } - return AssignmentHints { + return Ok(AssignmentHints { is_repeated: false, delta_pair: pair, - }; + }); } // Slow path: full O(N) scan when masks are not available. @@ -106,10 +120,10 @@ pub(super) fn analyze_twodelta_transition( .zip(assign_vec.iter()) .position(|(&previous, ¤t)| previous != current) else { - return AssignmentHints { + return Ok(AssignmentHints { is_repeated: true, delta_pair: None, - }; + }); }; let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); @@ -124,24 +138,24 @@ pub(super) fn analyze_twodelta_transition( } if previous != pair.0 && previous != pair.1 { - return AssignmentHints { + return Ok(AssignmentHints { is_repeated: false, delta_pair: None, - }; + }); } if current != pair.0 && current != pair.1 { - return AssignmentHints { + return Ok(AssignmentHints { is_repeated: false, delta_pair: None, - }; + }); } } - AssignmentHints { + Ok(AssignmentHints { is_repeated: false, delta_pair: Some(pair), - } + }) } /// Extract and validate the `assignment` array from a JSON object. diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 8c09301..947c054 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -1498,7 +1498,7 @@ fn ben_decoder_accepts_cursor_reader() { fn encode_twodelta_frame_different_lengths_errors() { let prev = vec![1u16, 2, 3]; let next = vec![1u16, 2]; - let err = encode_twodelta_frame(&prev, &next).unwrap_err(); + let err = encode_twodelta_frame(&prev, &next, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("equal-length")); } @@ -1506,7 +1506,7 @@ fn encode_twodelta_frame_different_lengths_errors() { #[test] fn encode_twodelta_frame_identical_assignments_errors() { let assign = vec![1u16, 2, 3]; - let err = encode_twodelta_frame(&assign, &assign).unwrap_err(); + let err = encode_twodelta_frame(&assign, &assign, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("identical")); } @@ -1516,7 +1516,7 @@ fn encode_twodelta_frame_more_than_two_values_errors() { // prev = [1,2,3], next = [3,1,2]: positions 0,1,2 all change and involve ids 1,2,3 → 3 ids let prev = vec![1u16, 2, 3]; let next = vec![3u16, 1, 2]; - let err = encode_twodelta_frame(&prev, &next).unwrap_err(); + let err = encode_twodelta_frame(&prev, &next, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("two distinct assignment ids")); } @@ -1525,7 +1525,7 @@ fn encode_twodelta_frame_more_than_two_values_errors() { fn encode_twodelta_frame_valid_two_value_transition() { let prev = vec![1u16, 1, 2, 2]; let next = vec![2u16, 2, 1, 1]; - let frame = encode_twodelta_frame(&prev, &next).unwrap(); + let frame = encode_twodelta_frame(&prev, &next, Some(1)).unwrap(); // All 4 positions belong to the pair, and all flip assert_eq!(frame.n_bytes as usize, frame.payload().len()); } @@ -1535,7 +1535,7 @@ fn encode_twodelta_frame_single_value_swap() { // Only one position changes: prev[3]=2 → next[3]=1; pair is (new_val, old_val) = (1, 2) let prev = vec![1u16, 1, 1, 2]; let next = vec![1u16, 1, 1, 1]; - let frame = encode_twodelta_frame(&prev, &next).unwrap(); + let frame = encode_twodelta_frame(&prev, &next, Some(1)).unwrap(); assert_eq!(frame.pair, (1, 2)); } @@ -1547,7 +1547,7 @@ fn encode_twodelta_frame_single_value_swap() { fn twodelta_frame_pair_accessor() { let pair = (3u16, 7u16); let run_lengths = vec![2u16, 3, 1]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); assert_eq!(frame.pair, pair); } @@ -1556,7 +1556,7 @@ fn twodelta_frame_max_len_bits_accessor() { // max run length = 4 = 0b100 → 3 bits let pair = (1u16, 2u16); let run_lengths = vec![4u16, 4]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); assert_eq!(frame.max_len_bit_count, 3); } @@ -1564,7 +1564,7 @@ fn twodelta_frame_max_len_bits_accessor() { fn twodelta_frame_n_bytes_and_payload_consistent() { let pair = (5u16, 10u16); let run_lengths = vec![1u16, 2, 3]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); assert_eq!(frame.n_bytes as usize, frame.payload().len()); } @@ -1572,7 +1572,7 @@ fn twodelta_frame_n_bytes_and_payload_consistent() { fn twodelta_frame_to_bytes_and_as_slice_same() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); assert_eq!(frame.to_bytes(), frame.as_slice()); } @@ -1580,7 +1580,7 @@ fn twodelta_frame_to_bytes_and_as_slice_same() { fn twodelta_frame_into_bytes_consumes() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); let expected = frame.to_bytes(); let actual = frame.into_bytes(); assert_eq!(actual, expected); @@ -1590,7 +1590,7 @@ fn twodelta_frame_into_bytes_consumes() { fn twodelta_frame_from_parts_round_trip() { let pair = (10u16, 20u16); let run_lengths = vec![2u16, 5, 1]; - let original = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let original = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); let reconstructed = TwoDeltaEncodeFrame::from_parts( pair, original.max_len_bit_count, @@ -1606,7 +1606,7 @@ fn twodelta_frame_from_parts_round_trip() { fn twodelta_frame_asref_and_deref() { let pair = (1u16, 2u16); let run_lengths = vec![3u16]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths); + let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); let as_ref: &[u8] = frame.as_ref(); let deref: &[u8] = &*frame; assert_eq!(as_ref, deref); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 9b1056f..fad0ebb 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1411,7 +1411,8 @@ fn twodelta_rejects_non_pair_transition() { let mut ben = Vec::new(); let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); - let err = encoder.write_assignment(vec![1u16, 3, 2, 4]).err().unwrap(); + encoder.write_assignment(vec![1u16, 3, 2, 4]).unwrap(); + let err = encoder.finish().err().unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } @@ -1422,10 +1423,10 @@ fn twodelta_write_json_value_rejects_non_pair_transition() { encoder .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) .unwrap(); - let err = encoder + encoder .write_json_value(json!({"assignment": [1u16, 3, 2, 4]})) - .err() .unwrap(); + let err = encoder.finish().err().unwrap(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } From f8735dc51b0bd7ca8e500e1260446ee84afca8c7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 23 Mar 2026 09:34:01 -0600 Subject: [PATCH 051/221] Update XZAssignment writer to be more parallel with the AssigmentWriter --- .../{xben.rs => xz_assignment_writer.rs} | 230 ++++++------------ 1 file changed, 75 insertions(+), 155 deletions(-) rename ben/src/io/writer/{xben.rs => xz_assignment_writer.rs} (62%) diff --git a/ben/src/io/writer/xben.rs b/ben/src/io/writer/xz_assignment_writer.rs similarity index 62% rename from ben/src/io/writer/xben.rs rename to ben/src/io/writer/xz_assignment_writer.rs index af3cef2..7a62a80 100644 --- a/ben/src/io/writer/xben.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -2,10 +2,7 @@ use super::frames::BufferedDeltaFrame; use super::twodelta::{ DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, }; -use super::utils::{ - analyze_twodelta_transition, encode_xben_twodelta_full_frame, is_repeated_assignment, - parse_json_assignment, -}; +use super::utils::{encode_xben_twodelta_full_frame, parse_json_assignment}; use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; @@ -23,102 +20,47 @@ pub struct XZAssignmentWriter { encoder: XzEncoder, previous_assignment: Vec, previous_masks: HashMap>, - previous_frame: Vec, + pending_assignment: Option>, count: u16, variant: BenVariant, chunk_size: usize, chunk_buffer: Vec, + complete: bool, } impl XZAssignmentWriter { - /// Rebuild the value-to-position index map from the current previous assignment. - fn rebuild_previous_masks(&mut self) { - self.previous_masks.clear(); - for (idx, &assignment) in self.previous_assignment.iter().enumerate() { - self.previous_masks.entry(assignment).or_default().push(idx); - } - } - - /// Store a new previous assignment along with its encoded frame and repetition count. - /// - /// # Arguments - /// - /// * `assignment` - The assignment vector to cache. - /// * `frame` - The already-encoded frame bytes for this assignment. - /// * `count` - The initial repetition count for this assignment. - fn set_previous_assignment(&mut self, assignment: Vec, frame: Vec, count: u16) { - self.previous_assignment = assignment; - self.rebuild_previous_masks(); - self.previous_frame = frame; - self.count = count; - } - - /// Update the value-to-position masks incrementally for a two-delta transition. + /// Encode and write the pending assignment with the accumulated count. /// - /// Instead of rebuilding the entire mask HashMap, only the positions belonging - /// to the two swapped values are repartitioned. This is O(pair_positions) - /// rather than O(assignment_length). - /// - /// # Arguments - /// - /// * `new_sample` - The new assignment vector after the transition. - /// * `pair` - The two values involved in the delta swap. - #[allow(dead_code)] - fn update_masks_for_delta(&mut self, new_sample: &[u16], pair: (u16, u16)) { - if pair.0 == pair.1 { - return; - } + /// For TwoDelta, builds the initial masks and writes the full frame followed + /// by the count. For MkvChain, encodes the assignment and appends the count. + /// This is a no-op when no assignment is pending. + fn flush_pending_frame(&mut self) -> Result<()> { + let pending = match self.pending_assignment.take() { + Some(p) => p, + None => return Ok(()), + }; - let pos_a = self.previous_masks.remove(&pair.0).unwrap_or_default(); - let pos_b = self.previous_masks.remove(&pair.1).unwrap_or_default(); - - let mut new_a = Vec::with_capacity(pos_a.len() + pos_b.len()); - let mut new_b = Vec::with_capacity(pos_a.len() + pos_b.len()); - - let (mut i, mut j) = (0, 0); - while i < pos_a.len() || j < pos_b.len() { - let pos = if j >= pos_b.len() || (i < pos_a.len() && pos_a[i] < pos_b[j]) { - let p = pos_a[i]; - i += 1; - p - } else { - let p = pos_b[j]; - j += 1; - p - }; - if new_sample[pos] == pair.0 { - new_a.push(pos); - } else { - new_b.push(pos); + match self.variant { + BenVariant::Standard => { + let encoded = encode_ben32_assignments(&pending)?; + self.encoder.write_all(&encoded)?; + } + BenVariant::MkvChain => { + let encoded = encode_ben32_assignments(&pending)?; + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&self.count.to_be_bytes())?; + } + BenVariant::TwoDelta => { + for (idx, &val) in pending.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } + let encoded = encode_xben_twodelta_full_frame(&pending); + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&self.count.to_be_bytes())?; } } - if !new_a.is_empty() { - self.previous_masks.insert(pair.0, new_a); - } - if !new_b.is_empty() { - self.previous_masks.insert(pair.1, new_b); - } - } - - /// Flush the buffered frame and its repetition count to the XZ encoder. - /// - /// For MkvChain and TwoDelta variants, the repetition count is appended - /// after the encoded frame. This is a no-op when no samples are pending. - /// - /// # Returns - /// - /// Returns `Ok(())` once the pending frame has been written. - fn flush_pending_frame(&mut self) -> Result<()> { - if self.count == 0 { - return Ok(()); - } - - self.encoder.write_all(&self.previous_frame)?; - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) { - self.encoder.write_all(&self.count.to_be_bytes())?; - } - self.count = 0; + self.previous_assignment = pending; Ok(()) } @@ -188,11 +130,12 @@ impl XZAssignmentWriter { encoder, previous_assignment: Vec::new(), previous_masks: HashMap::new(), - previous_frame: Vec::new(), + pending_assignment: None, count: 0, variant, chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, chunk_buffer: Vec::new(), + complete: false, }) } @@ -228,74 +171,59 @@ impl XZAssignmentWriter { let encoded = encode_ben32_assignments(&assign_vec)?; self.encoder.write_all(&encoded)?; self.previous_assignment = assign_vec; - self.previous_frame = encoded; - Ok(()) } BenVariant::MkvChain => { - if is_repeated_assignment(&self.previous_assignment, &assign_vec) { + if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { self.count += 1; return Ok(()); } - self.flush_pending_frame()?; - let encoded = encode_ben32_assignments(&assign_vec)?; - self.set_previous_assignment(assign_vec, encoded, 1); - Ok(()) + self.pending_assignment = Some(assign_vec); + self.count = 1; } BenVariant::TwoDelta => { - if self.previous_assignment.is_empty() { - let encoded = encode_xben_twodelta_full_frame(&assign_vec); - self.set_previous_assignment(assign_vec, encoded, 1); + // First assignment ever: buffer as the initial full frame. + if self.pending_assignment.is_none() && self.previous_assignment.is_empty() { + self.pending_assignment = Some(assign_vec); + self.count = 1; return Ok(()); } - - let masks = if self.previous_masks.is_empty() { - None - } else { - Some(&self.previous_masks) - }; - let hints = - analyze_twodelta_transition(&self.previous_assignment, &assign_vec, masks); - if hints.is_repeated { - if self.chunk_buffer.is_empty() { - self.count += 1; - } else { - self.chunk_buffer.last_mut().unwrap().count += 1; - } + // Repeat of the pending initial full frame. + if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { + self.count += 1; return Ok(()); } - - // Flush the initial full frame before the first delta. - if self.chunk_buffer.is_empty() { + // Repeat of the last delta frame in the current chunk. + if !self.chunk_buffer.is_empty() + && self.previous_assignment.as_slice() == assign_vec.as_slice() + { + self.chunk_buffer.last_mut().unwrap().count += 1; + return Ok(()); + } + // New distinct assignment: flush the initial full frame if pending. + if self.pending_assignment.is_some() { self.flush_pending_frame()?; } - - let encoded_frame: TwoDeltaEncodeFrame = match encode_twodelta_frame_with_hint( + // Encode the delta frame and add it to the chunk buffer. + let frame = encode_twodelta_frame_with_hint( &self.previous_assignment, &assign_vec, - hints.delta_pair, + None, Some(&mut self.previous_masks), - ) { - Ok(frame) => frame, - Err(e) => { - return Err(e); - } - }; - + None, + )?; self.chunk_buffer.push(BufferedDeltaFrame { - pair: encoded_frame.pair, - run_lengths: encoded_frame.run_length_vector, + pair: frame.pair, + run_lengths: frame.run_length_vector, count: 1, }); - self.previous_assignment = assign_vec; - if self.chunk_buffer.len() >= self.chunk_size { self.flush_chunk()?; } - Ok(()) } } + Ok(()) } /// Encode and write a JSON assignment record into the compressed XBEN stream. @@ -311,18 +239,21 @@ impl XZAssignmentWriter { self.write_assignment(parse_json_assignment(data)?) } - /// Read BEN frames from `reader` and write them into this XBEN stream. - /// - /// If the source still contains the 17-byte BEN banner, it is consumed and - /// replaced by the banner already written by this encoder. - /// - /// # Arguments - /// - /// * `reader` - The BEN input stream, with or without its banner. + /// Flush any buffered state to the underlying XZ encoder. /// /// # Returns /// - /// Returns `Ok(())` after the BEN stream has been translated into XBEN. + /// Returns `Ok(())` once all buffered state has been flushed. + pub fn finish(&mut self) -> Result<()> { + if self.complete { + return Ok(()); + } + self.flush_pending_frame()?; + self.flush_chunk()?; + self.complete = true; + Ok(()) + } + /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without /// materializing full assignment vectors. /// @@ -353,8 +284,8 @@ impl XZAssignmentWriter { encoded.extend_from_slice(&value.to_be_bytes()); encoded.extend_from_slice(&len.to_be_bytes()); } - self.previous_frame = encoded; - self.count = first_count; + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&first_count.to_be_bytes())?; let mut sample_count = first_count as usize; progress!("Encoding line: {}\r", sample_count); @@ -379,11 +310,6 @@ impl XZAssignmentWriter { TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); let run_lengths = frame.run_length_vector; - // Flush the initial full frame before the first delta chunk. - if self.chunk_buffer.is_empty() && self.count > 0 { - self.flush_pending_frame()?; - } - self.chunk_buffer.push(BufferedDeltaFrame { pair: frame.pair, run_lengths, @@ -398,7 +324,6 @@ impl XZAssignmentWriter { progress!("Encoding line: {}\r", sample_count); } - // Flush remaining partial chunk (Drop will also catch this, but be explicit). self.flush_chunk()?; tracing::trace!(""); @@ -430,13 +355,8 @@ impl XZAssignmentWriter { } impl Drop for XZAssignmentWriter { - /// Flush any buffered XBEN repetition state during drop. + /// Flush any buffered XBEN state during drop. fn drop(&mut self) { - if matches!(self.variant, BenVariant::MkvChain | BenVariant::TwoDelta) && self.count > 0 { - let _ = self.flush_pending_frame(); - } - if !self.chunk_buffer.is_empty() { - let _ = self.flush_chunk(); - } + let _ = self.finish(); } } From b126117ee702a22102bb35eb5df1b9327464b33f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 23 Mar 2026 09:54:32 -0600 Subject: [PATCH 052/221] Rename twodelta.rs -> twodelta_encode.rs --- ben/src/codec/frames/mod.rs | 4 ++-- ben/src/codec/frames/{twodelta.rs => twodelta_encode.rs} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ben/src/codec/frames/{twodelta.rs => twodelta_encode.rs} (100%) diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index ce5da0b..dcc2fc2 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -1,12 +1,12 @@ mod ben_decode; mod ben_encode; mod mkv_encode; -mod twodelta; +mod twodelta_encode; pub use ben_decode::BenDecodeFrame; pub use ben_encode::BenEncodeFrame; pub use mkv_encode::MkvBenEncodeFrame; -pub use twodelta::TwoDeltaEncodeFrame; +pub use twodelta_encode::TwoDeltaEncodeFrame; use crate::util::rle::assign_to_rle; diff --git a/ben/src/codec/frames/twodelta.rs b/ben/src/codec/frames/twodelta_encode.rs similarity index 100% rename from ben/src/codec/frames/twodelta.rs rename to ben/src/codec/frames/twodelta_encode.rs From c8e234824b5758b2738b41f2a39ead749e6b5ba4 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 23 Mar 2026 12:03:34 -0600 Subject: [PATCH 053/221] Update decoder and readers --- ben/src/cli/pben.rs | 4 +- ben/src/codec/decode/ben.rs | 22 +- ben/src/codec/decode/jsonl.rs | 137 ++ ben/src/codec/decode/mod.rs | 9 +- ben/src/codec/decode/twodelta.rs | 72 + ben/src/codec/decode/xz.rs | 116 +- ben/src/codec/frames/ben_decode.rs | 33 +- ben/src/codec/frames/mkv_decode.rs | 94 ++ ben/src/codec/frames/mod.rs | 13 + ben/src/codec/frames/twodelta_decode.rs | 53 + ben/src/codec/mod.rs | 5 +- ben/src/io/reader/assignment_reader.rs | 373 +++++ ben/src/io/reader/ben.rs | 1493 --------------------- ben/src/io/reader/mod.rs | 15 +- ben/src/io/reader/subsample.rs | 297 ++++ ben/src/io/reader/xz_assignment_reader.rs | 678 ++++++++++ ben/src/io/writer/utils.rs | 154 --- ben/src/ops/extract/mod.rs | 9 +- ben/src/ops/relabel/mod.rs | 4 +- ben/tests/test_coverage.rs | 88 +- ben/tests/test_impls_pipeline.rs | 96 +- pyben/src/decode/mod.rs | 8 +- 22 files changed, 1876 insertions(+), 1897 deletions(-) create mode 100644 ben/src/codec/decode/jsonl.rs create mode 100644 ben/src/codec/decode/twodelta.rs create mode 100644 ben/src/codec/frames/mkv_decode.rs create mode 100644 ben/src/codec/frames/twodelta_decode.rs create mode 100644 ben/src/io/reader/assignment_reader.rs delete mode 100644 ben/src/io/reader/ben.rs create mode 100644 ben/src/io/reader/subsample.rs create mode 100644 ben/src/io/reader/xz_assignment_reader.rs diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index ecdfae3..2b7ade0 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -1,5 +1,5 @@ use crate::cli::common::{check_overwrite, set_verbose}; -use crate::io::reader::BenDecoder; +use crate::io::reader::AssignmentReader; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; use clap::{Parser, ValueEnum}; @@ -190,7 +190,7 @@ fn derive_output_path(mode: Mode, input_file: &str) -> String { /// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { - let ben_reader = BenDecoder::new(&mut reader)?; + let ben_reader = AssignmentReader::new(&mut reader)?; let mut line = String::new(); for result in ben_reader { diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 801bfc2..2a02e4e 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -1,5 +1,4 @@ -use crate::io::reader::BenDecoder; -use std::io::{self, Read, Write}; +use std::io::{self, Read}; /// Decode a single BEN frame payload into run-length encoded assignments. /// @@ -93,22 +92,3 @@ pub fn decode_ben_line( Ok(output_rle) } - -/// Decode a BEN stream into JSONL assignment records. -/// -/// Each decoded sample is written as a JSON object containing an `assignment` -/// vector and a 1-based `sample` index. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including the 17-byte BEN banner. -/// * `writer` - The destination that will receive one JSON object per decoded -/// sample. -/// -/// # Returns -/// -/// Returns `Ok(())` after the stream has been fully decoded and written. -pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { - let mut ben_decoder = BenDecoder::new(reader)?; - ben_decoder.write_all_jsonl(writer) -} diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs new file mode 100644 index 0000000..d18a36e --- /dev/null +++ b/ben/src/codec/decode/jsonl.rs @@ -0,0 +1,137 @@ +use crate::io::reader::{AssignmentReader, XZAssignmentReader}; +use crate::{progress, BenVariant}; +use crate::codec::decode::jsonl_decode_ben32; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::format::FormatError; +use serde_json::json; +use std::io::{self, BufRead, BufReader, Read, Write}; +use xz2::read::XzDecoder; + +/// Decode a BEN stream into JSONL assignment records. +/// +/// Each decoded sample is written as a JSON object containing an `assignment` +/// vector and a 1-based `sample` index. +/// +/// # Arguments +/// +/// * `reader` - The input BEN stream, including the 17-byte BEN banner. +/// * `writer` - The destination that will receive one JSON object per decoded +/// sample. +/// +/// # Returns +/// +/// Returns `Ok(())` after the stream has been fully decoded and written. +pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { + let mut ben_decoder = AssignmentReader::new(reader)?; + ben_decoder.write_all_jsonl(writer) +} + +/// Decode an XBEN stream directly into JSONL assignment records. +/// +/// # Arguments +/// +/// * `reader` - The compressed XBEN input stream. +/// * `writer` - The destination that will receive one JSON object per decoded +/// sample. +/// +/// # Returns +/// +/// Returns `Ok(())` after the XBEN stream has been fully decoded into JSONL. +pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { + let mut decoder = XzDecoder::new(reader); + + let mut first_buffer = [0u8; BANNER_LEN]; + + if let Err(e) = decoder.read_exact(&mut first_buffer) { + return Err(e); + } + + let variant = match variant_from_banner(&first_buffer) { + Some(BenVariant::Standard) => BenVariant::Standard, + Some(BenVariant::MkvChain) => BenVariant::MkvChain, + Some(BenVariant::TwoDelta) => { + let mut xben = XZAssignmentReader::from_decompressed_stream( + BufReader::new(decoder), + BenVariant::TwoDelta, + ); + let mut sample_number = 1usize; + for record in &mut xben { + let (assignment, count) = record?; + for _ in 0..count { + progress!("Decoding sample: {}\r", sample_number); + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + sample_number += 1; + } + } + tracing::trace!(""); + tracing::trace!("Done!"); + return Ok(()); + } + None => { + return Err(io::Error::from(FormatError::UnknownBanner { + actual: first_buffer.to_vec(), + })); + } + }; + + let mut buffer = [0u8; 1 << 20]; + let mut overflow: Vec = Vec::new(); + + let mut line_count: usize = 0; + let mut starting_sample: usize = 0; + while let Ok(count) = decoder.read(&mut buffer) { + if count == 0 { + break; + } + + overflow.extend(&buffer[..count]); + + let mut last_valid_assignment = 0; + + match variant { + BenVariant::Standard => { + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 1; + line_count += 1; + progress!("Decoding sample: {}\r", line_count); + } + } + } + BenVariant::MkvChain => { + for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 3; + let lines = &overflow[i + 1..i + 3]; + let n_lines = u16::from_be_bytes([lines[0], lines[1]]); + line_count += n_lines as usize; + progress!("Decoding sample: {}\r", line_count); + } + } + } + BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), + } + + if last_valid_assignment == 0 { + continue; + } + + jsonl_decode_ben32( + &overflow[0..last_valid_assignment], + &mut writer, + starting_sample, + variant, + )?; + overflow.drain(..last_valid_assignment); + starting_sample = line_count; + } + tracing::trace!(""); + tracing::trace!("Done!"); + Ok(()) +} diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index 9b5cce7..e2b4528 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -4,11 +4,16 @@ mod ben; mod ben32; pub(crate) mod errors; pub(crate) use errors::DecodeError; +mod jsonl; +mod twodelta; mod xz; -pub use ben::{decode_ben_line, decode_ben_to_jsonl}; +pub use ben::decode_ben_line; +pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; -pub use xz::{decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress}; +pub use twodelta::decode_twodelta_frame; +pub(crate) use twodelta::apply_twodelta_runs_to_assignment; +pub use xz::{decode_xben_to_ben, xz_decompress}; #[cfg(test)] mod tests; diff --git a/ben/src/codec/decode/twodelta.rs b/ben/src/codec/decode/twodelta.rs new file mode 100644 index 0000000..50c6585 --- /dev/null +++ b/ben/src/codec/decode/twodelta.rs @@ -0,0 +1,72 @@ +use super::errors::DecodeError; +use crate::codec::TwoDeltaEncodeFrame; +use std::io; + +/// Apply decoded TwoDelta run lengths to produce a new assignment vector. +/// +/// Positions in `assignment` that hold either value of `pair` are overwritten +/// according to the alternating run-length encoding. `pair.0` fills the first +/// run, `pair.1` the second, and so on. +/// +/// # Arguments +/// +/// * `assignment` - The assignment from the preceding frame (consumed and returned). +/// * `pair` - The two label values that participate in the delta. +/// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. +/// +/// # Returns +/// +/// Returns the updated assignment vector, or an error if the run lengths are +/// exhausted before all relevant positions are covered. +pub(crate) fn apply_twodelta_runs_to_assignment( + mut assignment: Vec, + pair: (u16, u16), + run_lengths: &[u16], +) -> io::Result> { + let (first, second) = pair; + + let mut run_idx = 0usize; + let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); + let mut current_value = first; + + for (pos, val) in assignment.iter_mut().enumerate() { + if *val == first || *val == second { + if remaining_in_run == 0 { + run_idx += 1; + if run_idx >= run_lengths.len() { + return Err(io::Error::from(DecodeError::TwoDeltaRunsExhausted { + run_idx, + pos, + })); + } + remaining_in_run = run_lengths[run_idx]; + current_value = if current_value == first { + second + } else { + first + }; + } + *val = current_value; + remaining_in_run -= 1; + } + } + + Ok(assignment) +} + +/// Decode a TwoDelta frame by applying its delta to the previous assignment. +/// +/// # Arguments +/// +/// * `previous` - The assignment vector from the preceding frame. +/// * `frame` - The TwoDelta frame containing the pair and run-length vector. +/// +/// # Returns +/// +/// Returns the updated assignment vector. +pub fn decode_twodelta_frame( + previous: Vec, + frame: &TwoDeltaEncodeFrame, +) -> io::Result> { + apply_twodelta_runs_to_assignment(previous, frame.pair, &frame.run_length_vector) +} diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 521a567..11ef039 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,11 +1,9 @@ -use crate::codec::decode::jsonl_decode_ben32; use crate::codec::translate::ben32_to_ben_lines; use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::reader::XBenDecoder; +use crate::io::reader::XZAssignmentReader; use crate::io::writer::AssignmentWriter; use crate::{progress, BenVariant}; -use serde_json::json; use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; @@ -41,7 +39,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BenVariant::MkvChain } Some(BenVariant::TwoDelta) => { - let mut xben = XBenDecoder::from_decompressed_stream( + let mut xben = XZAssignmentReader::from_decompressed_stream( BufReader::new(decoder), BenVariant::TwoDelta, ); @@ -134,113 +132,3 @@ pub fn xz_decompress(reader: R, mut writer: W) -> io::Resu Ok(()) } - -/// Decode an XBEN stream directly into JSONL assignment records. -/// -/// # Arguments -/// -/// * `reader` - The compressed XBEN input stream. -/// * `writer` - The destination that will receive one JSON object per decoded -/// sample. -/// -/// # Returns -/// -/// Returns `Ok(())` after the XBEN stream has been fully decoded into JSONL. -pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - - let mut first_buffer = [0u8; BANNER_LEN]; - - if let Err(e) = decoder.read_exact(&mut first_buffer) { - return Err(e); - } - - let variant = match variant_from_banner(&first_buffer) { - Some(BenVariant::Standard) => BenVariant::Standard, - Some(BenVariant::MkvChain) => BenVariant::MkvChain, - Some(BenVariant::TwoDelta) => { - let mut xben = XBenDecoder::from_decompressed_stream( - BufReader::new(decoder), - BenVariant::TwoDelta, - ); - let mut sample_number = 1usize; - for record in &mut xben { - let (assignment, count) = record?; - for _ in 0..count { - progress!("Decoding sample: {}\r", sample_number); - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - sample_number += 1; - } - } - tracing::trace!(""); - tracing::trace!("Done!"); - return Ok(()); - } - None => { - return Err(io::Error::from(FormatError::UnknownBanner { - actual: first_buffer.to_vec(), - })); - } - }; - - let mut buffer = [0u8; 1 << 20]; - let mut overflow: Vec = Vec::new(); - - let mut line_count: usize = 0; - let mut starting_sample: usize = 0; - while let Ok(count) = decoder.read(&mut buffer) { - if count == 0 { - break; - } - - overflow.extend(&buffer[..count]); - - let mut last_valid_assignment = 0; - - match variant { - BenVariant::Standard => { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - progress!("Decoding sample: {}\r", line_count); - } - } - } - BenVariant::MkvChain => { - for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - progress!("Decoding sample: {}\r", line_count); - } - } - } - BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), - } - - if last_valid_assignment == 0 { - continue; - } - - jsonl_decode_ben32( - &overflow[0..last_valid_assignment], - &mut writer, - starting_sample, - variant, - )?; - overflow.drain(..last_valid_assignment); - starting_sample = line_count; - } - tracing::trace!(""); - tracing::trace!("Done!"); - Ok(()) -} diff --git a/ben/src/codec/frames/ben_decode.rs b/ben/src/codec/frames/ben_decode.rs index 9f9c4d1..07e0b2c 100644 --- a/ben/src/codec/frames/ben_decode.rs +++ b/ben/src/codec/frames/ben_decode.rs @@ -1,3 +1,7 @@ +use super::BenDecode; +use byteorder::{BigEndian, ReadBytesExt}; +use std::io; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct BenDecodeFrame { // The number of bits used to encode the maximum label value in this frame. @@ -8,8 +12,6 @@ pub struct BenDecodeFrame { pub n_bytes: u32, // The full serialized BEN frame bytes, including the header and payload. pub raw_bytes: Vec, - // The number of times this frame was repeated - pub count: u16, } impl BenDecodeFrame { @@ -54,3 +56,30 @@ impl PartialEq for Vec { *self == other.raw_bytes } } + +impl BenDecode for BenDecodeFrame { + /// Read the next Standard BEN frame from the stream. + /// + /// Standard BEN frames have no trailing count; `count` is always set to 1. + /// Returns `Ok(None)` on a clean EOF at a frame boundary. + fn from_reader(reader: &mut impl io::Read) -> io::Result> { + let max_val_bit_count = match reader.read_u8() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let max_len_bit_count = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut raw_bytes = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut raw_bytes)?; + + Ok(Some(BenDecodeFrame { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + })) + } +} diff --git a/ben/src/codec/frames/mkv_decode.rs b/ben/src/codec/frames/mkv_decode.rs new file mode 100644 index 0000000..f9022a2 --- /dev/null +++ b/ben/src/codec/frames/mkv_decode.rs @@ -0,0 +1,94 @@ +use super::BenDecode; +use byteorder::{BigEndian, ReadBytesExt}; +use std::io::{self, Read}; + +/// A decoded MkvChain BEN frame, including its repetition count. +/// +/// Symmetric to `MkvBenEncodeFrame` but stores only the decoded payload bytes +/// and header fields rather than the original RLE runs. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MkvBenDecodeFrame { + /// The number of bits used to encode the maximum label value in this frame. + pub max_val_bit_count: u8, + /// The number of bits used to encode the maximum run length in this frame. + pub max_len_bit_count: u8, + /// The number of bytes in the packed payload. + pub n_bytes: u32, + /// The packed payload bytes (not including the 6-byte header or count). + pub raw_bytes: Vec, + /// The number of times this assignment repeats. + pub count: u16, +} + +impl MkvBenDecodeFrame { + /// Borrow the packed payload bytes. + pub fn as_slice(&self) -> &[u8] { + &self.raw_bytes + } + + /// Clone out the packed payload bytes. + pub fn to_bytes(&self) -> Vec { + self.raw_bytes.clone() + } + + /// Consume the frame and return the packed payload bytes without cloning. + pub fn into_bytes(self) -> Vec { + self.raw_bytes + } +} + +impl BenDecode for MkvBenDecodeFrame { + /// Read the next MkvChain BEN frame from the stream. + /// + /// MkvChain frames carry a trailing `u16` repetition count. + /// Returns `Ok(None)` on a clean EOF at a frame boundary. + fn from_reader(reader: &mut impl Read) -> io::Result> { + let max_val_bit_count = match reader.read_u8() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let max_len_bit_count = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut raw_bytes = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut raw_bytes)?; + + let count = reader.read_u16::()?; + + Ok(Some(MkvBenDecodeFrame { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + })) + } +} + +impl AsRef<[u8]> for MkvBenDecodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for MkvBenDecodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for MkvBenDecodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.raw_bytes == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &MkvBenDecodeFrame) -> bool { + *self == other.raw_bytes + } +} diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index dcc2fc2..3dfb9a9 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -1,14 +1,19 @@ mod ben_decode; mod ben_encode; +mod mkv_decode; mod mkv_encode; +mod twodelta_decode; mod twodelta_encode; pub use ben_decode::BenDecodeFrame; pub use ben_encode::BenEncodeFrame; +pub use mkv_decode::MkvBenDecodeFrame; pub use mkv_encode::MkvBenEncodeFrame; +pub use twodelta_decode::TwoDeltaDecodeFrame; pub use twodelta_encode::TwoDeltaEncodeFrame; use crate::util::rle::assign_to_rle; +use std::io; pub trait BenConstruct { fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; @@ -21,6 +26,14 @@ pub trait BenConstruct { } } +pub trait BenDecode: Sized { + /// Read the next frame from a byte stream. + /// + /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` + /// on success, and `Err` on any IO or format error. + fn from_reader(reader: &mut impl io::Read) -> io::Result>; +} + /// Compresses a run-length encoded vector into BEN payload bytes. pub(super) fn compress_rle_to_ben_bytes( max_val_bit_count: u8, diff --git a/ben/src/codec/frames/twodelta_decode.rs b/ben/src/codec/frames/twodelta_decode.rs new file mode 100644 index 0000000..d2eea41 --- /dev/null +++ b/ben/src/codec/frames/twodelta_decode.rs @@ -0,0 +1,53 @@ +use super::twodelta_encode::TwoDeltaEncodeFrame; +use super::BenDecode; +use byteorder::{BigEndian, ReadBytesExt}; +use std::io::{self, Read}; + +/// A decoded TwoDelta delta frame, containing only what's needed to apply the delta. +/// +/// Unlike `TwoDeltaEncodeFrame`, this type does not retain raw bytes or +/// bit-packing metadata. It delegates bit-unpacking of the run lengths to +/// `TwoDeltaEncodeFrame::from_parts` and then discards everything except +/// `pair`, `run_lengths`, and `count`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TwoDeltaDecodeFrame { + /// The ordered pair of assignment ids involved in the delta. + pub pair: (u16, u16), + /// The unpacked run-length vector over the positions occupied by the pair. + pub run_lengths: Vec, + /// The number of times this delta repeats. + pub count: u16, +} + +impl BenDecode for TwoDeltaDecodeFrame { + /// Read the next TwoDelta delta frame from the stream. + /// + /// Reads pair, max_len_bits, n_bytes, payload, and count, then delegates + /// bit-unpacking to `TwoDeltaEncodeFrame::from_parts`. + /// Returns `Ok(None)` on a clean EOF at a frame boundary. + fn from_reader(reader: &mut impl Read) -> io::Result> { + let pair_a = match reader.read_u16::() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let pair_b = reader.read_u16::()?; + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut payload)?; + + let count = reader.read_u16::()?; + + let encode_frame = + TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload); + + Ok(Some(TwoDeltaDecodeFrame { + pair: encode_frame.pair, + run_lengths: encode_frame.run_length_vector, + count, + })) + } +} diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index 75bf32f..5033978 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -10,4 +10,7 @@ pub mod encode; pub mod frames; pub mod translate; -pub use frames::{BenConstruct, BenDecodeFrame, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; +pub use frames::{ + BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, MkvBenEncodeFrame, + TwoDeltaDecodeFrame, TwoDeltaEncodeFrame, +}; diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs new file mode 100644 index 0000000..af5b176 --- /dev/null +++ b/ben/src/io/reader/assignment_reader.rs @@ -0,0 +1,373 @@ +use super::errors::DecoderInitError; +use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError}; +use crate::codec::{ + BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, + TwoDeltaDecodeFrame, +}; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::util::rle::rle_to_vec; +use crate::{progress, BenVariant}; +use serde_json::json; +use std::io::{self, Cursor, Read, Write}; + +/// Iterator over decoded assignments in an uncompressed BEN stream. +pub struct AssignmentReader { + reader: R, + sample_count: usize, + variant: BenVariant, + previous_assignment: Option>, + twodelta_consumed_first_frame: bool, + silent: bool, +} + +/// Internal frame representation, one variant per BEN encoding type. +enum StoredBenFrame { + /// A Standard BEN frame (count is always 1). + Standard(BenDecodeFrame), + /// An MkvChain BEN frame carrying its repetition count. + MkvChain(MkvBenDecodeFrame), + /// A TwoDelta delta frame carrying its pair, run lengths, and count. + TwoDelta(TwoDeltaDecodeFrame), +} + +impl StoredBenFrame { + fn count(&self) -> u16 { + match self { + Self::Standard(_) => 1, + Self::MkvChain(f) => f.count, + Self::TwoDelta(f) => f.count, + } + } +} + +impl AssignmentReader { + /// Create a decoder for an uncompressed BEN stream. + /// + /// The reader must begin with one of the BEN banners such as + /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. + /// + /// # Arguments + /// + /// * `reader` - The input BEN stream, including its 17-byte banner. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first BEN frame. + pub fn new(mut reader: R) -> Result { + let mut check_buffer = [0u8; BANNER_LEN]; + + if let Err(e) = reader.read_exact(&mut check_buffer) { + return Err(DecoderInitError::Io(e)); + } + + match variant_from_banner(&check_buffer) { + Some(variant) => Ok(AssignmentReader { + reader, + sample_count: 0, + variant, + previous_assignment: None, + twodelta_consumed_first_frame: false, + silent: false, + }), + None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), + } + } + + /// Suppress progress output from this decoder's iterator. + pub fn silent(mut self, silent: bool) -> Self { + self.silent = silent; + self + } + + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.variant + } + + /// Decode the remaining BEN stream and write it as JSONL. + /// + /// Each decoded sample is written as a JSON object containing an + /// `assignment` vector and a 1-based `sample` index. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + Ok(true) + }) + } + + /// Read and return the next stored frame from the underlying BEN stream. + /// + /// Delegates to the appropriate `BenDecode::from_reader` implementation + /// based on the variant and whether the first TwoDelta frame has been read. + /// + /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read + /// failure, or `None` at a clean end of stream. + fn pop_frame_from_reader(&mut self) -> Option> { + match self.variant { + BenVariant::Standard => BenDecodeFrame::from_reader(&mut self.reader) + .transpose() + .map(|r| r.map(StoredBenFrame::Standard)), + BenVariant::MkvChain => MkvBenDecodeFrame::from_reader(&mut self.reader) + .transpose() + .map(|r| r.map(StoredBenFrame::MkvChain)), + BenVariant::TwoDelta => { + if !self.twodelta_consumed_first_frame { + // First TwoDelta frame is encoded in MkvChain format. + self.twodelta_consumed_first_frame = true; + MkvBenDecodeFrame::from_reader(&mut self.reader) + .transpose() + .map(|r| r.map(StoredBenFrame::MkvChain)) + } else { + TwoDeltaDecodeFrame::from_reader(&mut self.reader) + .transpose() + .map(|r| r.map(StoredBenFrame::TwoDelta)) + } + } + } + } + + /// Consume this decoder and iterate over raw BEN frames instead of + /// materialized assignments. + pub fn into_frames(self) -> AssignmentFrameReader { + AssignmentFrameReader { inner: self } + } + + /// Count the number of samples remaining in the BEN stream. + /// + /// Walks frame boundaries rather than expanding every assignment. + pub fn count_samples(self) -> io::Result { + let mut this = self; + let mut total = 0usize; + while let Some(frame_res) = this.pop_frame_from_reader() { + total += frame_res?.count() as usize; + } + Ok(total) + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// Unlike the `Iterator` implementation, this avoids cloning the assignment + /// buffer on every frame. The callback receives a borrowed slice and its + /// repetition count. Return `true` to continue or `false` to stop early. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + loop { + let frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + + let count = frame.count(); + + let assignment = match frame { + StoredBenFrame::Standard(f) => decode_ben_frame_to_assignment(&f)?, + StoredBenFrame::MkvChain(f) => decode_mkv_frame_to_assignment(&f)?, + StoredBenFrame::TwoDelta(f) => { + let prev = self + .previous_assignment + .take() + .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?; + apply_twodelta_runs_to_assignment(prev, f.pair, &f.run_lengths)? + } + }; + + let keep_going = f(&assignment, count)?; + self.previous_assignment = Some(assignment); + self.sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } + if !keep_going { + return Ok(()); + } + } + } +} + +/// Decode a raw Standard BEN frame into a full assignment vector. +pub(super) fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { + decode_ben_line( + Cursor::new(&frame.raw_bytes), + frame.max_val_bit_count, + frame.max_len_bit_count, + frame.n_bytes, + ) + .map(rle_to_vec) +} + +/// Decode a raw MkvChain BEN frame into a full assignment vector. +pub(super) fn decode_mkv_frame_to_assignment(frame: &MkvBenDecodeFrame) -> io::Result> { + decode_ben_line( + Cursor::new(&frame.raw_bytes), + frame.max_val_bit_count, + frame.max_len_bit_count, + frame.n_bytes, + ) + .map(rle_to_vec) +} + +/// Decode a stored BEN frame into a full assignment vector. +fn decode_stored_frame_to_assignment( + previous_assignment: &mut Option>, + frame: &StoredBenFrame, +) -> io::Result> { + match frame { + StoredBenFrame::Standard(f) => decode_ben_frame_to_assignment(f), + StoredBenFrame::MkvChain(f) => decode_mkv_frame_to_assignment(f), + StoredBenFrame::TwoDelta(f) => { + let prev = previous_assignment + .take() + .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?; + apply_twodelta_runs_to_assignment(prev, f.pair, &f.run_lengths) + } + } +} + +impl Iterator for AssignmentReader { + type Item = io::Result; + + fn next(&mut self) -> Option> { + let frame = match self.pop_frame_from_reader() { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Some(Err(e)), + None => return None, + }; + let count = frame.count(); + let assignment = + match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { + Ok(assgn) => assgn, + Err(e) => return Some(Err(e)), + }; + self.previous_assignment = Some(assignment.clone()); + self.sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", self.sample_count); + } + Some(Ok((assignment, count))) + } +} + +/// Iterator over raw BEN frames. +pub struct AssignmentFrameReader { + pub(super) inner: AssignmentReader, +} + +impl AssignmentFrameReader { + /// Create a raw BEN frame iterator from a reader. + pub fn new(reader: R) -> Result { + Ok(Self { + inner: AssignmentReader::new(reader)?, + }) + } +} + +impl Iterator for AssignmentFrameReader { + type Item = io::Result<(BenDecodeFrame, u16)>; + + /// Return the next raw BEN frame from the input stream. + /// + /// For Standard and MkvChain streams, returns the raw decoded frame paired + /// with its repetition count. + /// For TwoDelta streams, materializes each assignment and re-encodes it. + fn next(&mut self) -> Option { + match self.inner.variant { + BenVariant::Standard | BenVariant::MkvChain => { + match self.inner.pop_frame_from_reader() { + Some(Ok(StoredBenFrame::Standard(frame))) => Some(Ok((frame, 1))), + Some(Ok(StoredBenFrame::MkvChain(frame))) => { + let count = frame.count; + Some(Ok(( + BenDecodeFrame { + max_val_bit_count: frame.max_val_bit_count, + max_len_bit_count: frame.max_len_bit_count, + n_bytes: frame.n_bytes, + raw_bytes: frame.raw_bytes, + }, + count, + ))) + } + Some(Ok(StoredBenFrame::TwoDelta(_))) => Some(Err(io::Error::from( + DecodeError::UnexpectedTwoDeltaFrame { + variant: self.inner.variant, + }, + ))), + Some(Err(err)) => Some(Err(err)), + None => None, + } + } + BenVariant::TwoDelta => match self.inner.next() { + Some(Ok((assignment, count))) => { + let encoded = BenEncodeFrame::from_assignment(&assignment, None); + Some(Ok(( + BenDecodeFrame { + max_val_bit_count: encoded.max_val_bit_count, + max_len_bit_count: encoded.max_len_bit_count, + n_bytes: encoded.n_bytes, + raw_bytes: encoded.raw_bytes[6..].to_vec(), + }, + count, + ))) + } + Some(Err(err)) => Some(Err(err)), + None => None, + }, + } + } +} + +impl AssignmentReader { + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> super::subsample::SubsampleFrameDecoder< + impl Iterator> + Send, + > + where + T: IntoIterator, + { + let frames = self + .into_frames() + .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); + super::subsample::SubsampleFrameDecoder::by_indices(frames, indices) + } + + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> super::subsample::SubsampleFrameDecoder< + impl Iterator> + Send, + > { + let frames = self + .into_frames() + .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); + super::subsample::SubsampleFrameDecoder::by_range(frames, start, end) + } + + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> super::subsample::SubsampleFrameDecoder< + impl Iterator> + Send, + > { + let frames = self + .into_frames() + .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); + super::subsample::SubsampleFrameDecoder::every(frames, step, offset) + } +} + diff --git a/ben/src/io/reader/ben.rs b/ben/src/io/reader/ben.rs deleted file mode 100644 index 0b6d130..0000000 --- a/ben/src/io/reader/ben.rs +++ /dev/null @@ -1,1493 +0,0 @@ -use super::errors::DecoderInitError; -use super::twodelta::{ - XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_DELTA_TAG, XBEN_TWODELTA_FULL_TAG, -}; -use crate::codec::decode::{decode_ben32_line, decode_ben_line, DecodeError}; -use crate::codec::encode::encode_ben32_assignments; -use crate::codec::{BenConstruct, BenDecodeFrame, BenEncodeFrame, TwoDeltaEncodeFrame}; -use crate::format::banners::{variant_from_banner, BANNER_LEN}; -use crate::format::FormatError; -use crate::util::rle::rle_to_vec; -use crate::{progress, BenVariant}; -use byteorder::{BigEndian, ReadBytesExt}; -use serde_json::json; -use std::fs::File; -use std::io::{self, BufReader, Cursor, Read, Write}; -use std::iter::Peekable; -use std::path::{Path, PathBuf}; -use xz2::read::XzDecoder; - -/// A decoded assignment together with the number of times it repeats. -pub type MkvRecord = (Vec, u16); -/// A raw ben32 frame together with the number of times it repeats. -pub type Ben32Frame = (Vec, u16); -/// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. -pub type FrameIter = Box> + Send>; - -/// Iterator over decoded assignments in an uncompressed BEN stream. -pub struct BenDecoder { - reader: R, - sample_count: usize, - variant: BenVariant, - previous_assignment: Option>, - twodelta_consumed_first_frame: bool, - silent: bool, -} - -enum StoredBenFrame { - Ben(BenDecodeFrame), - TwoDelta { frame: TwoDeltaEncodeFrame, count: u16 }, -} - -impl StoredBenFrame { - fn count(&self) -> u16 { - match self { - Self::Ben(frame) => frame.count, - Self::TwoDelta { count, .. } => *count, - } - } -} - -impl BenDecoder { - /// Create a decoder for an uncompressed BEN stream. - /// - /// The reader must begin with one of the BEN banners such as - /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. - /// - /// # Arguments - /// - /// * `reader` - The input BEN stream, including its 17-byte banner. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first BEN frame. - pub fn new(mut reader: R) -> Result { - let mut check_buffer = [0u8; BANNER_LEN]; - - if let Err(e) = reader.read_exact(&mut check_buffer) { - return Err(DecoderInitError::Io(e)); - } - - match variant_from_banner(&check_buffer) { - Some(variant) => Ok(BenDecoder { - reader, - sample_count: 0, - variant, - previous_assignment: None, - twodelta_consumed_first_frame: false, - silent: false, - }), - None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), - } - } - - /// Suppress progress output from this decoder's iterator. - /// - /// # Arguments - /// - /// * `silent` - When `true`, the decoder will not emit progress messages. - /// - /// # Returns - /// - /// Returns `self` for method chaining. - pub fn silent(mut self, silent: bool) -> Self { - self.silent = silent; - self - } - - /// Decode the remaining BEN stream and write it as JSONL. - /// - /// # Arguments - /// - /// * `writer` - The destination that will receive one JSON object per - /// decoded sample. - /// - /// # Returns - /// - /// Returns `Ok(())` after the remaining stream has been fully decoded. - pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - let mut sample_number = 0usize; - self.for_each_assignment(|assignment, count| { - for _ in 0..count { - sample_number += 1; - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - } - Ok(true) - }) - } - - /// Read and return the next raw BEN frame stored in standard BEN layout. - /// - /// # Arguments - /// - /// * `with_count` - When `true`, read a trailing `u16` repetition count; - /// otherwise the count defaults to `1`. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_standard_frame_from_reader( - &mut self, - with_count: bool, - ) -> Option> { - let mut b1 = [0u8; 1]; - let max_val_bit_count = match self.reader.read_exact(&mut b1) { - Ok(()) => b1[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - tracing::trace!(""); - tracing::trace!("Done!"); - return None; - } - return Some(Err(e)); - } - }; - - let mut b2 = [0u8; 1]; - if let Err(e) = self.reader.read_exact(&mut b2) { - return Some(Err(e)); - } - let max_len_bit_count = b2[0]; - - let n_bytes = match self.reader.read_u32::() { - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - - let mut raw_assignment = vec![0u8; n_bytes as usize]; - if let Err(e) = self.reader.read_exact(&mut raw_assignment) { - return Some(Err(e)); - } - - let count = if with_count { - match self.reader.read_u16::() { - Ok(c) => c, - Err(e) => return Some(Err(e)), - } - } else { - 1 - }; - - Some(Ok(BenDecodeFrame { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes: raw_assignment, - count, - })) - } - - /// Read and return the next raw TwoDelta frame from the underlying stream. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next TwoDelta frame, `Some(Err(...))` - /// for a read failure, or `None` at a clean end of stream. - fn pop_twodelta_frame_from_reader(&mut self) -> Option> { - let pair_a = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - tracing::trace!(""); - tracing::trace!("Done!"); - return None; - } - return Some(Err(e)); - } - }; - - let pair_b = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - let mut bits = [0u8; 1]; - if let Err(e) = self.reader.read_exact(&mut bits) { - return Some(Err(e)); - } - let max_len_bits = bits[0]; - - let n_bytes = match self.reader.read_u32::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - let mut payload = vec![0u8; n_bytes as usize]; - if let Err(e) = self.reader.read_exact(&mut payload) { - return Some(Err(e)); - } - - let count = match self.reader.read_u16::() { - Ok(value) => value, - Err(e) => return Some(Err(e)), - }; - - Some(Ok(StoredBenFrame::TwoDelta { - frame: TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload), - count, - })) - } - - /// Read and return the next stored frame from the underlying BEN stream. - /// - /// # Arguments - /// - /// * `&mut self` - The decoder whose internal reader is advanced. - /// - /// # Returns - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_frame_from_reader(&mut self) -> Option> { - match self.variant { - BenVariant::Standard => self - .pop_standard_frame_from_reader(false) - .map(|res| res.map(StoredBenFrame::Ben)), - BenVariant::MkvChain => self - .pop_standard_frame_from_reader(true) - .map(|res| res.map(StoredBenFrame::Ben)), - BenVariant::TwoDelta => { - if !self.twodelta_consumed_first_frame { - self.twodelta_consumed_first_frame = true; - self.pop_standard_frame_from_reader(true) - .map(|res| res.map(StoredBenFrame::Ben)) - } else { - self.pop_twodelta_frame_from_reader() - } - } - } - } - - /// Consume this decoder and iterate over raw BEN frames instead of - /// materialized assignments. - /// - /// # Returns - /// - /// Returns an iterator that yields raw BEN frames from the remaining input. - pub fn into_frames(self) -> BenFrameDecoeder { - BenFrameDecoeder { inner: self } - } - - /// Count the number of samples remaining in the BEN stream. - /// - /// This consumes the decoder but only walks frame boundaries rather than - /// expanding every assignment into a full vector. - /// - /// # Returns - /// - /// Returns the number of remaining samples in the stream. - pub fn count_samples(self) -> io::Result { - let mut this = self; - let mut total = 0usize; - while let Some(frame_res) = this.pop_frame_from_reader() { - total += frame_res?.count() as usize; - } - Ok(total) - } - - /// Decode assignments and pass each one to a callback by reference. - /// - /// Unlike the `Iterator` implementation, this avoids cloning the assignment - /// buffer on every frame. The decoder owns a single buffer, mutates it in - /// place for TwoDelta frames, and lends `&[u16]` to the callback. This - /// eliminates one full-length memcpy per frame. - /// - /// The callback receives a borrowed assignment slice and its repetition - /// count. Return `true` to continue decoding or `false` to stop early. - /// - /// # Arguments - /// - /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. - /// - /// # Returns - /// - /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. - pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> - where - F: FnMut(&[u16], u16) -> io::Result, - { - loop { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Err(e), - None => return Ok(()), - }; - - let count = frame.count(); - - match frame { - StoredBenFrame::Ben(ben_frame) => { - let assignment = decode_ben_frame_to_assignment(&ben_frame)?; - let keep_going = f(&assignment, count)?; - self.previous_assignment = Some(assignment); - if !keep_going { - return Ok(()); - } - } - StoredBenFrame::TwoDelta { frame, count } => { - let assignment = self.previous_assignment.take().ok_or_else(|| { - io::Error::from(DecodeError::TwoDeltaNoAnchorFrame) - })?; - let run_lengths = frame.run_length_vector; - let assignment = - apply_twodelta_runs_to_assignment(assignment, frame.pair, &run_lengths)?; - let keep_going = f(&assignment, count)?; - self.previous_assignment = Some(assignment); - if !keep_going { - return Ok(()); - } - } - } - - self.sample_count += count as usize; - if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); - } - } - } -} - -/// Decode a raw BEN frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame` - The raw BEN frame to decode. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { - decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bit_count, - frame.max_len_bit_count, - frame.n_bytes, - ) - .map(rle_to_vec) -} - -/// Apply decoded TwoDelta run lengths to produce a new assignment vector. -/// -/// Positions in `previous_assignment` that hold either value of `pair` are -/// overwritten according to the alternating run-length encoding. -/// -/// # Arguments -/// -/// * `assignment` - The assignment from the preceding frame (mutated in place). -/// * `pair` - The two label values that participate in the delta. -/// * `run_lengths` - Alternating run lengths starting with the first value of `pair`. -/// -/// # Returns -/// -/// Returns the updated assignment vector. -fn apply_twodelta_runs_to_assignment( - mut assignment: Vec, - pair: (u16, u16), - run_lengths: &[u16], -) -> io::Result> { - let (first, second) = pair; - - let mut run_idx = 0usize; - let mut remaining_in_run: u16 = *run_lengths.first().unwrap_or(&0); - let mut current_value = first; - - for (pos, val) in assignment.iter_mut().enumerate() { - if *val == first || *val == second { - if remaining_in_run == 0 { - run_idx += 1; - if run_idx >= run_lengths.len() { - return Err(io::Error::from(DecodeError::TwoDeltaRunsExhausted { - run_idx, - pos, - })); - } - remaining_in_run = run_lengths[run_idx]; - current_value = if current_value == first { - second - } else { - first - }; - } - *val = current_value; - remaining_in_run -= 1; - } - } - - Ok(assignment) -} - -/// Decode a raw TwoDelta frame into a full assignment vector. -/// -/// Unpacks the bitpacked run lengths from the frame payload, then applies -/// them in a single pass over the assignment. -/// -/// # Arguments -/// -/// * `assignment` - The assignment from the preceding frame (mutated in place). -/// * `frame` - The TwoDelta frame whose packed payload is decoded and applied. -/// -/// # Returns -/// -/// Returns the updated assignment vector. -fn decode_twodelta_frame_to_assignment( - assignment: Vec, - frame: &TwoDeltaEncodeFrame, -) -> io::Result> { - apply_twodelta_runs_to_assignment(assignment, frame.pair, &frame.run_length_vector) -} - -/// Decode a stored BEN frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `previous_assignment` - The assignment from the preceding frame, required -/// for TwoDelta frames. -/// * `frame` - The stored frame to decode. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_stored_frame_to_assignment( - previous_assignment: &mut Option>, - frame: &StoredBenFrame, -) -> io::Result> { - match frame { - StoredBenFrame::Ben(frame) => decode_ben_frame_to_assignment(frame), - StoredBenFrame::TwoDelta { frame, .. } => decode_twodelta_frame_to_assignment( - previous_assignment - .take() - .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?, - frame, - ), - } -} - -impl Iterator for BenDecoder { - type Item = io::Result; - - /// Decode and return the next assignment from the BEN stream. - fn next(&mut self) -> Option> { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; - let assignment = - match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { - Ok(assgn) => assgn, - Err(e) => return Some(Err(e)), - }; - let count = frame.count(); - self.previous_assignment = Some(assignment.clone()); - self.sample_count += count as usize; - if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); - } - Some(Ok((assignment, count))) - } -} - -/// Iterator over raw BEN frames. -pub struct BenFrameDecoeder { - inner: BenDecoder, -} - -impl BenFrameDecoeder { - /// Create a raw BEN frame iterator from a reader. - /// - /// # Arguments - /// - /// * `reader` - The input BEN stream, including its 17-byte banner. - /// - /// # Returns - /// - /// Returns an iterator over raw BEN frames. - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: BenDecoder::new(reader)?, - }) - } -} - -impl Iterator for BenFrameDecoeder { - type Item = io::Result; - - /// Return the next raw BEN frame from the input stream. - fn next(&mut self) -> Option { - match self.inner.variant { - BenVariant::Standard | BenVariant::MkvChain => match self.inner.pop_frame_from_reader() - { - Some(Ok(StoredBenFrame::Ben(frame))) => Some(Ok(frame)), - Some(Ok(StoredBenFrame::TwoDelta { .. })) => Some(Err(io::Error::from( - DecodeError::UnexpectedTwoDeltaFrame { - variant: self.inner.variant, - }, - ))), - Some(Err(err)) => Some(Err(err)), - None => None, - }, - BenVariant::TwoDelta => match self.inner.next() { - Some(Ok((assignment, count))) => { - let encoded = BenEncodeFrame::from_assignment(&assignment, None); - Some(Ok(BenDecodeFrame { - max_val_bit_count: encoded.max_val_bit_count, - max_len_bit_count: encoded.max_len_bit_count, - count, - n_bytes: encoded.n_bytes, - raw_bytes: encoded.raw_bytes[6..].to_vec(), - })) - } - Some(Err(err)) => Some(Err(err)), - None => None, - }, - } - } -} - -/// Iterator over decoded assignments in an XBEN stream. -pub struct XBenDecoder { - xz: BufReader>, - /// Variant encoded in the XBEN banner. - pub variant: BenVariant, - overflow: Vec, - buf: Box<[u8]>, - previous_assignment: Option>, - chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, -} - -impl XBenDecoder { - /// Create an XBEN decoder from an already-opened decompressed stream. - /// - /// # Arguments - /// - /// * `xz` - A buffered XZ decompression reader positioned past the banner. - /// * `variant` - The BEN variant indicated by the banner. - /// - /// # Returns - /// - /// Returns a new decoder ready to yield frames from the stream. - pub(crate) fn from_decompressed_stream( - xz: BufReader>, - variant: BenVariant, - ) -> Self { - Self { - xz, - variant, - overflow: Vec::with_capacity(1 << 20), - buf: vec![0u8; 1 << 20].into_boxed_slice(), - previous_assignment: None, - chunk_queue: std::collections::VecDeque::new(), - } - } - - /// Create a decoder for an XBEN stream. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first ben32 frame in the - /// decompressed payload. - pub fn new(reader: R) -> io::Result { - let xz = XzDecoder::new(reader); - let mut xz = BufReader::with_capacity(1 << 20, xz); - - let mut first = [0u8; BANNER_LEN]; - xz.read_exact(&mut first)?; - let variant = variant_from_banner(&first).ok_or_else(|| { - io::Error::from(FormatError::UnknownBanner { - actual: first.to_vec(), - }) - })?; - - Ok(Self::from_decompressed_stream(xz, variant)) - } - - /// Try to extract one complete ben32 frame from the buffered overflow. - /// - /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 - /// frame and, for MkvChain streams, reads the trailing repetition count. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain one or more - /// complete ben32 frames. - /// - /// # Returns - /// - /// Returns the frame bytes, the number of consumed bytes, and the decoded - /// repetition count when a complete frame is available. - fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { - match self.variant { - BenVariant::Standard => { - if overflow.len() < 4 { - return None; - } - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let end = i + 1; - let frame = &overflow[..end]; - return Some((frame, end, 1)); - } - } - None - } - BenVariant::MkvChain => { - if overflow.len() < 6 { - return None; - } - for i in (3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let count_hi = overflow[i + 1]; - let count_lo = overflow[i + 2]; - let count = u16::from_be_bytes([count_hi, count_lo]); - let end = i + 3; - let frame = &overflow[..end]; - return Some((frame, end, count)); - } - } - None - } - BenVariant::TwoDelta => None, - } - } - - /// Try to extract one complete TwoDelta frame from the buffered overflow. - /// - /// Inspects the leading tag byte to determine whether the frame is a full - /// RLE frame or a delta frame, then reads the corresponding payload. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain a complete - /// TwoDelta frame. - /// - /// # Returns - /// - /// Returns the parsed frame, the number of consumed bytes, and the - /// repetition count when a complete frame is available. - fn pop_twodelta_frame_from_overflow( - &self, - overflow: &[u8], - ) -> Option> { - let tag = *overflow.first()?; - match tag { - XBEN_TWODELTA_FULL_TAG => { - if overflow.len() < 7 { - return None; - } - let run_count = - u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) - as usize; - let payload_len = run_count.checked_mul(4)?; - let total_len = 1usize - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; - if overflow.len() < total_len { - return None; - } - - let mut runs = Vec::with_capacity(run_count); - let mut cursor = 5usize; - for _ in 0..run_count { - let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); - runs.push((value, len)); - cursor += 4; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) - } - XBEN_TWODELTA_DELTA_TAG => { - if overflow.len() < 11 { - return None; - } - let pair = ( - u16::from_be_bytes([overflow[1], overflow[2]]), - u16::from_be_bytes([overflow[3], overflow[4]]), - ); - let run_count = - u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) - as usize; - let payload_len = run_count.checked_mul(2)?; - let total_len = 1usize - .checked_add(2)? - .checked_add(2)? - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; - if overflow.len() < total_len { - return None; - } - - let mut run_lengths = Vec::with_capacity(run_count); - let mut cursor = 9usize; - for _ in 0..run_count { - run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); - cursor += 2; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok(( - XBenTwoDeltaFrame::Delta { pair, run_lengths }, - total_len, - count, - ))) - } - XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. - _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { tag }))), - } - } - - /// Try to parse a columnar TwoDelta chunk from the overflow buffer. - /// - /// If the overflow starts with the chunk tag and contains enough bytes for - /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. - /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, - /// or `None` when the overflow is incomplete. - fn try_parse_twodelta_chunk(&mut self) -> Option> { - if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { - return None; - } - if self.overflow.len() < 5 { - return None; - } - - let n_frames = u32::from_be_bytes([ - self.overflow[1], - self.overflow[2], - self.overflow[3], - self.overflow[4], - ]) as usize; - - // Calculate total chunk size: tag(1) + n_frames(4) - // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) - let header_len = 5; - let pairs_len = n_frames * 4; - let counts_len = n_frames * 2; - let run_counts_len = n_frames * 4; - let fixed_len = header_len + pairs_len + counts_len + run_counts_len; - - if self.overflow.len() < fixed_len { - return None; - } - - // Read run-length counts to determine total run data size. - let run_counts_start = header_len + pairs_len + counts_len; - let mut total_runs = 0usize; - let mut run_counts = Vec::with_capacity(n_frames); - for i in 0..n_frames { - let offset = run_counts_start + i * 4; - let rc = u32::from_be_bytes([ - self.overflow[offset], - self.overflow[offset + 1], - self.overflow[offset + 2], - self.overflow[offset + 3], - ]) as usize; - run_counts.push(rc); - total_runs += rc; - } - - let run_data_len = total_runs * 2; - let total_len = fixed_len + run_data_len; - if self.overflow.len() < total_len { - return None; - } - - // Parse pairs channel. - let pairs_start = header_len; - // Parse counts channel. - let counts_start = pairs_start + pairs_len; - // Run data starts after run counts. - let run_data_start = run_counts_start + run_counts_len; - - let mut run_cursor = run_data_start; - for i in 0..n_frames { - let po = pairs_start + i * 4; - let pair = ( - u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), - u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), - ); - let co = counts_start + i * 2; - let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); - - let rc = run_counts[i]; - let mut run_lengths = Vec::with_capacity(rc); - for _ in 0..rc { - run_lengths.push(u16::from_be_bytes([ - self.overflow[run_cursor], - self.overflow[run_cursor + 1], - ])); - run_cursor += 2; - } - - self.chunk_queue - .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); - } - - self.overflow.drain(..total_len); - Some(Ok(())) - } - - /// Consume this decoder and iterate over raw ben32 frames instead of - /// materialized assignments. - /// - /// # Returns - /// - /// Returns an iterator that yields raw ben32 frames from the remaining - /// input. - pub fn into_frames(self) -> XBenFrameDecoder { - XBenFrameDecoder { inner: self } - } - - /// Count the number of samples remaining in the XBEN stream. - /// - /// # Returns - /// - /// Returns the number of remaining samples in the stream. - pub fn count_samples(self) -> io::Result { - let mut total = 0usize; - for frame_res in self.into_frames() { - let (_bytes, cnt) = frame_res?; - total += cnt as usize; - } - Ok(total) - } -} - -/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame_bytes` - The ben32 frame bytes. -/// * `variant` - The BEN variant used to interpret the frame tail. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_xben_frame_to_assignment( - frame_bytes: &[u8], - variant: BenVariant, -) -> io::Result> { - let cursor = Cursor::new(frame_bytes); - let (assignment, _) = decode_ben32_line(cursor, variant)?; - Ok(assignment) -} - -impl Iterator for XBenDecoder { - type Item = io::Result; - - /// Decode and return the next assignment from the XBEN stream. - fn next(&mut self) -> Option { - loop { - match self.variant { - BenVariant::Standard | BenVariant::MkvChain => { - if let Some((frame_bytes, consumed, count)) = - self.pop_frame_from_overflow(&self.overflow) - { - let res = match decode_xben_frame_to_assignment(frame_bytes, self.variant) { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - Ok((assignment, count)) - } - Err(e) => Err(e), - }; - self.overflow.drain(..consumed); - return Some(res); - } - } - BenVariant::TwoDelta => { - // Drain frames from a previously parsed chunk first. - if let Some((frame, count)) = self.chunk_queue.pop_front() { - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(prev) => { - apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) - } - None => { - Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)) - } - } - } - }; - return Some(match assignment { - Ok(a) => { - self.previous_assignment = Some(a.clone()); - Ok((a, count)) - } - Err(e) => Err(e), - }); - } - - // Try to parse a columnar chunk. - if let Some(result) = self.try_parse_twodelta_chunk() { - match result { - Ok(()) => continue, // Loop to drain chunk_queue. - Err(e) => return Some(Err(e)), - } - } - - // Try a single legacy frame (tag 0 or 1). - if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { - let res = match parsed { - Ok((frame, consumed, count)) => { - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(previous_assignment) => { - apply_twodelta_runs_to_assignment( - previous_assignment, - pair, - &run_lengths, - ) - } - None => Err(io::Error::from( - DecodeError::TwoDeltaNoAnchorFrame, - )), - } - } - }; - match assignment { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - self.overflow.drain(..consumed); - Ok((assignment, count)) - } - Err(err) => { - self.overflow.drain(..consumed); - Err(err) - } - } - } - Err(err) => { - self.overflow.clear(); - Err(err) - } - }; - return Some(res); - } - } - } - - let read = match self.xz.read(&mut self.buf) { - Ok(0) => { - if self.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::from(DecodeError::XBenTruncated))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.overflow.extend_from_slice(&self.buf[..read]); - } - } -} - -/// Iterator over raw ben32 frames inside an XBEN stream. -pub struct XBenFrameDecoder { - inner: XBenDecoder, -} - -impl XBenFrameDecoder { - /// Create a raw XBEN frame iterator from a reader. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns an iterator over raw ben32 frames. - pub fn new(reader: R) -> io::Result { - Ok(Self { - inner: XBenDecoder::new(reader)?, - }) - } -} - -impl Iterator for XBenFrameDecoder { - type Item = io::Result; - - /// Return the next raw ben32 frame from the input stream. - fn next(&mut self) -> Option { - if self.inner.variant == BenVariant::TwoDelta { - return self.inner.next().map(|result| { - result.and_then(|(assignment, count)| { - Ok((encode_ben32_assignments(&assignment)?, count)) - }) - }); - } - - loop { - if let Some((frame, consumed, count)) = - self.inner.pop_frame_from_overflow(&self.inner.overflow) - { - let out = frame.to_vec(); - self.inner.overflow.drain(..consumed); - return Some(Ok((out, count))); - } - - let read = match self.inner.xz.read(&mut self.inner.buf) { - Ok(0) => { - if self.inner.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::from(DecodeError::XBenTruncated))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.inner - .overflow - .extend_from_slice(&self.inner.buf[..read]); - } - } -} - -#[derive(Clone)] -/// A generalized frame type used by the subsampling machinery. -pub enum DecodeFrame { - /// A raw BEN frame. - Ben(BenDecodeFrame), - /// A raw ben32 frame from an XBEN stream together with its variant. - XBen(Vec, BenVariant), -} - -/// A selection strategy for extracting only part of a frame stream. -pub enum Selection { - /// Select explicit 1-based indices. - Indices(Peekable>), - /// Select every `step` samples starting at the 1-based `offset`. - Every { step: usize, offset: usize }, - /// Select the inclusive 1-based range `[start, end]`. - Range { start: usize, end: usize }, -} - -/// Decode a generic frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame` - Either a BEN frame or an XBEN ben32 frame. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { - match frame { - DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), - DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), - } -} - -/// Iterator adaptor that decodes only selected samples from a frame stream. -pub struct SubsampleFrameDecoder -where - I: Iterator>, -{ - inner: I, - selection: Selection, - sample: usize, -} - -impl SubsampleFrameDecoder -where - I: Iterator>, -{ - /// Create a subsampling iterator from a lower-level frame iterator. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `selection` - The sample-selection rule to apply. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn new(inner: I, selection: Selection) -> Self { - Self { - inner, - selection, - sample: 0, - } - } - - /// Select a set of 1-based sample indices. - /// - /// Indices are sorted and deduplicated before iteration begins. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn by_indices(inner: I, indices: T) -> Self - where - T: IntoIterator, - { - let mut v: Vec = indices.into_iter().collect(); - v.sort_unstable(); - v.dedup(); - Self::new(inner, Selection::Indices(v.into_iter().peekable())) - } - - /// Select the inclusive 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn by_range(inner: I, start: usize, end: usize) -> Self { - assert!( - start >= 1 && end >= start, - "range must be 1-based and end >= start" - ); - Self::new(inner, Selection::Range { start, end }) - } - - /// Select every `step` samples beginning from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn every(inner: I, step: usize, offset: usize) -> Self { - assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); - Self::new(inner, Selection::Every { step, offset }) - } - - /// Count how many selected samples fall within an inclusive sample interval. - /// - /// # Arguments - /// - /// * `lo` - The first 1-based sample index covered by the current frame. - /// * `hi` - The last 1-based sample index covered by the current frame. - /// - /// # Returns - /// - /// Returns the number of selected samples represented by the frame. - fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { - match &mut self.selection { - Selection::Indices(iter) => { - let mut taken = 0u16; - while let Some(&next) = iter.peek() { - if next < lo { - iter.next(); - continue; - } - if next > hi { - break; - } - iter.next(); - taken = taken.saturating_add(1); - } - taken - } - Selection::Every { step, offset } => { - let start = lo.max(*offset); - if start > hi { - return 0; - } - let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; - let first = start + ((*step - r) % *step); - if first > hi { - 0 - } else { - (1 + (hi - first) / *step) as u16 - } - } - Selection::Range { start, end } => { - if hi < *start || lo > *end { - 0 - } else { - let a = lo.max(*start); - let b = hi.min(*end); - (b - a + 1) as u16 - } - } - } - } -} - -impl Iterator for SubsampleFrameDecoder -where - I: Iterator>, -{ - type Item = io::Result; - - /// Return the next decoded sample selected by the subsampling rule. - fn next(&mut self) -> Option { - loop { - if let Selection::Range { end, .. } = self.selection { - if self.sample >= end { - return None; - } - } - if let Selection::Indices(ref mut it) = self.selection { - if it.peek().is_none() { - return None; - } - } - - let (frame, count) = match self.inner.next()? { - Ok(x) => x, - Err(e) => return Some(Err(e)), - }; - - let lo = self.sample + 1; - let hi = self.sample + count as usize; - let selected = self.count_selected_in(lo, hi); - - self.sample = hi; - - if selected > 0 { - match decode_frame_to_assignment(&frame) { - Ok(assignment) => return Some(Ok((assignment, selected))), - Err(e) => return Some(Err(e)), - } - } - } - } -} - -/// Build a generic frame iterator from a BEN or XBEN file path. -/// -/// Frame iteration is useful for subsampling and counting because it avoids -/// decoding every sample into a full assignment vector. -/// -/// # Arguments -/// -/// * `file_path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns a boxed iterator over generic frames and their repetition counts. -pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { - let file = File::options().read(true).open(file_path)?; - let reader = BufReader::new(file); - - match mode { - "ben" => { - let frames = BenFrameDecoeder::new(reader)?; - let mapped = frames.map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - Ok(Box::new(mapped)) - } - "xben" => { - let x = XBenDecoder::new(reader)?; - let variant = x.variant; - let frames = x.into_frames(); - let mapped = frames - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - Ok(Box::new(mapped)) - } - _ => Err(io::Error::from(DecoderInitError::UnknownMode { - mode: mode.to_string(), - })), - } -} - -impl BenDecoder { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let count = f.count; - (DecodeFrame::Ben(f), count) - }) - }); - SubsampleFrameDecoder::by_indices(frames, indices) - } - - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::by_range(frames, start, end) - } - - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let frames = self.into_frames().map(|res| { - res.map(|f| { - let cnt = f.count; - (DecodeFrame::Ben(f), cnt) - }) - }); - SubsampleFrameDecoder::every(frames, step, offset) - } -} - -impl XBenDecoder { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_indices(Box::new(frames), indices) - } - - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_range(Box::new(frames), start, end) - } - - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::every(Box::new(frames), step, offset) - } -} - -/// Count the number of samples in a BEN or XBEN file on disk. -/// -/// The file is walked frame-by-frame, so this is linear in file size but avoids -/// materializing full assignment vectors. -/// -/// # Arguments -/// -/// * `path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns the number of samples in the file. -pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { - let iter = build_frame_iter(&path.to_path_buf(), mode)?; - let mut total = 0usize; - for item in iter { - let (_frame, cnt) = item?; - total += cnt as usize; - } - Ok(total) -} diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs index 01ff258..6e1eaf6 100644 --- a/ben/src/io/reader/mod.rs +++ b/ben/src/io/reader/mod.rs @@ -1,11 +1,14 @@ -pub mod ben; +pub mod assignment_reader; pub mod errors; +pub mod subsample; pub(crate) mod tests; pub(crate) mod twodelta; +pub mod xz_assignment_reader; -pub use ben::{ - build_frame_iter, count_samples_from_file, Ben32Frame, BenDecoder, BenFrameDecoeder, - DecodeFrame, FrameIter, MkvRecord, Selection, SubsampleFrameDecoder, XBenDecoder, - XBenFrameDecoder, -}; +pub use assignment_reader::{AssignmentFrameReader, AssignmentReader}; pub use errors::DecoderInitError; +pub use subsample::{ + build_frame_iter, count_samples_from_file, Ben32Frame, DecodeFrame, FrameIter, MkvRecord, + Selection, SubsampleFrameDecoder, +}; +pub use xz_assignment_reader::{XZAssignmentFrameReader, XZAssignmentReader}; diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs new file mode 100644 index 0000000..a9d4d75 --- /dev/null +++ b/ben/src/io/reader/subsample.rs @@ -0,0 +1,297 @@ +use super::assignment_reader::decode_ben_frame_to_assignment; +use super::assignment_reader::AssignmentFrameReader; +use super::errors::DecoderInitError; +use super::xz_assignment_reader::decode_xben_frame_to_assignment; +use super::xz_assignment_reader::XZAssignmentReader; +use crate::codec::BenDecodeFrame; +use crate::BenVariant; +use std::fs::File; +use std::io::{self, BufReader}; +use std::iter::Peekable; +use std::path::{Path, PathBuf}; + +/// A decoded assignment together with the number of times it repeats. +pub type MkvRecord = (Vec, u16); +/// A raw ben32 frame together with the number of times it repeats. +pub type Ben32Frame = (Vec, u16); +/// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. +pub type FrameIter = Box> + Send>; + +#[derive(Clone)] +/// A generalized frame type used by the subsampling machinery. +pub enum DecodeFrame { + /// A raw BEN frame. + Ben(BenDecodeFrame), + /// A raw ben32 frame from an XBEN stream together with its variant. + XBen(Vec, BenVariant), +} + +/// A selection strategy for extracting only part of a frame stream. +pub enum Selection { + /// Select explicit 1-based indices. + Indices(Peekable>), + /// Select every `step` samples starting at the 1-based `offset`. + Every { step: usize, offset: usize }, + /// Select the inclusive 1-based range `[start, end]`. + Range { start: usize, end: usize }, +} + +/// Decode a generic frame into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame` - Either a BEN frame or an XBEN ben32 frame. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +pub(super) fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { + match frame { + DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), + DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), + } +} + +/// Iterator adaptor that decodes only selected samples from a frame stream. +pub struct SubsampleFrameDecoder +where + I: Iterator>, +{ + inner: I, + selection: Selection, + sample: usize, +} + +impl SubsampleFrameDecoder +where + I: Iterator>, +{ + /// Create a subsampling iterator from a lower-level frame iterator. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `selection` - The sample-selection rule to apply. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn new(inner: I, selection: Selection) -> Self { + Self { + inner, + selection, + sample: 0, + } + } + + /// Select a set of 1-based sample indices. + /// + /// Indices are sorted and deduplicated before iteration begins. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn by_indices(inner: I, indices: T) -> Self + where + T: IntoIterator, + { + let mut v: Vec = indices.into_iter().collect(); + v.sort_unstable(); + v.dedup(); + Self::new(inner, Selection::Indices(v.into_iter().peekable())) + } + + /// Select the inclusive 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn by_range(inner: I, start: usize, end: usize) -> Self { + assert!( + start >= 1 && end >= start, + "range must be 1-based and end >= start" + ); + Self::new(inner, Selection::Range { start, end }) + } + + /// Select every `step` samples beginning from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `inner` - The source iterator yielding frames and repetition counts. + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn every(inner: I, step: usize, offset: usize) -> Self { + assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); + Self::new(inner, Selection::Every { step, offset }) + } + + /// Count how many selected samples fall within an inclusive sample interval. + /// + /// # Arguments + /// + /// * `lo` - The first 1-based sample index covered by the current frame. + /// * `hi` - The last 1-based sample index covered by the current frame. + /// + /// # Returns + /// + /// Returns the number of selected samples represented by the frame. + fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { + match &mut self.selection { + Selection::Indices(iter) => { + let mut taken = 0u16; + while let Some(&next) = iter.peek() { + if next < lo { + iter.next(); + continue; + } + if next > hi { + break; + } + iter.next(); + taken = taken.saturating_add(1); + } + taken + } + Selection::Every { step, offset } => { + let start = lo.max(*offset); + if start > hi { + return 0; + } + let r = (start as isize - *offset as isize).rem_euclid(*step as isize) as usize; + let first = start + ((*step - r) % *step); + if first > hi { + 0 + } else { + (1 + (hi - first) / *step) as u16 + } + } + Selection::Range { start, end } => { + if hi < *start || lo > *end { + 0 + } else { + let a = lo.max(*start); + let b = hi.min(*end); + (b - a + 1) as u16 + } + } + } + } +} + +impl Iterator for SubsampleFrameDecoder +where + I: Iterator>, +{ + type Item = io::Result; + + /// Return the next decoded sample selected by the subsampling rule. + fn next(&mut self) -> Option { + loop { + if let Selection::Range { end, .. } = self.selection { + if self.sample >= end { + return None; + } + } + if let Selection::Indices(ref mut it) = self.selection { + if it.peek().is_none() { + return None; + } + } + + let (frame, count) = match self.inner.next()? { + Ok(x) => x, + Err(e) => return Some(Err(e)), + }; + + let lo = self.sample + 1; + let hi = self.sample + count as usize; + let selected = self.count_selected_in(lo, hi); + + self.sample = hi; + + if selected > 0 { + match decode_frame_to_assignment(&frame) { + Ok(assignment) => return Some(Ok((assignment, selected))), + Err(e) => return Some(Err(e)), + } + } + } + } +} + +/// Build a generic frame iterator from a BEN or XBEN file path. +/// +/// Frame iteration is useful for subsampling and counting because it avoids +/// decoding every sample into a full assignment vector. +/// +/// # Arguments +/// +/// * `file_path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns a boxed iterator over generic frames and their repetition counts. +pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { + let file = File::options().read(true).open(file_path)?; + let reader = BufReader::new(file); + + match mode { + "ben" => { + let frames = AssignmentFrameReader::new(reader)?; + let mapped = frames + .map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))); + Ok(Box::new(mapped)) + } + "xben" => { + let x = XZAssignmentReader::new(reader)?; + let variant = x.variant(); + let frames = x.into_frames(); + let mapped = frames + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + Ok(Box::new(mapped)) + } + _ => Err(io::Error::from(DecoderInitError::UnknownMode { + mode: mode.to_string(), + })), + } +} + +/// Count the number of samples in a BEN or XBEN file on disk. +/// +/// The file is walked frame-by-frame, so this is linear in file size but avoids +/// materializing full assignment vectors. +/// +/// # Arguments +/// +/// * `path` - Path to a `.ben` or `.xben` file. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns the number of samples in the file. +pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { + let iter = build_frame_iter(&path.to_path_buf(), mode)?; + let mut total = 0usize; + for item in iter { + let (_frame, cnt) = item?; + total += cnt as usize; + } + Ok(total) +} diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs new file mode 100644 index 0000000..bad3fef --- /dev/null +++ b/ben/src/io/reader/xz_assignment_reader.rs @@ -0,0 +1,678 @@ +use super::errors::DecoderInitError; +use super::subsample::{Ben32Frame, DecodeFrame, MkvRecord, SubsampleFrameDecoder}; +use super::twodelta::{ + XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_DELTA_TAG, XBEN_TWODELTA_FULL_TAG, +}; +use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; +use crate::codec::encode::encode_ben32_assignments; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::util::rle::rle_to_vec; +use crate::{progress, BenVariant}; +use serde_json::json; +use std::io::{self, BufReader, Cursor, Read, Write}; +use xz2::read::XzDecoder; + +/// Iterator over decoded assignments in an XBEN stream. +pub struct XZAssignmentReader { + xz: BufReader>, + /// Variant encoded in the XBEN banner (private; use `.variant()` accessor). + inner_variant: BenVariant, + overflow: Vec, + buf: Box<[u8]>, + previous_assignment: Option>, + chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, + silent: bool, +} + +impl XZAssignmentReader { + /// Create an XBEN decoder from an already-opened decompressed stream. + /// + /// # Arguments + /// + /// * `xz` - A buffered XZ decompression reader positioned past the banner. + /// * `variant` - The BEN variant indicated by the banner. + /// + /// # Returns + /// + /// Returns a new decoder ready to yield frames from the stream. + pub(crate) fn from_decompressed_stream( + xz: BufReader>, + variant: BenVariant, + ) -> Self { + Self { + xz, + inner_variant: variant, + overflow: Vec::with_capacity(1 << 20), + buf: vec![0u8; 1 << 20].into_boxed_slice(), + previous_assignment: None, + chunk_queue: std::collections::VecDeque::new(), + silent: false, + } + } + + /// Create a decoder for an XBEN stream. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns a new decoder positioned at the first ben32 frame in the + /// decompressed payload. + pub fn new(reader: R) -> Result { + let xz = XzDecoder::new(reader); + let mut xz = BufReader::with_capacity(1 << 20, xz); + + let mut first = [0u8; BANNER_LEN]; + if let Err(e) = xz.read_exact(&mut first) { + return Err(DecoderInitError::Io(e)); + } + let variant = match variant_from_banner(&first) { + Some(v) => v, + None => return Err(DecoderInitError::InvalidFileFormat(first.to_vec())), + }; + + Ok(Self::from_decompressed_stream(xz, variant)) + } + + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.inner_variant + } + + /// Suppress progress output from this decoder's iterator. + /// + /// # Arguments + /// + /// * `silent` - When `true`, the decoder will not emit progress messages. + /// + /// # Returns + /// + /// Returns `self` for method chaining. + pub fn silent(mut self, silent: bool) -> Self { + self.silent = silent; + self + } + + /// Try to extract one complete ben32 frame from the buffered overflow. + /// + /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 + /// frame and, for MkvChain streams, reads the trailing repetition count. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain one or more + /// complete ben32 frames. + /// + /// # Returns + /// + /// Returns the frame bytes, the number of consumed bytes, and the decoded + /// repetition count when a complete frame is available. + fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { + match self.inner_variant { + BenVariant::Standard => { + if overflow.len() < 4 { + return None; + } + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let end = i + 1; + let frame = &overflow[..end]; + return Some((frame, end, 1)); + } + } + None + } + BenVariant::MkvChain => { + if overflow.len() < 6 { + return None; + } + for i in (3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let count_hi = overflow[i + 1]; + let count_lo = overflow[i + 2]; + let count = u16::from_be_bytes([count_hi, count_lo]); + let end = i + 3; + let frame = &overflow[..end]; + return Some((frame, end, count)); + } + } + None + } + BenVariant::TwoDelta => None, + } + } + + /// Try to extract one complete TwoDelta frame from the buffered overflow. + /// + /// Inspects the leading tag byte to determine whether the frame is a full + /// RLE frame or a delta frame, then reads the corresponding payload. + /// + /// # Arguments + /// + /// * `overflow` - Buffered decompressed bytes that may contain a complete + /// TwoDelta frame. + /// + /// # Returns + /// + /// Returns the parsed frame, the number of consumed bytes, and the + /// repetition count when a complete frame is available. + fn pop_twodelta_frame_from_overflow( + &self, + overflow: &[u8], + ) -> Option> { + let tag = *overflow.first()?; + match tag { + XBEN_TWODELTA_FULL_TAG => { + if overflow.len() < 7 { + return None; + } + let run_count = + u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) + as usize; + let payload_len = run_count.checked_mul(4)?; + let total_len = 1usize + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut runs = Vec::with_capacity(run_count); + let mut cursor = 5usize; + for _ in 0..run_count { + let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); + runs.push((value, len)); + cursor += 4; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) + } + XBEN_TWODELTA_DELTA_TAG => { + if overflow.len() < 11 { + return None; + } + let pair = ( + u16::from_be_bytes([overflow[1], overflow[2]]), + u16::from_be_bytes([overflow[3], overflow[4]]), + ); + let run_count = + u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) + as usize; + let payload_len = run_count.checked_mul(2)?; + let total_len = 1usize + .checked_add(2)? + .checked_add(2)? + .checked_add(4)? + .checked_add(payload_len)? + .checked_add(2)?; + if overflow.len() < total_len { + return None; + } + + let mut run_lengths = Vec::with_capacity(run_count); + let mut cursor = 9usize; + for _ in 0..run_count { + run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); + cursor += 2; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok(( + XBenTwoDeltaFrame::Delta { pair, run_lengths }, + total_len, + count, + ))) + } + XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. + _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { tag }))), + } + } + + /// Try to parse a columnar TwoDelta chunk from the overflow buffer. + /// + /// If the overflow starts with the chunk tag and contains enough bytes for + /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. + /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, + /// or `None` when the overflow is incomplete. + fn try_parse_twodelta_chunk(&mut self) -> Option> { + if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { + return None; + } + if self.overflow.len() < 5 { + return None; + } + + let n_frames = u32::from_be_bytes([ + self.overflow[1], + self.overflow[2], + self.overflow[3], + self.overflow[4], + ]) as usize; + + // Calculate total chunk size: tag(1) + n_frames(4) + // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) + let header_len = 5; + let pairs_len = n_frames * 4; + let counts_len = n_frames * 2; + let run_counts_len = n_frames * 4; + let fixed_len = header_len + pairs_len + counts_len + run_counts_len; + + if self.overflow.len() < fixed_len { + return None; + } + + // Read run-length counts to determine total run data size. + let run_counts_start = header_len + pairs_len + counts_len; + let mut total_runs = 0usize; + let mut run_counts = Vec::with_capacity(n_frames); + for i in 0..n_frames { + let offset = run_counts_start + i * 4; + let rc = u32::from_be_bytes([ + self.overflow[offset], + self.overflow[offset + 1], + self.overflow[offset + 2], + self.overflow[offset + 3], + ]) as usize; + run_counts.push(rc); + total_runs += rc; + } + + let run_data_len = total_runs * 2; + let total_len = fixed_len + run_data_len; + if self.overflow.len() < total_len { + return None; + } + + // Parse pairs channel. + let pairs_start = header_len; + // Parse counts channel. + let counts_start = pairs_start + pairs_len; + // Run data starts after run counts. + let run_data_start = run_counts_start + run_counts_len; + + let mut run_cursor = run_data_start; + for i in 0..n_frames { + let po = pairs_start + i * 4; + let pair = ( + u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), + u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), + ); + let co = counts_start + i * 2; + let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); + + let rc = run_counts[i]; + let mut run_lengths = Vec::with_capacity(rc); + for _ in 0..rc { + run_lengths.push(u16::from_be_bytes([ + self.overflow[run_cursor], + self.overflow[run_cursor + 1], + ])); + run_cursor += 2; + } + + self.chunk_queue + .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); + } + + self.overflow.drain(..total_len); + Some(Ok(())) + } + + /// Consume this decoder and iterate over raw ben32 frames instead of + /// materialized assignments. + /// + /// # Returns + /// + /// Returns an iterator that yields raw ben32 frames from the remaining + /// input. + pub fn into_frames(self) -> XZAssignmentFrameReader { + XZAssignmentFrameReader { inner: self } + } + + /// Count the number of samples remaining in the XBEN stream. + /// + /// # Returns + /// + /// Returns the number of remaining samples in the stream. + pub fn count_samples(self) -> io::Result { + let mut total = 0usize; + for frame_res in self.into_frames() { + let (_bytes, cnt) = frame_res?; + total += cnt as usize; + } + Ok(total) + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// The callback receives a borrowed assignment slice and its repetition + /// count. Return `true` to continue decoding or `false` to stop early. + /// + /// # Arguments + /// + /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. + /// + /// # Returns + /// + /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + let mut sample_count = 0usize; + loop { + match self.next() { + Some(Ok((assignment, count))) => { + sample_count += count as usize; + if !self.silent { + progress!("Decoding sample: {}\r", sample_count); + } + let keep_going = f(&assignment, count)?; + if !keep_going { + return Ok(()); + } + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + } + } + } + + /// Decode the remaining XBEN stream and write it as JSONL. + /// + /// # Arguments + /// + /// * `writer` - The destination that will receive one JSON object per + /// decoded sample. + /// + /// # Returns + /// + /// Returns `Ok(())` after the remaining stream has been fully decoded. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + Ok(true) + }) + } +} + +/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. +/// +/// # Arguments +/// +/// * `frame_bytes` - The ben32 frame bytes. +/// * `variant` - The BEN variant used to interpret the frame tail. +/// +/// # Returns +/// +/// Returns the expanded assignment vector. +pub(super) fn decode_xben_frame_to_assignment( + frame_bytes: &[u8], + variant: BenVariant, +) -> io::Result> { + let cursor = Cursor::new(frame_bytes); + let (assignment, _) = decode_ben32_line(cursor, variant)?; + Ok(assignment) +} + +impl Iterator for XZAssignmentReader { + type Item = io::Result; + + /// Decode and return the next assignment from the XBEN stream. + fn next(&mut self) -> Option { + loop { + match self.inner_variant { + BenVariant::Standard | BenVariant::MkvChain => { + if let Some((frame_bytes, consumed, count)) = + self.pop_frame_from_overflow(&self.overflow) + { + let res = match decode_xben_frame_to_assignment(frame_bytes, self.inner_variant) { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + Ok((assignment, count)) + } + Err(e) => Err(e), + }; + self.overflow.drain(..consumed); + return Some(res); + } + } + BenVariant::TwoDelta => { + // Drain frames from a previously parsed chunk first. + if let Some((frame, count)) = self.chunk_queue.pop_front() { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.take() { + Some(prev) => { + apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) + } + None => { + Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)) + } + } + } + }; + return Some(match assignment { + Ok(a) => { + self.previous_assignment = Some(a.clone()); + Ok((a, count)) + } + Err(e) => Err(e), + }); + } + + // Try to parse a columnar chunk. + if let Some(result) = self.try_parse_twodelta_chunk() { + match result { + Ok(()) => continue, // Loop to drain chunk_queue. + Err(e) => return Some(Err(e)), + } + } + + // Try a single legacy frame (tag 0 or 1). + if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { + let res = match parsed { + Ok((frame, consumed, count)) => { + let assignment = match frame { + XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), + XBenTwoDeltaFrame::Delta { pair, run_lengths } => { + match self.previous_assignment.take() { + Some(previous_assignment) => { + apply_twodelta_runs_to_assignment( + previous_assignment, + pair, + &run_lengths, + ) + } + None => Err(io::Error::from( + DecodeError::TwoDeltaNoAnchorFrame, + )), + } + } + }; + match assignment { + Ok(assignment) => { + self.previous_assignment = Some(assignment.clone()); + self.overflow.drain(..consumed); + Ok((assignment, count)) + } + Err(err) => { + self.overflow.drain(..consumed); + Err(err) + } + } + } + Err(err) => { + self.overflow.clear(); + Err(err) + } + }; + return Some(res); + } + } + } + + let read = match self.xz.read(&mut self.buf) { + Ok(0) => { + if self.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.overflow.extend_from_slice(&self.buf[..read]); + } + } +} + +/// Iterator over raw ben32 frames inside an XBEN stream. +pub struct XZAssignmentFrameReader { + pub(super) inner: XZAssignmentReader, +} + +impl XZAssignmentFrameReader { + /// Create a raw XBEN frame iterator from a reader. + /// + /// # Arguments + /// + /// * `reader` - The compressed XBEN input stream. + /// + /// # Returns + /// + /// Returns an iterator over raw ben32 frames. + pub fn new(reader: R) -> Result { + Ok(Self { + inner: XZAssignmentReader::new(reader)?, + }) + } +} + +impl Iterator for XZAssignmentFrameReader { + type Item = io::Result; + + /// Return the next raw ben32 frame from the input stream. + fn next(&mut self) -> Option { + if self.inner.inner_variant == BenVariant::TwoDelta { + return self.inner.next().map(|result| { + result.and_then(|(assignment, count)| { + Ok((encode_ben32_assignments(&assignment)?, count)) + }) + }); + } + + loop { + if let Some((frame, consumed, count)) = + self.inner.pop_frame_from_overflow(&self.inner.overflow) + { + let out = frame.to_vec(); + self.inner.overflow.drain(..consumed); + return Some(Ok((out, count))); + } + + let read = match self.inner.xz.read(&mut self.inner.buf) { + Ok(0) => { + if self.inner.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + self.inner + .overflow + .extend_from_slice(&self.inner.buf[..read]); + } + } +} + +impl XZAssignmentReader { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + /// + /// # Arguments + /// + /// * `indices` - A collection of 1-based sample indices. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + Send> + where + T: IntoIterator, + { + let variant = self.inner_variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_indices(Box::new(frames), indices) + } + + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + /// + /// # Arguments + /// + /// * `start` - The first 1-based sample index to include. + /// * `end` - The last 1-based sample index to include. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.inner_variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::by_range(Box::new(frames), start, end) + } + + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + /// + /// # Arguments + /// + /// * `step` - The stride between selected samples. + /// * `offset` - The 1-based index of the first selected sample. + /// + /// # Returns + /// + /// Returns a decoder that yields only the selected samples. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> + Send> { + let variant = self.inner_variant; + let frames = self + .into_frames() + .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); + SubsampleFrameDecoder::every(Box::new(frames), step, offset) + } +} + diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index c2efeaf..dcb7855 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -1,163 +1,9 @@ use super::twodelta::XBEN_TWODELTA_FULL_TAG; -#[derive(Clone, Copy, Debug, Default)] -pub(super) struct AssignmentHints { - pub is_repeated: bool, - pub delta_pair: Option<(u16, u16)>, -} -use crate::codec::encode::errors::EncodeError; use crate::util::rle::assign_to_rle; use serde_json::Value; -use std::collections::HashMap; use std::io::{self, Result}; -/// Check whether two assignment vectors are identical element-by-element. -/// -/// # Arguments -/// -/// * `previous_sample` - The previous assignment vector. -/// * `assign_vec` - The current assignment vector. -/// -/// # Returns -/// -/// Returns `true` if both vectors have the same length and every element matches. -pub(super) fn is_repeated_assignment(previous_sample: &[u16], assign_vec: &[u16]) -> bool { - if previous_sample.is_empty() || previous_sample.len() != assign_vec.len() { - return false; - } - - for (&previous, ¤t) in previous_sample.iter().zip(assign_vec.iter()) { - if previous != current { - return false; - } - } - - true -} - -/// Analyze the transition between two assignment vectors for two-delta encoding. -/// -/// Determines whether the assignments are identical (repeated) or differ by -/// exactly one swapped pair of values, which qualifies for delta encoding. -/// -/// When `masks` are available the pair is detected in O(K) where K is the -/// number of unique label values, by checking each label's mask positions for -/// changes rather than scanning the full assignment array. -/// -/// # Arguments -/// -/// * `previous_sample` - The previous assignment vector. -/// * `assign_vec` - The current assignment vector. -/// * `masks` - An optional index map from each label value to its sorted -/// positions in the previous assignment. -/// -/// # Returns -/// -/// Returns an `AssignmentHints` with `is_repeated` set if the vectors match, -/// or `delta_pair` set if all differences involve exactly two values. -pub(super) fn analyze_twodelta_transition( - previous_sample: &[u16], - assign_vec: &[u16], - masks: Option<&HashMap>>, -) -> Result { - if previous_sample.is_empty() { - return Ok(AssignmentHints::default()); - } - - if previous_sample.len() != assign_vec.len() { - return Err(EncodeError::LengthMismatch { - prev_len: previous_sample.len(), - new_len: assign_vec.len(), - } - .into()); - } - - if previous_sample == assign_vec { - return Ok(AssignmentHints { - is_repeated: true, - delta_pair: None, - }); - } - - // Fast path: use masks to find the pair in O(K) instead of O(N). - if let Some(masks) = masks { - // Check each label's mask positions. Only labels involved in the swap - // will have any changed positions; all others short-circuit immediately. - let mut pair: Option<(u16, u16)> = None; - for (&label, positions) in masks { - for &pos in positions { - if assign_vec[pos] != label { - let other = assign_vec[pos]; - match pair { - None => { - pair = Some((label, other)); - break; - } - Some((a, b)) => { - if (label == a || label == b) && (other == a || other == b) { - break; - } - // More than two values involved. - return Ok(AssignmentHints { - is_repeated: false, - delta_pair: None, - }); - } - } - } - } - } - - return Ok(AssignmentHints { - is_repeated: false, - delta_pair: pair, - }); - } - - // Slow path: full O(N) scan when masks are not available. - let Some(first_mismatch) = previous_sample - .iter() - .zip(assign_vec.iter()) - .position(|(&previous, ¤t)| previous != current) - else { - return Ok(AssignmentHints { - is_repeated: true, - delta_pair: None, - }); - }; - - let pair = (previous_sample[first_mismatch], assign_vec[first_mismatch]); - - for (&previous, ¤t) in previous_sample - .iter() - .zip(assign_vec.iter()) - .skip(first_mismatch + 1) - { - if previous == current { - continue; - } - - if previous != pair.0 && previous != pair.1 { - return Ok(AssignmentHints { - is_repeated: false, - delta_pair: None, - }); - } - - if current != pair.0 && current != pair.1 { - return Ok(AssignmentHints { - is_repeated: false, - delta_pair: None, - }); - } - } - - Ok(AssignmentHints { - is_repeated: false, - delta_pair: Some(pair), - }) -} - /// Extract and validate the `assignment` array from a JSON object. /// /// # Arguments diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 9de303d..35c8181 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -1,7 +1,7 @@ //! Sample extraction helpers for BEN and XBEN streams. use crate::codec::decode::decode_ben32_line; -use crate::io::reader::{BenDecoder, XBenDecoder}; +use crate::io::reader::{AssignmentReader, XZAssignmentReader}; use serde_json::Error as SerdeError; use std::io::Cursor; use std::io::{self, Read}; @@ -65,7 +65,7 @@ pub fn extract_assignment_ben( } let mut current_sample = 1; - let inner_decoder = BenDecoder::new(&mut reader).map_err(io::Error::from)?; + let inner_decoder = AssignmentReader::new(&mut reader).map_err(io::Error::from)?; for record in inner_decoder { let (assignment, count) = record.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + count as usize > sample_number { @@ -97,8 +97,9 @@ pub fn extract_assignment_xben( return Err(SampleError::InvalidSampleNumber); } - let inner_decoder = XBenDecoder::new(&mut reader).map_err(SampleError::new_io_error)?; - let variant = inner_decoder.variant; + let inner_decoder = XZAssignmentReader::new(&mut reader) + .map_err(|e| SampleError::new_io_error(io::Error::from(e)))?; + let variant = inner_decoder.variant(); let frame_iterator = inner_decoder.into_frames(); let mut current_sample = 1; diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 4f0c031..8fa350c 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -7,7 +7,7 @@ use crate::codec::decode::decode_ben_line; use crate::codec::{BenConstruct, BenEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::reader::BenDecoder; +use crate::io::reader::AssignmentReader; use crate::io::writer::AssignmentWriter; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::{progress, BenVariant}; @@ -125,7 +125,7 @@ fn relabel_ben_file_via_decoder( where F: FnMut(&[u16]) -> io::Result>, { - let mut decoder = BenDecoder::new(reader)?.silent(true); + let mut decoder = AssignmentReader::new(reader)?.silent(true); let mut encoder = AssignmentWriter::new(writer, variant)?; let mut sample_number = 0usize; diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 947c054..7772cb2 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -15,7 +15,7 @@ use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::reader::{ - BenDecoder, BenFrameDecoeder, DecoderInitError, XBenDecoder, XBenFrameDecoder, + AssignmentReader, AssignmentFrameReader, DecoderInitError, XZAssignmentReader, XZAssignmentFrameReader, }; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::json::graph::{ @@ -333,12 +333,12 @@ fn decoder_init_error_converts_to_io_error_from_invalid_format() { } // ────────────────────────────────────────────────────────────────────────────── -// io::reader – BenDecoder +// io::reader – AssignmentReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_rejects_empty_input() { - match BenDecoder::new(io::empty()) { + match AssignmentReader::new(io::empty()) { Err(DecoderInitError::Io(_)) => {} Ok(_) => panic!("expected Io error"), Err(e) => panic!("unexpected error variant: {e}"), @@ -347,7 +347,7 @@ fn ben_decoder_rejects_empty_input() { #[test] fn ben_decoder_rejects_wrong_banner() { - match BenDecoder::new(b"BAD BAD BAD BAD!!".as_slice()) { + match AssignmentReader::new(b"BAD BAD BAD BAD!!".as_slice()) { Err(DecoderInitError::InvalidFileFormat(_)) => {} Ok(_) => panic!("expected InvalidFileFormat error"), Err(e) => panic!("unexpected error variant: {e}"), @@ -358,7 +358,7 @@ fn ben_decoder_rejects_wrong_banner() { fn ben_decoder_rejects_xz_data_with_helpful_message() { // Manufacture a valid XZ header prefix. let xz_magic = b"\xFD\x37\x7A\x58\x5A\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - match BenDecoder::new(xz_magic.as_slice()) { + match AssignmentReader::new(xz_magic.as_slice()) { Err(DecoderInitError::InvalidFileFormat(ref header)) => { let e = DecoderInitError::InvalidFileFormat(header.clone()); let msg = e.to_string(); @@ -374,7 +374,7 @@ fn ben_decoder_standard_single_assignment_round_trip() { let assignment = vec![1u16, 1, 2, 3, 3, 3]; let ben = encode_standard_ben(&[assignment.clone()]); - let mut decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap(); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); assert_eq!(decoded, assignment); @@ -386,7 +386,7 @@ fn ben_decoder_standard_multiple_assignments_round_trip() { let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![1u16, 1, 1]]; let ben = encode_standard_ben(&assignments); - let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); for expected in &assignments { let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); @@ -411,7 +411,7 @@ fn ben_decoder_mkv_preserves_repetition_counts() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); let (a1, c1) = decoder.next().unwrap().unwrap(); assert_eq!(a1, vec![1u16, 2, 3]); @@ -428,7 +428,7 @@ fn ben_decoder_mkv_preserves_repetition_counts() { fn ben_decoder_count_samples_standard() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 3); } @@ -447,7 +447,7 @@ fn ben_decoder_count_samples_mkv_with_repetitions() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 4); } @@ -456,7 +456,7 @@ fn ben_decoder_write_all_jsonl_produces_correct_output() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; let ben = encode_standard_ben(&assignments); - let mut decoder = BenDecoder::new(ben.as_slice()).unwrap(); + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap(); let mut out = Vec::new(); decoder.write_all_jsonl(&mut out).unwrap(); @@ -474,7 +474,7 @@ fn ben_decoder_for_each_assignment_early_stop() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_standard_ben(&assignments); - let mut decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); let mut seen = Vec::new(); decoder .for_each_assignment(|a, _count| { @@ -489,7 +489,7 @@ fn ben_decoder_for_each_assignment_early_stop() { } // ────────────────────────────────────────────────────────────────────────────── -// io::reader – XBenDecoder +// io::reader – XZAssignmentReader // ────────────────────────────────────────────────────────────────────────────── fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { @@ -511,16 +511,16 @@ fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { fn xben_decoder_reads_variant_from_banner_standard() { let assignments = vec![vec![1u16, 2, 3]]; let xben = make_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); - assert_eq!(decoder.variant, BenVariant::Standard); + let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant(), BenVariant::Standard); } #[test] fn xben_decoder_reads_variant_from_banner_mkvchain() { let assignments = vec![vec![1u16, 2, 3]]; let xben = make_xben(&assignments, BenVariant::MkvChain); - let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); - assert_eq!(decoder.variant, BenVariant::MkvChain); + let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant(), BenVariant::MkvChain); } #[test] @@ -529,8 +529,8 @@ fn xben_decoder_reads_variant_from_banner_twodelta() { let base = vec![1u16, 1, 2, 2]; let second = vec![1u16, 2, 2, 1]; // swap positions 1 & 3 let xben = make_xben(&[base, second], BenVariant::TwoDelta); - let decoder = XBenDecoder::new(xben.as_slice()).unwrap(); - assert_eq!(decoder.variant, BenVariant::TwoDelta); + let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + assert_eq!(decoder.variant(), BenVariant::TwoDelta); } // ────────────────────────────────────────────────────────────────────────────── @@ -1354,14 +1354,14 @@ fn relabel_ben_file_with_map_as_variant_permutes_correctly() { } // ────────────────────────────────────────────────────────────────────────────── -// BenDecoder – iterator interface +// AssignmentReader – iterator interface // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_iterator_collects_all_frames() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6], vec![7u16, 8, 9]]; let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert_eq!(frames.len(), 3); for (i, (a, count)) in frames.iter().enumerate() { @@ -1373,7 +1373,7 @@ fn ben_decoder_iterator_collects_all_frames() { #[test] fn ben_decoder_iterator_on_empty_payload_yields_nothing() { let ben = STANDARD_BEN_BANNER.to_vec(); // banner only, no frames - let decoder = BenDecoder::new(ben.as_slice()).unwrap().silent(true); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert!(frames.is_empty()); } @@ -1485,7 +1485,7 @@ fn ben_decoder_accepts_cursor_reader() { let assignment = vec![1u16, 2, 3]; let ben = encode_standard_ben(&[assignment.clone()]); let cursor = Cursor::new(ben); - let mut decoder = BenDecoder::new(cursor).unwrap().silent(true); + let mut decoder = AssignmentReader::new(cursor).unwrap().silent(true); let (decoded, _) = decoder.next().unwrap().unwrap(); assert_eq!(decoded, assignment); } @@ -1850,15 +1850,15 @@ fn sort_by_ordering_large_graph_multilevel_verifies_permutation() { } // ────────────────────────────────────────────────────────────────────────────── -// XBenDecoder / XBenFrameDecoder +// XZAssignmentReader / XZAssignmentFrameReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn xben_decoder_iterator_standard_collects_all() { let assignments = vec![vec![1u16, 1, 2, 2], vec![3u16, 3, 3, 3]]; let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); - assert_eq!(decoder.variant, BenVariant::Standard); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(decoder.variant(), BenVariant::Standard); let results: Vec> = decoder.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -1871,7 +1871,7 @@ fn xben_decoder_count_samples_standard() { vec![5u16, 6, 5, 6], ]; let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 3); } @@ -1879,7 +1879,7 @@ fn xben_decoder_count_samples_standard() { fn xben_decoder_count_samples_mkvchain() { let assignments: Vec> = (0..5u16).map(|i| vec![i, i + 1]).collect(); let xben = encode_xben(&assignments, BenVariant::MkvChain); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 5); } @@ -1887,7 +1887,7 @@ fn xben_decoder_count_samples_mkvchain() { fn xben_frame_decoder_new_and_iterate() { let assignments = vec![vec![1u16, 1, 2], vec![2u16, 2, 1]]; let xben = encode_xben(&assignments, BenVariant::Standard); - let frame_iter = XBenFrameDecoder::new(Cursor::new(xben)).unwrap(); + let frame_iter = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); let frames: Vec<(Vec, u16)> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); for (frame_bytes, count) in &frames { @@ -1898,18 +1898,18 @@ fn xben_frame_decoder_new_and_iterate() { } // ────────────────────────────────────────────────────────────────────────────── -// BenFrameDecoeder (note: typo in source name is intentional) +// AssignmentFrameReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_frame_decoder_standard_iterates() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; let ben = encode_standard_ben(&assignments); - let frame_iter = BenFrameDecoeder::new(Cursor::new(ben)).unwrap(); + let frame_iter = AssignmentFrameReader::new(Cursor::new(ben)).unwrap(); let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); - assert_eq!(frames[0].count, 1); - assert_eq!(frames[1].count, 1); + assert_eq!(frames[0].1, 1); + assert_eq!(frames[1].1, 1); } #[test] @@ -1921,22 +1921,22 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::TwoDelta).unwrap(); - // BenFrameDecoeder should re-encode TwoDelta frames back to standard BEN frames - let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + // AssignmentFrameReader should re-encode TwoDelta frames back to standard BEN frames + let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); let frame_iter = decoder.into_frames(); let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); } // ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — BenDecoder subsample methods +// SubsampleFrameDecoder — AssignmentReader subsample methods // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_subsample_by_indices() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 4]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); // 1-based indices: 2, 5, 8 let selected: Vec> = decoder .into_subsample_by_indices(vec![2usize, 5, 8]) @@ -1952,7 +1952,7 @@ fn ben_decoder_subsample_by_indices() { fn ben_decoder_subsample_by_range() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 3]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); // Inclusive 1-based range [3, 6] let selected: Vec> = decoder .into_subsample_by_range(3, 6) @@ -1967,7 +1967,7 @@ fn ben_decoder_subsample_by_range() { fn ben_decoder_subsample_every_nth() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); // Every 3rd sample starting at 1-based offset 1: samples 1, 4, 7, 10 let selected: Vec> = decoder .into_subsample_every(3, 1) @@ -1984,7 +1984,7 @@ fn ben_decoder_subsample_every_nth() { fn ben_decoder_subsample_by_indices_dedup() { let assignments: Vec> = (0u16..5).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = BenDecoder::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); // Duplicate index 2 → after dedup only samples 2 and 3 are selected let selected: Vec> = decoder .into_subsample_by_indices(vec![2usize, 2, 3]) @@ -1996,14 +1996,14 @@ fn ben_decoder_subsample_by_indices_dedup() { } // ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — XBenDecoder subsample methods +// SubsampleFrameDecoder — XZAssignmentReader subsample methods // ────────────────────────────────────────────────────────────────────────────── #[test] fn xben_decoder_subsample_by_indices() { let assignments: Vec> = (1u16..=5).map(|i| vec![i; 4]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let selected: Vec> = decoder .into_subsample_by_indices(vec![1usize, 3, 5]) .map(|r| r.unwrap().0) @@ -2018,7 +2018,7 @@ fn xben_decoder_subsample_by_indices() { fn xben_decoder_subsample_by_range() { let assignments: Vec> = (0u16..6).map(|i| vec![i; 3]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let selected: Vec> = decoder .into_subsample_by_range(2, 4) .map(|r| r.unwrap().0) @@ -2032,7 +2032,7 @@ fn xben_decoder_subsample_by_range() { fn xben_decoder_subsample_every() { let assignments: Vec> = (0u16..6).map(|i| vec![i; 2]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XBenDecoder::new(Cursor::new(xben)).unwrap(); + let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); // Every 2nd sample starting from offset 1: samples 1, 3, 5 let selected: Vec> = decoder .into_subsample_every(2, 1) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index fad0ebb..1909b84 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -8,8 +8,8 @@ use binary_ensemble::codec::encode::{ }; use binary_ensemble::codec::{BenConstruct, BenEncodeFrame}; use binary_ensemble::io::reader::{ - build_frame_iter, count_samples_from_file, BenDecoder, DecodeFrame, DecoderInitError, - SubsampleFrameDecoder, XBenDecoder, + build_frame_iter, count_samples_from_file, AssignmentReader, DecodeFrame, DecoderInitError, + SubsampleFrameDecoder, XZAssignmentReader, }; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::ops::extract::extract_assignment_ben; @@ -192,7 +192,7 @@ fn strat_threads_levels() -> impl Strategy { // ---------- Tests ---------- proptest! { - // JSONL -> BEN(Standard) -> JSONL round-trip via BenEncoder/BenDecoder entry points. + // JSONL -> BEN(Standard) -> JSONL round-trip via BenEncoder/AssignmentReader entry points. #[test] fn fuzz_roundtrip_ben_standard(seq in strat_assignment_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -337,7 +337,7 @@ proptest! { prop_assert_eq!(direct, via); } - // Iterator surface: XBenDecoder -> records matches direct JSONL + // Iterator surface: XZAssignmentReader -> records matches direct JSONL #[test] fn fuzz_xbendecoder_iterator_matches_jsonl(seq in strat_assignment_seq(), params in strat_threads_levels()) { let (threads, level) = params; @@ -353,7 +353,7 @@ proptest! { None, ).unwrap(); - let mut dec = XBenDecoder::new(xben.as_slice()).unwrap(); + let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let iter_jsonl = jsonl_from_records(&recs, 0); @@ -365,7 +365,7 @@ proptest! { prop_assert_eq!(iter_jsonl, direct); } - // Iterator surface: XBenDecoder over TwoDelta XBEN matches direct JSONL. + // Iterator surface: XZAssignmentReader over TwoDelta XBEN matches direct JSONL. #[test] fn fuzz_xbendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq(), params in strat_threads_levels()) { let (threads, level) = params; @@ -381,7 +381,7 @@ proptest! { None, ).unwrap(); - let mut dec = XBenDecoder::new(xben.as_slice()).unwrap(); + let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let iter_jsonl = jsonl_from_records(&recs, 0); @@ -391,7 +391,7 @@ proptest! { prop_assert_eq!(iter_jsonl, direct); } - // Iterator surface: BenDecoder over BEN produced by BenEncoder. + // Iterator surface: AssignmentReader over BEN produced by BenEncoder. #[test] fn fuzz_bendecoder_iterator_matches_jsonl(seq in strat_assignment_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -400,15 +400,15 @@ proptest! { let mut ben = Vec::new(); encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::Standard).unwrap(); - // Iterate BenDecoder - let mut dec = BenDecoder::new(ben.as_slice()).unwrap(); + // Iterate AssignmentReader + let mut dec = AssignmentReader::new(ben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let out = jsonl_from_records(&recs, 0); prop_assert_eq!(out, jsonl); } - // Iterator surface: BenDecoder over TwoDelta BEN matches JSONL. + // Iterator surface: AssignmentReader over TwoDelta BEN matches JSONL. #[test] fn fuzz_bendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -416,7 +416,7 @@ proptest! { let mut ben = Vec::new(); encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::TwoDelta).unwrap(); - let mut dec = BenDecoder::new(ben.as_slice()).unwrap(); + let mut dec = AssignmentReader::new(ben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let out = jsonl_from_records(&recs, 0); prop_assert_eq!(out, jsonl); @@ -444,7 +444,7 @@ proptest! { let mut want: Vec = (1..=n).step_by(3).collect(); // 1,4,7,… if want.is_empty() { want.push(1); } - let xb = XBenDecoder::new(xben.as_slice()).unwrap(); + let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_by_indices(want.clone()); let recs = collect_records(&mut sub).unwrap(); @@ -488,7 +488,7 @@ proptest! { } } - let xb = XBenDecoder::new(xben.as_slice()).unwrap(); + let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_every(step, offset); let recs = collect_records(&mut sub).unwrap(); @@ -522,7 +522,7 @@ proptest! { let truth: Vec> = (s..=e).map(|i| seq[i-1].clone()).collect(); - let xb = XBenDecoder::new(xben.as_slice()).unwrap(); + let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_by_range(s, e); let recs = collect_records(&mut sub).unwrap(); @@ -546,7 +546,7 @@ proptest! { want.push(1); } - let mut sub = BenDecoder::new(ben.as_slice()) + let mut sub = AssignmentReader::new(ben.as_slice()) .unwrap() .into_subsample_by_indices(want.clone()); let recs = collect_records(&mut sub).unwrap(); @@ -590,7 +590,7 @@ fn invalid_ben_header_yields_error() { bogus.extend_from_slice(b"NOT A BEN HEADER!"); bogus.resize(17, 0); - let err = BenDecoder::new(Cursor::new(bogus)) + let err = AssignmentReader::new(Cursor::new(bogus)) .err() .expect("expeced InvalidFileFormat error"); match err { @@ -608,10 +608,10 @@ fn xben_decoder_rejects_bad_banner() { let mut xz = Vec::new(); xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); - let err = XBenDecoder::new(xz.as_slice()) + let err = XZAssignmentReader::new(xz.as_slice()) .err() .expect("expeced InvalidFileFormat error"); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + assert_eq!(std::io::Error::from(err).kind(), std::io::ErrorKind::InvalidData); } #[test] @@ -631,7 +631,7 @@ fn subsample_every_respects_offset() { .unwrap(); // Keep every 1 starting at offset=2 -> only second sample. - let xb = XBenDecoder::new(xben.as_slice()).unwrap(); + let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_every(1, 2); let recs = collect_records(&mut sub).unwrap(); @@ -712,7 +712,7 @@ fn ben_new_invalid_header_detects_xz() { .unwrap(); // Try to treat it as BEN - let err = BenDecoder::new(xz.as_slice()) + let err = AssignmentReader::new(xz.as_slice()) .err() .expect("expected error"); match err { @@ -737,10 +737,10 @@ fn xben_new_invalid_banner() { Some(0), ) .unwrap(); - let err = XBenDecoder::new(wrong.as_slice()) + let err = XZAssignmentReader::new(wrong.as_slice()) .err() .expect("expected invalid data"); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + assert_eq!(std::io::Error::from(err).kind(), std::io::ErrorKind::InvalidData); } #[test] @@ -763,7 +763,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { // Trim the last byte to force partial frame after decompress let trimmed = &xz[..xz.len() - 1]; // Iterating should surface UnexpectedEof (partial frame) - let mut it = XBenDecoder::new(trimmed).unwrap(); + let mut it = XZAssignmentReader::new(trimmed).unwrap(); // Drain until error while let Some(res) = it.next() { if let Err(e) = res { @@ -846,7 +846,7 @@ fn subsample_by_indices_sorts_and_dedups() { None, ) .unwrap(); - let xb = XBenDecoder::new(xz.as_slice()).unwrap(); + let xb = XZAssignmentReader::new(xz.as_slice()).unwrap(); // Deliberately unsorted and duplicated indices let mut sub = xb.into_subsample_by_indices(vec![5, 2, 2, 1, 5, 3]); @@ -942,7 +942,7 @@ fn ben_encoder_write_assignment_path_roundtrips() { #[test] fn ben_decoder_new_reports_short_header_as_io_error() { - let err = BenDecoder::new([1u8, 2, 3].as_slice()).err().unwrap(); + let err = AssignmentReader::new([1u8, 2, 3].as_slice()).err().unwrap(); match err { DecoderInitError::Io(e) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), other => panic!("unexpected error: {other:?}"), @@ -954,7 +954,7 @@ fn ben_decoder_write_all_jsonl_propagates_frame_errors() { let mut malformed = b"STANDARD BEN FILE".to_vec(); malformed.extend_from_slice(&[3]); // start of a frame, but truncated - let mut decoder = BenDecoder::new(malformed.as_slice()).unwrap(); + let mut decoder = AssignmentReader::new(malformed.as_slice()).unwrap(); let err = decoder.write_all_jsonl(Vec::new()).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); } @@ -964,7 +964,7 @@ fn ben_decoder_count_samples_propagates_frame_errors() { let mut malformed = b"STANDARD BEN FILE".to_vec(); malformed.extend_from_slice(&[3]); - let err = BenDecoder::new(malformed.as_slice()) + let err = AssignmentReader::new(malformed.as_slice()) .unwrap() .count_samples() .unwrap_err(); @@ -987,11 +987,11 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { ) .unwrap(); - let mut frames = binary_ensemble::io::reader::XBenFrameDecoder::new(xz.as_slice()).unwrap(); + let mut frames = binary_ensemble::io::reader::XZAssignmentFrameReader::new(xz.as_slice()).unwrap(); assert!(frames.next().unwrap().is_ok()); let trimmed = &xz[..xz.len() - 1]; - let mut frames = binary_ensemble::io::reader::XBenFrameDecoder::new(trimmed).unwrap(); + let mut frames = binary_ensemble::io::reader::XZAssignmentFrameReader::new(trimmed).unwrap(); loop { match frames.next() { Some(Err(e)) => { @@ -1070,7 +1070,7 @@ impl std::io::Read for FailAfterN { fn ben_decoder_frame_read_error_paths() { let banner = b"STANDARD BEN FILE".to_vec(); - let err = BenDecoder::new(FailAfterN { + let err = AssignmentReader::new(FailAfterN { data: [banner.clone(), vec![3]].concat(), pos: 0, fail_at: 18, @@ -1081,7 +1081,7 @@ fn ben_decoder_frame_read_error_paths() { .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); - let err = BenDecoder::new(FailAfterN { + let err = AssignmentReader::new(FailAfterN { data: [banner.clone(), vec![3, 3, 0]].concat(), pos: 0, fail_at: 20, @@ -1092,7 +1092,7 @@ fn ben_decoder_frame_read_error_paths() { .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); - let err = BenDecoder::new(FailAfterN { + let err = AssignmentReader::new(FailAfterN { data: [banner.clone(), vec![3, 3, 0, 0, 0, 1]].concat(), pos: 0, fail_at: 23, @@ -1114,7 +1114,7 @@ fn ben_decoder_mkv_count_read_error_path() { ) .unwrap(); let truncated = ben[..ben.len() - 1].to_vec(); - let err = BenDecoder::new(truncated.as_slice()) + let err = AssignmentReader::new(truncated.as_slice()) .unwrap() .next() .unwrap() @@ -1197,7 +1197,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - BenDecoder::new(ben.as_slice()) + AssignmentReader::new(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1215,7 +1215,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - XBenDecoder::new(xben.as_slice()) + XZAssignmentReader::new(xben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1237,7 +1237,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - XBenDecoder::new(twodelta_xben.as_slice()) + XZAssignmentReader::new(twodelta_xben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1307,7 +1307,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { ) .unwrap(); - let mut by_indices = BenDecoder::new(ben.as_slice()) + let mut by_indices = AssignmentReader::new(ben.as_slice()) .unwrap() .into_subsample_by_indices(vec![4, 1, 1, 3]); let picked = collect_records(&mut by_indices).unwrap(); @@ -1316,7 +1316,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { vec![1, 3, 4] ); - let mut by_range = BenDecoder::new(ben.as_slice()) + let mut by_range = AssignmentReader::new(ben.as_slice()) .unwrap() .into_subsample_by_range(2, 3); let picked = collect_records(&mut by_range).unwrap(); @@ -1325,7 +1325,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { vec![2, 3] ); - let mut every = BenDecoder::new(ben.as_slice()) + let mut every = AssignmentReader::new(ben.as_slice()) .unwrap() .into_subsample_every(2, 2); let picked = collect_records(&mut every).unwrap(); @@ -1354,7 +1354,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { encoder.finish().unwrap(); } - let records = collect_records(BenDecoder::new(ben.as_slice()).unwrap()).unwrap(); + let records = collect_records(AssignmentReader::new(ben.as_slice()).unwrap()).unwrap(); assert_eq!( records, vec![ @@ -1368,10 +1368,10 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { decode_ben_to_jsonl(ben.as_slice(), &mut jsonl).unwrap(); assert_eq!(jsonl, jsonl_from_assignments(&assignments)); - let frames = BenDecoder::new(ben.as_slice()).unwrap().into_frames(); + let frames = AssignmentReader::new(ben.as_slice()).unwrap().into_frames(); assert_eq!( collect_frames( - frames.map(|res| res.map(|f| (DecodeFrame::Ben(f.clone()), f.count as u16))) + frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))) ) .unwrap() .len(), @@ -1449,22 +1449,22 @@ fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { .unwrap(); assert_eq!( - BenDecoder::new(ben.as_slice()) + AssignmentReader::new(ben.as_slice()) .unwrap() .count_samples() .unwrap(), 4 ); - let frames: Vec<_> = BenDecoder::new(ben.as_slice()) + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) .unwrap() .into_frames() .collect::>>() .unwrap(); assert_eq!(frames.len(), 3); - assert_eq!(frames[0].count, 2); - assert_eq!(frames[1].count, 1); - assert_eq!(frames[2].count, 1); + assert_eq!(frames[0].1, 2); + assert_eq!(frames[1].1, 1); + assert_eq!(frames[2].1, 1); let picked = extract_assignment_ben(ben.as_slice(), 3).unwrap(); assert_eq!(picked, assignments[2]); diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index 2e526b3..3124409 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -3,8 +3,8 @@ use binary_ensemble::codec::decode::{ decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, }; use binary_ensemble::io::reader::{ - build_frame_iter, count_samples_from_file, BenDecoder, MkvRecord, Selection, - SubsampleFrameDecoder, XBenDecoder, + build_frame_iter, count_samples_from_file, AssignmentReader, MkvRecord, Selection, + SubsampleFrameDecoder, XZAssignmentReader, }; use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; @@ -242,13 +242,13 @@ fn build_iter(py: Python<'_>, source: &DecoderSource) -> PyResult { let reader = open_input(&source.path)?; match source.mode { DecoderMode::Ben => { - let ben = BenDecoder::new(reader) + let ben = AssignmentReader::new(reader) .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; Ok(Box::new(ben)) } DecoderMode::XBen => { warn_xben_startup(py)?; - let xben = XBenDecoder::new(reader) + let xben = XZAssignmentReader::new(reader) .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; Ok(Box::new(xben)) } From 2433fa997ba11c3320178125990972603cf054a6 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:04:13 -0600 Subject: [PATCH 054/221] Update some tests --- ben/src/codec/decode/tests/mkvchain.rs | 645 ++++++++++ ben/src/codec/decode/tests/mod.rs | 3 + .../decode/{tests.rs => tests/standard.rs} | 3 +- ben/src/codec/decode/tests/twodelta.rs | 551 +++++++++ ben/tests/test_assignment_reader.rs | 1074 +++++++++++++++++ 5 files changed, 2275 insertions(+), 1 deletion(-) create mode 100644 ben/src/codec/decode/tests/mkvchain.rs create mode 100644 ben/src/codec/decode/tests/mod.rs rename ben/src/codec/decode/{tests.rs => tests/standard.rs} (99%) create mode 100644 ben/src/codec/decode/tests/twodelta.rs create mode 100644 ben/tests/test_assignment_reader.rs diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs new file mode 100644 index 0000000..87b7d06 --- /dev/null +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -0,0 +1,645 @@ +use crate::codec::decode::jsonl_decode_ben32; +use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; +use crate::codec::encode::{encode_ben_to_xben, xz_compress}; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; +use serde_json::{json, Value}; +use std::io::{self, BufReader}; + +// The bit-packed payload for assignment [(1,4),(2,1),(3,3)] = [1,1,1,1,2,3,3,3]. +// max_val_bit_count=2, max_len_bit_count=3, n_bytes=2: +// bits 00-04: 01100 → val=01=1, len=100=4 +// bits 05-09: 10001 → val=10=2, len=001=1 +// bits 10-14: 11011 → val=11=3, len=011=3 +// bit 15: 0 → padding +const FRAME_HEADER: &[u8] = &[2, 3, 0, 0, 0, 2]; +const FRAME_PAYLOAD: &[u8] = &[0b01100_100, 0b01_11011_0]; + +fn mkv_ben(count: u16) -> Vec { + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(FRAME_HEADER); + ben.extend_from_slice(FRAME_PAYLOAD); + ben.extend_from_slice(&count.to_be_bytes()); + ben +} + +fn expected_line(assignment: &[u16], sample: usize) -> String { + json!({"assignment": assignment, "sample": sample}).to_string() + "\n" +} + +// ─── decode_ben_to_jsonl ─────────────────────────────────────────────── + +#[test] +fn decode_ben_to_jsonl_count_one() { + let ben = mkv_ben(1); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + assert_eq!(out, expected_line(&assign, 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_count_three_expands_to_three_lines() { + let ben = mkv_ben(3); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + let expected: String = (1..=3).map(|i| expected_line(&assign, i)).collect(); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_sample_numbers_continue_across_frames() { + // Frame 1: [1,1,1,1,2,3,3,3] count=2 → samples 1,2 + // Frame 2: [23] count=3 → samples 3,4,5 + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(FRAME_HEADER); + ben.extend_from_slice(FRAME_PAYLOAD); + ben.extend_from_slice(&2u16.to_be_bytes()); + // Frame for assignment [23]: max_val_bits=5, max_len_bits=1, n_bytes=1 + // payload 0b101111_00 = bits 10111_1 → val=10111=23, len=1=1 + ben.extend_from_slice(&[5, 1, 0, 0, 0, 1, 0b101111_00]); + ben.extend_from_slice(&3u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let a1 = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + let a2 = [23u16]; + let expected: String = (1..=2) + .map(|i| expected_line(&a1, i)) + .chain((3..=5).map(|i| expected_line(&a2, i))) + .collect(); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_16bit_value_with_count() { + // Frame bytes from test_jsonl_decode_ben_16_bit_val (assignment [1,1,1,1,512,3,3,3]) + // with count=2 appended. + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[10, 3, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b00000000, + 0b01100_100, + 0b00000000, + 0b01_000000, + 0b0011011_0, + ]); + ben.extend_from_slice(&2u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (512, 1), (3, 3)]); + let expected: String = (1..=2).map(|i| expected_line(&assign, i)).collect(); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_empty_stream_produces_no_output() { + let ben = b"MKVCHAIN BEN FILE".to_vec(); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + assert!(out.is_empty()); +} + +// ─── jsonl_decode_ben32 ──────────────────────────────────────────────── + +#[test] +fn jsonl_decode_ben32_mkvchain_count_one() { + // ben32: [(1,4),(2,1),(3,3)] + terminator + count=1 + let input: Vec = vec![ + 0, 1, 0, 4, // (1, 4) + 0, 2, 0, 1, // (2, 1) + 0, 3, 0, 3, // (3, 3) + 0, 0, 0, 0, // terminator + 0, 1, // count = 1 + ]; + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + assert_eq!(out, expected_line(&assign, 1).as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_mkvchain_count_five_expands_correctly() { + // Single record with count=5 → 5 lines + let mut input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; + input.extend_from_slice(&5u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let expected: String = (1..=5).map(|i| expected_line(&[23], i)).collect(); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_mkvchain_two_records_correct_sample_numbers() { + // Record 1: [23] count=2 → samples 1,2 + // Record 2: [1,2,3,4] count=1 → sample 3 + let mut input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; + input.extend_from_slice(&2u16.to_be_bytes()); + input.extend_from_slice(&[0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 0, 0, 0]); + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let expected = + expected_line(&[23], 1) + &expected_line(&[23], 2) + &expected_line(&[1, 2, 3, 4], 3); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_mkvchain_starting_sample_offset() { + // starting_sample=5 → first output line has sample=6 + let mut input: Vec = vec![0, 7, 0, 1, 0, 0, 0, 0]; + input.extend_from_slice(&2u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 5, BenVariant::MkvChain).unwrap(); + + let expected = expected_line(&[7], 6) + &expected_line(&[7], 7); + assert_eq!(out, expected.as_bytes()); +} + +// ─── decode_xben_to_ben round-trip ──────────────────────────────────── + +#[test] +fn decode_xben_to_ben_mkvchain_roundtrip() { + let ben = mkv_ben(1); + let mut xben = Vec::new(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut decoded).unwrap(); + + // Verify by decoding the reconstructed BEN to JSONL + let mut jsonl = Vec::new(); + decode_ben_to_jsonl(decoded.as_slice(), &mut jsonl).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + assert_eq!(jsonl, expected_line(&assign, 1).as_bytes()); +} + +#[test] +fn decode_xben_to_jsonl_mkvchain_count_expands() { + // count=4 frame, verify XBEN → JSONL produces 4 lines + let ben = mkv_ben(4); + let mut xben = Vec::new(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + + let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); + let expected: String = (1..=4).map(|i| expected_line(&assign, i)).collect(); + assert_eq!(jsonl, expected.as_bytes()); +} + +// ─── error paths ────────────────────────────────────────────────────── + +#[test] +fn decode_ben_to_jsonl_truncated_count_field_errors() { + // Frame with only 1 byte of the 2-byte count field + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(FRAME_HEADER); + ben.extend_from_slice(FRAME_PAYLOAD); + ben.push(0x00); // only one byte of count instead of two + + let err = decode_ben_to_jsonl(ben.as_slice(), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn decode_xben_to_jsonl_rejects_mkvchain_partial_overflow() { + // Compress just the banner + 3 garbage bytes → no valid frames + let mut xz = Vec::new(); + let mut inner = b"MKVCHAIN BEN FILE".to_vec(); + inner.extend_from_slice(&[1, 2, 3]); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + + let mut out = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); + assert!(out.is_empty()); +} + +// ─── decode_ben_to_jsonl — byte-level frame encoding counterparts ────── +// These mirror the Standard tests in standard.rs exactly, differing only in +// the MKVCHAIN banner and the trailing u16 BE count field appended to each frame. + +#[test] +fn decode_ben_to_jsonl_exact() { + // Same 5-byte payload as test_jsonl_decode_ben_exact, count=1. + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 3, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b01100_100, + 0b01_11011_1, + 0b0010_1111, + 0b1_01001_10, + 0b001_11001_, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![ + (1u16, 4), + (2, 1), + (3, 3), + (2, 2), + (3, 7), + (1, 1), + (2, 1), + (3, 1), + ]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_16bit_len() { + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 10, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b01000000, + 0b0100_1010, + 0b00000000_, + 0b11000000, + 0b0011_0000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 4), (2, 512), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_val_65535() { + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[16, 4, 0, 0, 0, 8]); + ben.extend_from_slice(&[ + 0b00000000, + 0b00010111, + 0b0100_1111, + 0b11111111, + 0b11111111_, + 0b00000000, + 0b00001000, + 0b0011_0000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(23u16, 4), (65535, 15), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_len_65535() { + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[6, 16, 0, 0, 0, 9]); + ben.extend_from_slice(&[ + 0b01011100, + 0b00000000, + 0b000100_11, + 0b11001111, + 0b11111111, + 0b1111_0010, + 0b00000000, + 0b000000000, + 0b11_000000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(23u16, 4), (60, 65535), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_val_and_len_65535() { + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[16, 16, 0, 0, 0, 12]); + ben.extend_from_slice(&[ + 0b00000000, + 0b00000001, + 0b00000000, + 0b00000011_, + 0b11111111, + 0b11111111, + 0b11111111, + 0b11111111_, + 0b00000000, + 0b00001000, + 0b00000000, + 0b00000100_, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 3), (65535, 65535), (8, 4)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_single_element() { + // Assignment [23], count=1. + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[5, 1, 0, 0, 0, 1, 0b101111_00]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + assert_eq!(out, expected_line(&[23u16], 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_single_one() { + // Assignment [1], count=1. + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + ben.extend_from_slice(&[1, 1, 0, 0, 0, 1, 0b11_000000]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + assert_eq!(out, expected_line(&[1u16], 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_three_frames() { + // Three distinct frames, each count=1 — mirrors test_decode_ben_multiple_simple_lines. + let mut ben = b"MKVCHAIN BEN FILE".to_vec(); + // Frame 1: rle [(1,4),(2,4),(3,4),(4,4)] + ben.extend_from_slice(&[3, 3, 0, 0, 0, 3, 0b001100_01, 0b0100_0111, 0b00_100100]); + ben.extend_from_slice(&1u16.to_be_bytes()); + // Frame 2: rle [(2,2),(3,7),(1,1),(2,1),(3,1)] + ben.extend_from_slice(&[ + 2, + 3, + 0, + 0, + 0, + 4, + 0b10010_111, + 0b11_01001_1, + 0b0001_1100, + 0b1_0000000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + // Frame 3: rle [(1..10, each 1)] + ben.extend_from_slice(&[ + 4, + 1, + 0, + 0, + 0, + 7, + 0b00011_001, + 0b01_00111_0, + 0b1001_0101, + 0b1_01101_01, + 0b111_10001, + 0b10011_101, + 0b01_000000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_lst: Vec> = vec![ + vec![(1, 4), (2, 4), (3, 4), (4, 4)], + vec![(2, 2), (3, 7), (1, 1), (2, 1), (3, 1)], + vec![ + (1, 1), + (2, 1), + (3, 1), + (4, 1), + (5, 1), + (6, 1), + (7, 1), + (8, 1), + (9, 1), + (10, 1), + ], + ]; + let expected: String = rle_lst + .into_iter() + .enumerate() + .map(|(i, rle)| { + json!({ + "assignment": rle_to_vec(rle).iter().map(|x| json!(x)).collect::>(), + "sample": i + 1 + }) + .to_string() + + "\n" + }) + .collect(); + assert_eq!(out, expected.as_bytes()); +} + +// ─── jsonl_decode_ben32 — byte-level counterparts ───────────────────── +// Each Standard ben32 record has [pairs...][0,0,0,0] terminator. +// Each MkvChain ben32 record appends a u16 BE count after the terminator. + +#[test] +fn jsonl_decode_ben32_16bit_val() { + let mut input = vec![0, 1, 0, 4, 2, 0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]; + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let rle_assign = vec![(1u16, 4), (512, 1), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_16bit_len() { + let mut input = vec![0, 1, 0, 4, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0]; + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let rle_assign = vec![(1u16, 4), (2, 512), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_max_val_65535() { + let mut input = vec![0, 23, 0, 4, 255, 255, 0, 15, 0, 8, 0, 3, 0, 0, 0, 0]; + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let rle_assign = vec![(23u16, 4), (65535, 15), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_max_len_65535() { + let mut input = vec![0, 23, 0, 4, 0, 60, 255, 255, 0, 8, 0, 3, 0, 0, 0, 0]; + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let rle_assign = vec![(23u16, 4), (60, 65535), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_single_element() { + let mut input = vec![0, 23, 0, 1, 0, 0, 0, 0]; + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let expected = + json!({"assignment": [23u16], "sample": 1}).to_string() + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn jsonl_decode_ben32_three_frames() { + // Three ben32 records with count=1 each — mirrors test_decode_ben32_multiple_simple_lines. + let mut input: Vec = Vec::new(); + // Record 1: rle [(1,4),(2,4),(3,4),(4,4)] + input.extend_from_slice(&[ + 0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, + ]); + input.extend_from_slice(&1u16.to_be_bytes()); + // Record 2: rle [(2,2),(3,7),(1,1),(2,1),(3,1)] + input.extend_from_slice(&[ + 0, 2, 0, 2, 0, 3, 0, 7, 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 0, 0, 0, + ]); + input.extend_from_slice(&1u16.to_be_bytes()); + // Record 3: rle [(1..10, each 1)] + input.extend_from_slice(&[ + 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 5, 0, 1, 0, 6, 0, 1, 0, 7, 0, 1, 0, + 8, 0, 1, 0, 9, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, + ]); + input.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + + let rle_lst: Vec> = vec![ + vec![(1, 4), (2, 4), (3, 4), (4, 4)], + vec![(2, 2), (3, 7), (1, 1), (2, 1), (3, 1)], + vec![ + (1, 1), + (2, 1), + (3, 1), + (4, 1), + (5, 1), + (6, 1), + (7, 1), + (8, 1), + (9, 1), + (10, 1), + ], + ]; + let expected: String = rle_lst + .into_iter() + .enumerate() + .map(|(i, rle)| { + json!({ + "assignment": rle_to_vec(rle).iter().map(|x| json!(x)).collect::>(), + "sample": i + 1 + }) + .to_string() + + "\n" + }) + .collect(); + assert_eq!(out, expected.as_bytes()); +} diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs new file mode 100644 index 0000000..5cef702 --- /dev/null +++ b/ben/src/codec/decode/tests/mod.rs @@ -0,0 +1,3 @@ +mod mkvchain; +mod standard; +mod twodelta; diff --git a/ben/src/codec/decode/tests.rs b/ben/src/codec/decode/tests/standard.rs similarity index 99% rename from ben/src/codec/decode/tests.rs rename to ben/src/codec/decode/tests/standard.rs index f6f5453..bb74deb 100644 --- a/ben/src/codec/decode/tests.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -1,4 +1,5 @@ -use super::*; +use crate::codec::decode::jsonl_decode_ben32; +use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use crate::codec::encode::xz_compress; use crate::util::rle::rle_to_vec; use crate::BenVariant; diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs new file mode 100644 index 0000000..047adbc --- /dev/null +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -0,0 +1,551 @@ +use crate::codec::decode::{ + apply_twodelta_runs_to_assignment, decode_ben_to_jsonl, decode_twodelta_frame, + decode_xben_to_jsonl, +}; +use crate::codec::encode::{encode_ben_to_xben, encode_twodelta_frame}; +use crate::codec::frames::TwoDeltaEncodeFrame; +use crate::io::writer::AssignmentWriter; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; +use serde_json::{json, Value}; +use std::io::BufReader; + +// Build a TwoDelta BEN stream for the given sequence of assignments. +fn make_twodelta_ben(assignments: &[Vec]) -> Vec { + let mut out = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); + for a in assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + out +} + +fn expected_line(assignment: &[u16], sample: usize) -> String { + json!({"assignment": assignment, "sample": sample}).to_string() + "\n" +} + +// ─── apply_twodelta_runs_to_assignment ───────────────────────────────── + +#[test] +fn apply_runs_basic_two_position_swap() { + // prev: [1,2,1,2], run_lengths=[2,2] starting with value 1 + // → first 2 pair positions get value 1, next 2 get value 2 + // pair positions (where val is 1 or 2): 0,1,2,3 + // run 1 (len=2, val=1): pos 0,1 → 1,1; run 2 (len=2, val=2): pos 2,3 → 2,2 + let prev = vec![1u16, 2, 1, 2]; + let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[2, 2]).unwrap(); + assert_eq!(result, vec![1, 1, 2, 2]); +} + +#[test] +fn apply_runs_non_pair_positions_unchanged() { + // prev: [1,2,3,1,2], pair=(1,2), run_lengths=[2,2] + // pair positions: 0,1,3,4 (index 2 holds value 3 → unchanged) + // run 1 (len=2, val=1): pos 0,1 → 1,1 + // run 2 (len=2, val=2): pos 3,4 → 2,2 + let prev = vec![1u16, 2, 3, 1, 2]; + let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[2, 2]).unwrap(); + assert_eq!(result, vec![1, 1, 3, 2, 2]); +} + +#[test] +fn apply_runs_full_reversal() { + // prev: [1,1,2,2], pair=(2,1), run_lengths=[2,2] + // pair positions: 0,1,2,3; pair.0=2 comes first + // run 1 (len=2, val=2): pos 0,1 → 2,2; run 2 (len=2, val=1): pos 2,3 → 1,1 + let prev = vec![1u16, 1, 2, 2]; + let result = apply_twodelta_runs_to_assignment(prev, (2, 1), &[2, 2]).unwrap(); + assert_eq!(result, vec![2, 2, 1, 1]); +} + +#[test] +fn apply_runs_exhausted_before_all_positions_covered_errors() { + // prev: [1,2,1], pair=(1,2), run_lengths=[1] — too short + // After consuming run 0 (1 position with value 1), run 1 missing → error + let prev = vec![1u16, 2, 1]; + let err = apply_twodelta_runs_to_assignment(prev, (1, 2), &[1]).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn apply_runs_alternating_single_positions() { + // prev: [1,2,1,2,1], pair=(1,2), run_lengths=[1,1,1,1,1] + // Each pair position flips: run alternates 1,2,1,2,1 + let prev = vec![1u16, 2, 1, 2, 1]; + let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[1, 1, 1, 1, 1]).unwrap(); + // run[0]=1 → pos0=1; run[1]=1 → pos1=2; run[2]=1 → pos2=1; etc. + assert_eq!(result, vec![1, 2, 1, 2, 1]); +} + +// ─── decode_twodelta_frame ───────────────────────────────────────────── + +#[test] +fn decode_twodelta_frame_basic() { + let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let prev = vec![1u16, 2, 1, 2]; + let result = decode_twodelta_frame(prev, &frame).unwrap(); + assert_eq!(result, vec![1, 1, 2, 2]); +} + +#[test] +fn decode_twodelta_frame_full_swap() { + // pair=(2,1) means run starts with value 2; run_lengths=[2,2] + // prev [1,2,1,2]: pair positions 0,1,2,3 → [2,2,1,1] + let frame = TwoDeltaEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None); + let prev = vec![1u16, 2, 1, 2]; + let result = decode_twodelta_frame(prev, &frame).unwrap(); + assert_eq!(result, vec![2, 2, 1, 1]); +} + +#[test] +fn decode_twodelta_frame_chain_returns_to_original() { + // Frame 1: (1,2) run=[2,2] applied to [1,2,1,2] → [1,1,2,2] + // Frame 2: (1,2) run=[1,1,1,1] applied to [1,1,2,2] → [1,2,1,2] + let f1 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let f2 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None); + let initial = vec![1u16, 2, 1, 2]; + let after_f1 = decode_twodelta_frame(initial.clone(), &f1).unwrap(); + assert_eq!(after_f1, vec![1, 1, 2, 2]); + let after_f2 = decode_twodelta_frame(after_f1, &f2).unwrap(); + assert_eq!(after_f2, initial); +} + +#[test] +fn decode_twodelta_frame_roundtrip_with_encode() { + // Verify that encode_twodelta_frame + decode_twodelta_frame is identity. + let prev = vec![1u16, 1, 2, 2, 1, 2, 1, 2]; + let next = vec![2u16, 2, 1, 1, 1, 2, 1, 2]; + let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); + let decoded = decode_twodelta_frame(prev, &frame).unwrap(); + assert_eq!(decoded, next); +} + +#[test] +fn decode_twodelta_frame_larger_assignment() { + let prev: Vec = (0..100).map(|i| if i < 50 { 1 } else { 2 }).collect(); + let next: Vec = (0..100).map(|i| if i < 50 { 2 } else { 1 }).collect(); + let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); + let result = decode_twodelta_frame(prev, &frame).unwrap(); + assert_eq!(result, next); +} + +// ─── decode_ben_to_jsonl with TwoDelta BEN streams ───────────────────── + +#[test] +fn decode_ben_to_jsonl_twodelta_anchor_only() { + let assignments = vec![vec![1u16, 2, 1, 2]]; + let ben = make_twodelta_ben(&assignments); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + assert_eq!(out, expected_line(&[1, 2, 1, 2], 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_twodelta_anchor_plus_one_delta() { + let assignments = vec![ + vec![1u16, 2, 1, 2], // anchor + vec![1u16, 1, 2, 2], // delta: swap positions 1 and 2 + ]; + let ben = make_twodelta_ben(&assignments); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let expected = expected_line(&[1, 2, 1, 2], 1) + &expected_line(&[1, 1, 2, 2], 2); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_twodelta_chain_of_deltas() { + let a0 = vec![1u16, 2, 1, 2]; + let a1 = vec![1u16, 1, 2, 2]; + let a2 = vec![2u16, 1, 2, 1]; + let a3 = vec![2u16, 2, 1, 1]; + let assignments = vec![a0.clone(), a1.clone(), a2.clone(), a3.clone()]; + let ben = make_twodelta_ben(&assignments); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let expected = expected_line(&a0, 1) + + &expected_line(&a1, 2) + + &expected_line(&a2, 3) + + &expected_line(&a3, 4); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_twodelta_repeated_anchor_expands() { + // Writing the same assignment 3 times then a delta; anchor frame should have count=3. + let anchor = vec![1u16, 2, 1, 2]; + let delta = vec![1u16, 1, 2, 2]; + let assignments = vec![ + anchor.clone(), + anchor.clone(), + anchor.clone(), + delta.clone(), + ]; + let ben = make_twodelta_ben(&assignments); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let expected = expected_line(&anchor, 1) + + &expected_line(&anchor, 2) + + &expected_line(&anchor, 3) + + &expected_line(&delta, 4); + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_twodelta_multiple_repeated_deltas() { + // Anchor repeated twice, then a delta repeated twice + let anchor = vec![1u16, 2, 1, 2]; + let delta = vec![2u16, 1, 2, 1]; + let assignments = vec![anchor.clone(), anchor.clone(), delta.clone(), delta.clone()]; + let ben = make_twodelta_ben(&assignments); + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let expected = expected_line(&anchor, 1) + + &expected_line(&anchor, 2) + + &expected_line(&delta, 3) + + &expected_line(&delta, 4); + assert_eq!(out, expected.as_bytes()); +} + +// ─── decode_ben_to_jsonl — byte-level anchor frame counterparts ──────── +// The TwoDelta first frame (anchor) is encoded in MkvChain format. These tests +// mirror every byte-level Standard / MkvChain decode_ben_to_jsonl test using the +// TWODELTA banner and the same bit-packed frame bytes, verifying that the anchor +// path decodes the same payload correctly regardless of variant. + +#[test] +fn decode_ben_to_jsonl_underflow_anchor() { + // Mirrors test_jsonl_decode_ben_underflow: 2-byte payload, 1 padding bit. + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 3, 0, 0, 0, 2, 0b01100_100, 0b01_11011_0]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 4), (2, 1), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_exact_anchor() { + // Mirrors test_jsonl_decode_ben_exact: 5-byte payload, zero padding. + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 3, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b01100_100, + 0b01_11011_1, + 0b0010_1111, + 0b1_01001_10, + 0b001_11001_, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![ + (1u16, 4), + (2, 1), + (3, 3), + (2, 2), + (3, 7), + (1, 1), + (2, 1), + (3, 1), + ]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_16bit_val_anchor() { + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[10, 3, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b00000000, + 0b01100_100, + 0b00000000, + 0b01_000000, + 0b0011011_0, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 4), (512, 1), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_16bit_len_anchor() { + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 10, 0, 0, 0, 5]); + ben.extend_from_slice(&[ + 0b01000000, + 0b0100_1010, + 0b00000000_, + 0b11000000, + 0b0011_0000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 4), (2, 512), (3, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_val_65535_anchor() { + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[16, 4, 0, 0, 0, 8]); + ben.extend_from_slice(&[ + 0b00000000, + 0b00010111, + 0b0100_1111, + 0b11111111, + 0b11111111_, + 0b00000000, + 0b00001000, + 0b0011_0000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(23u16, 4), (65535, 15), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_len_65535_anchor() { + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[6, 16, 0, 0, 0, 9]); + ben.extend_from_slice(&[ + 0b01011100, + 0b00000000, + 0b000100_11, + 0b11001111, + 0b11111111, + 0b1111_0010, + 0b00000000, + 0b000000000, + 0b11_000000, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(23u16, 4), (60, 65535), (8, 3)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_max_val_and_len_65535_anchor() { + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[16, 16, 0, 0, 0, 12]); + ben.extend_from_slice(&[ + 0b00000000, + 0b00000001, + 0b00000000, + 0b00000011_, + 0b11111111, + 0b11111111, + 0b11111111, + 0b11111111_, + 0b00000000, + 0b00001000, + 0b00000000, + 0b00000100_, + ]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let rle_assign = vec![(1u16, 3), (65535, 65535), (8, 4)]; + let expected = json!({ + "assignment": rle_to_vec(rle_assign).iter().map(|x| json!(x)).collect::>(), + "sample": 1 + }) + .to_string() + + "\n"; + assert_eq!(out, expected.as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_single_element_anchor() { + // Anchor assignment [23], count=1. + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[5, 1, 0, 0, 0, 1, 0b101111_00]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + assert_eq!(out, expected_line(&[23u16], 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_single_one_anchor() { + // Anchor assignment [1], count=1. + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[1, 1, 0, 0, 0, 1, 0b11_000000]); + ben.extend_from_slice(&1u16.to_be_bytes()); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + assert_eq!(out, expected_line(&[1u16], 1).as_bytes()); +} + +#[test] +fn decode_ben_to_jsonl_three_frames_byte_level() { + // Hand-crafted TwoDelta BEN stream with explicit wire bytes: + // anchor [1,2] (count=1) in MkvChain format + // delta [1,2]→[2,1] (count=1) in TwoDelta format + // delta [2,1]→[1,2] (count=1) in TwoDelta format + // + // Anchor [1,2]: + // max_val_bits=2, max_len_bits=1, n_bytes=1 + // RLE (1,1),(2,1) → 3 bits each: 011_101_XX = 0b01110100 = 0x74 + // raw_bytes = [2, 1, 0,0,0,1, 0x74, 0,1] + // + // Delta [1,2]→[2,1]: + // pair=(2,1), run_lengths=[1,1], max_len_bits=1, n_bytes=1 + // payload: 2 × 1-bit values packed → 0b11000000 = 0xC0 + // raw_bytes = [0,2, 0,1, 1, 0,0,0,1, 0xC0, 0,1] + // + // Delta [2,1]→[1,2]: + // pair=(1,2), run_lengths=[1,1], same encoding + // raw_bytes = [0,1, 0,2, 1, 0,0,0,1, 0xC0, 0,1] + let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.extend_from_slice(&[2, 1, 0, 0, 0, 1, 0x74, 0, 1]); + ben.extend_from_slice(&[0, 2, 0, 1, 1, 0, 0, 0, 1, 0xC0, 0, 1]); + ben.extend_from_slice(&[0, 1, 0, 2, 1, 0, 0, 0, 1, 0xC0, 0, 1]); + + let mut out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap(); + + let expected = expected_line(&[1u16, 2], 1) + + &expected_line(&[2u16, 1], 2) + + &expected_line(&[1u16, 2], 3); + assert_eq!(out, expected.as_bytes()); +} + +// ─── decode_xben_to_jsonl round-trip ────────────────────────────────── + +#[test] +fn decode_xben_to_jsonl_twodelta_anchor_only() { + let anchor = vec![1u16, 2, 1, 2]; + let ben = make_twodelta_ben(&[anchor.clone()]); + let mut xben = Vec::new(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + + assert_eq!(jsonl, expected_line(&anchor, 1).as_bytes()); +} + +#[test] +fn decode_xben_to_jsonl_twodelta_chain_roundtrip() { + let a0 = vec![1u16, 2, 1, 2]; + let a1 = vec![1u16, 1, 2, 2]; + let a2 = vec![2u16, 1, 2, 1]; + let assignments = vec![a0.clone(), a1.clone(), a2.clone()]; + + let ben = make_twodelta_ben(&assignments); + let mut xben = Vec::new(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + + let expected = expected_line(&a0, 1) + &expected_line(&a1, 2) + &expected_line(&a2, 3); + assert_eq!(jsonl, expected.as_bytes()); +} + +#[test] +fn decode_xben_to_jsonl_twodelta_with_repetitions() { + // Repeated assignments in XBEN → correct expansion + let anchor = vec![1u16, 2, 1, 2]; + let assignments = vec![anchor.clone(), anchor.clone(), anchor.clone()]; + let ben = make_twodelta_ben(&assignments); + let mut xben = Vec::new(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + + let expected: String = (1..=3).map(|i| expected_line(&anchor, i)).collect(); + assert_eq!(jsonl, expected.as_bytes()); +} diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs new file mode 100644 index 0000000..c110221 --- /dev/null +++ b/ben/tests/test_assignment_reader.rs @@ -0,0 +1,1074 @@ +//! Rigorous tests for `AssignmentReader` with the MkvChain and TwoDelta BEN variants. +//! +//! Standard-variant tests already exist in `test_coverage.rs`. This file adds +//! equivalent depth for the two more complex variants. The helpers intentionally +//! mirror those in `test_coverage.rs` so that the two suites are easy to compare. + +use binary_ensemble::codec::decode::{decode_ben_line, decode_ben_to_jsonl}; +use binary_ensemble::codec::encode::encode_jsonl_to_ben; +use binary_ensemble::format::banners::{MKVCHAIN_BEN_BANNER, TWODELTA_BEN_BANNER}; +use binary_ensemble::io::reader::{AssignmentFrameReader, AssignmentReader}; +use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::util::rle::rle_to_vec; +use binary_ensemble::BenVariant; + +use serde_json::json; +use std::io::{self, Cursor, Write}; + +// ────────────────────────────────────────────────────────────────────────────── +// Shared helpers +// ────────────────────────────────────────────────────────────────────────────── + +fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { + let mut buf = Vec::new(); + for (i, a) in assignments.iter().enumerate() { + writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); + } + buf +} + +fn encode_ben(assignments: &[Vec], variant: BenVariant) -> Vec { + let jsonl = jsonl_from_assignments(assignments); + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, variant).unwrap(); + ben +} + +/// Expand all repetitions by calling `for_each_assignment`. +fn expand_assignments(ben: &[u8]) -> Vec> { + let mut decoder = AssignmentReader::new(ben).unwrap().silent(true); + let mut out = Vec::new(); + decoder + .for_each_assignment(|a, count| { + for _ in 0..count { + out.push(a.to_vec()); + } + Ok(true) + }) + .unwrap(); + out +} + +// ────────────────────────────────────────────────────────────────────────────── +// MkvChain variant +// ────────────────────────────────────────────────────────────────────────────── + +mod mkvchain { + use super::*; + + // ─── banner and initialisation ──────────────────────────────────────────── + + #[test] + fn banner_is_correct() { + let ben = encode_ben(&[vec![1u16, 2, 3]], BenVariant::MkvChain); + assert!(ben.starts_with(MKVCHAIN_BEN_BANNER)); + } + + #[test] + fn variant_accessor_returns_mkvchain() { + let ben = encode_ben(&[vec![1u16, 2]], BenVariant::MkvChain); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + assert_eq!(decoder.variant(), BenVariant::MkvChain); + } + + #[test] + fn empty_payload_yields_nothing() { + let ben = MKVCHAIN_BEN_BANNER.to_vec(); + let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let frames: Vec<_> = decoder.collect::>>().unwrap(); + assert!(frames.is_empty()); + } + + // ─── iterator / round-trips ─────────────────────────────────────────────── + + #[test] + fn single_assignment_round_trip() { + let assignment = vec![3u16, 3, 1, 2, 2, 1]; + let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 1); + assert_eq!(decoded, assignment); + assert!(decoder.next().is_none()); + } + + #[test] + fn multiple_distinct_assignments_each_have_count_one() { + let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![2u16, 1, 3]]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + for expected in &assignments { + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 1, "distinct assignment should have count=1"); + assert_eq!(&decoded, expected); + } + assert!(decoder.next().is_none()); + } + + #[test] + fn identical_run_compressed_into_single_frame_with_correct_count() { + // 5 identical assignments → one frame with count = 5. + let assignment = vec![2u16, 2, 1, 1]; + let assignments = vec![assignment.clone(); 5]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 5, "expected compressed count=5, got {count}"); + assert_eq!(decoded, assignment); + assert!(decoder.next().is_none()); + } + + #[test] + fn mixed_runs_yield_correct_frame_counts() { + // [A×3, B×2, C×1] → three frames with counts [3, 2, 1]. + let a = vec![1u16, 1, 1]; + let b = vec![2u16, 2, 2]; + let c = vec![3u16, 3, 3]; + let assignments = [ + a.clone(), a.clone(), a.clone(), + b.clone(), b.clone(), + c.clone(), + ]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + + let (d1, c1) = decoder.next().unwrap().unwrap(); + assert_eq!(c1, 3); + assert_eq!(d1, a); + + let (d2, c2) = decoder.next().unwrap().unwrap(); + assert_eq!(c2, 2); + assert_eq!(d2, b); + + let (d3, c3) = decoder.next().unwrap().unwrap(); + assert_eq!(c3, 1); + assert_eq!(d3, c); + + assert!(decoder.next().is_none()); + } + + #[test] + fn alternating_assignments_each_have_count_one() { + // A,B,A,B,A — no adjacent pair is identical, so every frame has count=1. + let a = vec![1u16, 2, 3]; + let b = vec![3u16, 2, 1]; + let assignments: Vec> = (0..5) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let records: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .collect::>>() + .unwrap(); + assert_eq!(records.len(), 5); + for (_, count) in &records { + assert_eq!(*count, 1, "alternating frames should each have count=1"); + } + } + + #[test] + fn iterator_values_match_original_assignments() { + let assignments: Vec> = (0u16..8).map(|i| vec![i, i + 1]).collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let decoded: Vec> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(decoded, assignments); + } + + // ─── count_samples ──────────────────────────────────────────────────────── + + #[test] + fn count_samples_with_no_repetitions() { + let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + } + + #[test] + fn count_samples_expands_repetitions() { + // 3×A + 2×B = 5 total samples from 2 frames. + let a = vec![1u16, 0]; + let b = vec![0u16, 1]; + let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..2).map(|_| b.clone())).collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 5); + } + + #[test] + fn count_samples_empty_stream() { + let ben = MKVCHAIN_BEN_BANNER.to_vec(); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 0); + } + + // ─── write_all_jsonl ────────────────────────────────────────────────────── + + #[test] + fn write_all_jsonl_expands_repetitions() { + // A single frame with count=3 must produce 3 separate JSONL lines. + let assignment = vec![5u16, 5, 5]; + let ben = encode_ben(&vec![assignment.clone(); 3], BenVariant::MkvChain); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + + assert_eq!(s.lines().count(), 3, "expected 3 JSONL lines for 3 samples"); + for line in s.lines() { + assert!(line.contains("\"assignment\":[5,5,5]"), "bad line: {line}"); + } + } + + #[test] + fn write_all_jsonl_sample_numbers_are_sequential() { + // Sample numbers must be 1, 2, 3 even when originating from one compressed frame. + let assignment = vec![1u16, 2, 3]; + let ben = encode_ben(&vec![assignment; 3], BenVariant::MkvChain); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + + let parsed: Vec = + s.lines().map(|l| serde_json::from_str(l).unwrap()).collect(); + for (i, v) in parsed.iter().enumerate() { + assert_eq!(v["sample"], i as u64 + 1, "sample number mismatch at position {i}"); + } + } + + #[test] + fn write_all_jsonl_mixed_runs_are_correct() { + let a = vec![10u16, 20]; + let b = vec![30u16, 40]; + // A, A, B → 3 lines, first two are [10,20], third is [30,40]. + let ben = encode_ben(&[a.clone(), a.clone(), b.clone()], BenVariant::MkvChain); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + let lines: Vec<&str> = s.lines().collect(); + + assert_eq!(lines.len(), 3); + assert!(lines[0].contains("[10,20]"), "line 0: {}", lines[0]); + assert!(lines[1].contains("[10,20]"), "line 1: {}", lines[1]); + assert!(lines[2].contains("[30,40]"), "line 2: {}", lines[2]); + } + + #[test] + fn write_all_jsonl_matches_codec_decode() { + let assignments: Vec> = vec![ + vec![1u16, 2, 1, 2], + vec![1u16, 2, 1, 2], + vec![2u16, 1, 2, 1], + ]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut via_reader = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut via_reader).unwrap(); + + let mut via_codec = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut via_codec).unwrap(); + + assert_eq!(via_reader, via_codec); + } + + // ─── for_each_assignment ───────────────────────────────────────────────── + + #[test] + fn for_each_receives_correct_count() { + let assignment = vec![7u16, 8, 9]; + let ben = encode_ben(&vec![assignment.clone(); 4], BenVariant::MkvChain); + + let mut seen_count = 0u16; + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|a, count| { + assert_eq!(a, assignment.as_slice()); + seen_count = count; + Ok(true) + }) + .unwrap(); + assert_eq!(seen_count, 4); + } + + #[test] + fn for_each_mixed_runs_delivers_correct_pairs() { + let a = vec![1u16, 1]; + let b = vec![2u16, 2]; + let c = vec![3u16, 3]; + let assignments = [a.clone(), a.clone(), a.clone(), b.clone(), b.clone(), c.clone()]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut frames: Vec<(Vec, u16)> = Vec::new(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + frames.push((assignment.to_vec(), count)); + Ok(true) + }) + .unwrap(); + + assert_eq!(frames, vec![(a, 3), (b, 2), (c, 1)]); + } + + #[test] + fn for_each_early_stop_terminates_after_first_frame() { + let a = vec![1u16, 1]; + let b = vec![2u16, 2]; + let c = vec![3u16, 3]; + let assignments = [a.clone(), a.clone(), b.clone(), c.clone()]; + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let mut seen = Vec::new(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, _count| { + seen.push(assignment.to_vec()); + Ok(false) // stop immediately after first frame + }) + .unwrap(); + assert_eq!(seen, vec![a]); + } + + // ─── into_frames (AssignmentFrameReader) ────────────────────────────────── + + #[test] + fn frame_reader_yields_count_in_tuple() { + // A run of 3 identical assignments → one frame tuple (frame, 3). + let assignment = vec![5u16, 6, 7]; + let ben = encode_ben(&vec![assignment; 3], BenVariant::MkvChain); + + let frames: Vec<_> = AssignmentFrameReader::new(Cursor::new(ben)) + .unwrap() + .collect::>>() + .unwrap(); + assert_eq!(frames.len(), 1, "expected one compressed frame"); + assert_eq!(frames[0].1, 3, "count should be 3"); + } + + #[test] + fn frame_reader_mixed_runs_yield_correct_counts() { + let a = vec![1u16, 0]; + let b = vec![0u16, 1]; + // A×2, B×1 → 2 frames with counts [2, 1]. + let ben = encode_ben(&[a.clone(), a.clone(), b.clone()], BenVariant::MkvChain); + + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .collect::>>() + .unwrap(); + assert_eq!(frames.len(), 2); + assert_eq!(frames[0].1, 2, "first frame count"); + assert_eq!(frames[1].1, 1, "second frame count"); + } + + #[test] + fn frame_reader_bytes_decode_back_to_original_assignment() { + let assignment = vec![3u16, 3, 1, 2]; + let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); + + let (frame, _count) = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .next() + .unwrap() + .unwrap(); + + let decoded = decode_ben_line( + Cursor::new(&frame.raw_bytes), + frame.max_val_bit_count, + frame.max_len_bit_count, + frame.n_bytes, + ) + .map(rle_to_vec) + .unwrap(); + assert_eq!(decoded, assignment); + } + + // ─── subsampling ────────────────────────────────────────────────────────── + // + // SubsampleFrameDecoder operates at the frame level: it returns one + // (assignment, count) tuple per frame that contains any selected indices, + // where count is the number of selected indices in that frame. + + #[test] + fn subsample_by_indices_locates_correct_sample_in_run() { + // A×5, B×5; index 3 is in the A run, index 6 is the first B. + let a = vec![1u16; 4]; + let b = vec![2u16; 4]; + let assignments: Vec<_> = (0..5).map(|_| a.clone()).chain((0..5).map(|_| b.clone())).collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_indices(vec![3usize, 6]) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(selected, vec![(a, 1), (b, 1)]); + } + + #[test] + fn subsample_by_indices_multiple_in_same_frame_returns_count() { + // A×5; indices 2 and 4 both fall in the single A frame → one result with count=2. + let a = vec![1u16; 4]; + let ben = encode_ben(&vec![a.clone(); 5], BenVariant::MkvChain); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_indices(vec![2usize, 4]) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(selected.len(), 1, "two indices in same frame → one result tuple"); + assert_eq!(selected[0].0, a); + assert_eq!(selected[0].1, 2, "count should be 2"); + } + + #[test] + fn subsample_by_range_spans_repeated_frames() { + // A×3, B×3; range [2, 5] → A contributes samples 2,3 (count=2) and B contributes 4,5 (count=2). + let a = vec![10u16; 3]; + let b = vec![20u16; 3]; + let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_range(2, 5) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(selected.len(), 2, "two frames contribute to range [2,5]"); + assert_eq!(selected[0], (a, 2)); // samples 2,3 from A + assert_eq!(selected[1], (b, 2)); // samples 4,5 from B + } + + #[test] + fn subsample_every_within_single_run() { + // A×6; every 2nd from offset 1 → indices 1,3,5 all in the A frame → count=3. + let a = vec![99u16; 2]; + let ben = encode_ben(&vec![a.clone(); 6], BenVariant::MkvChain); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_every(2, 1) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(selected.len(), 1, "all selected indices in one frame → one result"); + assert_eq!(selected[0].0, a); + assert_eq!(selected[0].1, 3, "indices 1,3,5 selected → count=3"); + } + + #[test] + fn subsample_every_across_two_runs() { + // A×4, B×4; every 2nd from offset 2 → indices 2,4,6,8 → 2 from A, 2 from B. + let a = vec![10u16; 2]; + let b = vec![20u16; 2]; + let assignments: Vec<_> = (0..4).map(|_| a.clone()).chain((0..4).map(|_| b.clone())).collect(); + let ben = encode_ben(&assignments, BenVariant::MkvChain); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_every(2, 2) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(selected, vec![(a, 2), (b, 2)]); + } + + // ─── error paths ───────────────────────────────────────────────────────── + + #[test] + fn truncated_count_field_errors_on_next() { + // Drop the last byte of the MkvChain count (u16 BE) from the stream. + let assignment = vec![1u16, 1]; + let ben = encode_ben(&[assignment], BenVariant::MkvChain); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn truncated_mid_payload_errors_on_next() { + let assignment = vec![1u16, 2, 3, 4, 5]; + let ben = encode_ben(&[assignment], BenVariant::MkvChain); + let truncated = &ben[..ben.len() - 5]; + let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn count_samples_propagates_truncation_error() { + let assignment = vec![1u16, 2]; + let ben = encode_ben(&[assignment], BenVariant::MkvChain); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated).unwrap().count_samples().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn write_all_jsonl_propagates_truncation_error() { + let assignment = vec![1u16, 2]; + let ben = encode_ben(&[assignment], BenVariant::MkvChain); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated) + .unwrap() + .write_all_jsonl(io::sink()) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } +} + +// ────────────────────────────────────────────────────────────────────────────── +// TwoDelta variant +// ────────────────────────────────────────────────────────────────────────────── + +mod twodelta { + use super::*; + + /// Encode via `AssignmentWriter` so we control the exact frame layout. + fn encode_twodelta(assignments: &[Vec]) -> Vec { + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + ben + } + + // ─── banner and initialisation ──────────────────────────────────────────── + + #[test] + fn banner_is_correct() { + let ben = encode_twodelta(&[vec![1u16, 2, 3]]); + assert!(ben.starts_with(TWODELTA_BEN_BANNER)); + } + + #[test] + fn variant_accessor_returns_twodelta() { + let ben = encode_twodelta(&[vec![1u16, 2]]); + assert_eq!( + AssignmentReader::new(ben.as_slice()).unwrap().variant(), + BenVariant::TwoDelta + ); + } + + // ─── round-trips ────────────────────────────────────────────────────────── + + #[test] + fn single_anchor_frame_round_trip() { + // A stream with only one assignment contains just the anchor frame. + let assignment = vec![1u16, 1, 2, 2, 3, 3]; + let ben = encode_twodelta(&[assignment.clone()]); + assert_eq!(expand_assignments(&ben), vec![assignment]); + } + + #[test] + fn anchor_then_single_delta_round_trip() { + let anchor = vec![1u16, 1, 2, 2]; + let next = vec![2u16, 2, 1, 1]; // all 1s↔2s swapped + let input = vec![anchor.clone(), next.clone()]; + let ben = encode_twodelta(&input); + assert_eq!(expand_assignments(&ben), input); + } + + #[test] + fn multiple_deltas_round_trip() { + // a→b→a→b: two alternating pair-swap assignments. + let a = vec![1u16, 1, 2, 2, 3, 3]; + let b = vec![2u16, 2, 1, 1, 3, 3]; // 1↔2, 3s unchanged + let input = vec![a.clone(), b.clone(), a.clone(), b.clone()]; + let ben = encode_twodelta(&input); + assert_eq!(expand_assignments(&ben), input); + } + + #[test] + fn delta_values_are_applied_correctly() { + // Explicit value check: the decoder must correctly update the previous + // assignment when it applies the delta. + // anchor: [1, 2, 1, 2, 1] + // next: [2, 1, 2, 1, 2] (every element swaps 1↔2) + let anchor = vec![1u16, 2, 1, 2, 1]; + let next = vec![2u16, 1, 2, 1, 2]; + let ben = encode_twodelta(&[anchor, next.clone()]); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let _ = decoder.next().unwrap().unwrap(); // skip anchor + let (decoded_next, _) = decoder.next().unwrap().unwrap(); + assert_eq!(decoded_next, next); + } + + #[test] + fn partial_swap_delta_is_correct() { + // Only some positions change: [1,2,3,1] → [2,2,3,2] (1s → 2s, 2s stay). + let anchor = vec![1u16, 2, 3, 1]; + let next = vec![2u16, 2, 3, 2]; + let input = vec![anchor, next]; + let ben = encode_twodelta(&input); + assert_eq!(expand_assignments(&ben), input); + } + + #[test] + fn long_delta_chain_round_trip() { + // A longer chain: a, b, a, b, a, b (6 assignments, 3 a→b and 2 b→a deltas). + let a = vec![1u16, 1, 2, 2, 1, 2]; + let b = vec![2u16, 2, 1, 1, 2, 1]; // 1↔2 everywhere + let input: Vec> = (0..6).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let ben = encode_twodelta(&input); + assert_eq!(expand_assignments(&ben), input); + } + + // ─── repetition counts ──────────────────────────────────────────────────── + + #[test] + fn anchor_repetition_count_is_correct() { + // Three identical anchor assignments → one frame with count=3. + let anchor = vec![1u16, 1, 2, 2]; + let ben = encode_twodelta(&vec![anchor.clone(); 3]); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let (decoded, count) = decoder.next().unwrap().unwrap(); + assert_eq!(count, 3, "anchor count should be 3"); + assert_eq!(decoded, anchor); + assert!(decoder.next().is_none()); + } + + #[test] + fn delta_repetition_count_is_correct() { + // a, b, b, b → anchor(a, 1), delta(a→b, 3). + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = std::iter::once(a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); + let ben = encode_twodelta(&assignments); + + let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + + let (d_anchor, c_anchor) = decoder.next().unwrap().unwrap(); + assert_eq!(c_anchor, 1, "anchor count"); + assert_eq!(d_anchor, a); + + let (d_delta, c_delta) = decoder.next().unwrap().unwrap(); + assert_eq!(c_delta, 3, "delta count should be 3"); + assert_eq!(d_delta, b); + + assert!(decoder.next().is_none()); + } + + #[test] + fn anchor_and_delta_repetitions_round_trip() { + // a×2, b×3 → anchor(2), delta(3). Expanding must give 5 correct assignments. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + assert_eq!(expand_assignments(&ben), assignments); + } + + #[test] + fn interleaved_repetitions_round_trip() { + // a, b, b, a, a, a, b → anchor(a,1), delta(a→b,2), delta(b→a,3), delta(a→b,1). + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments = vec![ + a.clone(), + b.clone(), b.clone(), + a.clone(), a.clone(), a.clone(), + b.clone(), + ]; + let ben = encode_twodelta(&assignments); + assert_eq!(expand_assignments(&ben), assignments); + } + + // ─── count_samples ──────────────────────────────────────────────────────── + + #[test] + fn count_samples_single_anchor() { + let ben = encode_twodelta(&[vec![1u16, 2, 3]]); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 1); + } + + #[test] + fn count_samples_anchor_plus_two_deltas() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments = vec![a.clone(), b.clone(), a.clone()]; + let ben = encode_twodelta(&assignments); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + } + + #[test] + fn count_samples_expands_repetitions() { + // a×2, b×3 → 5 total. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 5); + } + + // ─── write_all_jsonl ────────────────────────────────────────────────────── + + #[test] + fn write_all_jsonl_single_anchor() { + let assignment = vec![1u16, 2, 3]; + let ben = encode_twodelta(&[assignment.clone()]); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + + assert_eq!(s.lines().count(), 1); + assert!(s.contains("[1,2,3]")); + } + + #[test] + fn write_all_jsonl_expands_all_repetitions() { + // a×2, b×3 → 5 lines with correct content. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + let lines: Vec<&str> = s.lines().collect(); + + assert_eq!(lines.len(), 5); + assert!(lines[0].contains("[1,1,2,2]"), "line 0: {}", lines[0]); + assert!(lines[1].contains("[1,1,2,2]"), "line 1: {}", lines[1]); + assert!(lines[2].contains("[2,2,1,1]"), "line 2: {}", lines[2]); + assert!(lines[3].contains("[2,2,1,1]"), "line 3: {}", lines[3]); + assert!(lines[4].contains("[2,2,1,1]"), "line 4: {}", lines[4]); + } + + #[test] + fn write_all_jsonl_sample_numbers_are_sequential() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + + let mut out = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + + let parsed: Vec = + s.lines().map(|l| serde_json::from_str(l).unwrap()).collect(); + for (i, v) in parsed.iter().enumerate() { + assert_eq!(v["sample"], i as u64 + 1, "sample number at position {i}"); + } + } + + #[test] + fn write_all_jsonl_matches_codec_decode() { + let a = vec![1u16, 2, 1, 2]; + let b = vec![2u16, 1, 2, 1]; + let ben = encode_twodelta(&[a.clone(), b.clone(), a.clone()]); + + let mut via_reader = Vec::new(); + AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut via_reader).unwrap(); + + let mut via_codec = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut via_codec).unwrap(); + + assert_eq!(via_reader, via_codec); + } + + // ─── for_each_assignment ───────────────────────────────────────────────── + + #[test] + fn for_each_receives_anchor_count() { + let anchor = vec![1u16, 1, 2, 2]; + let ben = encode_twodelta(&vec![anchor.clone(); 4]); + + let mut seen: Vec<(Vec, u16)> = Vec::new(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|a, count| { + seen.push((a.to_vec(), count)); + Ok(true) + }) + .unwrap(); + + assert_eq!(seen, vec![(anchor, 4)]); + } + + #[test] + fn for_each_receives_anchor_and_delta_counts() { + // a×2, b×3 → callback invoked twice: (a, 2) then (b, 3). + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + + let mut frames: Vec<(Vec, u16)> = Vec::new(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + frames.push((assignment.to_vec(), count)); + Ok(true) + }) + .unwrap(); + + assert_eq!(frames, vec![(a, 2), (b, 3)]); + } + + #[test] + fn for_each_early_stop() { + // Three distinct frames; stopping after the second delivers exactly 2. + let a = vec![1u16, 2, 1, 2]; + let b = vec![2u16, 1, 2, 1]; + let c = vec![1u16, 1, 2, 2]; + let ben = encode_twodelta(&[a.clone(), b.clone(), c.clone()]); + + let mut seen = Vec::new(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, _count| { + seen.push(assignment.to_vec()); + Ok(seen.len() < 2) + }) + .unwrap(); + + assert_eq!(seen.len(), 2); + assert_eq!(seen[0], a); + assert_eq!(seen[1], b); + } + + // ─── into_frames (AssignmentFrameReader) ────────────────────────────────── + + #[test] + fn into_frames_count_is_preserved_through_re_encoding() { + // a×2, b×3 → 2 re-encoded frames with counts [2, 3]. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .collect::>>() + .unwrap(); + + assert_eq!(frames.len(), 2); + assert_eq!(frames[0].1, 2, "anchor frame count"); + assert_eq!(frames[1].1, 3, "delta frame count"); + } + + #[test] + fn into_frames_decodes_to_correct_assignments() { + // Each re-encoded frame must decode back to the materialized assignment. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let c = vec![1u16, 2, 1, 2]; + let input = vec![a.clone(), b.clone(), c.clone()]; + let ben = encode_twodelta(&input); + + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .collect::>>() + .unwrap(); + + assert_eq!(frames.len(), 3); + for (i, (frame, _count)) in frames.iter().enumerate() { + let decoded = decode_ben_line( + Cursor::new(&frame.raw_bytes), + frame.max_val_bit_count, + frame.max_len_bit_count, + frame.n_bytes, + ) + .map(rle_to_vec) + .unwrap(); + assert_eq!(decoded, input[i], "frame {i} decoded incorrectly"); + } + } + + #[test] + fn into_frames_from_anchor_only_has_single_frame_with_count_one() { + let assignment = vec![1u16, 2, 3]; + let ben = encode_twodelta(&[assignment]); + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .collect::>>() + .unwrap(); + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].1, 1); + } + + #[test] + fn into_frames_length_matches_unique_assignment_count() { + // a, b, a, b, a → 5 distinct frames (no run-length compression). + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let input: Vec<_> = (0..5).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let ben = encode_twodelta(&input); + + let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_frames() + .collect::>>() + .unwrap(); + assert_eq!(frames.len(), 5, "expected one frame per unique transition"); + } + + // ─── subsampling ────────────────────────────────────────────────────────── + + #[test] + fn subsample_by_indices_distinct_frames() { + // Five distinct assignments; select 1-based indices 1, 3, 5. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let c = vec![1u16, 2, 1, 2]; + let input = vec![a.clone(), b.clone(), c.clone(), a.clone(), b.clone()]; + let ben = encode_twodelta(&input); + + let selected: Vec> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_indices(vec![1usize, 3, 5]) + .map(|r| r.unwrap().0) + .collect(); + + assert_eq!(selected, vec![a.clone(), c.clone(), b.clone()]); + } + + #[test] + fn subsample_by_range_distinct_frames() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let c = vec![1u16, 2, 1, 2]; + let input = vec![a.clone(), b.clone(), c.clone(), a.clone(), b.clone()]; + let ben = encode_twodelta(&input); + + // Range [2, 4] → 3 assignments: b, c, a. + let selected: Vec> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_range(2, 4) + .map(|r| r.unwrap().0) + .collect(); + + assert_eq!(selected, vec![b.clone(), c.clone(), a.clone()]); + } + + #[test] + fn subsample_every_distinct_frames() { + // 6 cycling assignments: a,b,c,a,b,c. Every 3 from offset 1 → indices 1,4 → a,a. + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let c = vec![1u16, 2, 1, 2]; + let input = vec![a.clone(), b.clone(), c.clone(), a.clone(), b.clone(), c.clone()]; + let ben = encode_twodelta(&input); + + let selected: Vec> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_every(3, 1) + .map(|r| r.unwrap().0) + .collect(); + + assert_eq!(selected, vec![a.clone(), a.clone()]); + } + + #[test] + fn subsample_by_indices_across_repeated_frames() { + // a×3, b×3 → 6 samples from 2 frames. + // Indices 1 and 3 fall in the anchor (a) frame → (a, 2). + // Index 4 is the first b → (b, 1). + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let ben = encode_twodelta(&assignments); + + let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .into_subsample_by_indices(vec![1usize, 3, 4]) + .map(|r| r.unwrap()) + .collect(); + + assert_eq!(selected.len(), 2); + assert_eq!(selected[0], (a, 2)); // indices 1,3 in anchor frame + assert_eq!(selected[1], (b, 1)); // index 4 in delta frame + } + + // ─── error paths ───────────────────────────────────────────────────────── + + #[test] + fn truncated_anchor_errors_on_next() { + let assignment = vec![1u16, 2, 3]; + let ben = encode_twodelta(&[assignment]); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn truncated_delta_errors_on_next() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let ben = encode_twodelta(&[a.clone(), b.clone()]); + let truncated = &ben[..ben.len() - 1]; + + let mut decoder = AssignmentReader::new(truncated).unwrap().silent(true); + let _ = decoder.next().unwrap().unwrap(); // anchor succeeds + let err = decoder.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn count_samples_propagates_truncation_error() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let ben = encode_twodelta(&[a, b]); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated).unwrap().count_samples().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn write_all_jsonl_propagates_truncation_error() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let ben = encode_twodelta(&[a, b]); + let truncated = &ben[..ben.len() - 1]; + let err = AssignmentReader::new(truncated) + .unwrap() + .write_all_jsonl(io::sink()) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + } +} From 9e8fa327906838cb958899063d05698ec26e3c7b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:18:10 -0600 Subject: [PATCH 055/221] Add twodelta into cli --- ben/src/cli/ben.rs | 128 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 8b75486..1289d3f 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -16,6 +16,32 @@ use std::{ type DynReader = Box; type DynWriter = Box; +#[derive(Debug, Clone, Copy, ValueEnum, PartialEq)] +enum CliVariant { + /// Store each sample independently. + Standard, + /// Store one frame plus a repetition count for repeated consecutive samples. + #[value(alias = "mkv_chain")] + Mkvchain, + /// Store delta-encoded frames. + #[value(alias = "two_delta")] + Twodelta, +} + +/// Resolve the BEN variant from the CLI flags. +/// +/// `--variant` takes precedence over `--save-all`. +/// If neither is given, defaults to MkvChain. +fn resolve_variant(variant: Option, save_all: bool) -> BenVariant { + match variant { + Some(CliVariant::Standard) => BenVariant::Standard, + Some(CliVariant::Mkvchain) => BenVariant::MkvChain, + Some(CliVariant::Twodelta) => BenVariant::TwoDelta, + None if save_all => BenVariant::Standard, + None => BenVariant::MkvChain, + } +} + #[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] /// Defines the mode of operation. enum Mode { @@ -85,8 +111,15 @@ struct Args { /// of that assignment vector (this is useful for Markov chian methods /// like ReCom). This flag will cause the program to forgo the repetition /// count and just save all of the assignment vectors as they are encountered. + /// Equivalent to `--variant standard`. Ignored if `--variant` is set. #[arg(short = 'a', long)] save_all: bool, + /// BEN variant to use when encoding. + /// Possible values: standard, mkvchain, twodelta. + /// Defaults to mkvchain if neither this nor --save-all is given. + /// Takes precedence over --save-all when both are provided. + #[arg(short = 't', long, value_enum)] + variant: Option, /// If the output file already exists, this flag /// will cause the program to overwrite it without /// asking the user for confirmation. @@ -286,13 +319,8 @@ pub fn run() { }, }; - let possible_error = if args.save_all { - encode_jsonl_to_ben(reader, writer, BenVariant::Standard) - } else { - encode_jsonl_to_ben(reader, writer, BenVariant::MkvChain) - }; - - if let Err(err) = possible_error { + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = encode_jsonl_to_ben(reader, writer, variant) { eprintln!("Error: {:?}", err); } } @@ -340,26 +368,15 @@ pub fn run() { eprintln!("Error: {:?}", err); } } else if jsonl_and_xben { - let possible_error = if args.save_all { - encode_jsonl_to_xben( - reader, - writer, - BenVariant::Standard, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) - } else { - encode_jsonl_to_xben( - reader, - writer, - BenVariant::MkvChain, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) - }; - if let Err(e) = possible_error { + let variant = resolve_variant(args.variant, args.save_all); + if let Err(e) = encode_jsonl_to_xben( + reader, + writer, + variant, + args.n_cpus, + args.compression_level, + args.chunk_size, + ) { eprintln!("Error: {:?}", e); } } else { @@ -579,6 +596,63 @@ mod tests { assert!(args.verbose); } + #[test] + fn parse_variant_flag() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "twodelta", + "input.jsonl", + ]) + .unwrap(); + + assert_eq!(args.variant, Some(CliVariant::Twodelta)); + } + + #[test] + fn parse_variant_aliases() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "mkv_chain", + "input.jsonl", + ]) + .unwrap(); + assert_eq!(args.variant, Some(CliVariant::Mkvchain)); + + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "two_delta", + "input.jsonl", + ]) + .unwrap(); + assert_eq!(args.variant, Some(CliVariant::Twodelta)); + } + + #[test] + fn resolve_variant_precedence() { + // --variant takes precedence over --save-all + assert_eq!( + resolve_variant(Some(CliVariant::Twodelta), true), + BenVariant::TwoDelta + ); + assert_eq!( + resolve_variant(Some(CliVariant::Mkvchain), true), + BenVariant::MkvChain + ); + // --save-all alone means Standard + assert_eq!(resolve_variant(None, true), BenVariant::Standard); + // neither means MkvChain + assert_eq!(resolve_variant(None, false), BenVariant::MkvChain); + } + #[test] fn parse_xencode_stream_flags() { let args = Args::try_parse_from([ From edc9647802e242a632b004277f7917a36b06e468 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:18:13 -0600 Subject: [PATCH 056/221] add twodelta to python side --- pyben/binary_ensemble/_core.pyi | 12 ++++++------ pyben/src/common.rs | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pyben/binary_ensemble/_core.pyi b/pyben/binary_ensemble/_core.pyi index adc8b2c..64e223e 100644 --- a/pyben/binary_ensemble/_core.pyi +++ b/pyben/binary_ensemble/_core.pyi @@ -138,7 +138,7 @@ class PyBenEncoder: self, file_path: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain"] | None = None, + variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, ) -> None: """Initializes the encoder and opens the underlying file. @@ -148,7 +148,7 @@ class PyBenEncoder: Path to the output BEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain"}, optional + variant : {"standard", "mkv_chain", "twodelta"}, optional Select BEN variant. If None, defaults to "mkv_chain". Raises @@ -247,7 +247,7 @@ def compress_jsonl_to_ben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain"] | None = None, + variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, ) -> None: """Converts a JSONL file to a BEN file. @@ -259,7 +259,7 @@ def compress_jsonl_to_ben( Path to the output BEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain"}, optional + variant : {"standard", "mkv_chain", "twodelta"}, optional Select BEN variant. If None, defaults to "mkv_chain". Raises @@ -275,7 +275,7 @@ def compress_jsonl_to_xben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain"] | None = None, + variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, n_threads: int | None = None, compression_level: int | None = None, ) -> None: @@ -289,7 +289,7 @@ def compress_jsonl_to_xben( Path to the output XBEN file. overwrite : Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain"}, optional + variant : {"standard", "mkv_chain", "twodelta"}, optional Select BEN variant. If None, defaults to "mkv_chain". n_threads : Number of threads to use for compression. If None, defaults to the number of CPU cores. diff --git a/pyben/src/common.rs b/pyben/src/common.rs index bdd1673..88f5a40 100644 --- a/pyben/src/common.rs +++ b/pyben/src/common.rs @@ -9,8 +9,9 @@ pub fn parse_variant(variant: Option<&str>) -> PyResult { match variant { Some("standard") => Ok(BenVariant::Standard), Some("mkv_chain") | Some("markov") | None => Ok(BenVariant::MkvChain), + Some("twodelta") | Some("two_delta") => Ok(BenVariant::TwoDelta), Some(other) => Err(PyValueError::new_err(format!( - "Unknown variant: {other}. Supported variants are 'standard' and 'mkv_chain'." + "Unknown variant: {other}. Supported variants are 'standard', 'mkv_chain', and 'twodelta'." ))), } } From f53715ed8e40c059b3e36851d65c5cbe948638bf Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:27:35 -0600 Subject: [PATCH 057/221] improve docs for things that should not use twodelta --- ben/src/codec/translate/errors.rs | 5 ++++- ben/src/codec/translate/mod.rs | 17 +++++++++++++++++ ben/src/lib.rs | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/ben/src/codec/translate/errors.rs b/ben/src/codec/translate/errors.rs index c26501d..785eb37 100644 --- a/ben/src/codec/translate/errors.rs +++ b/ben/src/codec/translate/errors.rs @@ -13,7 +13,10 @@ pub enum TranslateError { )] Ben32MissingTerminator { actual: [u8; 4], offset: usize }, - #[error("TwoDelta BEN streams cannot be translated to ben32/XBEN via this path")] + #[error( + "TwoDelta BEN streams cannot be translated to ben32; \ + use XZAssignmentWriter/XZAssignmentReader for TwoDelta compressed I/O" + )] TwoDeltaUnsupported, #[error("IO error: {0}")] diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index c9a54c2..31a4c71 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -1,4 +1,10 @@ //! Translation helpers between BEN and ben32 representations. +//! +//! The ben32 intermediate format is used only by the Standard and MkvChain +//! variants. TwoDelta streams use a separate columnar layout and bypass +//! ben32 entirely — see [`XZAssignmentWriter`](crate::io::writer::XZAssignmentWriter) +//! and [`XZAssignmentReader`](crate::io::reader::XZAssignmentReader) for the +//! TwoDelta compressed-I/O path. mod errors; use errors::TranslateError; @@ -59,6 +65,11 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { /// This is primarily used while decoding XBEN, where the compressed payload is /// stored in ben32 form. /// +/// Only the [`Standard`](BenVariant::Standard) and +/// [`MkvChain`](BenVariant::MkvChain) variants are supported. +/// TwoDelta streams use a different compressed layout and do not pass through +/// ben32; see the module-level documentation for details. +/// /// # Arguments /// /// * `reader` - The ben32 input stream. @@ -147,6 +158,12 @@ fn ben_to_ben32_line( /// This is the format used inside XBEN after the outer XZ compression layer is /// removed. /// +/// Only the [`Standard`](BenVariant::Standard) and +/// [`MkvChain`](BenVariant::MkvChain) variants are supported. +/// Passing [`TwoDelta`](BenVariant::TwoDelta) returns an error. TwoDelta +/// streams use a separate columnar layout and bypass ben32 entirely; see +/// the module-level documentation for details. +/// /// # Arguments /// /// * `reader` - The BEN input stream without its 17-byte file banner. diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 687c332..6ab7497 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -56,6 +56,6 @@ pub enum BenVariant { Standard, /// Store one frame plus a repetition count for repeated consecutive samples. MkvChain, - /// Reserved for a future delta-based variant. + /// Store delta-encoded frames for improved compression of correlated samples. TwoDelta, } From b1c08135dcf6732a6f2c6be8f4bbe7b511d2e39f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 8 Apr 2026 09:31:52 -0600 Subject: [PATCH 058/221] Improve test suite --- ben/src/codec/decode/tests/mod.rs | 151 ++++++++++++++ ben/src/codec/encode/tests.rs | 243 +++++++++++++++++++++- ben/src/codec/frames/mod.rs | 3 + ben/src/codec/frames/tests.rs | 298 ++++++++++++++++++++++++++ ben/src/codec/translate/tests.rs | 19 ++ ben/src/format/mod.rs | 3 + ben/src/format/tests.rs | 20 ++ ben/src/io/reader/tests.rs | 335 ++++++++++++++++++++++++++++++ ben/src/ops/relabel/tests.rs | 20 ++ 9 files changed, 1091 insertions(+), 1 deletion(-) create mode 100644 ben/src/codec/frames/tests.rs create mode 100644 ben/src/format/tests.rs diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 5cef702..68cde27 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -1,3 +1,154 @@ mod mkvchain; mod standard; mod twodelta; + +use std::io; + +#[test] +fn decode_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let decode_err = super::DecodeError::Io(inner); + let io_err: io::Error = decode_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} + +#[test] +fn decode_error_non_io_becomes_invalid_data() { + let decode_err = super::DecodeError::TwoDeltaNoAnchorFrame; + let io_err: io::Error = decode_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn decode_xben_to_ben_twodelta_roundtrip() { + use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; + use crate::codec::encode::encode_jsonl_to_xben; + use crate::BenVariant; + use serde_json::Value; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,1,2,2],"sample":2} +{"assignment":[2,2,2,2],"sample":3} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::TwoDelta, + Some(1), + Some(1), + None, + ) + .unwrap(); + + // Decode XBEN → BEN + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + assert!(!ben.is_empty()); + + // Decode BEN → JSONL and verify + let mut jsonl_out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut jsonl_out).unwrap(); + let output_str = String::from_utf8(jsonl_out).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([2, 1, 2, 2])); + let v3: Value = serde_json::from_str(lines[2]).unwrap(); + assert_eq!(v3["assignment"], serde_json::json!([2, 2, 2, 2])); +} + +#[test] +fn decode_xben_to_jsonl_twodelta() { + use crate::codec::decode::decode_xben_to_jsonl; + use crate::codec::encode::encode_jsonl_to_xben; + use crate::BenVariant; + use serde_json::Value; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,1,2,2],"sample":2} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::TwoDelta, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut jsonl_out = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl_out).unwrap(); + let output_str = String::from_utf8(jsonl_out).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); +} + +#[test] +fn decode_xben_to_jsonl_rejects_invalid_banner() { + use crate::codec::decode::decode_xben_to_jsonl; + use crate::codec::encode::xz_compress; + use std::io::BufReader; + + // Create XZ-compressed data with a bad banner + let mut bad_data = b"GARBAGE BANNER!!!".to_vec(); + bad_data.extend_from_slice(&[0u8; 20]); + let mut xz = Vec::new(); + xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1)).unwrap(); + + let mut output = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut output).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_ben_to_xben_roundtrip() { + use crate::codec::decode::decode_xben_to_ben; + use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; + use crate::BenVariant; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + // JSONL → BEN + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + // BEN → XBEN + let mut xben = Vec::new(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None).unwrap(); + + // XBEN → BEN + let mut ben2 = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); + + assert_eq!(ben, ben2); +} + +#[test] +fn encode_ben_to_xben_with_chunk_size() { + use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; + use crate::BenVariant; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let mut xben = Vec::new(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), Some(1)).unwrap(); + assert!(!xben.is_empty()); +} diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 2129188..68a7e88 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -4,7 +4,7 @@ use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; use serde_json::Value; -use std::io::{BufRead, Write}; +use std::io::{self, BufRead, Write}; #[test] fn test_encode_jsonl_to_ben_underflow() { @@ -672,3 +672,244 @@ fn encode_jsonl_to_ben32_multiple_simple_lines() { } assert_eq!(buffer, expected_output) } + +#[test] +fn encode_ben32_line_missing_assignment_field() { + let data = json!({"sample": 1}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("assignment")); +} + +#[test] +fn encode_ben32_line_non_integer_value() { + let data = json!({"assignment": ["not_a_number"]}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_ben32_line_value_too_large_for_u16() { + let data = json!({"assignment": [100000]}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("too large")); +} + +#[test] +fn encode_ben32_assignments_empty_vec() { + let result = encode_ben32_assignments(Vec::::new()).unwrap(); + // Empty vec produces only the terminator + assert_eq!(result, vec![0, 0, 0, 0]); +} + +#[test] +fn encode_ben32_assignments_single_element() { + let result = encode_ben32_assignments(vec![5u16]).unwrap(); + // (5 << 16) | 1 = 0x00050001, then terminator + assert_eq!(result, vec![0, 5, 0, 1, 0, 0, 0, 0]); +} + +#[test] +fn encode_jsonl_to_ben_invalid_json_errors() { + let input = b"not valid json\n"; + let mut output = Vec::new(); + let err = encode_jsonl_to_ben(input.as_slice(), &mut output, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_jsonl_to_xben_roundtrip() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::Standard, + Some(1), + Some(1), + None, + ) + .unwrap(); + assert!(!xben.is_empty()); +} + +#[test] +fn encode_jsonl_to_xben_with_chunk_size() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::Standard, + Some(1), + Some(1), + Some(2), + ) + .unwrap(); + assert!(!xben.is_empty()); +} + +#[test] +fn encode_jsonl_to_xben_invalid_json_errors() { + let input = b"not valid json\n"; + let mut output = Vec::new(); + let err = encode_jsonl_to_xben( + input.as_slice(), + &mut output, + BenVariant::Standard, + Some(1), + Some(1), + None, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_jsonl_to_xben_mkv_variant() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(1), + None, + ) + .unwrap(); + assert!(!xben.is_empty()); +} + +#[test] +fn twodelta_encode_with_pair_and_mask_hints() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![2, 3]); + + let frame = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap(); + assert_eq!(frame.pair, (2, 1)); + assert!(!frame.run_length_vector.is_empty()); + // Verify masks were updated + assert_eq!(masks[&2], vec![0, 2]); + assert_eq!(masks[&1], vec![1, 3]); +} + +#[test] +fn twodelta_encode_with_mask_hint_only() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![2, 3]); + + let frame = encode_twodelta_frame_with_hint(&prev, &curr, None, Some(&mut masks), None) + .unwrap(); + assert_eq!(frame.pair, (2, 1)); +} + +#[test] +fn twodelta_encode_length_mismatch() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + + let prev = vec![1u16, 1, 2]; + let curr = vec![2u16, 1, 2, 1]; + let err = encode_twodelta_frame_with_hint(&prev, &curr, None, None, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_hint_without_masks_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), None, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_identical_pair_hint_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks = HashMap::new(); + masks.insert(1u16, vec![0, 1]); + + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 1)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_identical_assignments_errors() { + use crate::codec::encode::encode_twodelta_frame; + + let a = vec![1u16, 1, 2, 2]; + let err = encode_twodelta_frame(&a, &a, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_too_many_ids_errors() { + use crate::codec::encode::encode_twodelta_frame; + + let prev = vec![1u16, 2, 3, 4]; + let curr = vec![2u16, 1, 4, 3]; // 4 ids changing + let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_mask_hint_identical_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let a = vec![1u16, 1, 2, 2]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![2, 3]); + + let err = + encode_twodelta_frame_with_hint(&a, &a, None, Some(&mut masks), None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let encode_err = super::errors::EncodeError::Io(inner); + let io_err: io::Error = encode_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} + +#[test] +fn encode_error_non_io_becomes_invalid_data() { + let encode_err = super::errors::EncodeError::TwoDeltaTooManyIds; + let io_err: io::Error = encode_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("two distinct")); +} diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index 3dfb9a9..2088f8d 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -5,6 +5,9 @@ mod mkv_encode; mod twodelta_decode; mod twodelta_encode; +#[cfg(test)] +mod tests; + pub use ben_decode::BenDecodeFrame; pub use ben_encode::BenEncodeFrame; pub use mkv_decode::MkvBenDecodeFrame; diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs new file mode 100644 index 0000000..a99adf9 --- /dev/null +++ b/ben/src/codec/frames/tests.rs @@ -0,0 +1,298 @@ +use super::*; +use std::io; + +// ── BenDecodeFrame ────────────────────────────────────────────────────────── + +#[test] +fn ben_decode_frame_from_reader_standard_frame() { + // Header: max_val_bits=2, max_len_bits=3, n_bytes=2 + // Payload: 2 bytes + let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; + let mut cursor = io::Cursor::new(data); + let frame = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + assert_eq!(frame.max_val_bit_count, 2); + assert_eq!(frame.max_len_bit_count, 3); + assert_eq!(frame.n_bytes, 2); + assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); +} + +#[test] +fn ben_decode_frame_from_reader_eof_returns_none() { + let data: Vec = vec![]; + let mut cursor = io::Cursor::new(data); + let result = BenDecodeFrame::from_reader(&mut cursor).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn ben_decode_frame_from_reader_truncated_header_errors() { + // Only 1 byte — too short for a full header + let data: Vec = vec![2]; + let mut cursor = io::Cursor::new(data); + let err = BenDecodeFrame::from_reader(&mut cursor).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn ben_decode_frame_to_bytes() { + let frame = BenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 2, + raw_bytes: vec![0xAB, 0xCD], + }; + let bytes = frame.to_bytes(); + assert_eq!(bytes, vec![0xAB, 0xCD]); + // Original frame still usable (not consumed) + assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); +} + +#[test] +fn ben_decode_frame_into_bytes() { + let frame = BenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 2, + raw_bytes: vec![0xAB, 0xCD], + }; + let bytes = frame.into_bytes(); + assert_eq!(bytes, vec![0xAB, 0xCD]); +} + +#[test] +fn ben_decode_frame_as_ref() { + let frame = BenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 2, + raw_bytes: vec![0xAB, 0xCD], + }; + let slice: &[u8] = frame.as_ref(); + assert_eq!(slice, &[0xAB, 0xCD]); +} + +#[test] +fn ben_decode_frame_deref() { + let frame = BenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 2, + raw_bytes: vec![0xAB, 0xCD], + }; + // Deref lets us call slice methods directly + assert_eq!(frame.len(), 2); + assert_eq!(frame[0], 0xAB); + assert_eq!(frame[1], 0xCD); +} + +#[test] +fn ben_decode_frame_partial_eq_vec() { + let frame = BenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 2, + raw_bytes: vec![0xAB, 0xCD], + }; + let v = vec![0xAB, 0xCD]; + // Both directions + assert_eq!(frame, v); + assert_eq!(v, frame); + // Inequality + let v2 = vec![0xFF]; + assert_ne!(frame, v2); + assert_ne!(v2, frame); +} + +// ── MkvBenDecodeFrame ─────────────────────────────────────────────────────── + +#[test] +fn mkv_decode_frame_from_reader() { + // Header: max_val_bits=2, max_len_bits=3, n_bytes=2 + // Payload: 2 bytes + // Count: u16 BE = 5 + let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD, 0, 5]; + let mut cursor = io::Cursor::new(data); + let frame = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + assert_eq!(frame.max_val_bit_count, 2); + assert_eq!(frame.max_len_bit_count, 3); + assert_eq!(frame.n_bytes, 2); + assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); + assert_eq!(frame.count, 5); +} + +#[test] +fn mkv_decode_frame_from_reader_eof_returns_none() { + let data: Vec = vec![]; + let mut cursor = io::Cursor::new(data); + let result = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn mkv_decode_frame_from_reader_truncated_count_errors() { + // Valid header + payload, but missing count bytes + let data: Vec = vec![2, 3, 0, 0, 0, 1, 0xFF]; + let mut cursor = io::Cursor::new(data); + let err = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn mkv_decode_frame_to_bytes() { + let frame = MkvBenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 1, + raw_bytes: vec![0xFF], + count: 3, + }; + let bytes = frame.to_bytes(); + assert_eq!(bytes, vec![0xFF]); + assert_eq!(frame.raw_bytes, vec![0xFF]); +} + +#[test] +fn mkv_decode_frame_into_bytes() { + let frame = MkvBenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 1, + raw_bytes: vec![0xFF], + count: 3, + }; + let bytes = frame.into_bytes(); + assert_eq!(bytes, vec![0xFF]); +} + +#[test] +fn mkv_decode_frame_as_ref() { + let frame = MkvBenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 1, + raw_bytes: vec![0xFF], + count: 3, + }; + let slice: &[u8] = frame.as_ref(); + assert_eq!(slice, &[0xFF]); +} + +#[test] +fn mkv_decode_frame_deref() { + let frame = MkvBenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 1, + raw_bytes: vec![0xFF], + count: 3, + }; + assert_eq!(frame.len(), 1); + assert_eq!(frame[0], 0xFF); +} + +#[test] +fn mkv_decode_frame_partial_eq_vec() { + let frame = MkvBenDecodeFrame { + max_val_bit_count: 2, + max_len_bit_count: 3, + n_bytes: 1, + raw_bytes: vec![0xFF], + count: 3, + }; + let v = vec![0xFF]; + assert_eq!(frame, v); + assert_eq!(v, frame); + let v2 = vec![0x00]; + assert_ne!(frame, v2); + assert_ne!(v2, frame); +} + +// ── MkvBenEncodeFrame ─────────────────────────────────────────────────────── + +#[test] +fn mkv_encode_frame_from_rle_count_none_defaults_to_1() { + let runs = vec![(1u16, 4u16), (2, 1)]; + let frame = MkvBenEncodeFrame::from_rle(runs.clone(), None); + assert_eq!(frame.count, 1); + assert_eq!(frame.runs, runs); +} + +#[test] +fn mkv_encode_frame_from_rle_with_count() { + let runs = vec![(1u16, 4u16), (2, 1)]; + let frame = MkvBenEncodeFrame::from_rle(runs.clone(), Some(7)); + assert_eq!(frame.count, 7); +} + +#[test] +fn mkv_encode_frame_to_bytes() { + let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); + let bytes = frame.to_bytes(); + assert_eq!(bytes, frame.raw_bytes); + // Frame still usable + assert!(!frame.raw_bytes.is_empty()); +} + +#[test] +fn mkv_encode_frame_into_bytes() { + let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); + let expected = frame.raw_bytes.clone(); + let bytes = frame.into_bytes(); + assert_eq!(bytes, expected); +} + +#[test] +fn mkv_encode_frame_as_ref() { + let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); + let slice: &[u8] = frame.as_ref(); + assert_eq!(slice, &frame.raw_bytes); +} + +#[test] +fn mkv_encode_frame_deref() { + let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); + assert_eq!(frame.len(), frame.raw_bytes.len()); +} + +#[test] +fn mkv_encode_frame_partial_eq_vec() { + let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); + let v = frame.raw_bytes.clone(); + assert_eq!(frame, v); + assert_eq!(v, frame); + let v2 = vec![0xFF, 0xFF, 0xFF]; + assert_ne!(frame, v2); + assert_ne!(v2, frame); +} + +// ── TwoDeltaDecodeFrame ───────────────────────────────────────────────────── + +#[test] +fn twodelta_decode_frame_from_reader() { + // pair: (0, 2), (0, 1), max_len_bits: 1, n_bytes: 0,0,0,1, payload: 0xC0, count: 0,1 + let data: Vec = vec![0, 2, 0, 1, 1, 0, 0, 0, 1, 0xC0, 0, 1]; + let mut cursor = io::Cursor::new(data); + let frame = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); + assert_eq!(frame.pair, (2, 1)); + assert_eq!(frame.count, 1); + assert!(!frame.run_lengths.is_empty()); +} + +#[test] +fn twodelta_decode_frame_from_reader_eof_returns_none() { + let data: Vec = vec![]; + let mut cursor = io::Cursor::new(data); + let result = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn twodelta_decode_frame_from_reader_truncated_errors() { + // Only pair_a, missing pair_b + let data: Vec = vec![0, 2]; + let mut cursor = io::Cursor::new(data); + let err = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 185bfb5..3ac28a8 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -331,3 +331,22 @@ fn test_ben_to_ben32_lines_mkv_roundtrip() { assert_eq!(round, ben[17..]); } + +#[test] +fn test_ben_to_ben32_lines_rejects_twodelta() { + let ben_data = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; + let mut output = Vec::new(); + let err = ben_to_ben32_lines(ben_data.as_slice(), &mut output, BenVariant::TwoDelta) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Unsupported); + assert!(err.to_string().contains("TwoDelta")); +} + +#[test] +fn test_translate_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let translate_err = super::errors::TranslateError::Io(inner); + let io_err: io::Error = translate_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} diff --git a/ben/src/format/mod.rs b/ben/src/format/mod.rs index c0ca8cf..4105d03 100644 --- a/ben/src/format/mod.rs +++ b/ben/src/format/mod.rs @@ -3,3 +3,6 @@ pub mod banners; pub mod errors; pub use errors::FormatError; + +#[cfg(test)] +mod tests; diff --git a/ben/src/format/tests.rs b/ben/src/format/tests.rs new file mode 100644 index 0000000..6c96388 --- /dev/null +++ b/ben/src/format/tests.rs @@ -0,0 +1,20 @@ +use std::io; + +#[test] +fn format_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let format_err = super::FormatError::Io(inner); + let io_err: io::Error = format_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} + +#[test] +fn format_error_unknown_banner_becomes_invalid_data() { + let format_err = super::FormatError::UnknownBanner { + actual: b"GARBAGE BANNER!!!".to_vec(), + }; + let io_err: io::Error = format_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("unrecognized BEN banner")); +} diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index e69de29..f9ddff0 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -0,0 +1,335 @@ +use crate::codec::encode::encode_jsonl_to_xben; +use crate::io::reader::{XZAssignmentReader, XZAssignmentFrameReader}; +use crate::io::writer::XZAssignmentWriter; +use crate::BenVariant; +use std::io::Cursor; +use xz2::write::XzEncoder; + +/// Build a minimal XBEN stream from JSONL input for testing. +fn make_xben(jsonl: &str, variant: BenVariant) -> Vec { + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + variant, + Some(1), + Some(1), + None, + ) + .unwrap(); + xben +} + +/// Build a minimal XBEN stream using XZAssignmentWriter directly. +fn make_xben_from_assignments(assignments: &[Vec], variant: BenVariant) -> Vec { + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + xben +} + +// ── XZAssignmentReader ────────────────────────────────────────────────────── + +#[test] +fn xz_reader_standard_iterator() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.variant(), BenVariant::Standard); + let results: Vec<_> = reader.collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0].as_ref().unwrap().0, vec![1, 1, 2, 2]); + assert_eq!(results[0].as_ref().unwrap().1, 1); + assert_eq!(results[1].as_ref().unwrap().0, vec![2, 2, 1, 1]); +} + +#[test] +fn xz_reader_mkv_iterator() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::MkvChain); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.variant(), BenVariant::MkvChain); + let results: Vec<_> = reader.collect(); + // MkvChain collapses identical consecutive assignments + assert_eq!(results.len(), 2); + assert_eq!(results[0].as_ref().unwrap().1, 2); // count=2 + assert_eq!(results[1].as_ref().unwrap().1, 1); // count=1 +} + +#[test] +fn xz_reader_twodelta_iterator() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + vec![2, 2, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.variant(), BenVariant::TwoDelta); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn xz_reader_count_samples_standard() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[1,2,1,2],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.count_samples().unwrap(), 3); +} + +#[test] +fn xz_reader_count_samples_mkv() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::MkvChain); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.count_samples().unwrap(), 3); +} + +#[test] +fn xz_reader_silent_suppresses_output() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)) + .unwrap() + .silent(true); + let results: Vec<_> = reader.collect(); + assert_eq!(results.len(), 1); +} + +#[test] +fn xz_reader_for_each_assignment() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut collected = Vec::new(); + reader + .for_each_assignment(|assignment, count| { + collected.push((assignment.to_vec(), count)); + Ok(true) + }) + .unwrap(); + assert_eq!(collected.len(), 2); + assert_eq!(collected[0].0, vec![1, 1, 2, 2]); + assert_eq!(collected[1].0, vec![2, 2, 1, 1]); +} + +#[test] +fn xz_reader_for_each_assignment_early_stop() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut collected = Vec::new(); + reader + .for_each_assignment(|assignment, _count| { + collected.push(assignment.to_vec()); + Ok(false) // stop after first + }) + .unwrap(); + assert_eq!(collected.len(), 1); +} + +#[test] +fn xz_reader_write_all_jsonl() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + let v1: serde_json::Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + assert_eq!(v1["sample"], 1); +} + +#[test] +fn xz_reader_write_all_jsonl_mkv_expands_counts() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::MkvChain); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); // expanded from count=2 + count=1 +} + +#[test] +fn xz_reader_into_frames_standard() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let frames: Vec<_> = reader.into_frames().collect(); + assert_eq!(frames.len(), 2); + for f in &frames { + let (bytes, count) = f.as_ref().unwrap(); + assert!(!bytes.is_empty()); + assert_eq!(*count, 1); + } +} + +#[test] +fn xz_reader_into_frames_twodelta() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let frames: Vec<_> = reader.into_frames().collect(); + assert_eq!(frames.len(), 2); +} + +#[test] +fn xz_frame_reader_new() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let frames: Vec<_> = reader.collect(); + assert_eq!(frames.len(), 1); +} + +#[test] +fn xz_reader_new_rejects_invalid_data() { + let garbage = vec![0u8; 100]; + let result = XZAssignmentReader::new(Cursor::new(garbage)); + assert!(result.is_err()); +} + +// ── XZAssignmentReader subsample ──────────────────────────────────────────── + +#[test] +fn xz_reader_subsample_by_indices() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_indices(vec![1, 3]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![1, 1, 2, 2]); + assert_eq!(results[1], vec![3, 3, 3, 3]); +} + +#[test] +fn xz_reader_subsample_by_range() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_range(2, 3) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![2, 2, 1, 1]); + assert_eq!(results[1], vec![3, 3, 3, 3]); +} + +#[test] +fn xz_reader_subsample_every() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +{"assignment":[4,4,4,4],"sample":4} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_every(2, 1) // samples 1, 3 + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![1, 1, 2, 2]); + assert_eq!(results[1], vec![3, 3, 3, 3]); +} + +// ── XZAssignmentReader for_each_assignment with silent ────────────────────── + +#[test] +fn xz_reader_for_each_assignment_silent() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)) + .unwrap() + .silent(true); + let mut count = 0usize; + reader + .for_each_assignment(|_assignment, _cnt| { + count += 1; + Ok(true) + }) + .unwrap(); + assert_eq!(count, 2); +} + +// ── XZAssignmentReader TwoDelta write_all_jsonl ───────────────────────────── + +#[test] +fn xz_reader_write_all_jsonl_twodelta() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + vec![2, 2, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); +} + +// ── XZAssignmentReader TwoDelta count_samples ─────────────────────────────── + +#[test] +fn xz_reader_count_samples_twodelta() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.count_samples().unwrap(), 2); +} diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 6ddd7ea..8c89541 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -566,3 +566,23 @@ fn test_relabel_lines_with_map_propagate_non_eof_reader_error() { .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); } + +#[test] +fn relabel_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let relabel_err = super::errors::RelabelError::Io(inner); + let io_err: io::Error = relabel_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} + +#[test] +fn relabel_error_non_io_becomes_invalid_input() { + let relabel_err = super::errors::RelabelError::NonContiguousMap { + max_key: 10, + missing: 3, + }; + let io_err: io::Error = relabel_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidInput); + assert!(io_err.to_string().contains("contiguous")); +} From 0f8d0ffc826d8ed1faeaa8acd4663bc57c338c19 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 8 Apr 2026 09:55:44 -0600 Subject: [PATCH 059/221] add edge-case tests --- ben/src/codec/decode/tests/mod.rs | 72 ++++++ ben/src/codec/encode/tests.rs | 274 ++++++++++++++++++++++ ben/src/codec/frames/tests.rs | 270 +++++++++++++++++++++ ben/src/io/reader/tests.rs | 374 +++++++++++++++++++++++++++++- 4 files changed, 989 insertions(+), 1 deletion(-) diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 68cde27..6005f25 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -139,8 +139,11 @@ fn encode_ben_to_xben_roundtrip() { #[test] fn encode_ben_to_xben_with_chunk_size() { + use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; use crate::BenVariant; + use serde_json::Value; + use std::io::BufReader; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[2,2,1,1],"sample":2} @@ -151,4 +154,73 @@ fn encode_ben_to_xben_with_chunk_size() { let mut xben = Vec::new(); encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), Some(1)).unwrap(); assert!(!xben.is_empty()); + + // Verify content roundtrips correctly + let mut ben2 = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); + let mut jsonl_out = Vec::new(); + decode_ben_to_jsonl(ben2.as_slice(), &mut jsonl_out).unwrap(); + let output_str = String::from_utf8(jsonl_out).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([2, 2, 1, 1])); +} + +#[test] +fn encode_ben_to_xben_mkvchain_roundtrip() { + use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; + use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; + use crate::BenVariant; + use serde_json::Value; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut xben = Vec::new(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None).unwrap(); + + let mut ben2 = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); + let mut jsonl_out = Vec::new(); + decode_ben_to_jsonl(ben2.as_slice(), &mut jsonl_out).unwrap(); + let output_str = String::from_utf8(jsonl_out).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([1, 1, 2, 2])); + let v3: Value = serde_json::from_str(lines[2]).unwrap(); + assert_eq!(v3["assignment"], serde_json::json!([2, 2, 1, 1])); +} + +#[test] +fn decode_error_remaining_variants() { + // Test DecodeError variants we haven't covered + let err = super::DecodeError::XBenUnknownFrameTag { tag: 0xFF }; + let io_err: io::Error = err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("0xff")); + + let err = super::DecodeError::XBenTruncated; + let io_err: io::Error = err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + + let err = super::DecodeError::UnexpectedTwoDeltaFrame { + variant: crate::BenVariant::Standard, + }; + let io_err: io::Error = err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + + let err = super::DecodeError::TwoDeltaRunsExhausted { run_idx: 3, pos: 7 }; + let io_err: io::Error = err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); } diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 68a7e88..cfaac0c 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -913,3 +913,277 @@ fn encode_error_non_io_becomes_invalid_data() { assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); assert!(io_err.to_string().contains("two distinct")); } + +// ── XBEN roundtrip with content verification ──────────────────────────────── + +#[test] +fn encode_jsonl_to_xben_roundtrip_verifies_content() { + use crate::codec::decode::decode_xben_to_jsonl; + use std::io::BufReader; + use serde_json::Value; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::Standard, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut decoded).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([2, 2, 1, 1])); +} + +#[test] +fn encode_jsonl_to_xben_mkv_verifies_content() { + use crate::codec::decode::decode_xben_to_jsonl; + use std::io::BufReader; + use serde_json::Value; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut decoded).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + // First two should be identical (MkvChain de-duplication) + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + assert_eq!(v2["assignment"], serde_json::json!([1, 1, 2, 2])); + let v3: Value = serde_json::from_str(lines[2]).unwrap(); + assert_eq!(v3["assignment"], serde_json::json!([2, 2, 1, 1])); +} + +// ── TwoDelta with explicit count parameter ────────────────────────────────── + +#[test] +fn twodelta_encode_with_count() { + use crate::codec::encode::encode_twodelta_frame; + let prev = vec![1u16, 1, 2, 2]; + let next = vec![2u16, 1, 2, 1]; + let frame = encode_twodelta_frame(&prev, &next, Some(5)).unwrap(); + // Verify the count is embedded in the raw_bytes tail + let raw = &frame.raw_bytes; + let count = u16::from_be_bytes([raw[raw.len() - 2], raw[raw.len() - 1]]); + assert_eq!(count, 5); +} + +// ── TwoDelta run_length_vector verification ───────────────────────────────── + +#[test] +fn twodelta_encode_run_lengths_correct() { + use crate::codec::encode::encode_twodelta_frame; + // prev: [1,1,2,2], next: [2,1,2,1] + // pair positions (1 or 2): 0,1,2,3 + // In next: pos0=2, pos1=1, pos2=2, pos3=1 → runs of (2,1,2,1) = [1,1,1,1] + // pair.0 = value at first pair position in next = 2 + let prev = vec![1u16, 1, 2, 2]; + let next = vec![2u16, 1, 2, 1]; + let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); + assert_eq!(frame.pair, (2, 1)); + assert_eq!(frame.run_length_vector, vec![1, 1, 1, 1]); +} + +#[test] +fn twodelta_encode_run_lengths_with_non_pair_gaps() { + use crate::codec::encode::encode_twodelta_frame; + // prev: [1,3,2,3,1], next: [2,3,1,3,2] + // pair=(1,2), pair positions: 0,2,4 (positions with value 1 or 2) + // In next: pos0=2, pos2=1, pos4=2 → runs [1,1,1] + let prev = vec![1u16, 3, 2, 3, 1]; + let next = vec![2u16, 3, 1, 3, 2]; + let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); + assert_eq!(frame.run_length_vector, vec![1, 1, 1]); +} + +// ── TwoDelta encode→decode roundtrip ──────────────────────────────────────── + +#[test] +fn twodelta_encode_decode_roundtrip_via_codec() { + use crate::codec::decode::decode_twodelta_frame; + use crate::codec::encode::encode_twodelta_frame; + + let prev = vec![1u16, 1, 2, 2, 1, 2, 1, 2]; + let next = vec![2u16, 2, 1, 1, 1, 2, 1, 2]; // first 4 positions swap + let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); + let decoded = decode_twodelta_frame(prev, &frame).unwrap(); + assert_eq!(decoded, next); +} + +// ── TwoDelta error variants ───────────────────────────────────────────────── + +#[test] +fn twodelta_encode_missing_mask_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + // Missing mask for value 2 + + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_empty_mask_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![]); // Empty mask + + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_mask_out_of_pair_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // prev has value 3 at position 2, but mask claims it's part of pair (1,2) + let prev = vec![1u16, 1, 3, 2]; + let curr = vec![2u16, 1, 3, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![2, 3]); // position 2 in prev is actually 3, not 2 + + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +// ── JSON encoding edge cases ──────────────────────────────────────────────── + +#[test] +fn encode_ben32_line_negative_value_errors() { + let data = serde_json::json!({"assignment": [-1, 2, 3]}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_ben32_line_float_value_errors() { + let data = serde_json::json!({"assignment": [1.5, 2, 3]}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_ben32_line_null_value_errors() { + let data = serde_json::json!({"assignment": [null, 2, 3]}); + let err = encode_ben32_line(data).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn encode_ben32_line_value_at_u16_max() { + let data = serde_json::json!({"assignment": [65535, 1]}); + let result = encode_ben32_line(data).unwrap(); + // (65535 << 16) | 1 → 0xFFFF0001 then (1 << 16) | 1 → 0x00010001 then terminator + assert_eq!( + result, + vec![0xFF, 0xFF, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0] + ); +} + +// ── Encoding empty and single-element JSONL ───────────────────────────────── + +#[test] +fn encode_jsonl_to_ben_empty_input() { + let jsonl = b""; + let mut output = Vec::new(); + encode_jsonl_to_ben(jsonl.as_slice(), &mut output, BenVariant::Standard).unwrap(); + // Should only have the banner + assert_eq!(output, b"STANDARD BEN FILE"); +} + +#[test] +fn encode_jsonl_to_ben_single_sample() { + use crate::codec::decode::decode_ben_to_jsonl; + use serde_json::Value; + + let jsonl = b"{\"assignment\":[42],\"sample\":1}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::Standard).unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + let v: Value = serde_json::from_slice(decoded.trim_ascii()).unwrap(); + assert_eq!(v["assignment"], serde_json::json!([42])); +} + +// ── TwoDelta JSONL encoding edge cases ────────────────────────────────────── + +#[test] +fn encode_jsonl_to_xben_twodelta_roundtrip() { + use crate::codec::decode::decode_xben_to_jsonl; + use std::io::BufReader; + use serde_json::Value; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,1,2,1],"sample":2} +{"assignment":[2,2,1,1],"sample":3} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::TwoDelta, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut decoded).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v2: Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([2, 1, 2, 1])); + let v3: Value = serde_json::from_str(lines[2]).unwrap(); + assert_eq!(v3["assignment"], serde_json::json!([2, 2, 1, 1])); +} diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index a99adf9..b715909 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -296,3 +296,273 @@ fn twodelta_decode_frame_from_reader_truncated_errors() { let err = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } + +// ── Encode→Decode Roundtrips ──────────────────────────────────────────────── + +#[test] +fn ben_encode_decode_roundtrip_standard() { + use crate::codec::decode::decode_ben_line; + // Encode a Standard frame, then decode it via BenDecodeFrame::from_reader + let runs = vec![(1u16, 4), (2, 1), (3, 3)]; + let encode_frame = BenEncodeFrame::from_rle(runs.clone(), None); + + // from_reader expects just the header+payload (no banner) + let mut cursor = io::Cursor::new(encode_frame.as_slice()); + let decode_frame = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + + assert_eq!(decode_frame.max_val_bit_count, encode_frame.max_val_bit_count); + assert_eq!(decode_frame.max_len_bit_count, encode_frame.max_len_bit_count); + assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); + + // Verify the payload decodes back to the original RLE runs + let decoded_runs = decode_ben_line( + io::Cursor::new(&decode_frame.raw_bytes), + decode_frame.max_val_bit_count, + decode_frame.max_len_bit_count, + decode_frame.n_bytes, + ) + .unwrap(); + assert_eq!(decoded_runs, runs); +} + +#[test] +fn mkv_encode_decode_roundtrip() { + use crate::codec::decode::decode_ben_line; + let runs = vec![(1u16, 4), (2, 1), (3, 3)]; + let encode_frame = MkvBenEncodeFrame::from_rle(runs.clone(), Some(42)); + + let mut cursor = io::Cursor::new(encode_frame.as_slice()); + let decode_frame = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + + assert_eq!(decode_frame.max_val_bit_count, encode_frame.max_val_bit_count); + assert_eq!(decode_frame.max_len_bit_count, encode_frame.max_len_bit_count); + assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); + assert_eq!(decode_frame.count, 42); + + let decoded_runs = decode_ben_line( + io::Cursor::new(&decode_frame.raw_bytes), + decode_frame.max_val_bit_count, + decode_frame.max_len_bit_count, + decode_frame.n_bytes, + ) + .unwrap(); + assert_eq!(decoded_runs, runs); +} + +#[test] +fn twodelta_encode_decode_roundtrip() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let run_lengths = vec![3u16, 2, 1, 4]; + let encode_frame = + TwoDeltaEncodeFrame::from_run_lengths((5, 10), run_lengths.clone(), Some(7)); + + // Write the raw_bytes (which include pair, max_len_bits, n_bytes, payload, count) + let mut cursor = io::Cursor::new(encode_frame.as_slice()); + let decode_frame = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + + assert_eq!(decode_frame.pair, (5, 10)); + assert_eq!(decode_frame.count, 7); + assert_eq!(decode_frame.run_lengths, run_lengths); +} + +// ── Back-to-back frame reads ──────────────────────────────────────────────── + +#[test] +fn ben_decode_two_frames_back_to_back() { + let f1 = BenEncodeFrame::from_rle(vec![(1u16, 2), (3, 4)], None); + let f2 = BenEncodeFrame::from_rle(vec![(7u16, 1), (8, 1), (9, 1)], None); + + let mut data = Vec::new(); + data.extend_from_slice(f1.as_slice()); + data.extend_from_slice(f2.as_slice()); + + let mut cursor = io::Cursor::new(data); + let d1 = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d2 = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d3 = BenDecodeFrame::from_reader(&mut cursor).unwrap(); + + assert_eq!(d1.max_val_bit_count, f1.max_val_bit_count); + assert_eq!(d2.max_val_bit_count, f2.max_val_bit_count); + assert!(d3.is_none()); // clean EOF +} + +#[test] +fn mkv_decode_two_frames_back_to_back() { + let f1 = MkvBenEncodeFrame::from_rle(vec![(1u16, 2)], Some(10)); + let f2 = MkvBenEncodeFrame::from_rle(vec![(5u16, 5)], Some(20)); + + let mut data = Vec::new(); + data.extend_from_slice(f1.as_slice()); + data.extend_from_slice(f2.as_slice()); + + let mut cursor = io::Cursor::new(data); + let d1 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d2 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d3 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap(); + + assert_eq!(d1.count, 10); + assert_eq!(d2.count, 20); + assert!(d3.is_none()); +} + +#[test] +fn twodelta_decode_two_frames_back_to_back() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let f1 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], Some(1)); + let f2 = TwoDeltaEncodeFrame::from_run_lengths((3, 4), vec![1, 1, 1], Some(5)); + + let mut data = Vec::new(); + data.extend_from_slice(f1.as_slice()); + data.extend_from_slice(f2.as_slice()); + + let mut cursor = io::Cursor::new(data); + let d1 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d2 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d3 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap(); + + assert_eq!(d1.pair, (1, 2)); + assert_eq!(d1.run_lengths, vec![3, 2]); + assert_eq!(d1.count, 1); + assert_eq!(d2.pair, (3, 4)); + assert_eq!(d2.run_lengths, vec![1, 1, 1]); + assert_eq!(d2.count, 5); + assert!(d3.is_none()); +} + +// ── Boundary values ───────────────────────────────────────────────────────── + +#[test] +fn mkv_decode_frame_count_max_u16() { + let f = MkvBenEncodeFrame::from_rle(vec![(1u16, 1)], Some(u16::MAX)); + let mut cursor = io::Cursor::new(f.as_slice()); + let d = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + assert_eq!(d.count, u16::MAX); +} + +#[test] +fn twodelta_decode_frame_count_max_u16() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let f = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(u16::MAX)); + let mut cursor = io::Cursor::new(f.as_slice()); + let d = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + assert_eq!(d.count, u16::MAX); +} + +#[test] +fn ben_encode_single_run_frame() { + use crate::codec::decode::decode_ben_line; + let runs = vec![(1u16, 1)]; + let frame = BenEncodeFrame::from_rle(runs.clone(), None); + + let mut cursor = io::Cursor::new(frame.as_slice()); + let decoded = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + + let decoded_runs = decode_ben_line( + io::Cursor::new(&decoded.raw_bytes), + decoded.max_val_bit_count, + decoded.max_len_bit_count, + decoded.n_bytes, + ) + .unwrap(); + assert_eq!(decoded_runs, runs); +} + +#[test] +fn ben_encode_large_values_near_u16_max() { + use crate::codec::decode::decode_ben_line; + let runs = vec![(65534u16, 65535u16), (1, 1)]; + let frame = BenEncodeFrame::from_rle(runs.clone(), None); + + let mut cursor = io::Cursor::new(frame.as_slice()); + let decoded = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + + let decoded_runs = decode_ben_line( + io::Cursor::new(&decoded.raw_bytes), + decoded.max_val_bit_count, + decoded.max_len_bit_count, + decoded.n_bytes, + ) + .unwrap(); + assert_eq!(decoded_runs, runs); +} + +#[test] +fn twodelta_from_run_lengths_then_from_parts_roundtrip() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + // Verify that packing via from_run_lengths then unpacking via from_parts + // reproduces the same run_length_vector + let run_lengths = vec![5u16, 3, 7, 1, 2]; + let encoded = TwoDeltaEncodeFrame::from_run_lengths((10, 20), run_lengths.clone(), None); + + let reconstructed = TwoDeltaEncodeFrame::from_parts( + encoded.pair, + encoded.max_len_bit_count, + encoded.payload().to_vec(), + ); + assert_eq!(reconstructed.run_length_vector, run_lengths); + assert_eq!(reconstructed.pair, (10, 20)); +} + +#[test] +fn twodelta_from_run_lengths_single_run() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let run_lengths = vec![100u16]; + let encoded = TwoDeltaEncodeFrame::from_run_lengths((1, 2), run_lengths.clone(), None); + + let mut cursor = io::Cursor::new(encoded.as_slice()); + let decoded = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + assert_eq!(decoded.run_lengths, run_lengths); +} + +// ── BenEncodeFrame trait impls ────────────────────────────────────────────── + +#[test] +fn ben_encode_frame_partial_eq_vec_both_directions() { + let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); + let v = frame.raw_bytes.clone(); + assert_eq!(frame, v); + assert_eq!(v, frame); + let v2 = vec![0xFF, 0xFF, 0xFF]; + assert_ne!(frame, v2); + assert_ne!(v2, frame); +} + +#[test] +fn ben_encode_frame_as_ref_and_deref() { + let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); + let slice: &[u8] = frame.as_ref(); + assert_eq!(slice, &frame.raw_bytes[..]); + assert_eq!(frame.len(), frame.raw_bytes.len()); +} + +#[test] +fn ben_encode_frame_to_bytes_and_into_bytes() { + let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); + let to = frame.to_bytes(); + let expected = frame.raw_bytes.clone(); + assert_eq!(to, expected); + let into = frame.into_bytes(); + assert_eq!(into, expected); +} + +// ── TwoDeltaEncodeFrame trait impls ───────────────────────────────────────── + +#[test] +fn twodelta_encode_frame_as_ref_and_deref() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], None); + let slice: &[u8] = frame.as_ref(); + assert_eq!(slice, &frame.raw_bytes[..]); + assert_eq!(frame.len(), frame.raw_bytes.len()); +} + +#[test] +fn twodelta_encode_frame_to_bytes_and_into_bytes() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], None); + let to = frame.to_bytes(); + let expected = frame.raw_bytes.clone(); + assert_eq!(to, expected); + let into = frame.into_bytes(); + assert_eq!(into, expected); +} diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index f9ddff0..d41c0bc 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -1,5 +1,6 @@ use crate::codec::encode::encode_jsonl_to_xben; -use crate::io::reader::{XZAssignmentReader, XZAssignmentFrameReader}; +use crate::io::reader::errors::DecoderInitError; +use crate::io::reader::{XZAssignmentFrameReader, XZAssignmentReader}; use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; use std::io::Cursor; @@ -333,3 +334,374 @@ fn xz_reader_count_samples_twodelta() { let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 2); } + +// ── Content verification tests ───────────────────────────────────────────── + +#[test] +fn xz_reader_into_frames_standard_content() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[3,3,4,4],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let frames: Vec<_> = reader.into_frames().collect(); + assert_eq!(frames.len(), 2); + // Verify frame bytes can be decoded back + for f in &frames { + let (bytes, count) = f.as_ref().unwrap(); + assert!(!bytes.is_empty()); + assert_eq!(*count, 1); + } +} + +#[test] +fn xz_reader_write_all_jsonl_standard_content_verified() { + let jsonl = r#"{"assignment":[5,6,7],"sample":1} +{"assignment":[8,9,10],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + let v1: serde_json::Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([5, 6, 7])); + assert_eq!(v1["sample"], 1); + let v2: serde_json::Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([8, 9, 10])); + assert_eq!(v2["sample"], 2); +} + +#[test] +fn xz_reader_write_all_jsonl_mkv_content_verified() { + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[1,2,3],"sample":2} +{"assignment":[4,5,6],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::MkvChain); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + let v1: serde_json::Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 2, 3])); + let v2: serde_json::Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([1, 2, 3])); + let v3: serde_json::Value = serde_json::from_str(lines[2]).unwrap(); + assert_eq!(v3["assignment"], serde_json::json!([4, 5, 6])); +} + +// ── Single sample streams ────────────────────────────────────────────────── + +#[test] +fn xz_reader_single_sample_standard() { + let jsonl = r#"{"assignment":[42],"sample":1} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].as_ref().unwrap().0, vec![42]); + assert_eq!(results[0].as_ref().unwrap().1, 1); +} + +#[test] +fn xz_reader_single_sample_twodelta() { + let xben = make_xben_from_assignments(&[vec![1u16, 2, 3]], BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1, 2, 3]]); +} + +// ── Subsample edge cases ──────────────────────────────────────────────────── + +#[test] +fn xz_reader_subsample_by_indices_deduplicates_and_sorts() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + // Pass unsorted duplicates: [3,1,3,1] → sorted+deduped [1,3] + let results: Vec<_> = reader + .into_subsample_by_indices(vec![3, 1, 3, 1]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![1, 1, 2, 2]); + assert_eq!(results[1], vec![3, 3, 3, 3]); +} + +#[test] +fn xz_reader_subsample_by_indices_beyond_stream() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + // Index 5 is beyond the stream (only 2 samples) + let results: Vec<_> = reader + .into_subsample_by_indices(vec![5]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 0); +} + +#[test] +fn xz_reader_subsample_by_range_single_element() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +{"assignment":[3,3,3,3],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_range(2, 2) // only sample 2 + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], vec![2, 2, 1, 1]); +} + +#[test] +fn xz_reader_subsample_every_offset_beyond_stream() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + // Offset 10 is beyond the stream + let results: Vec<_> = reader + .into_subsample_every(1, 10) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 0); +} + +#[test] +fn xz_reader_subsample_mkv_with_count_gt_1() { + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[1,2,3],"sample":2} +{"assignment":[1,2,3],"sample":3} +{"assignment":[4,5,6],"sample":4} +"#; + let xben = make_xben(jsonl, BenVariant::MkvChain); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + // Select sample 2 (middle of the count=3 frame) and sample 4 + let results: Vec<_> = reader + .into_subsample_by_indices(vec![2, 4]) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0].0, vec![1, 2, 3]); + assert_eq!(results[1].0, vec![4, 5, 6]); +} + +#[test] +fn xz_reader_subsample_twodelta() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + vec![2, 2, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_indices(vec![1, 3]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![1, 1, 2, 2]); + assert_eq!(results[1], vec![2, 2, 2, 2]); +} + +// ── DecoderInitError tests ───────────────────────────────────────────────── + +#[test] +fn decoder_init_error_xz_header_detected() { + // Feed XZ-compressed data to a reader that expects uncompressed BEN + use crate::io::reader::AssignmentReader; + let xz_magic = b"\xFD\x37\x7A\x58\x5A\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; + let result = AssignmentReader::new(xz_magic.as_slice()); + assert!(result.is_err()); + let io_err: std::io::Error = result.err().unwrap().into(); + assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("Compressed header")); +} + +#[test] +fn decoder_init_error_unknown_banner() { + use crate::io::reader::AssignmentReader; + let bad_banner = b"THIS IS NOT BEN!!"; + let result = AssignmentReader::new(bad_banner.as_slice()); + assert!(result.is_err()); + let io_err: std::io::Error = result.err().unwrap().into(); + assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("Invalid file format")); +} + +#[test] +fn decoder_init_error_io() { + use crate::io::reader::AssignmentReader; + struct FailReader; + impl std::io::Read for FailReader { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Err(std::io::Error::new(std::io::ErrorKind::BrokenPipe, "broken")) + } + } + let result = AssignmentReader::new(FailReader); + assert!(result.is_err()); + let io_err: std::io::Error = result.err().unwrap().into(); + assert_eq!(io_err.kind(), std::io::ErrorKind::BrokenPipe); +} + +#[test] +fn decoder_init_error_unknown_mode() { + let err = DecoderInitError::UnknownMode { + mode: "foo".to_string(), + }; + let io_err: std::io::Error = err.into(); + assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidInput); + assert!(io_err.to_string().contains("foo")); +} + +// ── for_each_assignment edge cases ───────────────────────────────────────── + +#[test] +fn xz_reader_for_each_assignment_callback_error_propagates() { + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader + .for_each_assignment(|_assignment, _count| { + Err(std::io::Error::new(std::io::ErrorKind::Other, "callback failed")) + }) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); + assert_eq!(err.to_string(), "callback failed"); +} + +// ── Large assignment vector ──────────────────────────────────────────────── + +#[test] +fn xz_reader_large_assignment_roundtrip() { + let big_assign: Vec = (1..=1000).collect(); + let xben = make_xben_from_assignments(&[big_assign.clone()], BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], big_assign); +} + +// ── AssignmentReader tests ───────────────────────────────────────────────── + +#[test] +fn assignment_reader_standard_roundtrip() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::AssignmentReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[3,3,4,4],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + assert_eq!(reader.variant(), BenVariant::Standard); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1, 1, 2, 2], vec![3, 3, 4, 4]]); +} + +#[test] +fn assignment_reader_mkv_roundtrip() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::AssignmentReader; + + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[1,2,3],"sample":2} +{"assignment":[4,5,6],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + assert_eq!(reader.variant(), BenVariant::MkvChain); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + // MkvChain collapses: first frame count=2, second count=1 + assert_eq!(results.len(), 2); + assert_eq!(results[0].0, vec![1, 2, 3]); + assert_eq!(results[0].1, 2); + assert_eq!(results[1].0, vec![4, 5, 6]); + assert_eq!(results[1].1, 1); +} + +#[test] +fn assignment_reader_twodelta_roundtrip() { + use crate::io::reader::AssignmentReader; + use crate::io::writer::AssignmentWriter; + + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + vec![2, 2, 2, 2], + ]; + + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + + let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + assert_eq!(reader.variant(), BenVariant::TwoDelta); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn assignment_reader_count_samples() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::AssignmentReader; + + let jsonl = r#"{"assignment":[1,2],"sample":1} +{"assignment":[3,4],"sample":2} +{"assignment":[5,6],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + assert_eq!(reader.count_samples().unwrap(), 3); +} + +#[test] +fn assignment_reader_write_all_jsonl() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::AssignmentReader; + + let jsonl = r#"{"assignment":[10,20],"sample":1} +{"assignment":[30,40],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let mut reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let output_str = String::from_utf8(output).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 2); + let v1: serde_json::Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([10, 20])); + let v2: serde_json::Value = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(v2["assignment"], serde_json::json!([30, 40])); +} From cc53e8aa4fef70f016d882cd530a63899da9defe Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:14:39 -0600 Subject: [PATCH 060/221] change format spec for bendl --- docs/bendl-format-spec.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/bendl-format-spec.md b/docs/bendl-format-spec.md index b02643a..6a5e56b 100644 --- a/docs/bendl-format-spec.md +++ b/docs/bendl-format-spec.md @@ -78,7 +78,7 @@ offset size field - `minor_version` - initial value: `0` - `flags` - - bundle-level feature flags + - bundle-level feature flags (64 bits available) - `complete` - `0` means incomplete/unfinalized - `1` means finalized @@ -96,7 +96,7 @@ offset size field - `0` if unknown/unfinalized - `sample_count` - number of expanded samples in the assignment stream - - `u64::MAX` if unknown/unfinalized + - `-1` if unknown/unfinalized - `reserved` - reserved for future extension @@ -294,14 +294,14 @@ pub struct BendlHeader { pub magic: [u8; 8], pub major_version: u16, pub minor_version: u16, - pub flags: u16, + pub flags: u64, pub complete: u8, pub assignment_format: u8, pub directory_offset: u64, pub directory_len: u64, pub stream_offset: u64, pub stream_len: u64, - pub sample_count: u64, + pub sample_count: i28, pub reserved: u64, } From f9224ae0f00d7129f3e9b5d18ce3cac6b759cb73 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 10 Apr 2026 08:13:57 -0600 Subject: [PATCH 061/221] move over to using petgraph internals --- Cargo.lock | 234 +++++ ben/Cargo.toml | 3 + ben/src/json/graph/errors.rs | 60 ++ ben/src/json/graph/mlc.rs | 394 +++++++++ ben/src/json/graph/mod.rs | 731 +++------------- ben/src/json/graph/nx_formats.rs | 59 ++ ben/src/json/graph/petxgraph/mod.rs | 57 ++ ben/src/json/graph/petxgraph/nx_convert.rs | 315 +++++++ ben/src/json/graph/petxgraph/permutation.rs | 63 ++ ben/src/json/graph/petxgraph/sort.rs | 86 ++ ben/src/json/graph/rcm.rs | 135 +++ ben/src/json/graph/tests/mod.rs | 2 + .../graph/{tests.rs => tests/test_algos.rs} | 2 +- ben/src/json/graph/tests/test_io.rs | 802 ++++++++++++++++++ 14 files changed, 2326 insertions(+), 617 deletions(-) create mode 100644 ben/src/json/graph/errors.rs create mode 100644 ben/src/json/graph/mlc.rs create mode 100644 ben/src/json/graph/nx_formats.rs create mode 100644 ben/src/json/graph/petxgraph/mod.rs create mode 100644 ben/src/json/graph/petxgraph/nx_convert.rs create mode 100644 ben/src/json/graph/petxgraph/permutation.rs create mode 100644 ben/src/json/graph/petxgraph/sort.rs create mode 100644 ben/src/json/graph/rcm.rs create mode 100644 ben/src/json/graph/tests/mod.rs rename ben/src/json/graph/{tests.rs => tests/test_algos.rs} (99%) create mode 100644 ben/src/json/graph/tests/test_io.rs diff --git a/Cargo.lock b/Cargo.lock index b3ceb07..82f6b82 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "ansi_term" version = "0.12.1" @@ -95,11 +101,14 @@ dependencies = [ "clap 4.5.48", "lipsum", "pcompress", + "petgraph", "pipe", "proptest", "rand 0.9.2", "rand_chacha 0.9.0", "rand_distr", + "rustworkx-core", + "serde", "serde_json", "thiserror", "tracing", @@ -226,12 +235,43 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "errno" version = "0.3.14" @@ -254,12 +294,24 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0399f9d26e5191ce32c498bebd31e7a3ceabc2745f0ac54af3f335126c3f24b3" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "getrandom" version = "0.3.3" @@ -272,6 +324,24 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", + "rayon", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + [[package]] name = "heck" version = "0.3.3" @@ -296,6 +366,17 @@ dependencies = [ "libc", ] +[[package]] +name = "indexmap" +version = "2.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "rayon", +] + [[package]] name = "indoc" version = "2.0.6" @@ -308,6 +389,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -374,6 +464,16 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "memchr" version = "2.7.6" @@ -389,6 +489,22 @@ dependencies = [ "autocfg", ] +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", + "rayon", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -398,6 +514,24 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -431,6 +565,18 @@ dependencies = [ "structopt", ] +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", + "serde", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -458,6 +604,15 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +[[package]] +name = "portable-atomic-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -467,6 +622,17 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "priority-queue" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93980406f12d9f8140ed5abe7155acb10bb1e69ea55c88960b9c2f117445ef96" +dependencies = [ + "equivalent", + "indexmap", + "serde", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -684,6 +850,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_pcg" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b48ac3f7ffaab7fac4d2376632268aa5f89abdb55f7ebf8f4d11fffccb2320f7" +dependencies = [ + "rand_core 0.9.3", +] + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -693,6 +868,43 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex-automata" version = "0.4.14" @@ -723,6 +935,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustworkx-core" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaeee6f84153fd6f62507fc22bfe9499c8485075b44186dcbb918166ef75116f" +dependencies = [ + "fixedbitset", + "foldhash", + "hashbrown 0.15.5", + "indexmap", + "ndarray", + "num-traits", + "petgraph", + "priority-queue", + "rand 0.9.2", + "rand_distr", + "rand_pcg", + "rayon", + "rayon-cond", +] + [[package]] name = "rusty-fork" version = "0.3.1" @@ -748,6 +981,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", + "serde_derive", ] [[package]] diff --git a/ben/Cargo.toml b/ben/Cargo.toml index 14fea11..b69493f 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -18,7 +18,10 @@ name = "binary_ensemble" byteorder = "1.5.0" clap = { version = "^4.5.2", features = ["derive"] } pcompress = "1.0.7" +petgraph = "0.8.3" pipe = "0.4.0" +rustworkx-core = "0.17.1" +serde = {version = "1.0.228", features = ["derive"]} serde_json = "^1.0.107" thiserror = "2.0.18" tracing = "0.1.41" diff --git a/ben/src/json/graph/errors.rs b/ben/src/json/graph/errors.rs new file mode 100644 index 0000000..445e03a --- /dev/null +++ b/ben/src/json/graph/errors.rs @@ -0,0 +1,60 @@ +use serde_json::Value; +use std::fmt; + +/// Errors that can occur when converting between [`NxGraphAdjFormat`] and +/// [`PetxGraph`]. +#[derive(Debug)] +pub(crate) enum NxPetgraphError { + /// The `directed` flag on the input does not match the target graph type. + DirectednessMismatch { + /// The directedness expected by the target type. + expected_directed: bool, + /// The directedness found in the input data. + found_directed: bool, + }, + /// The `nodes` and `adjacency` arrays have different lengths. + NodeAdjacencyLengthMismatch { + /// Number of entries in the `nodes` array. + n_nodes: usize, + /// Number of entries in the `adjacency` array. + n_adjacency_items: usize, + }, + /// A node id appears more than once in the `nodes` array. + DuplicateNodeId(Value), + /// An adjacency entry references a node id not present in `nodes`. + MissingNeighborNode(Value), + /// A catch-all for other conversion errors. + Other(String), +} + +impl fmt::Display for NxPetgraphError { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::DirectednessMismatch { + expected_directed, + found_directed, + } => write!( + formatter, + "directedness mismatch: expected_directed={}, found_directed={}", + expected_directed, found_directed + ), + Self::NodeAdjacencyLengthMismatch { + n_nodes: nodes, + n_adjacency_items: adjacency, + } => write!( + formatter, + "nodes/adjacency length mismatch: {} nodes but {} adjacency lists", + nodes, adjacency + ), + Self::DuplicateNodeId(id) => { + write!(formatter, "duplicate node id in NetworkX data: {}", id) + } + Self::MissingNeighborNode(id) => { + write!(formatter, "adjacency references unknown node id: {}", id) + } + Self::Other(msg) => write!(formatter, "{}", msg), + } + } +} + +impl std::error::Error for NxPetgraphError {} diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs new file mode 100644 index 0000000..422c5bf --- /dev/null +++ b/ben/src/json/graph/mlc.rs @@ -0,0 +1,394 @@ +use super::petxgraph::{apply_permutation, PetxGraph}; +use super::rcm::{local_degree_in_component, rcm_component}; +use petgraph::graph::{Graph, NodeIndex}; +use petgraph::visit::NodeIndexable; +use rustworkx_core::connectivity::connected_components; +use std::cmp::Reverse; +use std::collections::HashSet; + +/// Tracks how many original nodes have been finalized so far and emits +/// periodic `tracing::info!` milestones when verbose logging is enabled. +/// +/// Progress is measured in real-node chunks (base-case RCM calls and +/// per-cluster RCM calls at depth 0). Coarse-graph recursion does not +/// contribute, so `total` corresponds exactly to the number of nodes in +/// the original graph. +struct MlcProgress { + total: usize, + done: usize, + last_logged_pct: usize, +} + +impl MlcProgress { + /// Create a new progress tracker for a graph with `total` real nodes. + fn new(total: usize) -> Self { + Self { + total, + done: 0, + last_logged_pct: 0, + } + } + + /// Record that `chunk` more real nodes have been finalized. Emits an + /// `info!` log line whenever completion crosses a 5% boundary. + fn add(&mut self, chunk: usize) { + self.done += chunk; + let pct = if self.total == 0 { + 100 + } else { + self.done * 100 / self.total + }; + if pct >= self.last_logged_pct + 5 || self.done == self.total { + tracing::info!( + "MLC progress: {}/{} nodes ({}%)", + self.done, + self.total, + pct + ); + self.last_logged_pct = pct; + } + } +} + +/// Compute a multilevel cluster ordering and apply it to the graph in place. +/// +/// The graph is reordered so that nodes which are topologically close end up +/// at adjacent indices. Each connected component is ordered independently, +/// and components are sorted by their minimum node index. +/// +/// Arguments: +/// +/// - `petx_graph`: The graph to reorder in place. Only edge topology is +/// considered; node and edge attributes are preserved but relocated. +/// +/// Returns: +/// +/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` +/// the node occupied before reordering. +pub(super) fn apply_multi_level_clustering(petx_graph: &mut PetxGraph) -> Vec +where + Ty: petgraph::EdgeType, +{ + let total_nodes = petx_graph.graph.node_count(); + tracing::info!("MLC: starting on graph with {} nodes", total_nodes); + + let labels: Vec = (0..petx_graph.graph.node_bound()).collect(); + let mut progress = MlcProgress::new(total_nodes); + let order = mlc_order_inner(&petx_graph.graph, &labels, Some(&mut progress), 0); + *petx_graph = apply_permutation(petx_graph, &order); + + tracing::info!("MLC: complete"); + order +} + +/// Recursively order each connected component via multilevel clustering, then +/// concatenate the results. +/// +/// Components are sorted by their minimum label so that the output order is +/// deterministic. Each component is ordered independently by +/// [`mlc_component`]. +/// +/// # Arguments +/// +/// * `graph` - The input graph to order. Generic over node/edge weights and +/// edge type so it also works with the coarse graph during recursion. +/// * `labels` - A per-node label vector used for tie-breaking when choosing +/// BFS seeds and sorting neighbors. Indexed by `NodeIndex::index()`. +/// * `progress` - Optional progress tracker. `Some(_)` at the top level and +/// `None` when recursing into the coarse graph so only real-node work +/// contributes to the counter. +/// * `depth` - Recursion depth. Zero at the top level, incremented each +/// time we recurse into a coarse graph. Used only for logging. +/// +/// # Returns +/// +/// A permutation vector where `order[new_index]` is the `NodeIndex` of the +/// node that should occupy position `new_index`. +fn mlc_order_inner( + graph: &Graph, + labels: &[usize], + mut progress: Option<&mut MlcProgress>, + depth: usize, +) -> Vec +where + Ty: petgraph::EdgeType, +{ + let mut components: Vec> = connected_components(graph) + .into_iter() + .map(|set| set.into_iter().collect()) + .collect(); + components.sort_by_key(|c| { + c.iter() + .map(|n| labels[n.index()]) + .min() + .unwrap_or(usize::MAX) + }); + + tracing::debug!( + "MLC depth={}: {} component(s) to order", + depth, + components.len() + ); + + let mut order = Vec::with_capacity(graph.node_count()); + for component in components { + order.extend(mlc_component( + graph, + labels, + &component, + progress.as_deref_mut(), + depth, + )); + } + order +} + +/// Recursively order a single connected component via multilevel clustering. +/// +/// Single-node components are returned as-is. Otherwise the component is +/// greedily partitioned into clusters; each cluster is then ordered by +/// recursively applying `mlc_component` to it, a coarse graph of +/// inter-cluster edges is built, and the coarse graph is ordered via +/// [`mlc_order_inner`] to determine the final cluster sequence. +/// +/// If the greedy partition produces a single cluster (or the unreachable +/// all-singletons case), the algorithm cannot make progress and falls back +/// to RCM on the whole component. +/// +/// # Arguments +/// +/// * `graph` - The full graph (only edges within `component` are relevant). +/// * `labels` - Per-node labels for tie-breaking, indexed by +/// `NodeIndex::index()`. +/// * `component` - The subset of `NodeIndex` values to order. +/// * `progress` - Optional progress tracker, `Some(_)` only when ordering +/// real nodes. Advanced when the recursion bottoms out at a singleton +/// component or hits the degenerate RCM fallback. +/// * `depth` - Recursion depth, zero at the top level. Used for logging. +/// +/// # Returns +/// +/// A permutation of the nodes in `component` representing their new order. +fn mlc_component( + graph: &Graph, + labels: &[usize], + component: &[NodeIndex], + mut progress: Option<&mut MlcProgress>, + depth: usize, +) -> Vec +where + Ty: petgraph::EdgeType, +{ + if component.len() == 1 { + tracing::debug!("MLC depth={}: singleton component", depth); + if let Some(p) = progress.as_deref_mut() { + p.add(1); + } + return vec![component[0]]; + } + + let clusters = greedy_cluster_partition(graph, labels, component); + if clusters.len() <= 1 || clusters.len() == component.len() { + tracing::debug!( + "MLC depth={}: degenerate partition ({} clusters from {} nodes), falling back to RCM", + depth, + clusters.len(), + component.len() + ); + let order = rcm_component(graph, labels, component); + if let Some(p) = progress.as_deref_mut() { + p.add(component.len()); + } + return order; + } + + tracing::debug!( + "MLC depth={}: partitioned {} nodes into {} clusters", + depth, + component.len(), + clusters.len() + ); + + let mut cluster_orders: Vec> = Vec::with_capacity(clusters.len()); + for cluster in &clusters { + let order = mlc_component(graph, labels, cluster, progress.as_deref_mut(), depth + 1); + cluster_orders.push(order); + } + + let (coarse_graph, coarse_labels) = build_coarse_graph(graph, labels, &clusters); + let coarse_order = mlc_order_inner(&coarse_graph, &coarse_labels, None, depth + 1); + + let mut order = Vec::with_capacity(component.len()); + for coarse_node in coarse_order { + order.extend(cluster_orders[coarse_node.index()].iter().copied()); + } + order +} + +/// Partition a component into small clusters using a greedy seed-expansion +/// strategy. +/// +/// Seeds are chosen in order of increasing local degree (ties broken by label). +/// Each seed expands to include all of its unassigned neighbors. After each +/// cluster is formed, local degrees are incrementally updated: for every +/// unassigned neighbor of a newly-assigned node, the neighbor's degree is +/// decremented. Nodes are then re-sorted before picking the next seed. +/// +/// # Arguments +/// +/// * `graph` - The full graph (only edges within `component` are relevant). +/// * `labels` - Per-node labels for tie-breaking, indexed by +/// `NodeIndex::index()`. +/// * `component` - The subset of `NodeIndex` values to partition. +/// +/// # Returns +/// +/// A vector of clusters, where each cluster is a vector of `NodeIndex` +/// values. Every node in `component` appears in exactly one cluster. +fn greedy_cluster_partition( + graph: &Graph, + labels: &[usize], + component: &[NodeIndex], +) -> Vec> +where + Ty: petgraph::EdgeType, +{ + let component_set: HashSet = component.iter().copied().collect(); + let mut local_deg = local_degree_in_component(graph, &component_set, component); + + let mut assigned = vec![false; graph.node_bound()]; + let mut remaining: Vec = component.to_vec(); + let mut clusters = Vec::new(); + + // Epoch-based marking for seed neighbors avoids rebuilding a set each + // iteration. + let mut seed_marks = vec![0usize; graph.node_bound()]; + let mut mark_epoch = 1usize; + + while !remaining.is_empty() { + remaining.sort_by_key(|&node| (local_deg[node.index()], labels[node.index()])); + let seed = remaining[0]; + + let mut cluster = vec![seed]; + assigned[seed.index()] = true; + + for neighbor in graph.neighbors(seed) { + if component_set.contains(&neighbor) { + seed_marks[neighbor.index()] = mark_epoch; + } + } + + let mut candidates: Vec = graph + .neighbors(seed) + .filter(|&n| component_set.contains(&n) && !assigned[n.index()]) + .collect(); + candidates.sort_by_key(|&neighbor| { + let shared = graph + .neighbors(neighbor) + .filter(|&next| { + component_set.contains(&next) && seed_marks[next.index()] == mark_epoch + }) + .count(); + ( + Reverse(shared), + local_deg[neighbor.index()], + labels[neighbor.index()], + ) + }); + + for neighbor in candidates { + assigned[neighbor.index()] = true; + cluster.push(neighbor); + } + + mark_epoch = mark_epoch.wrapping_add(1); + if mark_epoch == 0 { + seed_marks.fill(0); + mark_epoch = 1; + } + + // Decrement degrees of unassigned nodes adjacent to the new cluster. + for &node in &cluster { + for neighbor in graph.neighbors(node) { + if component_set.contains(&neighbor) && !assigned[neighbor.index()] { + local_deg[neighbor.index()] -= 1; + } + } + } + + remaining.retain(|&n| !assigned[n.index()]); + clusters.push(cluster); + } + + clusters +} + +/// Build a coarse graph where each cluster is contracted into a single node. +/// +/// The coarse graph is always undirected: an edge exists between two coarse +/// nodes whenever any original-graph edge connects their clusters. Each coarse +/// node's label is the minimum original label among its cluster members. +/// +/// # Arguments +/// +/// * `graph` - The full graph containing the original edges. +/// * `labels` - Per-node labels for the original graph, indexed by +/// `NodeIndex::index()`. +/// * `clusters` - The partition produced by [`greedy_cluster_partition`]. +/// Cluster `i` maps to coarse node `i`. +/// +/// # Returns +/// +/// A tuple of: +/// * The coarse `Graph<(), (), Undirected>` with one node per cluster and +/// one edge per inter-cluster connection. +/// * A label vector for the coarse graph (one entry per cluster), where +/// each label is the minimum original label in that cluster. +fn build_coarse_graph( + graph: &Graph, + labels: &[usize], + clusters: &[Vec], +) -> (Graph<(), (), petgraph::Undirected>, Vec) +where + Ty: petgraph::EdgeType, +{ + let mut cluster_of = vec![usize::MAX; graph.node_bound()]; + for (ci, cluster) in clusters.iter().enumerate() { + for &node in cluster { + cluster_of[node.index()] = ci; + } + } + + let mut coarse_graph = Graph::<(), (), petgraph::Undirected>::with_capacity(clusters.len(), 0); + for _ in 0..clusters.len() { + coarse_graph.add_node(()); + } + + let mut seen_edges: HashSet<(usize, usize)> = HashSet::new(); + for (ci, cluster) in clusters.iter().enumerate() { + for &node in cluster { + for neighbor in graph.neighbors(node) { + let nc = cluster_of[neighbor.index()]; + if nc != ci && nc != usize::MAX { + let canonical = if ci < nc { (ci, nc) } else { (nc, ci) }; + if seen_edges.insert(canonical) { + coarse_graph.add_edge(NodeIndex::new(ci), NodeIndex::new(nc), ()); + } + } + } + } + } + + let coarse_labels: Vec = clusters + .iter() + .map(|cluster| { + cluster + .iter() + .map(|n| labels[n.index()]) + .min() + .unwrap_or(usize::MAX) + }) + .collect(); + + (coarse_graph, coarse_labels) +} diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 013e571..bb0c8d9 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -1,11 +1,19 @@ //! JSON graph helpers used by relabeling workflows. -use crate::progress; -use serde_json::{json, Value}; -use std::cmp::{Ordering, Reverse}; -use std::collections::{HashMap, HashSet, VecDeque}; -use std::io::{self, Read, Result, Write}; -use std::result::Result as StdResult; +use std::collections::HashMap; +use std::io::{self, Error, ErrorKind, Read, Result, Write}; + +mod errors; +mod mlc; +mod nx_formats; +mod petxgraph; +mod rcm; + +use errors::NxPetgraphError; +use nx_formats::NxGraphAdjFormat; +use petgraph::graph::NodeIndex; +use petgraph::{Directed, Undirected}; +use petxgraph::PetxGraph; /// Topology-based graph ordering methods supported by `reben`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -16,695 +24,186 @@ pub enum GraphOrderingMethod { ReverseCuthillMckee, } -#[derive(Clone)] -struct GraphJson { - data: Value, - nodes: Vec, - adjacency: Vec>, - node_ids: Vec, - adjacency_indices: Vec>, -} - -impl GraphJson { - /// Deserialize a NetworkX node-link JSON graph from a reader. - /// - /// # Arguments - /// - /// * `reader` - A source implementing `Read` that provides the JSON data. - /// - /// # Returns - /// - /// Returns a parsed `GraphJson` with precomputed node ids and adjacency indices. - fn from_reader(reader: R) -> io::Result { - let data: Value = serde_json::from_reader(reader)?; - let nodes = data["nodes"].as_array().cloned().unwrap_or_default(); - let adjacency = data["adjacency"] - .as_array() - .map(|rows| { - rows.iter() - .map(|row| row.as_array().cloned().unwrap_or_default()) - .collect::>() - }) - .unwrap_or_default(); - - let node_ids = nodes - .iter() - .map(parse_node_id) - .collect::>>()?; - let id_to_index = node_ids - .iter() - .enumerate() - .map(|(idx, &id)| (id, idx)) - .collect::>(); - let adjacency_indices = adjacency - .iter() - .map(|row| { - row.iter() - .map(|link| { - let id = parse_link_id(link)?; - id_to_index.get(&id).copied().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Adjacency references unknown node id {id}"), - ) - }) - }) - .collect::>>() - }) - .collect::>>()?; - - Ok(Self { - data, - nodes, - adjacency, - node_ids, - adjacency_indices, - }) - } -} - -/// Sorts a JSON-formatted NetworkX graph file by a key. +/// Sorts a JSON-formatted NetworkX graph file by a node attribute. +/// +/// Reads a NetworkX adjacency-format JSON graph, reorders nodes so that they +/// are sorted by the given attribute key, and writes the reordered graph back +/// as JSON. /// /// # Arguments /// -/// * `reader` - The source JSON graph in the NetworkX node-link style used by -/// the relabeling workflow. -/// * `writer` - The destination for the sorted JSON graph. -/// * `key` - The node attribute used to determine the new ordering. +/// * `reader` - A source of JSON bytes in NetworkX adjacency format. +/// * `writer` - Destination for the reordered JSON output. +/// * `key` - The node attribute name to sort by. Use `"id"` to sort by the +/// NetworkX node id. /// /// # Returns /// -/// Returns a map from the original node id to the new node id. +/// A map from each original node id to its new (post-sort) node id. pub fn sort_json_file_by_key( reader: R, writer: W, key: &str, ) -> Result> { tracing::trace!("Loading JSON file..."); - let graph = GraphJson::from_reader(reader)?; - let mut order: Vec = (0..graph.nodes.len()).collect(); + let nx_graph: NxGraphAdjFormat = serde_json::from_reader(reader)?; + let original_ids = extract_usize_ids(&nx_graph)?; tracing::trace!("Sorting JSON file by key: {}", key); - order.sort_by(|&a, &b| compare_node_key(&graph.nodes[a], &graph.nodes[b], key)); + let (result, order) = if nx_graph.directed { + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = petxgraph::sort_by_key(&mut petx, key); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + (result, order) + } else { + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = petxgraph::sort_by_key(&mut petx, key); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + (result, order) + }; - reorder_graph(graph, order, writer) + write_nx_graph(writer, &result)?; + Ok(build_id_mapping(&original_ids, &order)) } /// Reorder a JSON-formatted NetworkX graph file using a topology-based method. /// +/// Reads a NetworkX adjacency-format JSON graph, reorders nodes using the +/// specified graph ordering algorithm, and writes the reordered graph back +/// as JSON. +/// /// # Arguments /// -/// * `reader` - The source JSON graph in the NetworkX node-link style used by -/// the relabeling workflow. -/// * `writer` - The destination for the reordered JSON graph. +/// * `reader` - A source of JSON bytes in NetworkX adjacency format. +/// * `writer` - Destination for the reordered JSON output. /// * `method` - The topology-based ordering algorithm to apply. /// /// # Returns /// -/// Returns a map from the original node id to the new node id. +/// A map from each original node id to its new (post-sort) node id. pub fn sort_json_file_by_ordering( reader: R, writer: W, method: GraphOrderingMethod, ) -> Result> { tracing::trace!("Loading JSON file..."); - let graph = GraphJson::from_reader(reader)?; - tracing::trace!("Sorting JSON file by ordering method: {:?}", method); - - let order = match method { - GraphOrderingMethod::MultiLevelCluster => multi_level_cluster_order(&graph), - GraphOrderingMethod::ReverseCuthillMckee => reverse_cuthill_mckee_order(&graph), - }; - - reorder_graph(graph, order, writer) -} + let nx_graph: NxGraphAdjFormat = serde_json::from_reader(reader)?; + let original_ids = extract_usize_ids(&nx_graph)?; -/// Extract the `id` field from a node JSON value as a `usize`. -/// -/// # Arguments -/// -/// * `node` - A JSON value representing a graph node. -/// -/// # Returns -/// -/// Returns the node id, or an error if the field is missing or not an unsigned integer. -fn parse_node_id(node: &Value) -> io::Result { - node["id"].as_u64().map(|v| v as usize).ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Node id is not an unsigned integer: {}", node["id"]), - ) - }) -} - -/// Extract the `id` field from an adjacency link JSON value as a `usize`. -/// -/// # Arguments -/// -/// * `link` - A JSON value representing an adjacency link (edge target). -/// -/// # Returns -/// -/// Returns the target node id, or an error if the field is missing or not an unsigned integer. -fn parse_link_id(link: &Value) -> io::Result { - link["id"].as_u64().map(|v| v as usize).ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Edge target id is not an unsigned integer: {}", link["id"]), - ) - }) -} - -/// Compare two nodes by a named attribute, using numeric ordering when possible. -/// -/// # Arguments -/// -/// * `a` - The first node JSON value. -/// * `b` - The second node JSON value. -/// * `key` - The attribute name to compare. -/// -/// # Returns -/// -/// Returns the ordering between the two nodes based on the attribute value. -fn compare_node_key(a: &Value, b: &Value, key: &str) -> Ordering { - let extract_value = |val: &Value| -> StdResult { - match &val[key] { - Value::String(s) => s.parse::().map_err(|_| s.clone()), - Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()), - _ => Err(val[key].to_string()), - } + tracing::trace!("Sorting JSON file by ordering method: {:?}", method); + let (result, order) = if nx_graph.directed { + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = run_ordering_method(&mut petx, method); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + (result, order) + } else { + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = run_ordering_method(&mut petx, method); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + (result, order) }; - match (extract_value(a), extract_value(b)) { - (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), - (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), - (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), - (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), - } -} - -/// Apply a permutation to a graph and write the relabeled JSON to a writer. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph to reorder. -/// * `order` - A permutation where `order[new_index]` gives the old index. -/// * `writer` - The destination for the reordered JSON output. -/// -/// # Returns -/// -/// Returns a map from original node id to new node id. -fn reorder_graph( - mut graph: GraphJson, - order: Vec, - mut writer: W, -) -> io::Result> { - let mut old_id_to_new = HashMap::with_capacity(order.len()); - let mut new_nodes = Vec::with_capacity(order.len()); - let mut new_adjacency = Vec::with_capacity(order.len()); - - for (new_idx, &old_idx) in order.iter().enumerate() { - progress!("Relabeling node: {}\r", new_idx + 1); - old_id_to_new.insert(graph.node_ids[old_idx], new_idx); - } - tracing::trace!(""); - - for (new_idx, &old_idx) in order.iter().enumerate() { - let mut node = graph.nodes[old_idx].clone(); - node["id"] = json!(new_idx); - new_nodes.push(node); - } - - for (new_idx, &old_idx) in order.iter().enumerate() { - progress!("Relabeling edge: {}\r", new_idx + 1); - let mut new_edge_lst = graph.adjacency[old_idx].clone(); - for link in &mut new_edge_lst { - let old_neighbor_id = parse_link_id(link)?; - let new_neighbor = old_id_to_new[&old_neighbor_id]; - link["id"] = json!(new_neighbor); - } - new_adjacency.push(Value::Array(new_edge_lst)); - } - tracing::trace!(""); - - graph.data["nodes"] = Value::Array(new_nodes); - graph.data["adjacency"] = Value::Array(new_adjacency); - - tracing::trace!("Writing new json to file..."); - let rendered = serde_json::to_string(&graph.data) - .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))?; - writer.write_all(rendered.as_bytes())?; - - Ok(old_id_to_new) -} - -/// Find connected components of a graph using breadth-first search. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph to decompose. -/// -/// # Returns -/// -/// Returns a list of components, each a vector of node indices, sorted by -/// smallest original node id. -fn connected_components(graph: &GraphJson) -> Vec> { - let n = graph.nodes.len(); - let mut seen = vec![false; n]; - let mut components = Vec::new(); - - for start in 0..n { - if seen[start] { - continue; - } - let mut queue = VecDeque::from([start]); - let mut component = Vec::new(); - seen[start] = true; - - while let Some(node) = queue.pop_front() { - component.push(node); - for &neighbor in &graph.adjacency_indices[node] { - if !seen[neighbor] { - seen[neighbor] = true; - queue.push_back(neighbor); - } - } - } - - components.push(component); - } - - components.sort_by_key(|component| graph.node_ids[component[0]]); - components -} - -/// Compute a Reverse Cuthill-McKee ordering for the entire graph. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph to order. -/// -/// # Returns -/// -/// Returns a permutation of node indices that reduces bandwidth. -fn reverse_cuthill_mckee_order(graph: &GraphJson) -> Vec { - let mut order = Vec::with_capacity(graph.nodes.len()); - - for component in connected_components(graph) { - order.extend(reverse_cuthill_mckee_component(graph, &component)); - } - - order -} - -/// Compute a Reverse Cuthill-McKee ordering for a single connected component. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph. -/// * `component` - The node indices belonging to the component. -/// -/// # Returns -/// -/// Returns a reversed BFS ordering of the component starting from the -/// minimum-degree node. -fn reverse_cuthill_mckee_component(graph: &GraphJson, component: &[usize]) -> Vec { - let degrees = graph - .adjacency_indices - .iter() - .map(Vec::len) - .collect::>(); - let component_set = component - .iter() - .copied() - .collect::>(); - let start = component - .iter() - .copied() - .min_by_key(|&node| (degrees[node], graph.node_ids[node])) - .unwrap(); - - let mut visited = vec![false; graph.nodes.len()]; - let mut queue = VecDeque::from([start]); - visited[start] = true; - let mut component_order = Vec::with_capacity(component.len()); - - while let Some(node) = queue.pop_front() { - component_order.push(node); - let mut neighbors = graph.adjacency_indices[node] - .iter() - .copied() - .filter(|neighbor| component_set.contains(neighbor) && !visited[*neighbor]) - .collect::>(); - neighbors.sort_by_key(|&neighbor| (degrees[neighbor], graph.node_ids[neighbor])); - for neighbor in neighbors { - visited[neighbor] = true; - queue.push_back(neighbor); - } - } - - component_order.reverse(); - component_order -} - -/// Compute a multilevel cluster ordering for the entire graph. -/// -/// # Arguments -/// -/// * `graph` - The parsed graph to order. -/// -/// # Returns -/// -/// Returns a permutation of node indices produced by recursive multilevel -/// clustering. -fn multi_level_cluster_order(graph: &GraphJson) -> Vec { - multilevel_cluster_order_generic(&graph.adjacency_indices, &graph.node_ids) -} - -fn subset_mask(size: usize, nodes: &[usize]) -> Vec { - let mut mask = vec![false; size]; - for &node in nodes { - mask[node] = true; - } - mask -} - -/// Find connected components of a generic adjacency list using breadth-first search. -/// -/// # Arguments -/// -/// * `adjacency` - The adjacency list for each node. -/// * `labels` - Node labels used to sort components by smallest label. -/// -/// # Returns -/// -/// Returns a list of components, each a vector of node indices, sorted by -/// minimum label. -fn connected_components_generic(adjacency: &[Vec], labels: &[usize]) -> Vec> { - let mut seen = vec![false; adjacency.len()]; - let mut components = Vec::new(); - - for start in 0..adjacency.len() { - if seen[start] { - continue; - } - let mut queue = VecDeque::from([start]); - let mut component = Vec::new(); - seen[start] = true; - - while let Some(node) = queue.pop_front() { - component.push(node); - for &neighbor in &adjacency[node] { - if !seen[neighbor] { - seen[neighbor] = true; - queue.push_back(neighbor); - } - } - } - - components.push(component); - } - - components.sort_by_key(|component| { - component - .iter() - .map(|&node| labels[node]) - .min() - .unwrap_or(usize::MAX) - }); - components + write_nx_graph(writer, &result)?; + Ok(build_id_mapping(&original_ids, &order)) } -/// Compute a Reverse Cuthill-McKee ordering for a component of a generic graph. +/// Dispatch to the appropriate ordering algorithm. /// /// # Arguments /// -/// * `adjacency` - The adjacency list for each node. -/// * `labels` - Node labels used for tie-breaking. -/// * `component` - The node indices belonging to the component. +/// * `petx` - The graph to reorder in place. +/// * `method` - Which ordering algorithm to run. /// /// # Returns /// -/// Returns a reversed BFS ordering of the component starting from the -/// minimum-degree node. -fn rcm_component_generic( - adjacency: &[Vec], - labels: &[usize], - component: &[usize], -) -> Vec { - let component_mask = subset_mask(adjacency.len(), component); - let local_degree = local_degree_in_subset(adjacency, &component_mask, component); - let start = component - .iter() - .copied() - .min_by_key(|&node| (local_degree[node], labels[node])) - .unwrap(); - - let mut visited = vec![false; adjacency.len()]; - let mut queue = VecDeque::from([start]); - let mut order = Vec::with_capacity(component.len()); - visited[start] = true; - - while let Some(node) = queue.pop_front() { - order.push(node); - let mut neighbors = adjacency[node] - .iter() - .copied() - .filter(|&neighbor| component_mask[neighbor] && !visited[neighbor]) - .collect::>(); - neighbors.sort_by_key(|&neighbor| (local_degree[neighbor], labels[neighbor])); - for neighbor in neighbors { - visited[neighbor] = true; - queue.push_back(neighbor); - } +/// The permutation that was applied: `order[new_index]` is the `NodeIndex` +/// the node occupied before reordering. +fn run_ordering_method( + petx: &mut PetxGraph, + method: GraphOrderingMethod, +) -> Vec { + match method { + GraphOrderingMethod::MultiLevelCluster => mlc::apply_multi_level_clustering(petx), + GraphOrderingMethod::ReverseCuthillMckee => rcm::apply_reverse_cuthill_mckee(petx), } - - order.reverse(); - order } -/// Compute a multilevel cluster ordering for a generic graph. +/// Extract the integer node ids from an [`NxGraphAdjFormat`] in order. /// /// # Arguments /// -/// * `adjacency` - The adjacency list for each node. -/// * `labels` - Node labels used for tie-breaking and component sorting. +/// * `nx_graph` - The parsed NetworkX graph whose node ids are extracted. /// /// # Returns /// -/// Returns a permutation of node indices produced by recursive multilevel -/// clustering across all connected components. -fn multilevel_cluster_order_generic(adjacency: &[Vec], labels: &[usize]) -> Vec { - let mut order = Vec::with_capacity(adjacency.len()); - for component in connected_components_generic(adjacency, labels) { - order.extend(multilevel_cluster_component_generic( - adjacency, labels, &component, - )); - } - order -} - -/// Compute a multilevel cluster ordering for a single component of a generic graph. -/// -/// # Arguments -/// -/// * `adjacency` - The adjacency list for each node. -/// * `labels` - Node labels used for tie-breaking. -/// * `component` - The node indices belonging to the component. +/// A vector of `usize` ids in the same order as `nx_graph.nodes`. /// -/// # Returns +/// # Errors /// -/// Returns an ordering that recursively partitions the component into clusters, -/// orders each cluster with RCM, builds a coarse graph of clusters, and recurses. -fn multilevel_cluster_component_generic( - adjacency: &[Vec], - labels: &[usize], - component: &[usize], -) -> Vec { - if component.len() <= 8 { - return rcm_component_generic(adjacency, labels, component); - } - - let clusters = greedy_cluster_partition(adjacency, labels, component, 6); - if clusters.len() <= 1 || clusters.len() == component.len() { - return rcm_component_generic(adjacency, labels, component); - } - - let cluster_orders = clusters +/// Returns an error if any node id is not a non-negative integer. +fn extract_usize_ids(nx_graph: &NxGraphAdjFormat) -> io::Result> { + nx_graph + .nodes .iter() - .map(|cluster| rcm_component_generic(adjacency, labels, cluster)) - .collect::>(); - let (coarse_adjacency, coarse_labels) = build_coarse_graph(adjacency, labels, &clusters); - let coarse_order = multilevel_cluster_order_generic(&coarse_adjacency, &coarse_labels); - - let mut order = Vec::with_capacity(component.len()); - for cluster_idx in coarse_order { - order.extend(cluster_orders[cluster_idx].iter().copied()); - } - order + .map(|n| { + n.id.as_u64().map(|v| v as usize).ok_or_else(|| { + Error::new( + ErrorKind::InvalidData, + format!("Node id is not an unsigned integer: {}", n.id), + ) + }) + }) + .collect() } -/// Partition a component into small clusters using a greedy seed-expansion strategy. +/// Build a mapping from original node ids to new positional ids. /// /// # Arguments /// -/// * `adjacency` - The adjacency list for each node. -/// * `labels` - Node labels used for tie-breaking. -/// * `component` - The node indices to partition. -/// * `max_cluster_size` - The maximum number of nodes per cluster. +/// * `original_ids` - The node ids before reordering, indexed by the old +/// node position. +/// * `order` - The permutation that was applied: `order[new_index]` is the +/// old `NodeIndex`. /// /// # Returns /// -/// Returns a list of clusters, each a vector of node indices. -fn greedy_cluster_partition( - adjacency: &[Vec], - labels: &[usize], - component: &[usize], - max_cluster_size: usize, -) -> Vec> { - let component_mask = subset_mask(adjacency.len(), component); - let local_degree = local_degree_in_subset(adjacency, &component_mask, component); - let mut assigned = vec![false; adjacency.len()]; - let mut unassigned = component.to_vec(); - unassigned.sort_by_key(|&node| (local_degree[node], labels[node])); - let mut remaining = unassigned.len(); - let mut clusters = Vec::new(); - let mut seed_marks = vec![0usize; adjacency.len()]; - let mut mark_epoch = 1usize; - - while remaining > 0 { - let seed = unassigned - .iter() - .copied() - .find(|&node| !assigned[node]) - .unwrap(); - - let mut cluster = vec![seed]; - assigned[seed] = true; - remaining -= 1; - for &neighbor in &adjacency[seed] { - if component_mask[neighbor] { - seed_marks[neighbor] = mark_epoch; - } - } - - let mut candidates = adjacency[seed] - .iter() - .copied() - .filter(|&neighbor| component_mask[neighbor] && !assigned[neighbor]) - .collect::>(); - candidates.sort_by_key(|&neighbor| { - let shared = adjacency[neighbor] - .iter() - .filter(|&&next| component_mask[next] && seed_marks[next] == mark_epoch) - .count(); - (Reverse(shared), local_degree[neighbor], labels[neighbor]) - }); - - for neighbor in candidates - .into_iter() - .take(max_cluster_size.saturating_sub(1)) - { - assigned[neighbor] = true; - remaining -= 1; - cluster.push(neighbor); - } - - mark_epoch = mark_epoch.wrapping_add(1); - if mark_epoch == 0 { - seed_marks.fill(0); - mark_epoch = 1; - } - - clusters.push(cluster); +/// A map where `mapping[original_id] == new_positional_id`. +fn build_id_mapping(original_ids: &[usize], order: &[NodeIndex]) -> HashMap { + let mut mapping = HashMap::with_capacity(order.len()); + for (new_idx, &old_node_idx) in order.iter().enumerate() { + mapping.insert(original_ids[old_node_idx.index()], new_idx); } - - clusters + mapping } -/// Compute the degree of each node restricted to a subset of the graph. +/// Serialize an [`NxGraphAdjFormat`] to JSON and write it to the given writer. /// /// # Arguments /// -/// * `adjacency` - The adjacency list for each node. -/// * `subset_mask` - A boolean mask indicating which nodes belong to the subset. -/// * `subset` - The node indices in the subset. +/// * `writer` - Destination for the JSON bytes. +/// * `nx_graph` - The graph to serialize. /// /// # Returns /// -/// Returns a vector indexed by node where each entry is the number of neighbors -/// within the subset. -fn local_degree_in_subset( - adjacency: &[Vec], - subset_mask: &[bool], - subset: &[usize], -) -> Vec { - let mut local_degree = vec![0usize; adjacency.len()]; - for &node in subset { - local_degree[node] = adjacency[node] - .iter() - .filter(|&&neighbor| subset_mask[neighbor]) - .count(); - } - local_degree +/// `Ok(())` on success, or an I/O error if serialization or writing fails. +fn write_nx_graph(mut writer: W, nx_graph: &NxGraphAdjFormat) -> io::Result<()> { + let rendered = + serde_json::to_string(nx_graph).map_err(|err| Error::new(ErrorKind::InvalidData, err))?; + writer.write_all(rendered.as_bytes()) } -/// Build a coarse graph where each cluster is contracted into a single node. +/// Convert an [`NxPetgraphError`] into a [`std::io::Error`] with +/// [`ErrorKind::InvalidData`]. /// /// # Arguments /// -/// * `adjacency` - The adjacency list of the original graph. -/// * `labels` - Node labels of the original graph. -/// * `clusters` - The cluster partition of the original graph. +/// * `e` - The conversion error to wrap. /// /// # Returns /// -/// Returns a tuple of the coarse adjacency list and coarse labels, where each -/// coarse label is the minimum original label in the cluster. -fn build_coarse_graph( - adjacency: &[Vec], - labels: &[usize], - clusters: &[Vec], -) -> (Vec>, Vec) { - let mut cluster_of = vec![usize::MAX; adjacency.len()]; - for (cluster_idx, cluster) in clusters.iter().enumerate() { - for &node in cluster { - cluster_of[node] = cluster_idx; - } - } - - let mut coarse_sets = vec![HashSet::new(); clusters.len()]; - for (cluster_idx, cluster) in clusters.iter().enumerate() { - for &node in cluster { - for &neighbor in &adjacency[node] { - let neighbor_cluster = cluster_of[neighbor]; - if neighbor_cluster != cluster_idx && neighbor_cluster != usize::MAX { - coarse_sets[cluster_idx].insert(neighbor_cluster); - } - } - } - } - - let coarse_adjacency = coarse_sets - .into_iter() - .map(|neighbors| { - let mut neighbors = neighbors.into_iter().collect::>(); - neighbors.sort_unstable(); - neighbors - }) - .collect::>(); - let coarse_labels = clusters - .iter() - .map(|cluster| { - cluster - .iter() - .map(|&node| labels[node]) - .min() - .unwrap_or(usize::MAX) - }) - .collect::>(); - - (coarse_adjacency, coarse_labels) +/// An `io::Error` carrying `e` as its inner cause. +fn nx_err(e: NxPetgraphError) -> Error { + Error::new(ErrorKind::InvalidData, e) } #[cfg(test)] diff --git a/ben/src/json/graph/nx_formats.rs b/ben/src/json/graph/nx_formats.rs new file mode 100644 index 0000000..5b002e9 --- /dev/null +++ b/ben/src/json/graph/nx_formats.rs @@ -0,0 +1,59 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::BTreeMap; + +/// A NetworkX graph in adjacency-format JSON. +/// +/// This is the Rust representation of the JSON produced by +/// `networkx.adjacency_data()`. All fields use `#[serde(default)]` so that +/// inputs which omit optional keys (e.g. `"directed"`) still deserialize +/// successfully. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct NxGraphAdjFormat { + /// Whether the graph is directed. Defaults to `false`. + #[serde(default)] + pub directed: bool, + /// Whether the graph allows parallel edges. Defaults to `false`. + #[serde(default)] + pub multigraph: bool, + /// Graph-level attributes as key/value pairs. + #[serde(default)] + pub graph: Vec<(String, Value)>, + + /// The list of nodes, each carrying an `id` and arbitrary attributes. + #[serde(default)] + pub nodes: Vec, + /// Adjacency lists parallel to `nodes`. `adjacency[i]` lists the + /// neighbors (and edge attributes) of `nodes[i]`. + #[serde(default)] + pub adjacency: Vec>, +} + +/// A single node in a [`NxGraphAdjFormat`]. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct NxNode { + /// The node identifier. May be an integer or a string. + #[serde(rename = "id")] + pub id: Value, + + /// All remaining node attributes (flattened from the JSON object). + #[serde(flatten)] + pub attrs: BTreeMap, +} + +/// A single entry in a node's adjacency list, representing one edge. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct NxAdjEntry { + /// The id of the neighbor node this edge points to. + #[serde(rename = "id")] + pub id: Value, + + /// The edge key, present only in multigraphs. Omitted from JSON when + /// `None`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub key: Option, + + /// All remaining edge attributes (flattened from the JSON object). + #[serde(flatten)] + pub attrs: BTreeMap, +} diff --git a/ben/src/json/graph/petxgraph/mod.rs b/ben/src/json/graph/petxgraph/mod.rs new file mode 100644 index 0000000..06437b6 --- /dev/null +++ b/ben/src/json/graph/petxgraph/mod.rs @@ -0,0 +1,57 @@ +mod nx_convert; +mod permutation; +mod sort; + +use super::nx_formats::NxAdjEntry; +use petgraph::graph::{DiGraph, Graph, UnGraph}; +use petgraph::{Directed, Undirected}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::BTreeMap; + +/// A single node in a [`PetxGraph`]. +/// +/// All NetworkX node attributes are stored in `attrs`, including the original +/// node id under the reserved key `"__networkx_id__"`. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct PetxNode { + /// Node attributes. Always contains `"__networkx_id__"` holding the + /// original (or current) NetworkX node id as a [`Value`]. + pub attrs: BTreeMap, +} + +/// A petgraph-backed graph that mirrors a NetworkX adjacency-format graph. +/// +/// The type parameter `Ty` is either [`Directed`] or [`Undirected`] and +/// determines the edge semantics of the underlying [`Graph`]. +/// +/// Graph-level attributes (the `"graph"` array in the NetworkX JSON) are +/// stored alongside the petgraph [`Graph`] so they survive roundtrips. +#[derive(Debug, Clone)] +pub(crate) struct PetxGraph +where + Ty: petgraph::EdgeType, +{ + /// Graph-level key/value attributes from the NetworkX JSON `"graph"` field. + pub graph_attrs: Vec<(String, Value)>, + /// The underlying petgraph graph. Nodes carry [`PetxNode`] weights and + /// edges carry [`NxAdjEntry`] weights. + pub graph: Graph, +} + +/// Convenience alias for a directed [`PetxGraph`]. +pub(crate) type PetxDiGraph = PetxGraph; +/// Convenience alias for an undirected [`PetxGraph`]. +pub(crate) type PetxUnGraph = PetxGraph; +/// Convenience alias for the inner directed petgraph type. +pub(crate) type PetxDiInnerGraph = DiGraph; +/// Convenience alias for the inner undirected petgraph type. +pub(crate) type PetxUnInnerGraph = UnGraph; + +pub(in crate::json::graph) use permutation::apply_permutation; +pub(in crate::json::graph) use sort::sort_by_key; + +#[cfg(test)] +pub(in crate::json::graph) use nx_convert::{ + graph_has_parallel_edges, nx_node_to_petx_node, petx_node_to_nx_node, +}; diff --git a/ben/src/json/graph/petxgraph/nx_convert.rs b/ben/src/json/graph/petxgraph/nx_convert.rs new file mode 100644 index 0000000..e3238bd --- /dev/null +++ b/ben/src/json/graph/petxgraph/nx_convert.rs @@ -0,0 +1,315 @@ +use super::super::errors::NxPetgraphError; +use super::super::nx_formats::{NxAdjEntry, NxGraphAdjFormat, NxNode}; +use super::{PetxGraph, PetxNode}; +use petgraph::graph::{Graph, NodeIndex}; +use petgraph::visit::{EdgeRef, IntoNodeReferences}; +use petgraph::{Directed, Undirected}; +use std::collections::{HashMap, HashSet}; + +/// Convert an [`NxNode`] into a [`PetxNode`]. +/// +/// The node's `id` field is moved into the attribute map under the reserved +/// key `"__networkx_id__"` so it can be recovered later. +/// +/// # Arguments +/// +/// * `nx_node` - The NetworkX node to convert. Consumed by this function. +/// +/// # Returns +/// +/// A [`PetxNode`] whose `attrs` map contains all original attributes plus +/// `"__networkx_id__"`. +pub(in crate::json::graph) fn nx_node_to_petx_node(nx_node: NxNode) -> PetxNode { + let mut attrs = nx_node.attrs; + attrs.insert("__networkx_id__".to_string(), nx_node.id); + PetxNode { attrs } +} + +/// Convert a [`PetxNode`] back into an [`NxNode`]. +/// +/// The `"__networkx_id__"` entry is removed from the attribute map and +/// placed back into the `id` field. +/// +/// # Arguments +/// +/// * `petx_node` - The petgraph node to convert. +/// +/// # Returns +/// +/// An [`NxNode`] with the original id and attributes restored. +/// +/// # Errors +/// +/// Returns [`NxPetgraphError::Other`] if the node has no +/// `"__networkx_id__"` attribute. +pub(in crate::json::graph) fn petx_node_to_nx_node( + petx_node: &PetxNode, +) -> Result { + let mut attrs = petx_node.attrs.clone(); + let id = attrs.remove("__networkx_id__").ok_or_else(|| { + NxPetgraphError::Other("missing __networkx_id__ on petgraph node".to_string()) + })?; + + Ok(NxNode { id, attrs }) +} + +/// Build a [`PetxGraph`] from a parsed [`NxGraphAdjFormat`]. +/// +/// Nodes are added in order and edges are extracted from the adjacency lists. +/// For undirected graphs, duplicate `(u,v)` / `(v,u)` entries are +/// deduplicated so each edge is stored only once. +/// +/// # Arguments +/// +/// * `nx_graph` - The parsed NetworkX graph. Consumed by this function. +/// * `is_directed` - Whether the target graph should be directed. Must match +/// `nx_graph.directed`. +/// +/// # Returns +/// +/// A [`PetxGraph`] with the same topology and attributes. +/// +/// # Errors +/// +/// Returns an [`NxPetgraphError`] if: +/// * `nx_graph.directed` does not match `is_directed`. +/// * The `nodes` and `adjacency` arrays differ in length. +/// * A node id appears more than once. +/// * An adjacency entry references a node id not present in `nodes`. +fn build_petgraph_from_networkx( + nx_graph: NxGraphAdjFormat, + is_directed: bool, +) -> Result, NxPetgraphError> +where + Ty: petgraph::EdgeType, +{ + if nx_graph.directed != is_directed { + return Err(NxPetgraphError::DirectednessMismatch { + expected_directed: is_directed, + found_directed: nx_graph.directed, + }); + } + + if nx_graph.nodes.len() != nx_graph.adjacency.len() { + return Err(NxPetgraphError::NodeAdjacencyLengthMismatch { + n_nodes: nx_graph.nodes.len(), + n_adjacency_items: nx_graph.adjacency.len(), + }); + } + + let NxGraphAdjFormat { + directed: _, + multigraph: _, + graph: graph_attrs, + nodes, + adjacency, + } = nx_graph; + + let mut graph = Graph::::with_capacity(nodes.len(), 0); + let mut node_id_to_index: HashMap = + HashMap::with_capacity(nodes.len()); + + for node in nodes { + if node_id_to_index.contains_key(&node.id) { + return Err(NxPetgraphError::DuplicateNodeId(node.id)); + } + + let node_id = node.id.clone(); + let petx_node = nx_node_to_petx_node(node); + let index = graph.add_node(petx_node); + node_id_to_index.insert(node_id, index); + } + + // NetworkX adjacency format is a list of adjacency lists, where the i-th adjacency list + // corresponds to the i-th node in the nodes list. + // + // For undirected graphs, the format may contain both (u, v) and (v, u), so we track + // canonicalized edge endpoint pairs and only add each undirected edge once. + let mut seen_undirected_edges: HashSet<(String, String, Option)> = HashSet::new(); + + for (source_idx_orig, neighbors) in adjacency.into_iter().enumerate() { + let source_idx = NodeIndex::new(source_idx_orig); + let source_node = graph.node_weight(source_idx).ok_or_else(|| { + NxPetgraphError::Other(format!( + "invalid adjacency: source index {} out of bounds for nodes list", + source_idx.index() + )) + })?; + + let source_id = source_node.attrs.get("__networkx_id__").ok_or_else(|| { + NxPetgraphError::Other("missing __networkx_id__ on source node".to_string()) + })?; + + let source_key = serde_json::to_string(source_id).map_err(|e| { + NxPetgraphError::Other(format!( + "failed to serialize source node id to string: {}", + e + )) + })?; + + for edge in neighbors { + let target_id = &edge.id; + let target_idx = node_id_to_index + .get(target_id) + .ok_or_else(|| NxPetgraphError::MissingNeighborNode(target_id.clone()))?; + + if is_directed { + graph.add_edge(source_idx, *target_idx, edge); + } else { + let target_key = serde_json::to_string(target_id).map_err(|e| { + NxPetgraphError::Other(format!( + "failed to serialize target node id to string: {}", + e + )) + })?; + + let edge_key_str = edge + .key + .as_ref() + .and_then(|key| serde_json::to_string(key).ok()); + + let canonical = if source_key <= target_key { + (source_key.clone(), target_key, edge_key_str) + } else { + (target_key, source_key.clone(), edge_key_str) + }; + + if seen_undirected_edges.insert(canonical) { + graph.add_edge(source_idx, *target_idx, edge); + } + } + } + } + + Ok(PetxGraph { graph_attrs, graph }) +} + +/// Check whether a graph contains parallel (multi) edges. +/// +/// Two edges are considered parallel if they connect the same pair of +/// endpoints. For undirected graphs, `(u,v)` and `(v,u)` are the same pair. +/// +/// # Arguments +/// +/// * `graph` - The petgraph graph to inspect. +/// +/// # Returns +/// +/// `true` if any pair of nodes is connected by more than one edge. +pub(in crate::json::graph) fn graph_has_parallel_edges( + graph: &Graph, +) -> bool +where + Ty: petgraph::EdgeType, +{ + let mut seen_endpoint_pairs: HashSet<(usize, usize)> = HashSet::new(); + + for edge_ref in graph.edge_references() { + let source_idx = edge_ref.source().index(); + let target_idx = edge_ref.target().index(); + + let endpoint_pair = if graph.is_directed() || source_idx <= target_idx { + (source_idx, target_idx) + } else { + (target_idx, source_idx) + }; + + if !seen_endpoint_pairs.insert(endpoint_pair) { + return true; + } + } + + false +} + +/// Convert a [`PetxGraph`] back into an [`NxGraphAdjFormat`]. +/// +/// Nodes are emitted in petgraph index order. For undirected graphs, each +/// edge appears in both endpoints' adjacency lists (except self-loops, +/// which appear only once). The `multigraph` flag is set automatically +/// based on whether parallel edges exist. +/// +/// # Arguments +/// +/// * `petx_graph` - The petgraph-backed graph to convert. +/// * `is_directed` - Whether the output should be marked as directed. +/// +/// # Returns +/// +/// An [`NxGraphAdjFormat`] ready for JSON serialization. +/// +/// # Errors +/// +/// Returns [`NxPetgraphError::Other`] if any node is missing its +/// `"__networkx_id__"` attribute. +fn construct_networkx_from_petgraph( + petx_graph: &PetxGraph, + is_directed: bool, +) -> Result +where + Ty: petgraph::EdgeType, +{ + let graph = &petx_graph.graph; + let graph_attrs = petx_graph.graph_attrs.clone(); + let mut nodes = Vec::with_capacity(graph.node_count()); + let mut adjacency = vec![Vec::::new(); graph.node_count()]; + + for (_, node) in graph.node_references() { + nodes.push(petx_node_to_nx_node(node)?); + } + + for edge_ref in graph.edge_references() { + let source_idx = edge_ref.source().index(); + let target_idx = edge_ref.target().index(); + let mut adj_data = edge_ref.weight().clone(); + + adj_data.id = nodes[target_idx].id.clone(); + adjacency[source_idx].push(adj_data.clone()); + + if !is_directed && source_idx != target_idx { + let mut reverse_adj_data = adj_data; + reverse_adj_data.id = nodes[source_idx].id.clone(); + adjacency[target_idx].push(reverse_adj_data); + } + } + + Ok(NxGraphAdjFormat { + directed: is_directed, + multigraph: graph_has_parallel_edges(graph), + graph: graph_attrs, + nodes, + adjacency, + }) +} + +impl TryFrom for PetxGraph { + type Error = NxPetgraphError; + + fn try_from(nx_graph: NxGraphAdjFormat) -> Result { + build_petgraph_from_networkx::(nx_graph, true) + } +} + +impl TryFrom for PetxGraph { + type Error = NxPetgraphError; + + fn try_from(nx_graph: NxGraphAdjFormat) -> Result { + build_petgraph_from_networkx::(nx_graph, false) + } +} + +impl TryFrom<&PetxGraph> for NxGraphAdjFormat { + type Error = NxPetgraphError; + + fn try_from(petx_graph: &PetxGraph) -> Result { + construct_networkx_from_petgraph(petx_graph, true) + } +} + +impl TryFrom<&PetxGraph> for NxGraphAdjFormat { + type Error = NxPetgraphError; + + fn try_from(petx_graph: &PetxGraph) -> Result { + construct_networkx_from_petgraph(petx_graph, false) + } +} diff --git a/ben/src/json/graph/petxgraph/permutation.rs b/ben/src/json/graph/petxgraph/permutation.rs new file mode 100644 index 0000000..323d06e --- /dev/null +++ b/ben/src/json/graph/petxgraph/permutation.rs @@ -0,0 +1,63 @@ +use super::super::nx_formats::NxAdjEntry; +use super::{PetxGraph, PetxNode}; +use petgraph::graph::{Graph, NodeIndex}; +use petgraph::visit::{EdgeRef, NodeIndexable}; +use serde_json::Value; + +/// Apply a node permutation to a `PetxGraph`, returning a new graph with nodes +/// reordered. +/// +/// Arguments: +/// +/// - `petx_graph`: The input graph to permute. +/// - `order`: A permutation where `order[new_index]` is the `NodeIndex` of the +/// node that should occupy position `new_index` in the output graph. Must be +/// a valid permutation of the graph's node indices. +/// +/// Returns: +/// +/// - A new `PetxGraph` with nodes in the specified order and edges remapped to +/// the new indices. Edge attributes (including `key` and `attrs`) are +/// preserved; the `NxAdjEntry::id` field is left as-is since +/// `construct_networkx_from_petgraph` overwrites it on export. +pub(in crate::json::graph) fn apply_permutation( + petx_graph: &PetxGraph, + order: &[NodeIndex], +) -> PetxGraph +where + Ty: petgraph::EdgeType, +{ + let graph = &petx_graph.graph; + + // Build old-to-new index mapping. + let mut old_to_new = vec![NodeIndex::new(0); graph.node_bound()]; + for (new_idx, &old_idx) in order.iter().enumerate() { + old_to_new[old_idx.index()] = NodeIndex::new(new_idx); + } + + let mut new_graph = + Graph::::with_capacity(graph.node_count(), graph.edge_count()); + + for &old_idx in order { + new_graph.add_node(graph[old_idx].clone()); + } + + for edge_ref in graph.edge_references() { + let new_src = old_to_new[edge_ref.source().index()]; + let new_tgt = old_to_new[edge_ref.target().index()]; + new_graph.add_edge(new_src, new_tgt, edge_ref.weight().clone()); + } + + // Relabel __networkx_id__ to match new positions. + for node_idx in new_graph.node_indices() { + new_graph[node_idx].attrs.insert( + "__networkx_id__".to_string(), + Value::from(node_idx.index() as u64), + ); + } + + PetxGraph { + graph_attrs: petx_graph.graph_attrs.clone(), + graph: new_graph, + } +} diff --git a/ben/src/json/graph/petxgraph/sort.rs b/ben/src/json/graph/petxgraph/sort.rs new file mode 100644 index 0000000..6563cda --- /dev/null +++ b/ben/src/json/graph/petxgraph/sort.rs @@ -0,0 +1,86 @@ +use super::permutation::apply_permutation; +use super::{PetxGraph, PetxNode}; +use petgraph::graph::NodeIndex; +use serde_json::Value; +use std::cmp::Ordering; + +/// Sort a `PetxGraph` by a node attribute and apply the permutation in place. +/// +/// Nodes are ordered by the value of `key` in their attribute map, using +/// numeric comparison when possible and falling back to string comparison. +/// +/// Returns the permutation that was applied. +pub(in crate::json::graph) fn sort_by_key( + petx_graph: &mut PetxGraph, + key: &str, +) -> Vec +where + Ty: petgraph::EdgeType, +{ + let graph = &petx_graph.graph; + let mut order: Vec = graph.node_indices().collect(); + + order.sort_by(|&a, &b| { + let a_val = get_sort_attr(&graph[a], key); + let b_val = get_sort_attr(&graph[b], key); + compare_attr_values(a_val, b_val) + }); + + *petx_graph = apply_permutation(petx_graph, &order); + order +} + +/// Look up the sort attribute for a node. +/// +/// The special key `"id"` is mapped to the internal `"__networkx_id__"` +/// attribute so callers can sort by the NetworkX node id. +/// +/// # Arguments +/// +/// * `node` - The node whose attribute is being looked up. +/// * `key` - The attribute name. `"id"` is treated as an alias for +/// `"__networkx_id__"`. +/// +/// # Returns +/// +/// A reference to the attribute [`Value`], or `None` if the attribute is +/// absent. +fn get_sort_attr<'a>(node: &'a PetxNode, key: &str) -> Option<&'a Value> { + if key == "id" { + node.attrs.get("__networkx_id__") + } else { + node.attrs.get(key) + } +} + +/// Compare two optional attribute values for sorting. +/// +/// Values are compared numerically when both can be interpreted as `u64`. +/// Otherwise they are compared as strings. `None` is treated as the string +/// `"null"`. +/// +/// # Arguments +/// +/// * `a` - The left-hand attribute value (or `None` if absent). +/// * `b` - The right-hand attribute value (or `None` if absent). +/// +/// # Returns +/// +/// An [`Ordering`] suitable for use in a sort comparator. +fn compare_attr_values(a: Option<&Value>, b: Option<&Value>) -> Ordering { + let extract = |val: Option<&Value>| -> Result { + match val { + Some(Value::String(s)) => s.parse::().map_err(|_| s.clone()), + Some(Value::Number(n)) => n.as_u64().ok_or_else(|| n.to_string()), + Some(v) => Err(v.to_string()), + None => Err("null".to_string()), + } + }; + + match (extract(a), extract(b)) { + (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), + (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), + (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), + (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), + } +} diff --git a/ben/src/json/graph/rcm.rs b/ben/src/json/graph/rcm.rs new file mode 100644 index 0000000..39235ab --- /dev/null +++ b/ben/src/json/graph/rcm.rs @@ -0,0 +1,135 @@ +use super::petxgraph::{apply_permutation, PetxGraph}; +use petgraph::graph::{Graph, NodeIndex}; +use petgraph::visit::NodeIndexable; +use rustworkx_core::connectivity::connected_components; +use std::collections::{HashSet, VecDeque}; + +/// Compute a Reverse Cuthill-McKee ordering and apply it to the graph in place. +/// +/// Each connected component is ordered independently via RCM, and components +/// are sorted by their minimum node index. The graph is then permuted in place. +/// +/// Arguments: +/// +/// - `petx_graph`: The graph to reorder in place. +/// +/// Returns: +/// +/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` +/// the node occupied before reordering. +pub(super) fn apply_reverse_cuthill_mckee(petx_graph: &mut PetxGraph) -> Vec +where + Ty: petgraph::EdgeType, +{ + let labels: Vec = (0..petx_graph.graph.node_bound()).collect(); + let graph = &petx_graph.graph; + + let mut components: Vec> = connected_components(graph) + .into_iter() + .map(|set| set.into_iter().collect()) + .collect(); + components.sort_by_key(|c| { + c.iter() + .map(|n| labels[n.index()]) + .min() + .unwrap_or(usize::MAX) + }); + + let mut order = Vec::with_capacity(graph.node_count()); + for component in components { + order.extend(rcm_component(graph, &labels, &component)); + } + + *petx_graph = apply_permutation(petx_graph, &order); + order +} + +/// Reverse Cuthill-McKee ordering for a single connected component. +/// +/// Starts BFS from the minimum-degree node (ties broken by label), then +/// reverses the result to produce the RCM permutation. +/// +/// # Arguments +/// +/// * `graph` - The full graph (only edges within `component` are relevant). +/// * `labels` - Per-node labels for tie-breaking, indexed by +/// `NodeIndex::index()`. +/// * `component` - The subset of `NodeIndex` values to order. +/// +/// # Returns +/// +/// A permutation of the nodes in `component` representing their RCM order. +pub(super) fn rcm_component( + graph: &Graph, + labels: &[usize], + component: &[NodeIndex], +) -> Vec +where + Ty: petgraph::EdgeType, +{ + let component_set: HashSet = component.iter().copied().collect(); + let local_deg = local_degree_in_component(graph, &component_set, component); + + let start = component + .iter() + .copied() + .min_by_key(|&node| (local_deg[node.index()], labels[node.index()])) + .unwrap(); + + let mut visited = vec![false; graph.node_bound()]; + visited[start.index()] = true; + let mut queue = VecDeque::from([start]); + let mut order = Vec::with_capacity(component.len()); + + while let Some(node) = queue.pop_front() { + order.push(node); + let mut neighbors: Vec = graph + .neighbors(node) + .filter(|&n| component_set.contains(&n) && !visited[n.index()]) + .collect(); + neighbors.sort_by_key(|&n| (local_deg[n.index()], labels[n.index()])); + for n in neighbors { + visited[n.index()] = true; + queue.push_back(n); + } + } + + order.reverse(); + order +} + +/// Compute the degree of each component node restricted to the component. +/// +/// For each node in `component`, counts how many of its neighbors are also +/// in the component. The result is indexed by `NodeIndex::index()`, so +/// entries for nodes outside the component are zero. +/// +/// # Arguments +/// +/// * `graph` - The full graph. +/// * `component_set` - A `HashSet` of the nodes in the component, used for +/// fast membership checks. +/// * `component` - The slice of `NodeIndex` values in the component. +/// +/// # Returns +/// +/// A vector of length `graph.node_bound()` where +/// `result[node.index()]` is the number of neighbors of `node` that are in +/// the component, or `0` for nodes not in the component. +pub(super) fn local_degree_in_component( + graph: &Graph, + component_set: &HashSet, + component: &[NodeIndex], +) -> Vec +where + Ty: petgraph::EdgeType, +{ + let mut local_deg = vec![0usize; graph.node_bound()]; + for &node in component { + local_deg[node.index()] = graph + .neighbors(node) + .filter(|n| component_set.contains(n)) + .count(); + } + local_deg +} diff --git a/ben/src/json/graph/tests/mod.rs b/ben/src/json/graph/tests/mod.rs new file mode 100644 index 0000000..a34abf1 --- /dev/null +++ b/ben/src/json/graph/tests/mod.rs @@ -0,0 +1,2 @@ +mod test_algos; +mod test_io; diff --git a/ben/src/json/graph/tests.rs b/ben/src/json/graph/tests/test_algos.rs similarity index 99% rename from ben/src/json/graph/tests.rs rename to ben/src/json/graph/tests/test_algos.rs index ec6f640..2505f4f 100644 --- a/ben/src/json/graph/tests.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -1,4 +1,4 @@ -use super::*; +use super::super::*; use serde_json::Value; fn path_graph_json() -> &'static [u8] { diff --git a/ben/src/json/graph/tests/test_io.rs b/ben/src/json/graph/tests/test_io.rs new file mode 100644 index 0000000..9fea044 --- /dev/null +++ b/ben/src/json/graph/tests/test_io.rs @@ -0,0 +1,802 @@ +use super::super::nx_formats::{NxAdjEntry, NxGraphAdjFormat, NxNode}; +use super::super::petxgraph::*; +use petgraph::graph::{DiGraph, NodeIndex, UnGraph}; +use petgraph::visit::EdgeRef; +use serde_json::json; +use std::collections::BTreeMap; + +fn parse_nx(s: &str) -> NxGraphAdjFormat { + serde_json::from_str(s).unwrap() +} + +/// Collect edges as a sorted set of (source, target) pairs, canonicalized for undirected. +fn edge_set_undirected(graph: &PetxUnInnerGraph) -> Vec<(usize, usize)> { + let mut edges: Vec<(usize, usize)> = graph + .edge_references() + .map(|e| { + let (a, b) = (e.source().index(), e.target().index()); + if a <= b { + (a, b) + } else { + (b, a) + } + }) + .collect(); + edges.sort(); + edges +} + +/// Collect edges as a sorted set of (source, target) pairs for directed graphs. +fn edge_set_directed(graph: &PetxDiInnerGraph) -> Vec<(usize, usize)> { + let mut edges: Vec<(usize, usize)> = graph + .edge_references() + .map(|e| (e.source().index(), e.target().index())) + .collect(); + edges.sort(); + edges +} + +/// Normalize an NxGraphAdjFormat by sorting each adjacency list by target id, +/// so structural equality can be checked after roundtrip. +fn normalize(format: &mut NxGraphAdjFormat) { + for adj_list in &mut format.adjacency { + adj_list.sort_by(|a, b| { + let ak = serde_json::to_string(&a.id).unwrap(); + let bk = serde_json::to_string(&b.id).unwrap(); + ak.cmp(&bk) + }); + } +} + +// ================================================================ +// == Fixtures (generated with `uv run --with networkx python3`) == +// ================================================================ + +const KARATE_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [["name", "Zachary's Karate Club"]], "nodes": [{"club": "Mr. Hi", "id": 0}, {"club": "Mr. Hi", "id": 1}, {"club": "Mr. Hi", "id": 2}, {"club": "Mr. Hi", "id": 3}, {"club": "Mr. Hi", "id": 4}, {"club": "Mr. Hi", "id": 5}, {"club": "Mr. Hi", "id": 6}, {"club": "Mr. Hi", "id": 7}, {"club": "Mr. Hi", "id": 8}, {"club": "Officer", "id": 9}, {"club": "Mr. Hi", "id": 10}, {"club": "Mr. Hi", "id": 11}, {"club": "Mr. Hi", "id": 12}, {"club": "Mr. Hi", "id": 13}, {"club": "Officer", "id": 14}, {"club": "Officer", "id": 15}, {"club": "Mr. Hi", "id": 16}, {"club": "Mr. Hi", "id": 17}, {"club": "Officer", "id": 18}, {"club": "Mr. Hi", "id": 19}, {"club": "Officer", "id": 20}, {"club": "Mr. Hi", "id": 21}, {"club": "Officer", "id": 22}, {"club": "Officer", "id": 23}, {"club": "Officer", "id": 24}, {"club": "Officer", "id": 25}, {"club": "Officer", "id": 26}, {"club": "Officer", "id": 27}, {"club": "Officer", "id": 28}, {"club": "Officer", "id": 29}, {"club": "Officer", "id": 30}, {"club": "Officer", "id": 31}, {"club": "Officer", "id": 32}, {"club": "Officer", "id": 33}], "adjacency": [[{"weight": 4, "id": 1}, {"weight": 5, "id": 2}, {"weight": 3, "id": 3}, {"weight": 3, "id": 4}, {"weight": 3, "id": 5}, {"weight": 3, "id": 6}, {"weight": 2, "id": 7}, {"weight": 2, "id": 8}, {"weight": 2, "id": 10}, {"weight": 3, "id": 11}, {"weight": 1, "id": 12}, {"weight": 3, "id": 13}, {"weight": 2, "id": 17}, {"weight": 2, "id": 19}, {"weight": 2, "id": 21}, {"weight": 2, "id": 31}], [{"weight": 4, "id": 0}, {"weight": 6, "id": 2}, {"weight": 3, "id": 3}, {"weight": 4, "id": 7}, {"weight": 5, "id": 13}, {"weight": 1, "id": 17}, {"weight": 2, "id": 19}, {"weight": 2, "id": 21}, {"weight": 2, "id": 30}], [{"weight": 5, "id": 0}, {"weight": 6, "id": 1}, {"weight": 3, "id": 3}, {"weight": 4, "id": 7}, {"weight": 5, "id": 8}, {"weight": 1, "id": 9}, {"weight": 3, "id": 13}, {"weight": 2, "id": 27}, {"weight": 2, "id": 28}, {"weight": 2, "id": 32}], [{"weight": 3, "id": 0}, {"weight": 3, "id": 1}, {"weight": 3, "id": 2}, {"weight": 3, "id": 7}, {"weight": 3, "id": 12}, {"weight": 3, "id": 13}], [{"weight": 3, "id": 0}, {"weight": 2, "id": 6}, {"weight": 3, "id": 10}], [{"weight": 3, "id": 0}, {"weight": 5, "id": 6}, {"weight": 3, "id": 10}, {"weight": 3, "id": 16}], [{"weight": 3, "id": 0}, {"weight": 2, "id": 4}, {"weight": 5, "id": 5}, {"weight": 3, "id": 16}], [{"weight": 2, "id": 0}, {"weight": 4, "id": 1}, {"weight": 4, "id": 2}, {"weight": 3, "id": 3}], [{"weight": 2, "id": 0}, {"weight": 5, "id": 2}, {"weight": 3, "id": 30}, {"weight": 3, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 1, "id": 2}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 3, "id": 4}, {"weight": 3, "id": 5}], [{"weight": 3, "id": 0}], [{"weight": 1, "id": 0}, {"weight": 3, "id": 3}], [{"weight": 3, "id": 0}, {"weight": 5, "id": 1}, {"weight": 3, "id": 2}, {"weight": 3, "id": 3}, {"weight": 3, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 3, "id": 5}, {"weight": 3, "id": 6}], [{"weight": 2, "id": 0}, {"weight": 1, "id": 1}], [{"weight": 1, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 1}, {"weight": 1, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 1, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 1}], [{"weight": 2, "id": 32}, {"weight": 3, "id": 33}], [{"weight": 5, "id": 25}, {"weight": 4, "id": 27}, {"weight": 3, "id": 29}, {"weight": 5, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 25}, {"weight": 3, "id": 27}, {"weight": 2, "id": 31}], [{"weight": 5, "id": 23}, {"weight": 2, "id": 24}, {"weight": 7, "id": 31}], [{"weight": 4, "id": 29}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 4, "id": 23}, {"weight": 3, "id": 24}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 2, "id": 31}, {"weight": 2, "id": 33}], [{"weight": 3, "id": 23}, {"weight": 4, "id": 26}, {"weight": 4, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 1}, {"weight": 3, "id": 8}, {"weight": 3, "id": 32}, {"weight": 3, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 24}, {"weight": 7, "id": 25}, {"weight": 2, "id": 28}, {"weight": 4, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 3, "id": 8}, {"weight": 3, "id": 14}, {"weight": 3, "id": 15}, {"weight": 1, "id": 18}, {"weight": 3, "id": 20}, {"weight": 2, "id": 22}, {"weight": 5, "id": 23}, {"weight": 4, "id": 29}, {"weight": 3, "id": 30}, {"weight": 4, "id": 31}, {"weight": 5, "id": 33}], [{"weight": 4, "id": 8}, {"weight": 2, "id": 9}, {"weight": 3, "id": 13}, {"weight": 2, "id": 14}, {"weight": 4, "id": 15}, {"weight": 2, "id": 18}, {"weight": 1, "id": 19}, {"weight": 1, "id": 20}, {"weight": 4, "id": 23}, {"weight": 2, "id": 26}, {"weight": 4, "id": 27}, {"weight": 2, "id": 28}, {"weight": 2, "id": 29}, {"weight": 3, "id": 30}, {"weight": 4, "id": 31}, {"weight": 5, "id": 32}, {"weight": 3, "id": 22}]]}"#; + +const SMALL_DIRECTED_JSON: &str = r#"{"directed": true, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}], "adjacency": [[{"id": 1}, {"id": 2}], [{"id": 2}], [{"id": 3}], [{"id": 0}]]}"#; + +const K5_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}], "adjacency": [[{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}], [{"id": 0}, {"id": 2}, {"id": 3}, {"id": 4}], [{"id": 0}, {"id": 1}, {"id": 3}, {"id": 4}], [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 4}], [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}]]}"#; + +const P4_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}], "adjacency": [[{"id": 1}], [{"id": 0}, {"id": 2}], [{"id": 1}, {"id": 3}], [{"id": 2}]]}"#; + +const SINGLE_NODE_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"label": "solo", "id": 0}], "adjacency": [[]]}"#; + +const TWO_TRIANGLES_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}], "adjacency": [[{"id": 1}, {"id": 2}], [{"id": 0}, {"id": 2}], [{"id": 1}, {"id": 0}], [{"id": 4}, {"id": 5}], [{"id": 3}, {"id": 5}], [{"id": 4}, {"id": 3}]]}"#; + +const STRING_IDS_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"weight": 1.0, "id": "alpha"}, {"weight": 2.0, "id": "beta"}, {"weight": 3.0, "id": "gamma"}], "adjacency": [[{"color": "red", "id": "beta"}], [{"color": "red", "id": "alpha"}, {"color": "blue", "id": "gamma"}], [{"color": "blue", "id": "beta"}]]}"#; + +const DIRECTED_CYCLE_JSON: &str = r#"{"directed": true, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}], "adjacency": [[{"id": 1}], [{"id": 2}], [{"id": 3}], [{"id": 4}], [{"id": 0}]]}"#; + +const SELF_LOOP_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}], "adjacency": [[{"id": 1}, {"id": 0}], [{"id": 0}]]}"#; + +const EMPTY_EDGES_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}], "adjacency": [[], [], []]}"#; + +// ============================= +// == Karate club graph tests == +// ============================= + +#[test] +fn karate_club_node_and_edge_counts() { + let nx = parse_nx(KARATE_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 34); + assert_eq!(petx.graph.edge_count(), 78); +} + +#[test] +fn karate_club_graph_attrs_preserved() { + let nx = parse_nx(KARATE_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph_attrs.len(), 1); + assert_eq!(petx.graph_attrs[0].0, "name"); + assert_eq!(petx.graph_attrs[0].1, json!("Zachary's Karate Club")); +} + +#[test] +fn karate_club_node_attrs_preserved() { + let nx = parse_nx(KARATE_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + + // Node 0 should have club="Mr. Hi" + let node0 = petx.graph.node_weight(NodeIndex::new(0)).unwrap(); + assert_eq!(node0.attrs.get("club"), Some(&json!("Mr. Hi"))); + assert_eq!(node0.attrs.get("__networkx_id__"), Some(&json!(0))); + + // Node 33 should have club="Officer" + let node33 = petx.graph.node_weight(NodeIndex::new(33)).unwrap(); + assert_eq!(node33.attrs.get("club"), Some(&json!("Officer"))); +} + +#[test] +fn karate_club_roundtrip() { + let nx_original = parse_nx(KARATE_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + + assert_eq!(nx_roundtrip.directed, nx_expected.directed); + assert_eq!(nx_roundtrip.multigraph, nx_expected.multigraph); + assert_eq!(nx_roundtrip.graph, nx_expected.graph); + assert_eq!(nx_roundtrip.nodes.len(), nx_expected.nodes.len()); + + for (orig, rt) in nx_expected.nodes.iter().zip(nx_roundtrip.nodes.iter()) { + assert_eq!(orig.id, rt.id); + assert_eq!(orig.attrs, rt.attrs); + } + + for (i, (orig_adj, rt_adj)) in nx_expected + .adjacency + .iter() + .zip(nx_roundtrip.adjacency.iter()) + .enumerate() + { + assert_eq!( + orig_adj.len(), + rt_adj.len(), + "adjacency list length mismatch at node {}", + i + ); + } +} + +// ======================= +// == Complete graph K5 == +// ======================= + +#[test] +fn k5_node_and_edge_counts() { + let nx = parse_nx(K5_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 5); + // K5 has C(5,2) = 10 edges + assert_eq!(petx.graph.edge_count(), 10); +} + +#[test] +fn k5_all_pairs_connected() { + let nx = parse_nx(K5_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + let edges = edge_set_undirected(&petx.graph); + + for i in 0..5 { + for j in (i + 1)..5 { + assert!(edges.contains(&(i, j)), "K5 missing edge ({}, {})", i, j); + } + } +} + +#[test] +fn k5_roundtrip() { + let nx_original = parse_nx(K5_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip, nx_expected); +} + +// =================== +// == Path graph P4 == +// =================== + +#[test] +fn p4_structure() { + let nx = parse_nx(P4_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 4); + assert_eq!(petx.graph.edge_count(), 3); + + let edges = edge_set_undirected(&petx.graph); + assert_eq!(edges, vec![(0, 1), (1, 2), (2, 3)]); +} + +#[test] +fn p4_roundtrip() { + let nx_original = parse_nx(P4_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip, nx_expected); +} + +// ===================== +// == Directed graphs == +// ===================== + +#[test] +fn small_directed_structure() { + let nx = parse_nx(SMALL_DIRECTED_JSON); + let petx: PetxDiGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 4); + assert_eq!(petx.graph.edge_count(), 5); + + let edges = edge_set_directed(&petx.graph); + assert_eq!(edges, vec![(0, 1), (0, 2), (1, 2), (2, 3), (3, 0)]); +} + +#[test] +fn small_directed_roundtrip() { + let nx_original = parse_nx(SMALL_DIRECTED_JSON); + let petx: PetxDiGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip, nx_expected); +} + +#[test] +fn directed_cycle_structure() { + let nx = parse_nx(DIRECTED_CYCLE_JSON); + let petx: PetxDiGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 5); + assert_eq!(petx.graph.edge_count(), 5); + + let edges = edge_set_directed(&petx.graph); + assert_eq!(edges, vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 0)]); +} + +#[test] +fn directed_cycle_roundtrip() { + let nx_original = parse_nx(DIRECTED_CYCLE_JSON); + let petx: PetxDiGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip, nx_expected); +} + +// ================ +// == Edge cases == +// ================ + +#[test] +fn single_node_no_edges() { + let nx = parse_nx(SINGLE_NODE_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 1); + assert_eq!(petx.graph.edge_count(), 0); + + let node = petx.graph.node_weight(NodeIndex::new(0)).unwrap(); + assert_eq!(node.attrs.get("label"), Some(&json!("solo"))); +} + +#[test] +fn single_node_roundtrip() { + let nx_original = parse_nx(SINGLE_NODE_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + assert_eq!(nx_roundtrip, nx_original); +} + +#[test] +fn empty_edges_graph() { + let nx = parse_nx(EMPTY_EDGES_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 3); + assert_eq!(petx.graph.edge_count(), 0); +} + +#[test] +fn empty_edges_roundtrip() { + let nx_original = parse_nx(EMPTY_EDGES_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + assert_eq!(nx_roundtrip, nx_original); +} + +#[test] +fn self_loop_preserved() { + let nx = parse_nx(SELF_LOOP_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 2); + // self-loop (0,0) + edge (0,1) + assert_eq!(petx.graph.edge_count(), 2); + + let edges = edge_set_undirected(&petx.graph); + assert!(edges.contains(&(0, 0)), "self-loop should be preserved"); + assert!(edges.contains(&(0, 1))); +} + +#[test] +fn disconnected_graph_two_triangles() { + let nx = parse_nx(TWO_TRIANGLES_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 6); + assert_eq!(petx.graph.edge_count(), 6); + + let edges = edge_set_undirected(&petx.graph); + // Triangle 1: {0,1,2} + assert!(edges.contains(&(0, 1))); + assert!(edges.contains(&(0, 2))); + assert!(edges.contains(&(1, 2))); + // Triangle 2: {3,4,5} + assert!(edges.contains(&(3, 4))); + assert!(edges.contains(&(3, 5))); + assert!(edges.contains(&(4, 5))); +} + +#[test] +fn two_triangles_roundtrip() { + let nx_original = parse_nx(TWO_TRIANGLES_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip, nx_expected); +} + +// ========================================= +// == String node IDs and edge attributes == +// ========================================= + +#[test] +fn string_ids_structure() { + let nx = parse_nx(STRING_IDS_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.node_count(), 3); + assert_eq!(petx.graph.edge_count(), 2); + + // Verify node IDs are stored as strings + let node0 = petx.graph.node_weight(NodeIndex::new(0)).unwrap(); + assert_eq!(node0.attrs.get("__networkx_id__"), Some(&json!("alpha"))); + assert_eq!(node0.attrs.get("weight"), Some(&json!(1.0))); +} + +#[test] +fn string_ids_edge_attrs_preserved() { + let nx = parse_nx(STRING_IDS_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + + // Find the edge between alpha (0) and beta (1) + let edge = petx.graph.edges(NodeIndex::new(0)).next().unwrap(); + let weight = edge.weight(); + assert_eq!(weight.attrs.get("color"), Some(&json!("red"))); +} + +#[test] +fn string_ids_roundtrip() { + let nx_original = parse_nx(STRING_IDS_JSON); + let petx: PetxUnGraph = nx_original.clone().try_into().unwrap(); + let mut nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + let mut nx_expected = nx_original; + normalize(&mut nx_expected); + normalize(&mut nx_roundtrip); + assert_eq!(nx_roundtrip.directed, nx_expected.directed); + assert_eq!(nx_roundtrip.nodes.len(), nx_expected.nodes.len()); + for (orig, rt) in nx_expected.nodes.iter().zip(nx_roundtrip.nodes.iter()) { + assert_eq!(orig.id, rt.id); + assert_eq!(orig.attrs, rt.attrs); + } +} + +// ============================== +// == graph_has_parallel_edges == +// ============================== + +#[test] +fn no_parallel_edges_simple_graph() { + let nx = parse_nx(K5_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert!(!graph_has_parallel_edges(&petx.graph)); +} + +#[test] +fn no_parallel_edges_directed() { + let nx = parse_nx(SMALL_DIRECTED_JSON); + let petx: PetxDiGraph = nx.try_into().unwrap(); + assert!(!graph_has_parallel_edges(&petx.graph)); +} + +#[test] +fn parallel_edges_detected_undirected() { + // Manually build a graph with parallel edges + let mut graph = UnGraph::::new_undirected(); + let a = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(0))]), + }); + let b = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(0), + key: None, + attrs: BTreeMap::new(), + }; + graph.add_edge(a, b, edge.clone()); + graph.add_edge(a, b, edge); + assert!(graph_has_parallel_edges(&graph)); +} + +#[test] +fn parallel_edges_detected_directed() { + let mut graph = DiGraph::::new(); + let a = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(0))]), + }); + let b = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(0), + key: None, + attrs: BTreeMap::new(), + }; + graph.add_edge(a, b, edge.clone()); + graph.add_edge(a, b, edge); + assert!(graph_has_parallel_edges(&graph)); +} + +#[test] +fn antiparallel_not_parallel_in_directed() { + // In directed graphs, (a->b) and (b->a) are NOT parallel + let mut graph = DiGraph::::new(); + let a = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(0))]), + }); + let b = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(0), + key: None, + attrs: BTreeMap::new(), + }; + graph.add_edge(a, b, edge.clone()); + graph.add_edge(b, a, edge); + assert!(!graph_has_parallel_edges(&graph)); +} + +// ====================================== +// == nx_node <-> petx_node conversion == +// ====================================== + +#[test] +fn nx_to_petx_node_stores_id_in_attrs() { + let nx_node = NxNode { + id: json!(42), + attrs: BTreeMap::from([("color".into(), json!("blue"))]), + }; + let petx = nx_node_to_petx_node(nx_node); + assert_eq!(petx.attrs.get("__networkx_id__"), Some(&json!(42))); + assert_eq!(petx.attrs.get("color"), Some(&json!("blue"))); +} + +#[test] +fn petx_to_nx_node_restores_id() { + let petx = PetxNode { + attrs: BTreeMap::from([ + ("__networkx_id__".into(), json!("node_a")), + ("weight".into(), json!(3.14)), + ]), + }; + let nx = petx_node_to_nx_node(&petx).unwrap(); + assert_eq!(nx.id, json!("node_a")); + assert_eq!(nx.attrs.get("weight"), Some(&json!(3.14))); + assert!(!nx.attrs.contains_key("__networkx_id__")); +} + +#[test] +fn petx_to_nx_node_missing_id_errors() { + let petx = PetxNode { + attrs: BTreeMap::from([("color".into(), json!("red"))]), + }; + let result = petx_node_to_nx_node(&petx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("__networkx_id__"), + "error should mention missing id field: {}", + err + ); +} + +// ================= +// == Error cases == +// ================= + +#[test] +fn directedness_mismatch_undirected_to_directed() { + let nx = parse_nx(K5_JSON); // undirected + let result = PetxDiGraph::try_from(nx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("directedness mismatch"), + "unexpected error: {}", + err + ); +} + +#[test] +fn directedness_mismatch_directed_to_undirected() { + let nx = parse_nx(SMALL_DIRECTED_JSON); // directed + let result = PetxUnGraph::try_from(nx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("directedness mismatch"), + "unexpected error: {}", + err + ); +} + +#[test] +fn duplicate_node_id_error() { + let nx = NxGraphAdjFormat { + directed: false, + multigraph: false, + graph: vec![], + nodes: vec![ + NxNode { + id: json!(0), + attrs: BTreeMap::new(), + }, + NxNode { + id: json!(0), + attrs: BTreeMap::new(), + }, + ], + adjacency: vec![vec![], vec![]], + }; + let result = PetxUnGraph::try_from(nx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("duplicate node id"), + "unexpected error: {}", + err + ); +} + +#[test] +fn node_adjacency_length_mismatch_error() { + let nx = NxGraphAdjFormat { + directed: false, + multigraph: false, + graph: vec![], + nodes: vec![NxNode { + id: json!(0), + attrs: BTreeMap::new(), + }], + adjacency: vec![vec![], vec![]], // 1 node but 2 adjacency lists + }; + let result = PetxUnGraph::try_from(nx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("length mismatch"), + "unexpected error: {}", + err + ); +} + +#[test] +fn missing_neighbor_node_error() { + let nx = NxGraphAdjFormat { + directed: false, + multigraph: false, + graph: vec![], + nodes: vec![NxNode { + id: json!(0), + attrs: BTreeMap::new(), + }], + adjacency: vec![vec![NxAdjEntry { + id: json!(999), // doesn't exist + key: None, + attrs: BTreeMap::new(), + }]], + }; + let result = PetxUnGraph::try_from(nx); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("unknown node id"), + "unexpected error: {}", + err + ); +} + +// ============================================================ +// == Type alias smoke tests (ensures they compile and work) == +// ============================================================ + +#[test] +fn type_aliases_work() { + let nx_un = parse_nx(P4_JSON); + let _petx_un: PetxUnGraph = nx_un.try_into().unwrap(); + + let nx_di = parse_nx(SMALL_DIRECTED_JSON); + let _petx_di: PetxDiGraph = nx_di.try_into().unwrap(); + + // Verify inner graph types match + let _inner_un: &PetxUnInnerGraph = &_petx_un.graph; + let _inner_di: &PetxDiInnerGraph = &_petx_di.graph; +} + +// =================================== +// == Undirected edge deduplication == +// =================================== + +#[test] +fn undirected_dedup_produces_correct_edge_count() { + // NetworkX adjacency format lists each undirected edge twice: once from + // each endpoint. The converter should deduplicate to a single petgraph edge. + let nx = parse_nx(P4_JSON); + // P4 adjacency has 6 total entries (1+2+2+1) but only 3 unique edges + let total_adj_entries: usize = nx.adjacency.iter().map(|a| a.len()).sum(); + assert_eq!(total_adj_entries, 6); + + let petx: PetxUnGraph = nx.try_into().unwrap(); + assert_eq!(petx.graph.edge_count(), 3); +} + +#[test] +fn construct_nx_from_petx_restores_both_directions() { + // When converting back, each undirected edge should appear in both + // endpoints' adjacency lists. + let nx_original = parse_nx(P4_JSON); + let petx: PetxUnGraph = nx_original.try_into().unwrap(); + let nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); + + let total_adj_entries: usize = nx_roundtrip.adjacency.iter().map(|a| a.len()).sum(); + assert_eq!(total_adj_entries, 6); + + // Node 0 should have neighbor 1 (id field is the target node's id) + assert_eq!(nx_roundtrip.adjacency[0].len(), 1); + assert_eq!(nx_roundtrip.adjacency[0][0].id, json!(1)); + // Node 1 should have neighbors 0 and 2 + assert_eq!(nx_roundtrip.adjacency[1].len(), 2); +} + +// ============================================ +// == multigraph flag detection on roundtrip == +// ============================================ + +#[test] +fn simple_graph_roundtrip_multigraph_false() { + let nx = parse_nx(K5_JSON); + let petx: PetxUnGraph = nx.try_into().unwrap(); + let nx_rt = NxGraphAdjFormat::try_from(&petx).unwrap(); + assert!(!nx_rt.multigraph); +} + +#[test] +fn graph_with_parallel_edges_sets_multigraph_true() { + // Build a PetxGraph with parallel edges, convert to NxGraphAdjFormat + let mut inner = UnGraph::::new_undirected(); + let a = inner.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(0))]), + }); + let b = inner.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".into(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(0), + key: None, + attrs: BTreeMap::new(), + }; + inner.add_edge(a, b, edge.clone()); + inner.add_edge(a, b, edge); + + let petx = PetxUnGraph { + graph_attrs: vec![], + graph: inner, + }; + let nx = NxGraphAdjFormat::try_from(&petx).unwrap(); + assert!(nx.multigraph); +} + +// ============================= +// == JSON roundtrip fidelity == +// ============================= +// +// These tests verify that the full pipeline +// JSON string → NxGraphAdjFormat → PetxGraph → NxGraphAdjFormat → JSON string +// produces output whose serde_json::Value representation matches the input. +// This catches serialization artifacts (e.g. `"key": null` for absent fields) +// that struct-level roundtrip tests miss. + +/// Normalize a `serde_json::Value` representing an NxGraphAdjFormat by sorting +/// each adjacency list by the stringified `"id"` field, so edge-order +/// differences don't cause spurious failures. +fn normalize_json_value(v: &mut serde_json::Value) { + if let Some(adj) = v.get_mut("adjacency").and_then(|a| a.as_array_mut()) { + for list in adj.iter_mut() { + if let Some(entries) = list.as_array_mut() { + entries.sort_by(|a, b| { + let ak = a.get("id").map(|v| v.to_string()).unwrap_or_default(); + let bk = b.get("id").map(|v| v.to_string()).unwrap_or_default(); + ak.cmp(&bk) + }); + } + } + } +} + +/// Full JSON-level roundtrip for an undirected graph fixture. +fn assert_json_roundtrip_undirected(json_str: &str) { + let nx: NxGraphAdjFormat = serde_json::from_str(json_str).unwrap(); + let petx: PetxUnGraph = nx.try_into().unwrap(); + let nx_back = NxGraphAdjFormat::try_from(&petx).unwrap(); + let serialized = serde_json::to_string(&nx_back).unwrap(); + + let mut expected: serde_json::Value = serde_json::from_str(json_str).unwrap(); + let mut actual: serde_json::Value = serde_json::from_str(&serialized).unwrap(); + normalize_json_value(&mut expected); + normalize_json_value(&mut actual); + assert_eq!(actual, expected); +} + +/// Full JSON-level roundtrip for a directed graph fixture. +fn assert_json_roundtrip_directed(json_str: &str) { + let nx: NxGraphAdjFormat = serde_json::from_str(json_str).unwrap(); + let petx: PetxDiGraph = nx.try_into().unwrap(); + let nx_back = NxGraphAdjFormat::try_from(&petx).unwrap(); + let serialized = serde_json::to_string(&nx_back).unwrap(); + + let mut expected: serde_json::Value = serde_json::from_str(json_str).unwrap(); + let mut actual: serde_json::Value = serde_json::from_str(&serialized).unwrap(); + normalize_json_value(&mut expected); + normalize_json_value(&mut actual); + assert_eq!(actual, expected); +} + +#[test] +fn json_fidelity_karate_club() { + assert_json_roundtrip_undirected(KARATE_JSON); +} + +#[test] +fn json_fidelity_k5() { + assert_json_roundtrip_undirected(K5_JSON); +} + +#[test] +fn json_fidelity_p4() { + assert_json_roundtrip_undirected(P4_JSON); +} + +#[test] +fn json_fidelity_single_node() { + assert_json_roundtrip_undirected(SINGLE_NODE_JSON); +} + +#[test] +fn json_fidelity_two_triangles() { + assert_json_roundtrip_undirected(TWO_TRIANGLES_JSON); +} + +#[test] +fn json_fidelity_string_ids() { + assert_json_roundtrip_undirected(STRING_IDS_JSON); +} + +#[test] +fn json_fidelity_empty_edges() { + assert_json_roundtrip_undirected(EMPTY_EDGES_JSON); +} + +#[test] +fn json_fidelity_self_loop() { + assert_json_roundtrip_undirected(SELF_LOOP_JSON); +} + +#[test] +fn json_fidelity_small_directed() { + assert_json_roundtrip_directed(SMALL_DIRECTED_JSON); +} + +#[test] +fn json_fidelity_directed_cycle() { + assert_json_roundtrip_directed(DIRECTED_CYCLE_JSON); +} From 027ba0df17fb58b3c3c62d0052bf5a6758094a53 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 10 Apr 2026 12:51:25 -0600 Subject: [PATCH 062/221] try louvain --- ben/src/json/graph/mlc.rs | 432 ++++++++++++++++++++++++++++++++------ 1 file changed, 364 insertions(+), 68 deletions(-) diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index 422c5bf..7a450d5 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -1,10 +1,18 @@ use super::petxgraph::{apply_permutation, PetxGraph}; -use super::rcm::{local_degree_in_component, rcm_component}; +use super::rcm::rcm_component; use petgraph::graph::{Graph, NodeIndex}; -use petgraph::visit::NodeIndexable; +use petgraph::visit::{EdgeRef, NodeIndexable}; use rustworkx_core::connectivity::connected_components; -use std::cmp::Reverse; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; + +/// Cap on Louvain local-move passes within a single level. Phase 1 +/// usually converges in far fewer passes; the cap is purely defensive. +const LOUVAIN_MAX_PASSES: usize = 32; + +/// Cap on Louvain coarsening levels (phase 1 + contract iterations). +/// Each level either strictly reduces node count or hits a modularity +/// fixed point, so this bound is purely defensive. +const LOUVAIN_MAX_LEVELS: usize = 32; /// Tracks how many original nodes have been finalized so far and emits /// periodic `tracing::info!` milestones when verbose logging is enabled. @@ -187,7 +195,7 @@ where return vec![component[0]]; } - let clusters = greedy_cluster_partition(graph, labels, component); + let clusters = louvain_cluster_partition(graph, labels, component); if clusters.len() <= 1 || clusters.len() == component.len() { tracing::debug!( "MLC depth={}: degenerate partition ({} clusters from {} nodes), falling back to RCM", @@ -225,27 +233,54 @@ where order } -/// Partition a component into small clusters using a greedy seed-expansion -/// strategy. +/// Partition a component into communities via full multilevel Louvain. +/// +/// Runs the standard two-phase Louvain algorithm end-to-end: +/// +/// 1. **Phase 1 (local moves):** each node is repeatedly considered for +/// moving into one of its neighbors' communities, picking the move +/// that maximizes the modularity gain. Passes over the node set +/// continue until no move improves modularity or +/// [`LOUVAIN_MAX_PASSES`] is reached. +/// 2. **Phase 2 (contract):** each community is collapsed into a single +/// super-node, with intra-community edges becoming self-loops and +/// inter-community edges becoming weighted edges between super-nodes. +/// The coarser graph is then fed back into phase 1. +/// +/// The loop terminates when a phase-1 sweep makes no moves (a fixed +/// point of the modularity objective) or after [`LOUVAIN_MAX_LEVELS`] +/// levels. Unlike the single-level variant, each contract step is +/// consumed internally — the caller still uses MLC's existing +/// coarse-graph machinery ([`build_coarse_graph`] + [`mlc_order_inner`]) +/// to *order* clusters, but the clustering itself is already coarsened. +/// +/// Internally this operates on [`LouvainGraph`], a compact adjacency +/// representation that tracks weighted edges, self-loop weights, node +/// degrees, and total weight m. The modularity move-gain is the standard +/// integer-safe form /// -/// Seeds are chosen in order of increasing local degree (ties broken by label). -/// Each seed expands to include all of its unassigned neighbors. After each -/// cluster is formed, local degrees are incrementally updated: for every -/// unassigned neighbor of a newly-assigned node, the neighbor's degree is -/// decremented. Nodes are then re-sorted before picking the next seed. +/// ```text +/// Δ ∝ 2m·k_{i,in}(C) − k_i·Σ_tot(C), +/// ``` +/// +/// computed after temporarily removing node i from its current community +/// so the "stay put" baseline uses the same formula and ties prefer +/// staying. Node-processing order is deterministic (by level label). /// /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by -/// `NodeIndex::index()`. +/// * `labels` - Per-node labels used to fix a deterministic +/// node-processing order and tiebreak across runs. /// * `component` - The subset of `NodeIndex` values to partition. /// /// # Returns /// -/// A vector of clusters, where each cluster is a vector of `NodeIndex` -/// values. Every node in `component` appears in exactly one cluster. -fn greedy_cluster_partition( +/// A vector of clusters, one per community found at the coarsest level. +/// Clusters are sorted by their minimum-label member; nodes within each +/// cluster are sorted by label. Every node in `component` appears in +/// exactly one cluster. +fn louvain_cluster_partition( graph: &Graph, labels: &[usize], component: &[NodeIndex], @@ -253,74 +288,335 @@ fn greedy_cluster_partition( where Ty: petgraph::EdgeType, { - let component_set: HashSet = component.iter().copied().collect(); - let mut local_deg = local_degree_in_component(graph, &component_set, component); + let mut g = louvain_init_graph(graph, component); - let mut assigned = vec![false; graph.node_bound()]; - let mut remaining: Vec = component.to_vec(); - let mut clusters = Vec::new(); + // No edges means no modularity to optimize; return singletons so the + // caller's degenerate-partition guard kicks in. + if g.total_weight == 0 { + return component.iter().map(|&n| vec![n]).collect(); + } - // Epoch-based marking for seed neighbors avoids rebuilding a set each - // iteration. - let mut seed_marks = vec![0usize; graph.node_bound()]; - let mut mark_epoch = 1usize; + // Each super-node currently groups one or more original NodeIndex + // values from `component`. Initially every node is its own super-node. + let mut super_nodes: Vec> = + component.iter().map(|&n| vec![n]).collect(); - while !remaining.is_empty() { - remaining.sort_by_key(|&node| (local_deg[node.index()], labels[node.index()])); - let seed = remaining[0]; + // Level-labels track the minimum original label within each super-node + // so phase 1 has a deterministic processing order at every level. + let mut level_labels: Vec = + component.iter().map(|&n| labels[n.index()]).collect(); - let mut cluster = vec![seed]; - assigned[seed.index()] = true; + for _ in 0..LOUVAIN_MAX_LEVELS { + let (community_of, any_move) = louvain_phase1(&g, &level_labels); + if !any_move { + break; + } - for neighbor in graph.neighbors(seed) { - if component_set.contains(&neighbor) { - seed_marks[neighbor.index()] = mark_epoch; + let (new_g, new_id_of) = louvain_contract(&g, &community_of); + let new_n = new_g.n(); + + // Roll super-node membership and level labels forward one level. + let mut new_super_nodes: Vec> = vec![Vec::new(); new_n]; + let mut new_level_labels: Vec = vec![usize::MAX; new_n]; + for old in 0..g.n() { + let new_c = new_id_of[old]; + let chunk = std::mem::take(&mut super_nodes[old]); + new_super_nodes[new_c].extend(chunk); + if level_labels[old] < new_level_labels[new_c] { + new_level_labels[new_c] = level_labels[old]; } } - let mut candidates: Vec = graph - .neighbors(seed) - .filter(|&n| component_set.contains(&n) && !assigned[n.index()]) - .collect(); - candidates.sort_by_key(|&neighbor| { - let shared = graph - .neighbors(neighbor) - .filter(|&next| { - component_set.contains(&next) && seed_marks[next.index()] == mark_epoch - }) - .count(); - ( - Reverse(shared), - local_deg[neighbor.index()], - labels[neighbor.index()], - ) - }); - - for neighbor in candidates { - assigned[neighbor.index()] = true; - cluster.push(neighbor); + super_nodes = new_super_nodes; + level_labels = new_level_labels; + g = new_g; + + if g.n() <= 1 { + break; } + } + + // Deterministic output: nodes sorted by label within each cluster, and + // clusters sorted by their minimum-label member. + let mut clusters = super_nodes; + for cluster in &mut clusters { + cluster.sort_by_key(|&n| labels[n.index()]); + } + clusters.sort_by_key(|cluster| labels[cluster[0].index()]); + clusters +} + +/// Compact weighted-undirected graph used by the multilevel Louvain +/// implementation. +/// +/// `adj[u]` lists `(v, weight)` pairs for non-loop edges; each non-loop +/// edge appears in both endpoints' lists (symmetric storage). Self-loops +/// are stored separately in `self_loop[u]` so `adj` never contains +/// self-references. +/// +/// Weighted modularity conventions apply: +/// `deg[u] = Σ adj[u].1 + 2·self_loop[u]`, and +/// `total_weight = Σ deg[u] / 2`. +struct LouvainGraph { + adj: Vec>, + self_loop: Vec, + deg: Vec, + total_weight: i64, +} - mark_epoch = mark_epoch.wrapping_add(1); - if mark_epoch == 0 { - seed_marks.fill(0); - mark_epoch = 1; +impl LouvainGraph { + fn n(&self) -> usize { + self.adj.len() + } +} + +/// Build the initial [`LouvainGraph`] for a connected component of the +/// original unweighted graph. +/// +/// Every edge starts with weight 1. Self-loops (should any exist in the +/// input) are routed into `self_loop`. Nodes are compacted into +/// `0..component.len()` following the order of `component`. +fn louvain_init_graph( + graph: &Graph, + component: &[NodeIndex], +) -> LouvainGraph +where + Ty: petgraph::EdgeType, +{ + let n = component.len(); + let mut idx_of = vec![usize::MAX; graph.node_bound()]; + for (i, &node) in component.iter().enumerate() { + idx_of[node.index()] = i; + } + + let mut adj_maps: Vec> = (0..n).map(|_| HashMap::new()).collect(); + let mut self_loop = vec![0i64; n]; + + // `edge_references()` yields each edge exactly once, which avoids + // any ambiguity around how petgraph reports self-loops via + // `neighbors()`. + for edge_ref in graph.edge_references() { + let u = idx_of[edge_ref.source().index()]; + let v = idx_of[edge_ref.target().index()]; + if u == usize::MAX || v == usize::MAX { + continue; } + if u == v { + self_loop[u] += 1; + } else { + *adj_maps[u].entry(v).or_insert(0) += 1; + *adj_maps[v].entry(u).or_insert(0) += 1; + } + } - // Decrement degrees of unassigned nodes adjacent to the new cluster. - for &node in &cluster { - for neighbor in graph.neighbors(node) { - if component_set.contains(&neighbor) && !assigned[neighbor.index()] { - local_deg[neighbor.index()] -= 1; + let adj: Vec> = adj_maps + .into_iter() + .map(|m| { + let mut v: Vec<_> = m.into_iter().collect(); + v.sort_unstable_by_key(|&(nb, _)| nb); + v + }) + .collect(); + + let mut deg = vec![0i64; n]; + for u in 0..n { + let s: i64 = adj[u].iter().map(|&(_, w)| w).sum(); + deg[u] = s + 2 * self_loop[u]; + } + let total_weight = deg.iter().sum::() / 2; + + LouvainGraph { + adj, + self_loop, + deg, + total_weight, + } +} + +/// Run one Louvain phase-1 (local-move) sweep on a [`LouvainGraph`]. +/// +/// Each node starts in its own community. At most [`LOUVAIN_MAX_PASSES`] +/// passes over the node set are attempted; passes stop early once a full +/// pass completes with no improving move. Nodes are processed in +/// ascending `level_labels` order for determinism. +/// +/// # Returns +/// +/// * A dense assignment `community_of[u]` giving the community id for +/// each node. Ids are integers in `0..n` but not necessarily contiguous +/// — [`louvain_contract`] remaps them. +/// * A flag that is `true` if at least one node moved during the sweep. +fn louvain_phase1(g: &LouvainGraph, level_labels: &[usize]) -> (Vec, bool) { + let n = g.n(); + let m2 = 2 * g.total_weight; + + let mut community_of: Vec = (0..n).collect(); + let mut community_sum_deg: Vec = g.deg.clone(); + + let mut node_order: Vec = (0..n).collect(); + node_order.sort_by_key(|&u| level_labels[u]); + + // Scratch buffers reused across nodes. + let mut contrib: Vec = vec![0; n]; + let mut contrib_keys: Vec = Vec::new(); + + let mut any_move = false; + + for _ in 0..LOUVAIN_MAX_PASSES { + let mut improved = false; + + for &u in &node_order { + let ci = community_of[u]; + let k_u = g.deg[u]; + + // Tally weighted edges to each neighbor community. + for &(v, w) in &g.adj[u] { + let cj = community_of[v]; + if contrib[cj] == 0 { + contrib_keys.push(cj); + } + contrib[cj] += w; + } + + // Temporarily remove u from its current community so that + // `community_sum_deg[ci]` and `contrib[ci]` reflect the + // post-removal state uniformly for every candidate. + community_sum_deg[ci] -= k_u; + + // Baseline candidate: stay in `ci`. `contrib[ci]` is 0 if no + // neighbor is currently in `ci`, which is the correct value. + let mut best_community = ci; + let mut best_gain = m2 * contrib[ci] - k_u * community_sum_deg[ci]; + + // Sort touched keys so ties deterministically prefer the + // lower community id. + contrib_keys.sort_unstable(); + for &cj in &contrib_keys { + if cj == ci { + continue; + } + let gain = m2 * contrib[cj] - k_u * community_sum_deg[cj]; + if gain > best_gain { + best_gain = gain; + best_community = cj; } } + + // Reset scratch for the next node. + for &k in &contrib_keys { + contrib[k] = 0; + } + contrib_keys.clear(); + + // Commit the (possibly no-op) move. + community_sum_deg[best_community] += k_u; + if best_community != ci { + community_of[u] = best_community; + improved = true; + any_move = true; + } } - remaining.retain(|&n| !assigned[n.index()]); - clusters.push(cluster); + if !improved { + break; + } } - clusters + (community_of, any_move) +} + +/// Contract a [`LouvainGraph`] using a community assignment, producing a +/// new (coarser) graph whose nodes are the communities. +/// +/// Intra-community edges become contributions to the new node's +/// self-loop; inter-community edges become weighted edges between +/// super-nodes. Both old self-loops and internal edges at this level are +/// preserved in the new self-loop so the total weight is invariant. +/// +/// # Arguments +/// +/// * `g` - The current-level graph. +/// * `community_of` - Community assignment produced by +/// [`louvain_phase1`]. Values may be any integers in `0..g.n()`. +/// +/// # Returns +/// +/// * The new coarser [`LouvainGraph`]. +/// * A remap `new_id_of[u]` giving the new super-node index of each old +/// node. Used by the caller to roll super-node membership forward. +fn louvain_contract(g: &LouvainGraph, community_of: &[usize]) -> (LouvainGraph, Vec) { + let n = g.n(); + + // Dense-remap community ids to 0..new_n in first-seen order. + let mut dense = vec![usize::MAX; n]; + let mut new_id_of = vec![usize::MAX; n]; + let mut new_n = 0usize; + for u in 0..n { + let c = community_of[u]; + if dense[c] == usize::MAX { + dense[c] = new_n; + new_n += 1; + } + new_id_of[u] = dense[c]; + } + + // Carry forward existing self-loops verbatim. + let mut new_self_loop = vec![0i64; new_n]; + for u in 0..n { + new_self_loop[new_id_of[u]] += g.self_loop[u]; + } + + // Aggregate edges. Internal (intra-community) edges are double-counted + // by the symmetric adjacency, so we accumulate and halve at the end. + // External edges are also double-counted, but symmetrically across + // the two endpoints — which is exactly the shape the new symmetric + // adj needs, so no halving is required there. + let mut internal_accum = vec![0i64; new_n]; + let mut new_adj_maps: Vec> = (0..new_n).map(|_| HashMap::new()).collect(); + + for u in 0..n { + let cu = new_id_of[u]; + for &(v, w) in &g.adj[u] { + let cv = new_id_of[v]; + if cu == cv { + internal_accum[cu] += w; + } else { + *new_adj_maps[cu].entry(cv).or_insert(0) += w; + } + } + } + + for c in 0..new_n { + new_self_loop[c] += internal_accum[c] / 2; + } + + let new_adj: Vec> = new_adj_maps + .into_iter() + .map(|m| { + let mut v: Vec<_> = m.into_iter().collect(); + v.sort_unstable_by_key(|&(nb, _)| nb); + v + }) + .collect(); + + let mut new_deg = vec![0i64; new_n]; + for c in 0..new_n { + let s: i64 = new_adj[c].iter().map(|&(_, w)| w).sum(); + new_deg[c] = s + 2 * new_self_loop[c]; + } + let new_total = new_deg.iter().sum::() / 2; + + debug_assert_eq!(new_total, g.total_weight); + + ( + LouvainGraph { + adj: new_adj, + self_loop: new_self_loop, + deg: new_deg, + total_weight: new_total, + }, + new_id_of, + ) } /// Build a coarse graph where each cluster is contracted into a single node. @@ -334,7 +630,7 @@ where /// * `graph` - The full graph containing the original edges. /// * `labels` - Per-node labels for the original graph, indexed by /// `NodeIndex::index()`. -/// * `clusters` - The partition produced by [`greedy_cluster_partition`]. +/// * `clusters` - The partition produced by [`louvain_cluster_partition`]. /// Cluster `i` maps to coarse node `i`. /// /// # Returns From 266f11ed9c677cc46384cd28a774fb9d24c4e1ea Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 10 Apr 2026 20:33:43 -0600 Subject: [PATCH 063/221] revert louvain --- Cargo.lock | 217 +++++++++++- ben/Cargo.toml | 1 + ben/src/json/graph/mlc.rs | 686 ++++++++++++-------------------------- 3 files changed, 413 insertions(+), 491 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 82f6b82..44e33e6 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,7 @@ version = "1.0.0" dependencies = [ "byteorder", "clap 4.5.48", + "indicatif", "lipsum", "pcompress", "petgraph", @@ -143,6 +144,12 @@ version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + [[package]] name = "byteorder" version = "1.5.0" @@ -176,7 +183,7 @@ dependencies = [ "bitflags 1.3.2", "strsim 0.8.0", "textwrap", - "unicode-width", + "unicode-width 0.1.14", "vec_map", ] @@ -226,6 +233,19 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width 0.2.2", + "windows-sys 0.59.0", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -266,6 +286,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "equivalent" version = "1.0.2" @@ -377,6 +403,19 @@ dependencies = [ "rayon", ] +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width 0.2.2", + "web-time", +] + [[package]] name = "indoc" version = "2.0.6" @@ -404,6 +443,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "js-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -542,6 +591,12 @@ dependencies = [ "libm", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" version = "1.21.3" @@ -935,6 +990,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "rustworkx-core" version = "0.17.1" @@ -1121,7 +1182,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" dependencies = [ - "unicode-width", + "unicode-width 0.1.14", ] [[package]] @@ -1238,6 +1299,12 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unindent" version = "0.2.4" @@ -1295,6 +1362,61 @@ dependencies = [ "wit-bindgen", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1323,13 +1445,22 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.5", ] [[package]] @@ -1341,6 +1472,22 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + [[package]] name = "windows-targets" version = "0.53.5" @@ -1348,58 +1495,106 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.1" diff --git a/ben/Cargo.toml b/ben/Cargo.toml index b69493f..228f492 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -17,6 +17,7 @@ name = "binary_ensemble" [dependencies] byteorder = "1.5.0" clap = { version = "^4.5.2", features = ["derive"] } +indicatif = "0.17" pcompress = "1.0.7" petgraph = "0.8.3" pipe = "0.4.0" diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index 7a450d5..4eea7fd 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -1,59 +1,107 @@ use super::petxgraph::{apply_permutation, PetxGraph}; -use super::rcm::rcm_component; +use super::rcm::{local_degree_in_component, rcm_component}; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use petgraph::graph::{Graph, NodeIndex}; -use petgraph::visit::{EdgeRef, NodeIndexable}; +use petgraph::visit::NodeIndexable; use rustworkx_core::connectivity::connected_components; -use std::collections::{HashMap, HashSet}; - -/// Cap on Louvain local-move passes within a single level. Phase 1 -/// usually converges in far fewer passes; the cap is purely defensive. -const LOUVAIN_MAX_PASSES: usize = 32; - -/// Cap on Louvain coarsening levels (phase 1 + contract iterations). -/// Each level either strictly reduces node count or hits a modularity -/// fixed point, so this bound is purely defensive. -const LOUVAIN_MAX_LEVELS: usize = 32; - -/// Tracks how many original nodes have been finalized so far and emits -/// periodic `tracing::info!` milestones when verbose logging is enabled. -/// -/// Progress is measured in real-node chunks (base-case RCM calls and -/// per-cluster RCM calls at depth 0). Coarse-graph recursion does not -/// contribute, so `total` corresponds exactly to the number of nodes in -/// the original graph. +use std::cmp::Reverse; +use std::collections::HashSet; +use std::time::Duration; + +/// Per-phase progress tracker for MLC, with one spinner line per recursion +/// depth. +/// +/// Phase 1 (depth 0) processes the original nodes; phase 2 processes the +/// level-1 clusters produced by phase 1; and so on. Bars are added lazily +/// the first time a given depth is reached, and each bar's total grows as +/// new work at that depth is discovered (e.g. when the next top-level +/// component recurses and introduces more coarse nodes). +/// +/// Spinners auto-hide when stderr is not a terminal (e.g. under `cargo +/// test` or when output is piped), so no config is needed for CI/test +/// environments. struct MlcProgress { - total: usize, - done: usize, - last_logged_pct: usize, + multi: MultiProgress, + bars: Vec, + totals: Vec, + dones: Vec, } impl MlcProgress { - /// Create a new progress tracker for a graph with `total` real nodes. - fn new(total: usize) -> Self { + /// Create an empty tracker. Bars are added lazily as depths are reached. + fn new() -> Self { Self { - total, - done: 0, - last_logged_pct: 0, + multi: MultiProgress::new(), + bars: Vec::new(), + totals: Vec::new(), + dones: Vec::new(), } } - /// Record that `chunk` more real nodes have been finalized. Emits an - /// `info!` log line whenever completion crosses a 5% boundary. - fn add(&mut self, chunk: usize) { - self.done += chunk; - let pct = if self.total == 0 { - 100 - } else { - self.done * 100 / self.total - }; - if pct >= self.last_logged_pct + 5 || self.done == self.total { - tracing::info!( - "MLC progress: {}/{} nodes ({}%)", - self.done, - self.total, - pct + /// Make sure a bar exists for `depth`, creating any intermediate bars + /// that don't exist yet. + fn ensure_depth(&mut self, depth: usize) { + while self.bars.len() <= depth { + let bar = self.multi.add(ProgressBar::new_spinner()); + bar.set_style( + ProgressStyle::default_spinner() + .template("{spinner:.cyan} {msg}") + .unwrap(), ); - self.last_logged_pct = pct; + bar.enable_steady_tick(Duration::from_millis(100)); + self.bars.push(bar); + self.totals.push(0); + self.dones.push(0); + let d = self.bars.len() - 1; + self.refresh(d); + } + } + + /// Record that `n` more items will be processed at `depth`. + fn add_total(&mut self, depth: usize, n: usize) { + self.ensure_depth(depth); + self.totals[depth] += n; + self.refresh(depth); + } + + /// Record that `n` more items at `depth` have been finalized. + fn add_done(&mut self, depth: usize, n: usize) { + self.ensure_depth(depth); + self.dones[depth] += n; + self.refresh(depth); + } + + fn refresh(&self, depth: usize) { + let done = self.dones[depth]; + let total = self.totals[depth]; + let pct = if total == 0 { 0 } else { done * 100 / total }; + self.bars[depth].set_message(format!( + "MLC phase {}: {}/{} {} ({}%)", + depth + 1, + done, + total, + Self::unit_for_depth(depth), + pct + )); + } + + fn unit_for_depth(depth: usize) -> String { + if depth == 0 { + "nodes".to_string() + } else { + format!("level-{} clusters", depth) + } + } + + /// Stop all spinners, leaving a final "complete" message on each. + fn finish(&self) { + for (d, bar) in self.bars.iter().enumerate() { + bar.finish_with_message(format!( + "MLC phase {}: complete ({} {})", + d + 1, + self.totals[d], + Self::unit_for_depth(d) + )); } } } @@ -77,36 +125,31 @@ pub(super) fn apply_multi_level_clustering(petx_graph: &mut PetxGraph) - where Ty: petgraph::EdgeType, { - let total_nodes = petx_graph.graph.node_count(); - tracing::info!("MLC: starting on graph with {} nodes", total_nodes); - let labels: Vec = (0..petx_graph.graph.node_bound()).collect(); - let mut progress = MlcProgress::new(total_nodes); - let order = mlc_order_inner(&petx_graph.graph, &labels, Some(&mut progress), 0); + let mut progress = MlcProgress::new(); + let order = mlc_order_inner(&petx_graph.graph, &labels, &mut progress, 0); *petx_graph = apply_permutation(petx_graph, &order); - tracing::info!("MLC: complete"); + progress.finish(); order } /// Recursively order each connected component via multilevel clustering, then /// concatenate the results. /// -/// Components are sorted by their minimum label so that the output order is -/// deterministic. Each component is ordered independently by -/// [`mlc_component`]. +/// Components are sorted by decreasing size (ties broken by minimum label) +/// so that larger components occupy the beginning of the output. Each +/// component is ordered independently by [`mlc_component`]. /// /// # Arguments /// /// * `graph` - The input graph to order. Generic over node/edge weights and /// edge type so it also works with the coarse graph during recursion. /// * `labels` - A per-node label vector used for tie-breaking when choosing -/// BFS seeds and sorting neighbors. Indexed by `NodeIndex::index()`. -/// * `progress` - Optional progress tracker. `Some(_)` at the top level and -/// `None` when recursing into the coarse graph so only real-node work -/// contributes to the counter. -/// * `depth` - Recursion depth. Zero at the top level, incremented each -/// time we recurse into a coarse graph. Used only for logging. +/// seeds and sorting neighbors. Indexed by `NodeIndex::index()`. +/// * `progress` - Progress tracker for the multi-phase spinner display. +/// * `depth` - Recursion depth (0 at the top level). Used to route progress +/// updates to the correct phase bar. /// /// # Returns /// @@ -115,53 +158,52 @@ where fn mlc_order_inner( graph: &Graph, labels: &[usize], - mut progress: Option<&mut MlcProgress>, + progress: &mut MlcProgress, depth: usize, ) -> Vec where Ty: petgraph::EdgeType, { + progress.add_total(depth, graph.node_count()); + let mut components: Vec> = connected_components(graph) .into_iter() .map(|set| set.into_iter().collect()) .collect(); components.sort_by_key(|c| { - c.iter() + let min_label = c + .iter() .map(|n| labels[n.index()]) .min() - .unwrap_or(usize::MAX) + .unwrap_or(usize::MAX); + (Reverse(c.len()), min_label) }); - tracing::debug!( - "MLC depth={}: {} component(s) to order", - depth, - components.len() - ); - let mut order = Vec::with_capacity(graph.node_count()); for component in components { - order.extend(mlc_component( - graph, - labels, - &component, - progress.as_deref_mut(), - depth, - )); + order.extend(mlc_component(graph, labels, &component, progress, depth)); } order } -/// Recursively order a single connected component via multilevel clustering. -/// -/// Single-node components are returned as-is. Otherwise the component is -/// greedily partitioned into clusters; each cluster is then ordered by -/// recursively applying `mlc_component` to it, a coarse graph of -/// inter-cluster edges is built, and the coarse graph is ordered via -/// [`mlc_order_inner`] to determine the final cluster sequence. -/// -/// If the greedy partition produces a single cluster (or the unreachable -/// all-singletons case), the algorithm cannot make progress and falls back -/// to RCM on the whole component. +/// Order a single connected component by seed-expansion clustering plus +/// recursive coarsening. +/// +/// Steps: +/// +/// 1. Singleton components return immediately. +/// 2. [`greedy_cluster_partition`] carves the component into stars. +/// 3. Each cluster is re-ordered internally via [`rcm_component`] on its +/// induced subgraph, so peripheral leaves bracket the cluster and the +/// high-degree seed sits in the interior. +/// 4. If the partition returns a single cluster (a star that covers the +/// whole component), that RCM-ordered cluster is the final order. +/// 5. Otherwise a coarse graph is built with one node per cluster, and +/// [`mlc_order_inner`] recurses on it to decide the order in which +/// clusters are emitted. The recursion terminates when each coarse +/// component collapses to a single cluster. +/// 6. The final order is produced by unrolling: emit clusters in the +/// recursive coarse order, each cluster in its RCM-ordered form. /// /// # Arguments /// @@ -169,10 +211,9 @@ where /// * `labels` - Per-node labels for tie-breaking, indexed by /// `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to order. -/// * `progress` - Optional progress tracker, `Some(_)` only when ordering -/// real nodes. Advanced when the recursion bottoms out at a singleton -/// component or hits the degenerate RCM fallback. -/// * `depth` - Recursion depth, zero at the top level. Used for logging. +/// * `progress` - Progress tracker for the multi-phase spinner display. +/// * `depth` - Recursion depth; routes progress updates to the correct +/// phase bar. /// /// # Returns /// @@ -181,442 +222,127 @@ fn mlc_component( graph: &Graph, labels: &[usize], component: &[NodeIndex], - mut progress: Option<&mut MlcProgress>, + progress: &mut MlcProgress, depth: usize, ) -> Vec where Ty: petgraph::EdgeType, { if component.len() == 1 { - tracing::debug!("MLC depth={}: singleton component", depth); - if let Some(p) = progress.as_deref_mut() { - p.add(1); - } + progress.add_done(depth, 1); return vec![component[0]]; } - let clusters = louvain_cluster_partition(graph, labels, component); - if clusters.len() <= 1 || clusters.len() == component.len() { - tracing::debug!( - "MLC depth={}: degenerate partition ({} clusters from {} nodes), falling back to RCM", - depth, - clusters.len(), - component.len() - ); - let order = rcm_component(graph, labels, component); - if let Some(p) = progress.as_deref_mut() { - p.add(component.len()); - } - return order; + // `greedy_cluster_partition` ticks this depth's progress per cluster, + // so every node in `component` contributes to phase `depth+1` exactly + // once. + let mut clusters = greedy_cluster_partition(graph, labels, component, progress, depth); + + // Reorder each cluster internally via RCM on the subgraph induced by + // its members. This puts peripheral (degree-1) nodes at both ends of + // the cluster and the high-degree seed near the middle/end, which + // keeps cluster boundaries "loose" and avoids stranding the most- + // connected node next to the previous cluster. + for cluster in clusters.iter_mut() { + *cluster = rcm_component(graph, labels, cluster); } - tracing::debug!( - "MLC depth={}: partitioned {} nodes into {} clusters", - depth, - component.len(), - clusters.len() - ); - - let mut cluster_orders: Vec> = Vec::with_capacity(clusters.len()); - for cluster in &clusters { - let order = mlc_component(graph, labels, cluster, progress.as_deref_mut(), depth + 1); - cluster_orders.push(order); + // Single-cluster case: the whole component is one star. + if clusters.len() == 1 { + return clusters.into_iter().next().unwrap(); } + // Multi-cluster case: recurse on the coarse graph to decide the order + // in which the clusters appear. let (coarse_graph, coarse_labels) = build_coarse_graph(graph, labels, &clusters); - let coarse_order = mlc_order_inner(&coarse_graph, &coarse_labels, None, depth + 1); + let coarse_order = mlc_order_inner(&coarse_graph, &coarse_labels, progress, depth + 1); let mut order = Vec::with_capacity(component.len()); for coarse_node in coarse_order { - order.extend(cluster_orders[coarse_node.index()].iter().copied()); + order.extend(clusters[coarse_node.index()].iter().copied()); } order } -/// Partition a component into communities via full multilevel Louvain. -/// -/// Runs the standard two-phase Louvain algorithm end-to-end: +/// Partition a component into star-shaped clusters using a greedy +/// seed-expansion strategy. /// -/// 1. **Phase 1 (local moves):** each node is repeatedly considered for -/// moving into one of its neighbors' communities, picking the move -/// that maximizes the modularity gain. Passes over the node set -/// continue until no move improves modularity or -/// [`LOUVAIN_MAX_PASSES`] is reached. -/// 2. **Phase 2 (contract):** each community is collapsed into a single -/// super-node, with intra-community edges becoming self-loops and -/// inter-community edges becoming weighted edges between super-nodes. -/// The coarser graph is then fed back into phase 1. +/// At each step, the lowest-degree unassigned node (ties broken by label) is +/// chosen as a seed, and the seed together with all of its unassigned +/// neighbors becomes the next cluster. Local degrees are then decremented +/// for every unassigned node adjacent to a newly-assigned one, so subsequent +/// seed selections reflect the residual graph. /// -/// The loop terminates when a phase-1 sweep makes no moves (a fixed -/// point of the modularity objective) or after [`LOUVAIN_MAX_LEVELS`] -/// levels. Unlike the single-level variant, each contract step is -/// consumed internally — the caller still uses MLC's existing -/// coarse-graph machinery ([`build_coarse_graph`] + [`mlc_order_inner`]) -/// to *order* clusters, but the clustering itself is already coarsened. -/// -/// Internally this operates on [`LouvainGraph`], a compact adjacency -/// representation that tracks weighted edges, self-loop weights, node -/// degrees, and total weight m. The modularity move-gain is the standard -/// integer-safe form -/// -/// ```text -/// Δ ∝ 2m·k_{i,in}(C) − k_i·Σ_tot(C), -/// ``` -/// -/// computed after temporarily removing node i from its current community -/// so the "stay put" baseline uses the same formula and ties prefer -/// staying. Node-processing order is deterministic (by level label). +/// Only cluster *membership* is meaningful here; the internal order of each +/// returned cluster is not final and is expected to be overwritten by the +/// caller (e.g. via [`rcm_component`]). /// /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels used to fix a deterministic -/// node-processing order and tiebreak across runs. +/// * `labels` - Per-node labels for tie-breaking, indexed by +/// `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to partition. +/// * `progress` - Progress tracker; `depth`'s done counter is advanced by +/// each cluster's size as the cluster is formed, so the caller's phase +/// bar fills up gradually during large partitions. +/// * `depth` - Recursion depth of the caller, used to select the phase +/// bar to update. /// /// # Returns /// -/// A vector of clusters, one per community found at the coarsest level. -/// Clusters are sorted by their minimum-label member; nodes within each -/// cluster are sorted by label. Every node in `component` appears in -/// exactly one cluster. -fn louvain_cluster_partition( +/// A vector of clusters, where each cluster is a vector of `NodeIndex` +/// values. Every node in `component` appears in exactly one cluster. +fn greedy_cluster_partition( graph: &Graph, labels: &[usize], component: &[NodeIndex], + progress: &mut MlcProgress, + depth: usize, ) -> Vec> where Ty: petgraph::EdgeType, { - let mut g = louvain_init_graph(graph, component); - - // No edges means no modularity to optimize; return singletons so the - // caller's degenerate-partition guard kicks in. - if g.total_weight == 0 { - return component.iter().map(|&n| vec![n]).collect(); - } - - // Each super-node currently groups one or more original NodeIndex - // values from `component`. Initially every node is its own super-node. - let mut super_nodes: Vec> = - component.iter().map(|&n| vec![n]).collect(); - - // Level-labels track the minimum original label within each super-node - // so phase 1 has a deterministic processing order at every level. - let mut level_labels: Vec = - component.iter().map(|&n| labels[n.index()]).collect(); - - for _ in 0..LOUVAIN_MAX_LEVELS { - let (community_of, any_move) = louvain_phase1(&g, &level_labels); - if !any_move { - break; - } - - let (new_g, new_id_of) = louvain_contract(&g, &community_of); - let new_n = new_g.n(); - - // Roll super-node membership and level labels forward one level. - let mut new_super_nodes: Vec> = vec![Vec::new(); new_n]; - let mut new_level_labels: Vec = vec![usize::MAX; new_n]; - for old in 0..g.n() { - let new_c = new_id_of[old]; - let chunk = std::mem::take(&mut super_nodes[old]); - new_super_nodes[new_c].extend(chunk); - if level_labels[old] < new_level_labels[new_c] { - new_level_labels[new_c] = level_labels[old]; + let component_set: HashSet = component.iter().copied().collect(); + let mut local_deg = local_degree_in_component(graph, &component_set, component); + + let mut assigned = vec![false; graph.node_bound()]; + let mut remaining: Vec = component.to_vec(); + let mut clusters = Vec::new(); + + while !remaining.is_empty() { + remaining.sort_by_key(|&node| (local_deg[node.index()], labels[node.index()])); + let seed = remaining[0]; + + let mut cluster = vec![seed]; + assigned[seed.index()] = true; + + // Cluster membership is seed + every unassigned in-component + // neighbor. Internal order here is irrelevant: the caller + // (`mlc_component`) overwrites it with an RCM ordering on the + // cluster's induced subgraph. + for neighbor in graph.neighbors(seed) { + if component_set.contains(&neighbor) && !assigned[neighbor.index()] { + assigned[neighbor.index()] = true; + cluster.push(neighbor); } } - super_nodes = new_super_nodes; - level_labels = new_level_labels; - g = new_g; - - if g.n() <= 1 { - break; - } - } - - // Deterministic output: nodes sorted by label within each cluster, and - // clusters sorted by their minimum-label member. - let mut clusters = super_nodes; - for cluster in &mut clusters { - cluster.sort_by_key(|&n| labels[n.index()]); - } - clusters.sort_by_key(|cluster| labels[cluster[0].index()]); - clusters -} - -/// Compact weighted-undirected graph used by the multilevel Louvain -/// implementation. -/// -/// `adj[u]` lists `(v, weight)` pairs for non-loop edges; each non-loop -/// edge appears in both endpoints' lists (symmetric storage). Self-loops -/// are stored separately in `self_loop[u]` so `adj` never contains -/// self-references. -/// -/// Weighted modularity conventions apply: -/// `deg[u] = Σ adj[u].1 + 2·self_loop[u]`, and -/// `total_weight = Σ deg[u] / 2`. -struct LouvainGraph { - adj: Vec>, - self_loop: Vec, - deg: Vec, - total_weight: i64, -} - -impl LouvainGraph { - fn n(&self) -> usize { - self.adj.len() - } -} - -/// Build the initial [`LouvainGraph`] for a connected component of the -/// original unweighted graph. -/// -/// Every edge starts with weight 1. Self-loops (should any exist in the -/// input) are routed into `self_loop`. Nodes are compacted into -/// `0..component.len()` following the order of `component`. -fn louvain_init_graph( - graph: &Graph, - component: &[NodeIndex], -) -> LouvainGraph -where - Ty: petgraph::EdgeType, -{ - let n = component.len(); - let mut idx_of = vec![usize::MAX; graph.node_bound()]; - for (i, &node) in component.iter().enumerate() { - idx_of[node.index()] = i; - } - - let mut adj_maps: Vec> = (0..n).map(|_| HashMap::new()).collect(); - let mut self_loop = vec![0i64; n]; - - // `edge_references()` yields each edge exactly once, which avoids - // any ambiguity around how petgraph reports self-loops via - // `neighbors()`. - for edge_ref in graph.edge_references() { - let u = idx_of[edge_ref.source().index()]; - let v = idx_of[edge_ref.target().index()]; - if u == usize::MAX || v == usize::MAX { - continue; - } - if u == v { - self_loop[u] += 1; - } else { - *adj_maps[u].entry(v).or_insert(0) += 1; - *adj_maps[v].entry(u).or_insert(0) += 1; - } - } - - let adj: Vec> = adj_maps - .into_iter() - .map(|m| { - let mut v: Vec<_> = m.into_iter().collect(); - v.sort_unstable_by_key(|&(nb, _)| nb); - v - }) - .collect(); - - let mut deg = vec![0i64; n]; - for u in 0..n { - let s: i64 = adj[u].iter().map(|&(_, w)| w).sum(); - deg[u] = s + 2 * self_loop[u]; - } - let total_weight = deg.iter().sum::() / 2; - - LouvainGraph { - adj, - self_loop, - deg, - total_weight, - } -} - -/// Run one Louvain phase-1 (local-move) sweep on a [`LouvainGraph`]. -/// -/// Each node starts in its own community. At most [`LOUVAIN_MAX_PASSES`] -/// passes over the node set are attempted; passes stop early once a full -/// pass completes with no improving move. Nodes are processed in -/// ascending `level_labels` order for determinism. -/// -/// # Returns -/// -/// * A dense assignment `community_of[u]` giving the community id for -/// each node. Ids are integers in `0..n` but not necessarily contiguous -/// — [`louvain_contract`] remaps them. -/// * A flag that is `true` if at least one node moved during the sweep. -fn louvain_phase1(g: &LouvainGraph, level_labels: &[usize]) -> (Vec, bool) { - let n = g.n(); - let m2 = 2 * g.total_weight; - - let mut community_of: Vec = (0..n).collect(); - let mut community_sum_deg: Vec = g.deg.clone(); - - let mut node_order: Vec = (0..n).collect(); - node_order.sort_by_key(|&u| level_labels[u]); - - // Scratch buffers reused across nodes. - let mut contrib: Vec = vec![0; n]; - let mut contrib_keys: Vec = Vec::new(); - - let mut any_move = false; - - for _ in 0..LOUVAIN_MAX_PASSES { - let mut improved = false; - - for &u in &node_order { - let ci = community_of[u]; - let k_u = g.deg[u]; - - // Tally weighted edges to each neighbor community. - for &(v, w) in &g.adj[u] { - let cj = community_of[v]; - if contrib[cj] == 0 { - contrib_keys.push(cj); - } - contrib[cj] += w; - } - - // Temporarily remove u from its current community so that - // `community_sum_deg[ci]` and `contrib[ci]` reflect the - // post-removal state uniformly for every candidate. - community_sum_deg[ci] -= k_u; - - // Baseline candidate: stay in `ci`. `contrib[ci]` is 0 if no - // neighbor is currently in `ci`, which is the correct value. - let mut best_community = ci; - let mut best_gain = m2 * contrib[ci] - k_u * community_sum_deg[ci]; - - // Sort touched keys so ties deterministically prefer the - // lower community id. - contrib_keys.sort_unstable(); - for &cj in &contrib_keys { - if cj == ci { - continue; - } - let gain = m2 * contrib[cj] - k_u * community_sum_deg[cj]; - if gain > best_gain { - best_gain = gain; - best_community = cj; + // Decrement degrees of unassigned nodes adjacent to the new cluster. + for &node in &cluster { + for neighbor in graph.neighbors(node) { + if component_set.contains(&neighbor) && !assigned[neighbor.index()] { + local_deg[neighbor.index()] -= 1; } } - - // Reset scratch for the next node. - for &k in &contrib_keys { - contrib[k] = 0; - } - contrib_keys.clear(); - - // Commit the (possibly no-op) move. - community_sum_deg[best_community] += k_u; - if best_community != ci { - community_of[u] = best_community; - improved = true; - any_move = true; - } - } - - if !improved { - break; } - } - - (community_of, any_move) -} -/// Contract a [`LouvainGraph`] using a community assignment, producing a -/// new (coarser) graph whose nodes are the communities. -/// -/// Intra-community edges become contributions to the new node's -/// self-loop; inter-community edges become weighted edges between -/// super-nodes. Both old self-loops and internal edges at this level are -/// preserved in the new self-loop so the total weight is invariant. -/// -/// # Arguments -/// -/// * `g` - The current-level graph. -/// * `community_of` - Community assignment produced by -/// [`louvain_phase1`]. Values may be any integers in `0..g.n()`. -/// -/// # Returns -/// -/// * The new coarser [`LouvainGraph`]. -/// * A remap `new_id_of[u]` giving the new super-node index of each old -/// node. Used by the caller to roll super-node membership forward. -fn louvain_contract(g: &LouvainGraph, community_of: &[usize]) -> (LouvainGraph, Vec) { - let n = g.n(); - - // Dense-remap community ids to 0..new_n in first-seen order. - let mut dense = vec![usize::MAX; n]; - let mut new_id_of = vec![usize::MAX; n]; - let mut new_n = 0usize; - for u in 0..n { - let c = community_of[u]; - if dense[c] == usize::MAX { - dense[c] = new_n; - new_n += 1; - } - new_id_of[u] = dense[c]; + remaining.retain(|&n| !assigned[n.index()]); + progress.add_done(depth, cluster.len()); + clusters.push(cluster); } - // Carry forward existing self-loops verbatim. - let mut new_self_loop = vec![0i64; new_n]; - for u in 0..n { - new_self_loop[new_id_of[u]] += g.self_loop[u]; - } - - // Aggregate edges. Internal (intra-community) edges are double-counted - // by the symmetric adjacency, so we accumulate and halve at the end. - // External edges are also double-counted, but symmetrically across - // the two endpoints — which is exactly the shape the new symmetric - // adj needs, so no halving is required there. - let mut internal_accum = vec![0i64; new_n]; - let mut new_adj_maps: Vec> = (0..new_n).map(|_| HashMap::new()).collect(); - - for u in 0..n { - let cu = new_id_of[u]; - for &(v, w) in &g.adj[u] { - let cv = new_id_of[v]; - if cu == cv { - internal_accum[cu] += w; - } else { - *new_adj_maps[cu].entry(cv).or_insert(0) += w; - } - } - } - - for c in 0..new_n { - new_self_loop[c] += internal_accum[c] / 2; - } - - let new_adj: Vec> = new_adj_maps - .into_iter() - .map(|m| { - let mut v: Vec<_> = m.into_iter().collect(); - v.sort_unstable_by_key(|&(nb, _)| nb); - v - }) - .collect(); - - let mut new_deg = vec![0i64; new_n]; - for c in 0..new_n { - let s: i64 = new_adj[c].iter().map(|&(_, w)| w).sum(); - new_deg[c] = s + 2 * new_self_loop[c]; - } - let new_total = new_deg.iter().sum::() / 2; - - debug_assert_eq!(new_total, g.total_weight); - - ( - LouvainGraph { - adj: new_adj, - self_loop: new_self_loop, - deg: new_deg, - total_weight: new_total, - }, - new_id_of, - ) + clusters } /// Build a coarse graph where each cluster is contracted into a single node. @@ -630,7 +356,7 @@ fn louvain_contract(g: &LouvainGraph, community_of: &[usize]) -> (LouvainGraph, /// * `graph` - The full graph containing the original edges. /// * `labels` - Per-node labels for the original graph, indexed by /// `NodeIndex::index()`. -/// * `clusters` - The partition produced by [`louvain_cluster_partition`]. +/// * `clusters` - The partition produced by [`greedy_cluster_partition`]. /// Cluster `i` maps to coarse node `i`. /// /// # Returns From 612512efd2aa649e4aebc678c3d83b78f38f9690 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 10 Apr 2026 23:20:43 -0600 Subject: [PATCH 064/221] add bendl --- ben/src/bin/bendl.rs | 4 + ben/src/cli/bendl.rs | 472 ++++++ ben/src/cli/mod.rs | 1 + ben/src/io/bundle/format.rs | 645 +++++++++ ben/src/io/bundle/manifest.rs | 58 + ben/src/io/bundle/mod.rs | 27 + ben/src/io/bundle/reader.rs | 1046 ++++++++++++++ ben/src/io/bundle/writer.rs | 2009 ++++++++++++++++++++++++++ ben/src/io/mod.rs | 1 + ben/tests/test_cli.rs | 169 ++- pyben/binary_ensemble/__init__.py | 2 + pyben/src/bundle/mod.rs | 241 +++ pyben/src/encode/mod.rs | 303 +++- pyben/src/lib.rs | 2 + pyben/tests/test_bundle.py | 1375 ++++++++++++++++++ pyben/tests/test_python_pipelines.py | 14 +- 16 files changed, 6345 insertions(+), 24 deletions(-) create mode 100644 ben/src/bin/bendl.rs create mode 100644 ben/src/cli/bendl.rs create mode 100644 ben/src/io/bundle/format.rs create mode 100644 ben/src/io/bundle/manifest.rs create mode 100644 ben/src/io/bundle/mod.rs create mode 100644 ben/src/io/bundle/reader.rs create mode 100644 ben/src/io/bundle/writer.rs create mode 100644 pyben/src/bundle/mod.rs create mode 100644 pyben/tests/test_bundle.py diff --git a/ben/src/bin/bendl.rs b/ben/src/bin/bendl.rs new file mode 100644 index 0000000..5033a34 --- /dev/null +++ b/ben/src/bin/bendl.rs @@ -0,0 +1,4 @@ +/// Entry point for the `bendl` CLI binary. +fn main() { + binary_ensemble::cli::bendl::run(); +} diff --git a/ben/src/cli/bendl.rs b/ben/src/cli/bendl.rs new file mode 100644 index 0000000..786aa25 --- /dev/null +++ b/ben/src/cli/bendl.rs @@ -0,0 +1,472 @@ +//! CLI front-end for the `.bendl` bundle container. +//! +//! Exposes four subcommands: +//! +//! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional +//! asset files into a finalized `.bendl` bundle. +//! - `inspect` — print the header and directory of a `.bendl` file. +//! - `extract` — copy the embedded stream region or a named asset out +//! of a bundle to disk. +//! - `append` — add new asset files to an already-finalized bundle +//! without rewriting the stream. + +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, BufWriter, Read, Seek, Write}; +use std::path::{Path, PathBuf}; + +use clap::{Parser, Subcommand}; + +use crate::cli::common::{check_overwrite, set_verbose}; +use crate::io::bundle::format::{ + AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, + ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, +}; +use crate::io::bundle::{ + AddAssetOptions, BendlReader, BendlWriteError, BendlWriter, +}; +use crate::io::bundle::writer::BendlAppender; +use crate::io::reader::subsample::count_samples_from_file; + +/// Parsed form of a `name=path` option such as `--asset myblob=/tmp/x`. +#[derive(Debug, Clone)] +struct NamedAsset { + name: String, + path: PathBuf, +} + +impl std::str::FromStr for NamedAsset { + type Err = String; + fn from_str(s: &str) -> Result { + let (name, path) = s + .split_once('=') + .ok_or_else(|| format!("expected NAME=PATH, got {s:?}"))?; + if name.is_empty() { + return Err("custom asset name must be non-empty".to_string()); + } + Ok(NamedAsset { + name: name.to_string(), + path: PathBuf::from(path), + }) + } +} + +/// `bendl` CLI entry point. +#[derive(Parser, Debug)] +#[command( + name = "bendl", + about = "Create, inspect, extract from, and append to .bendl bundle files.", + version +)] +struct Args { + /// Enable verbose tracing output. + #[arg(short, long, global = true)] + verbose: bool, + + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// Package a `.ben` or `.xben` stream (plus optional assets) into a `.bendl`. + Create(CreateArgs), + /// Print the header and directory of a `.bendl` file. + Inspect(InspectArgs), + /// Extract the embedded stream or a named asset to a file. + Extract(ExtractArgs), + /// Append new assets to an already-finalized `.bendl` bundle. + Append(AppendArgs), +} + +#[derive(Parser, Debug)] +struct CreateArgs { + /// Path to the `.ben` or `.xben` assignment stream to embed. + /// File extension chooses the container format. + #[arg(short = 'i', long)] + input: PathBuf, + /// Destination `.bendl` path. + #[arg(short = 'o', long)] + output: PathBuf, + /// Optional `graph.json` asset path. Will be stored under the + /// canonical name `graph.json` and xz-compressed by default. + #[arg(long)] + graph: Option, + /// Optional `metadata.json` asset path. Stored under canonical name. + #[arg(long)] + metadata: Option, + /// Optional `relabel_map.json` asset path. Stored under canonical name. + #[arg(long)] + relabel_map: Option, + /// Additional custom assets, specified as `NAME=PATH`. May be repeated. + #[arg(long = "asset")] + assets: Vec, + /// Overwrite the output file if it already exists. + #[arg(short = 'w', long)] + overwrite: bool, + /// Store `graph.json` raw instead of compressing it. + #[arg(long)] + graph_raw: bool, +} + +#[derive(Parser, Debug)] +struct InspectArgs { + /// `.bendl` file to inspect. + input: PathBuf, +} + +#[derive(Parser, Debug)] +struct ExtractArgs { + /// `.bendl` file to extract from. + input: PathBuf, + /// Output file path for the extracted bytes. + #[arg(short = 'o', long)] + output: PathBuf, + /// Extract the embedded assignment stream region verbatim. Mutually + /// exclusive with `--asset`. + #[arg(long, conflicts_with = "asset")] + stream: bool, + /// Name of the asset to extract (e.g. `graph.json`). If the asset is + /// xz-compressed, the extracted file contains the decompressed bytes. + #[arg(long)] + asset: Option, + /// Overwrite the output file if it already exists. + #[arg(short = 'w', long)] + overwrite: bool, +} + +#[derive(Parser, Debug)] +struct AppendArgs { + /// `.bendl` file to append to. Must be finalized (`complete == 1`). + input: PathBuf, + /// Optional `graph.json` asset path to add. + #[arg(long)] + graph: Option, + /// Optional `metadata.json` asset path to add. + #[arg(long)] + metadata: Option, + /// Optional `relabel_map.json` asset path to add. + #[arg(long)] + relabel_map: Option, + /// Additional custom assets, specified as `NAME=PATH`. May be repeated. + #[arg(long = "asset")] + assets: Vec, + /// Store `graph.json` raw instead of compressing it. + #[arg(long)] + graph_raw: bool, +} + +/// Parse CLI arguments and execute the selected subcommand. +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + let result = match args.command { + Command::Create(a) => run_create(a), + Command::Inspect(a) => run_inspect(a), + Command::Extract(a) => run_extract(a), + Command::Append(a) => run_append(a), + }; + + if let Err(err) = result { + eprintln!("Error: {err}"); + std::process::exit(1); + } +} + +/// Detect the container format of `path` from its extension. +fn format_from_path(path: &Path) -> Result { + match path.extension().and_then(|e| e.to_str()) { + Some("ben") => Ok(AssignmentFormat::Ben), + Some("xben") => Ok(AssignmentFormat::Xben), + other => Err(format!( + "unable to determine assignment format from extension {other:?}; \ + expected .ben or .xben" + )), + } +} + +/// `mode` argument expected by `count_samples_from_file`. +fn mode_str(format: AssignmentFormat) -> &'static str { + match format { + AssignmentFormat::Ben => "ben", + AssignmentFormat::Xben => "xben", + } +} + +fn run_create(args: CreateArgs) -> Result<(), String> { + let format = format_from_path(&args.input)?; + check_overwrite( + args.output.to_str().ok_or("non-utf8 output path")?, + args.overwrite, + ) + .map_err(|e| format!("{e}"))?; + + // Count samples up front so we can patch the header at finalize time. + // This pre-scan is O(stream size); the second pass streams bytes directly. + let sample_count: i64 = count_samples_from_file(&args.input, mode_str(format)) + .map_err(|e| format!("failed to count samples in {:?}: {e}", args.input))? + as i64; + + let out_file = File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?; + let mut writer = BendlWriter::new(out_file, format) + .map_err(|e| format!("failed to initialize bundle writer: {e}"))?; + + // Add singleton assets first, in canonical order. + if let Some(ref path) = args.metadata { + add_file_asset( + &mut writer, + ASSET_TYPE_METADATA, + "metadata.json", + path, + AddAssetOptions::defaults().json(), + )?; + } + if let Some(ref path) = args.graph { + let opts = if args.graph_raw { + AddAssetOptions::defaults().json().raw() + } else { + AddAssetOptions::defaults().json() + }; + add_file_asset(&mut writer, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + } + if let Some(ref path) = args.relabel_map { + add_file_asset( + &mut writer, + ASSET_TYPE_RELABEL_MAP, + "relabel_map.json", + path, + AddAssetOptions::defaults().json(), + )?; + } + for NamedAsset { name, path } in &args.assets { + add_file_asset( + &mut writer, + ASSET_TYPE_CUSTOM, + name, + path, + AddAssetOptions::defaults(), + )?; + } + + // Stream phase: copy bytes from the input file directly into the + // bundle's stream region. This preserves the exact BEN/XBEN bytes. + { + let mut handle = writer + .begin_stream() + .map_err(|e| format!("failed to open stream region: {e}"))?; + let mut input = BufReader::new( + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?, + ); + io::copy(&mut input, &mut handle) + .map_err(|e| format!("failed to copy assignment stream: {e}"))?; + handle + .finish(sample_count) + .map_err(|e| format!("failed to close stream region: {e}"))?; + } + + writer + .finish() + .map_err(|e| format!("failed to finalize bundle: {e}"))?; + + eprintln!( + "Wrote {:?} ({} samples, format = {:?})", + args.output, sample_count, format + ); + Ok(()) +} + +fn add_file_asset( + writer: &mut BendlWriter, + asset_type: u16, + name: &str, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + writer + .add_asset(asset_type, name, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} + +fn run_inspect(args: InspectArgs) -> Result<(), String> { + let file = File::open(&args.input) + .map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let reader = BendlReader::open(BufReader::new(file)) + .map_err(|e| format!("failed to parse bundle header: {e}"))?; + + let header = reader.header(); + println!("file: {}", args.input.display()); + println!( + "version: {}.{}", + header.major_version, header.minor_version + ); + println!("complete: {}", reader.is_complete()); + println!( + "assignment_format: {}", + match reader.assignment_format() { + Some(AssignmentFormat::Ben) => "ben", + Some(AssignmentFormat::Xben) => "xben", + None => "unknown", + } + ); + println!( + "sample_count: {}", + match reader.sample_count() { + Some(n) => n.to_string(), + None => "".to_string(), + } + ); + println!( + "stream: offset={} len={}", + header.stream_offset, header.stream_len + ); + println!( + "directory: offset={} len={}", + header.directory_offset, header.directory_len + ); + + let entries = reader.assets(); + println!("assets: {} entries", entries.len()); + for entry in entries { + let mut flag_parts: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flag_parts.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flag_parts.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flag_parts.push("checksum"); + } + let flag_str = if flag_parts.is_empty() { + "-".to_string() + } else { + flag_parts.join(",") + }; + println!( + " type={:<4} name={:<24} offset={:<10} len={:<10} flags={}", + entry.asset_type, entry.name, entry.payload_offset, entry.payload_len, flag_str + ); + } + + Ok(()) +} + +fn run_extract(args: ExtractArgs) -> Result<(), String> { + if !args.stream && args.asset.is_none() { + return Err("extract requires either --stream or --asset ".to_string()); + } + check_overwrite( + args.output.to_str().ok_or("non-utf8 output path")?, + args.overwrite, + ) + .map_err(|e| format!("{e}"))?; + + let file = File::open(&args.input) + .map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let mut reader = BendlReader::open(BufReader::new(file)) + .map_err(|e| format!("failed to parse bundle header: {e}"))?; + + let mut out = BufWriter::new( + File::create(&args.output).map_err(|e| format!("failed to create {:?}: {e}", args.output))?, + ); + + if args.stream { + let mut stream = reader + .assignment_stream_reader() + .map_err(|e| format!("failed to open stream region: {e}"))?; + io::copy(&mut stream, &mut out) + .map_err(|e| format!("failed to copy stream bytes: {e}"))?; + } else if let Some(name) = args.asset.as_deref() { + let entry = reader + .find_asset_by_name(name) + .cloned() + .ok_or_else(|| format!("no asset named {name:?} in bundle"))?; + let mut asset = reader + .asset_reader(&entry) + .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; + io::copy(&mut asset, &mut out) + .map_err(|e| format!("failed to copy asset bytes: {e}"))?; + } + + out.flush().map_err(|e| format!("flush failed: {e}"))?; + Ok(()) +} + +fn run_append(args: AppendArgs) -> Result<(), String> { + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&args.input) + .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; + let mut appender = BendlAppender::open(file) + .map_err(|e| format!("failed to open appender: {e}"))?; + + let mut added = 0usize; + if let Some(ref path) = args.metadata { + append_file_asset( + &mut appender, + ASSET_TYPE_METADATA, + "metadata.json", + path, + AddAssetOptions::defaults().json(), + )?; + added += 1; + } + if let Some(ref path) = args.graph { + let opts = if args.graph_raw { + AddAssetOptions::defaults().json().raw() + } else { + AddAssetOptions::defaults().json() + }; + append_file_asset(&mut appender, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + added += 1; + } + if let Some(ref path) = args.relabel_map { + append_file_asset( + &mut appender, + ASSET_TYPE_RELABEL_MAP, + "relabel_map.json", + path, + AddAssetOptions::defaults().json(), + )?; + added += 1; + } + for NamedAsset { name, path } in &args.assets { + append_file_asset( + &mut appender, + ASSET_TYPE_CUSTOM, + name, + path, + AddAssetOptions::defaults(), + )?; + added += 1; + } + + if added == 0 { + // Nothing to do; leave the file untouched. + appender.abort(); + eprintln!("No assets specified; bundle is unchanged."); + return Ok(()); + } + + appender + .commit() + .map_err(|e| format!("failed to commit append: {e}"))?; + eprintln!("Appended {added} asset(s) to {:?}", args.input); + Ok(()) +} + +fn append_file_asset( + appender: &mut BendlAppender, + asset_type: u16, + name: &str, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + appender + .add_asset(asset_type, name, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} diff --git a/ben/src/cli/mod.rs b/ben/src/cli/mod.rs index 568f3ad..9b0161b 100644 --- a/ben/src/cli/mod.rs +++ b/ben/src/cli/mod.rs @@ -1,6 +1,7 @@ //! Library-backed CLI implementations used by the `src/bin` entrypoints. pub mod ben; +pub mod bendl; pub mod common; pub mod pben; pub mod reben; diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs new file mode 100644 index 0000000..c49255c --- /dev/null +++ b/ben/src/io/bundle/format.rs @@ -0,0 +1,645 @@ +//! Binary header and directory definitions for the `.bendl` container. +//! +//! This module is the pure format layer: it defines the on-disk byte +//! layout, the associated constants, and the encode/decode helpers that +//! convert between in-memory Rust structs and their on-disk representation. +//! There is no I/O orchestration here — higher layers (`reader`, `writer`) +//! combine these primitives with seekable files. +//! +//! All multi-byte integers in the `.bendl` format are little-endian. + +use std::io::{self, Read, Write}; + +use thiserror::Error; + +// --------------------------------------------------------------------------- +// Magic, version, and header layout +// --------------------------------------------------------------------------- + +/// Magic bytes at offset 0 of every `.bendl` file. +pub const BENDL_MAGIC: [u8; 8] = *b"BENDL\0\0\x01"; + +/// Current major version produced by this implementation. +pub const BENDL_MAJOR_VERSION: u16 = 1; +/// Current minor version produced by this implementation. +pub const BENDL_MINOR_VERSION: u16 = 0; + +/// Size of the fixed header in bytes. +pub const HEADER_SIZE: usize = 64; + +/// `complete` flag value for incomplete (unfinalized) bundles. +pub const COMPLETE_NO: u8 = 0; +/// `complete` flag value for finalized bundles. +pub const COMPLETE_YES: u8 = 1; + +// --------------------------------------------------------------------------- +// Assignment format identifiers +// --------------------------------------------------------------------------- + +/// Assignment format identifier: embedded BEN stream. +pub const ASSIGNMENT_FORMAT_BEN: u8 = 1; +/// Assignment format identifier: embedded XBEN stream. +pub const ASSIGNMENT_FORMAT_XBEN: u8 = 2; + +/// Container format of the embedded assignment stream. +/// +/// The BEN *variant* (`Standard`, `MkvChain`, `TwoDelta`) is carried by +/// the 17-byte banner at the start of the embedded stream and is not +/// duplicated in the bundle header. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AssignmentFormat { + /// Uncompressed BEN byte stream. + Ben, + /// XBEN byte stream (xz-compressed BEN). + Xben, +} + +impl AssignmentFormat { + /// Raw wire encoding of this format. + pub fn to_u8(self) -> u8 { + match self { + AssignmentFormat::Ben => ASSIGNMENT_FORMAT_BEN, + AssignmentFormat::Xben => ASSIGNMENT_FORMAT_XBEN, + } + } + + /// Parse a raw byte into an `AssignmentFormat`. + pub fn from_u8(raw: u8) -> Option { + match raw { + ASSIGNMENT_FORMAT_BEN => Some(AssignmentFormat::Ben), + ASSIGNMENT_FORMAT_XBEN => Some(AssignmentFormat::Xben), + _ => None, + } + } +} + +// --------------------------------------------------------------------------- +// Asset types, flags, canonical names +// --------------------------------------------------------------------------- + +/// Asset type id for `metadata.json`. +pub const ASSET_TYPE_METADATA: u16 = 1; +/// Asset type id for `graph.json`. +pub const ASSET_TYPE_GRAPH: u16 = 2; +/// Asset type id for `relabel_map.json`. +pub const ASSET_TYPE_RELABEL_MAP: u16 = 3; +/// Asset type id for a custom user asset (name chosen by writer). +pub const ASSET_TYPE_CUSTOM: u16 = 4; + +/// Canonical name for the `metadata.json` asset. +pub const CANONICAL_NAME_METADATA: &str = "metadata.json"; +/// Canonical name for the `graph.json` asset. +pub const CANONICAL_NAME_GRAPH: &str = "graph.json"; +/// Canonical name for the `relabel_map.json` asset. +pub const CANONICAL_NAME_RELABEL_MAP: &str = "relabel_map.json"; + +/// Return the canonical name reserved for a known singleton asset type, +/// or `None` for custom or unknown types. +pub fn canonical_name_for(asset_type: u16) -> Option<&'static str> { + match asset_type { + ASSET_TYPE_METADATA => Some(CANONICAL_NAME_METADATA), + ASSET_TYPE_GRAPH => Some(CANONICAL_NAME_GRAPH), + ASSET_TYPE_RELABEL_MAP => Some(CANONICAL_NAME_RELABEL_MAP), + _ => None, + } +} + +/// Return whether a given asset type should default to xz compression +/// when the writer is not given an explicit compression option. +pub fn default_compresses_by_type(asset_type: u16) -> bool { + matches!(asset_type, ASSET_TYPE_GRAPH) +} + +/// Asset flag bit: the decoded payload is UTF-8 JSON. +pub const ASSET_FLAG_JSON: u16 = 1 << 0; +/// Asset flag bit: the stored payload is xz-compressed. The `payload_len` +/// directory field refers to the compressed size on disk. +pub const ASSET_FLAG_XZ: u16 = 1 << 1; +/// Asset flag bit: the entry carries a trailing checksum. +pub const ASSET_FLAG_CHECKSUM: u16 = 1 << 2; + +/// Default xz preset level used when compressing asset payloads. +/// +/// Level 6 matches the `xz` CLI's own default and `xz2::XzEncoder::new`'s +/// default, and is a reasonable ratio/speed balance for JSON payloads. +pub const DEFAULT_XZ_PRESET: u32 = 6; + +// --------------------------------------------------------------------------- +// Header +// --------------------------------------------------------------------------- + +/// In-memory representation of the fixed 64-byte `.bendl` header. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct BendlHeader { + /// Magic bytes; should equal [`BENDL_MAGIC`]. + pub magic: [u8; 8], + /// Incompatible-change version. + pub major_version: u16, + /// Additive backward-compatible version. + pub minor_version: u16, + /// `1` if the bundle was successfully finalized, else `0`. + pub complete: u8, + /// Container format of the embedded assignment stream. + pub assignment_format: u8, + /// Padding after `assignment_format`; writers set to zero, readers ignore. + pub reserved_0: u16, + /// Bundle-level feature flags. + pub flags: u64, + /// Absolute byte offset of the directory table, or `0` if no directory + /// has been written yet. In a finalized bundle the directory lives at + /// the end of the file. + pub directory_offset: u64, + /// Byte length of the directory table, or `0` if absent. + pub directory_len: u64, + /// Byte offset where the assignment stream begins. + pub stream_offset: u64, + /// Byte length of the assignment stream, or `0` if unfinalized. + pub stream_len: u64, + /// Number of expanded samples in the assignment stream, or `-1` if + /// unfinalized. + pub sample_count: i64, +} + +impl BendlHeader { + /// Build a provisional header used before any data has been written. + pub fn provisional(assignment_format: AssignmentFormat, stream_offset: u64) -> Self { + BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: assignment_format.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset, + stream_len: 0, + sample_count: -1, + } + } + + /// Whether the bundle has been finalized. + pub fn is_complete(&self) -> bool { + self.complete == COMPLETE_YES + } + + /// Typed view of the embedded assignment format. + pub fn assignment_format_typed(&self) -> Option { + AssignmentFormat::from_u8(self.assignment_format) + } + + /// Serialize the header into its fixed-size on-disk byte representation. + pub fn to_bytes(&self) -> [u8; HEADER_SIZE] { + let mut out = [0u8; HEADER_SIZE]; + out[0..8].copy_from_slice(&self.magic); + out[8..10].copy_from_slice(&self.major_version.to_le_bytes()); + out[10..12].copy_from_slice(&self.minor_version.to_le_bytes()); + out[12] = self.complete; + out[13] = self.assignment_format; + out[14..16].copy_from_slice(&self.reserved_0.to_le_bytes()); + out[16..24].copy_from_slice(&self.flags.to_le_bytes()); + out[24..32].copy_from_slice(&self.directory_offset.to_le_bytes()); + out[32..40].copy_from_slice(&self.directory_len.to_le_bytes()); + out[40..48].copy_from_slice(&self.stream_offset.to_le_bytes()); + out[48..56].copy_from_slice(&self.stream_len.to_le_bytes()); + out[56..64].copy_from_slice(&self.sample_count.to_le_bytes()); + out + } + + /// Parse a fixed 64-byte header from its on-disk byte representation. + pub fn from_bytes(bytes: &[u8; HEADER_SIZE]) -> Result { + let mut magic = [0u8; 8]; + magic.copy_from_slice(&bytes[0..8]); + if magic != BENDL_MAGIC { + return Err(BendlFormatError::InvalidMagic(magic)); + } + + let major_version = u16::from_le_bytes(bytes[8..10].try_into().unwrap()); + let minor_version = u16::from_le_bytes(bytes[10..12].try_into().unwrap()); + if major_version != BENDL_MAJOR_VERSION { + return Err(BendlFormatError::UnsupportedMajorVersion { + found: major_version, + supported: BENDL_MAJOR_VERSION, + }); + } + + Ok(BendlHeader { + magic, + major_version, + minor_version, + complete: bytes[12], + assignment_format: bytes[13], + reserved_0: u16::from_le_bytes(bytes[14..16].try_into().unwrap()), + flags: u64::from_le_bytes(bytes[16..24].try_into().unwrap()), + directory_offset: u64::from_le_bytes(bytes[24..32].try_into().unwrap()), + directory_len: u64::from_le_bytes(bytes[32..40].try_into().unwrap()), + stream_offset: u64::from_le_bytes(bytes[40..48].try_into().unwrap()), + stream_len: u64::from_le_bytes(bytes[48..56].try_into().unwrap()), + sample_count: i64::from_le_bytes(bytes[56..64].try_into().unwrap()), + }) + } + + /// Read and parse a fixed header from a `Read` source. + pub fn read_from(reader: &mut R) -> Result { + let mut buf = [0u8; HEADER_SIZE]; + reader.read_exact(&mut buf)?; + Self::from_bytes(&buf) + } + + /// Write the header to a `Write` sink. + pub fn write_to(&self, writer: &mut W) -> io::Result<()> { + writer.write_all(&self.to_bytes()) + } +} + +// --------------------------------------------------------------------------- +// Directory entry +// --------------------------------------------------------------------------- + +/// Fixed-size header at the start of every directory entry, before the +/// variable-length `name` and optional `checksum` bytes. +pub const DIRECTORY_ENTRY_HEADER_SIZE: usize = 28; + +/// In-memory representation of a single directory entry. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BendlDirectoryEntry { + /// Identifies the meaning of the payload (see `ASSET_TYPE_*`). + pub asset_type: u16, + /// Encoding/compression flags for this asset. + pub asset_flags: u16, + /// UTF-8 asset name. Must match the canonical name for singleton types. + pub name: String, + /// Absolute file offset of the asset payload. + pub payload_offset: u64, + /// Byte length of the asset payload as stored on disk (post-compression + /// when the xz flag is set). + pub payload_len: u64, + /// Optional trailing checksum bytes. Interpretation depends on flags. + pub checksum: Option>, +} + +impl BendlDirectoryEntry { + /// Total on-disk size of this entry, including name and checksum. + pub fn encoded_len(&self) -> usize { + DIRECTORY_ENTRY_HEADER_SIZE + + self.name.len() + + self.checksum.as_ref().map(|c| c.len()).unwrap_or(0) + } + + /// Serialize the entry into a byte vector. + pub fn to_bytes(&self) -> Result, BendlFormatError> { + let name_bytes = self.name.as_bytes(); + let name_len: u16 = name_bytes.len().try_into().map_err(|_| { + BendlFormatError::NameTooLong { + length: name_bytes.len(), + } + })?; + let checksum_bytes = self.checksum.as_deref().unwrap_or(&[]); + let checksum_len: u32 = + checksum_bytes + .len() + .try_into() + .map_err(|_| BendlFormatError::ChecksumTooLong { + length: checksum_bytes.len(), + })?; + + let mut out = Vec::with_capacity(self.encoded_len()); + out.extend_from_slice(&self.asset_type.to_le_bytes()); + out.extend_from_slice(&self.asset_flags.to_le_bytes()); + out.extend_from_slice(&name_len.to_le_bytes()); + out.extend_from_slice(&0u16.to_le_bytes()); // reserved + out.extend_from_slice(&self.payload_offset.to_le_bytes()); + out.extend_from_slice(&self.payload_len.to_le_bytes()); + out.extend_from_slice(&checksum_len.to_le_bytes()); + out.extend_from_slice(name_bytes); + out.extend_from_slice(checksum_bytes); + Ok(out) + } + + /// Read one directory entry from a `Read` source. + pub fn read_from(reader: &mut R) -> Result { + let mut header = [0u8; DIRECTORY_ENTRY_HEADER_SIZE]; + reader.read_exact(&mut header)?; + + let asset_type = u16::from_le_bytes(header[0..2].try_into().unwrap()); + let asset_flags = u16::from_le_bytes(header[2..4].try_into().unwrap()); + let name_len = u16::from_le_bytes(header[4..6].try_into().unwrap()) as usize; + // header[6..8] reserved; ignored + let payload_offset = u64::from_le_bytes(header[8..16].try_into().unwrap()); + let payload_len = u64::from_le_bytes(header[16..24].try_into().unwrap()); + let checksum_len = u32::from_le_bytes(header[24..28].try_into().unwrap()) as usize; + + let mut name_buf = vec![0u8; name_len]; + reader.read_exact(&mut name_buf)?; + let name = String::from_utf8(name_buf).map_err(|_| BendlFormatError::NameNotUtf8)?; + + let checksum = if checksum_len == 0 { + None + } else { + let mut buf = vec![0u8; checksum_len]; + reader.read_exact(&mut buf)?; + Some(buf) + }; + + Ok(BendlDirectoryEntry { + asset_type, + asset_flags, + name, + payload_offset, + payload_len, + checksum, + }) + } +} + +// --------------------------------------------------------------------------- +// Directory table +// --------------------------------------------------------------------------- + +/// Read the full directory table from a `Read` source. +/// +/// The source should be positioned at the first byte of the directory +/// table (i.e. at `header.directory_offset`) and is expected to contain +/// exactly `entry_count` entries followed by no trailing bytes within the +/// directory region. +pub fn read_directory( + reader: &mut R, +) -> Result, BendlFormatError> { + let mut count_buf = [0u8; 4]; + reader.read_exact(&mut count_buf)?; + let entry_count = u32::from_le_bytes(count_buf) as usize; + + let mut entries = Vec::with_capacity(entry_count); + for _ in 0..entry_count { + entries.push(BendlDirectoryEntry::read_from(reader)?); + } + Ok(entries) +} + +/// Serialize a directory table into a byte vector. +pub fn encode_directory(entries: &[BendlDirectoryEntry]) -> Result, BendlFormatError> { + let entry_count: u32 = entries + .len() + .try_into() + .map_err(|_| BendlFormatError::TooManyEntries { + length: entries.len(), + })?; + + let body_len: usize = entries.iter().map(|e| e.encoded_len()).sum(); + let mut out = Vec::with_capacity(4 + body_len); + out.extend_from_slice(&entry_count.to_le_bytes()); + for entry in entries { + out.extend_from_slice(&entry.to_bytes()?); + } + Ok(out) +} + +// --------------------------------------------------------------------------- +// Errors +// --------------------------------------------------------------------------- + +/// Errors produced by the `.bendl` format layer. +#[derive(Debug, Error)] +pub enum BendlFormatError { + /// The file's leading magic bytes did not match [`BENDL_MAGIC`]. + #[error("invalid bendl magic: {0:02X?}")] + InvalidMagic([u8; 8]), + + /// The file's major version is not supported by this implementation. + #[error("unsupported bendl major version {found}: this implementation supports {supported}")] + UnsupportedMajorVersion { + /// Version actually found in the file. + found: u16, + /// Maximum major version this implementation can handle. + supported: u16, + }, + + /// A directory entry's name exceeded the `u16` length limit. + #[error("directory entry name is {length} bytes which exceeds the u16 length limit")] + NameTooLong { + /// The offending length in bytes. + length: usize, + }, + + /// A directory entry's checksum exceeded the `u32` length limit. + #[error("directory entry checksum is {length} bytes which exceeds the u32 length limit")] + ChecksumTooLong { + /// The offending length in bytes. + length: usize, + }, + + /// A directory table exceeded the `u32` entry count limit. + #[error("directory has {length} entries which exceeds the u32 entry count limit")] + TooManyEntries { + /// The offending entry count. + length: usize, + }, + + /// A directory entry name was not valid UTF-8. + #[error("directory entry name is not valid UTF-8")] + NameNotUtf8, + + /// An I/O error occurred while reading or writing the format layer. + #[error("IO error: {0}")] + Io(#[from] io::Error), +} + +impl From for io::Error { + fn from(err: BendlFormatError) -> Self { + match err { + BendlFormatError::Io(e) => e, + other => io::Error::new(io::ErrorKind::InvalidData, other.to_string()), + } + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn magic_is_eight_bytes_and_matches_spec() { + assert_eq!(BENDL_MAGIC.len(), 8); + assert_eq!(&BENDL_MAGIC[..5], b"BENDL"); + } + + #[test] + fn canonical_name_lookup() { + assert_eq!(canonical_name_for(ASSET_TYPE_METADATA), Some("metadata.json")); + assert_eq!(canonical_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); + assert_eq!( + canonical_name_for(ASSET_TYPE_RELABEL_MAP), + Some("relabel_map.json") + ); + assert_eq!(canonical_name_for(ASSET_TYPE_CUSTOM), None); + assert_eq!(canonical_name_for(9999), None); + } + + #[test] + fn default_compression_policy() { + assert!(default_compresses_by_type(ASSET_TYPE_GRAPH)); + assert!(!default_compresses_by_type(ASSET_TYPE_METADATA)); + assert!(!default_compresses_by_type(ASSET_TYPE_RELABEL_MAP)); + assert!(!default_compresses_by_type(ASSET_TYPE_CUSTOM)); + } + + #[test] + fn assignment_format_roundtrip() { + for fmt in [AssignmentFormat::Ben, AssignmentFormat::Xben] { + assert_eq!(AssignmentFormat::from_u8(fmt.to_u8()), Some(fmt)); + } + assert_eq!(AssignmentFormat::from_u8(0), None); + assert_eq!(AssignmentFormat::from_u8(255), None); + } + + #[test] + fn header_is_exactly_64_bytes() { + let header = BendlHeader::provisional(AssignmentFormat::Ben, 64); + assert_eq!(header.to_bytes().len(), HEADER_SIZE); + assert_eq!(HEADER_SIZE, 64); + } + + #[test] + fn header_round_trip_provisional() { + let header = BendlHeader::provisional(AssignmentFormat::Xben, 64); + let decoded = BendlHeader::from_bytes(&header.to_bytes()).unwrap(); + assert_eq!(header, decoded); + assert!(!decoded.is_complete()); + assert_eq!(decoded.assignment_format_typed(), Some(AssignmentFormat::Xben)); + assert_eq!(decoded.sample_count, -1); + assert_eq!(decoded.stream_len, 0); + assert_eq!(decoded.directory_offset, 0); + } + + #[test] + fn header_round_trip_finalized() { + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: ASSIGNMENT_FORMAT_BEN, + reserved_0: 0, + flags: 0x0000_0000_0000_000F, + directory_offset: 1_000_000, + directory_len: 256, + stream_offset: 64, + stream_len: 999_936, + sample_count: 4242, + }; + let bytes = header.to_bytes(); + let decoded = BendlHeader::from_bytes(&bytes).unwrap(); + assert_eq!(decoded, header); + assert!(decoded.is_complete()); + } + + #[test] + fn header_rejects_invalid_magic() { + let mut header = BendlHeader::provisional(AssignmentFormat::Ben, 64); + header.magic = *b"NOTABEND"; + let err = BendlHeader::from_bytes(&header.to_bytes()).unwrap_err(); + assert!(matches!(err, BendlFormatError::InvalidMagic(_))); + } + + #[test] + fn header_rejects_unsupported_major_version() { + let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); + bytes[8..10].copy_from_slice(&999u16.to_le_bytes()); + let err = BendlHeader::from_bytes(&bytes).unwrap_err(); + assert!(matches!( + err, + BendlFormatError::UnsupportedMajorVersion { found: 999, .. } + )); + } + + #[test] + fn directory_entry_round_trip_no_checksum() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset: 128, + payload_len: 4096, + checksum: None, + }; + let bytes = entry.to_bytes().unwrap(); + assert_eq!(bytes.len(), entry.encoded_len()); + let mut cursor = &bytes[..]; + let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); + assert_eq!(decoded, entry); + } + + #[test] + fn directory_entry_round_trip_with_checksum() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_CHECKSUM, + name: "custom_blob".to_string(), + payload_offset: 2048, + payload_len: 512, + checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]), + }; + let bytes = entry.to_bytes().unwrap(); + let mut cursor = &bytes[..]; + let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); + assert_eq!(decoded, entry); + assert_eq!(decoded.checksum.unwrap(), vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]); + } + + #[test] + fn directory_table_round_trip() { + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset: 64, + payload_len: 2048, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: CANONICAL_NAME_METADATA.to_string(), + payload_offset: 2112, + payload_len: 128, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "provenance.bin".to_string(), + payload_offset: 2240, + payload_len: 32, + checksum: None, + }, + ]; + + let encoded = encode_directory(&entries).unwrap(); + let mut cursor = &encoded[..]; + let decoded = read_directory(&mut cursor).unwrap(); + assert_eq!(decoded, entries); + } + + #[test] + fn empty_directory_table_round_trip() { + let encoded = encode_directory(&[]).unwrap(); + assert_eq!(encoded, vec![0, 0, 0, 0]); + let mut cursor = &encoded[..]; + let decoded = read_directory(&mut cursor).unwrap(); + assert!(decoded.is_empty()); + } + + #[test] + fn header_and_directory_entry_header_sizes_are_stable() { + // These sizes are baked into the on-disk format; regressing them + // would silently break existing bundles. + assert_eq!(HEADER_SIZE, 64); + assert_eq!(DIRECTORY_ENTRY_HEADER_SIZE, 28); + } +} diff --git a/ben/src/io/bundle/manifest.rs b/ben/src/io/bundle/manifest.rs new file mode 100644 index 0000000..86851ca --- /dev/null +++ b/ben/src/io/bundle/manifest.rs @@ -0,0 +1,58 @@ +//! JSON metadata structs for the optional `metadata.json` asset. +//! +//! The authoritative values for `major_version`, `minor_version`, +//! `assignment_format`, `complete`, and the BEN variant all live in the +//! fixed bundle header (or in the embedded stream banner for the variant). +//! The `metadata.json` asset is a best-effort human-readable mirror +//! intended for debugging and tooling; writers should prefer reading the +//! header directly rather than trusting fields in this struct. + +use serde::{Deserialize, Serialize}; + +/// Serde representation of the optional `metadata.json` asset. +/// +/// Field names mirror the header where possible so that the JSON is +/// easy to cross-reference against the binary layout. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct BendlManifest { + /// Incompatible-change version of the bundle format. + pub major_version: u16, + /// Additive version of the bundle format. + pub minor_version: u16, + /// Container format of the embedded assignment stream + /// (`"ben"` or `"xben"`). + pub assignment_format: String, + /// BEN variant (`"standard"`, `"mkv_chain"`, or `"two_delta"`) as + /// carried by the embedded stream's 17-byte banner. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub variant: Option, + /// Whether the bundle was finalized successfully. + pub complete: bool, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn manifest_json_round_trip() { + let manifest = BendlManifest { + major_version: 1, + minor_version: 0, + assignment_format: "xben".to_string(), + variant: Some("mkv_chain".to_string()), + complete: false, + }; + let json = serde_json::to_string(&manifest).unwrap(); + let decoded: BendlManifest = serde_json::from_str(&json).unwrap(); + assert_eq!(decoded, manifest); + } + + #[test] + fn manifest_accepts_missing_variant() { + let json = r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; + let decoded: BendlManifest = serde_json::from_str(json).unwrap(); + assert_eq!(decoded.variant, None); + assert!(decoded.complete); + } +} diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs new file mode 100644 index 0000000..369a169 --- /dev/null +++ b/ben/src/io/bundle/mod.rs @@ -0,0 +1,27 @@ +//! `.bendl` single-file dataset container. +//! +//! A `.bendl` file is a seekable container that wraps the existing BEN or +//! XBEN assignment stream together with optional front-loaded assets such +//! as a graph JSON, a relabel map, or a metadata blob. The directory table +//! that describes those assets lives at the end of the file so that new +//! assets can be appended to a finalized bundle in O(new asset size + +//! directory size) without rewriting the assignment stream. +//! +//! The module is organised as: +//! +//! - [`format`] — binary header and directory entry types, constants, and +//! encode/decode helpers. Pure functions over byte buffers; no I/O. +//! - [`manifest`] — serde structs for the optional `metadata.json` asset. + +pub mod format; +pub mod manifest; +pub mod reader; +pub mod writer; + +pub use reader::{ + BendlReader, BundleAssignmentReader, BundleAssignmentReaderError, BundleValidationError, +}; +pub use writer::{ + AddAssetOptions, BendlStreamHandle, BendlWriteError, BendlWriter, BundleAssignmentSink, + BundleAssignmentStreamCtx, +}; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs new file mode 100644 index 0000000..f6eafa8 --- /dev/null +++ b/ben/src/io/bundle/reader.rs @@ -0,0 +1,1046 @@ +//! Read-only inspection of `.bendl` files. +//! +//! A [`BendlReader`] parses a bundle's fixed header and (if present) its +//! trailing directory table. It does not read any asset payload bytes +//! until the caller explicitly requests them via [`BendlReader::asset_bytes`] +//! or [`BendlReader::asset_reader`]. The assignment stream region is +//! likewise exposed as a byte range the caller can plumb into the +//! existing `AssignmentReader` / `XZAssignmentReader` without this module +//! reinterpreting any BEN/XBEN internals. + +use std::io::{self, Read, Seek, SeekFrom, Take}; + +use xz2::read::XzDecoder; + +use super::format::{ + canonical_name_for, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, + BendlHeader, ASSET_FLAG_XZ, +}; + +/// Reader for a single `.bendl` file. +pub struct BendlReader { + inner: R, + header: BendlHeader, + directory: Vec, +} + +impl BendlReader { + /// Open a `.bendl` file by validating its header and loading the + /// directory table if one exists. + /// + /// The underlying reader is left at an unspecified position; callers + /// should seek explicitly before reading asset or stream bytes. + pub fn open(mut inner: R) -> Result { + inner.seek(SeekFrom::Start(0))?; + let header = BendlHeader::read_from(&mut inner)?; + + let directory = if header.directory_offset != 0 && header.directory_len != 0 { + inner.seek(SeekFrom::Start(header.directory_offset))?; + read_directory(&mut inner)? + } else { + Vec::new() + }; + + Ok(BendlReader { + inner, + header, + directory, + }) + } + + /// The parsed fixed header. + pub fn header(&self) -> &BendlHeader { + &self.header + } + + /// Whether the bundle was successfully finalized. + pub fn is_complete(&self) -> bool { + self.header.is_complete() + } + + /// The sample count recorded in the header, or `None` if not + /// authoritative (i.e. the bundle is still incomplete). + pub fn sample_count(&self) -> Option { + if self.header.is_complete() { + Some(self.header.sample_count) + } else { + None + } + } + + /// The container format of the embedded assignment stream. + pub fn assignment_format(&self) -> Option { + self.header.assignment_format_typed() + } + + /// All directory entries in the order they appear in the directory. + pub fn assets(&self) -> &[BendlDirectoryEntry] { + &self.directory + } + + /// Look up a directory entry by canonical or custom name. + pub fn find_asset_by_name(&self, name: &str) -> Option<&BendlDirectoryEntry> { + self.directory.iter().find(|e| e.name == name) + } + + /// Look up the unique directory entry with the given asset type, if + /// any. Singleton types (`metadata.json`, `graph.json`, + /// `relabel_map.json`) use this to grab their payload without caring + /// about the canonical name. + pub fn find_asset_by_type(&self, asset_type: u16) -> Option<&BendlDirectoryEntry> { + self.directory.iter().find(|e| e.asset_type == asset_type) + } + + /// Return the byte range occupied by the assignment stream. + /// + /// For finalized bundles this is `(stream_offset, stream_len)` as + /// recorded in the header. For incomplete bundles the end of the + /// stream is taken as EOF (or the directory start, if a provisional + /// directory was written). + pub fn assignment_stream_range(&mut self) -> io::Result<(u64, u64)> { + if self.header.is_complete() { + Ok((self.header.stream_offset, self.header.stream_len)) + } else { + let end = if self.header.directory_offset != 0 { + self.header.directory_offset + } else { + self.inner.seek(SeekFrom::End(0))? + }; + let len = end.saturating_sub(self.header.stream_offset); + Ok((self.header.stream_offset, len)) + } + } + + /// Return a `Take` reader positioned at the start of the assignment + /// stream and limited to its declared length. The caller is expected + /// to wrap the returned reader in an `AssignmentReader` or + /// `XZAssignmentReader` as appropriate for `assignment_format()`. + pub fn assignment_stream_reader(&mut self) -> io::Result> { + let (offset, len) = self.assignment_stream_range()?; + self.inner.seek(SeekFrom::Start(offset))?; + Ok((&mut self.inner).take(len)) + } + + /// Construct the appropriate assignment decoder for the bundle's + /// declared `assignment_format` and return it as a + /// [`BundleAssignmentReader`] enum. + /// + /// - `AssignmentFormat::Ben` produces a + /// [`crate::io::reader::AssignmentReader`] over a `Take<&mut R>`. + /// - `AssignmentFormat::Xben` produces a + /// [`crate::io::reader::XZAssignmentReader`] over a `Take<&mut R>`. + /// + /// Returns an error if the header's `assignment_format` field is + /// unrecognized or the embedded banner is malformed. + pub fn open_assignment_reader( + &mut self, + ) -> Result>, BundleAssignmentReaderError> { + let format = self + .assignment_format() + .ok_or(BundleAssignmentReaderError::UnknownAssignmentFormat( + self.header.assignment_format, + ))?; + let stream = self.assignment_stream_reader()?; + match format { + AssignmentFormat::Ben => { + let inner = crate::io::reader::AssignmentReader::new(stream) + .map_err(BundleAssignmentReaderError::Decoder)?; + Ok(BundleAssignmentReader::Ben(inner)) + } + AssignmentFormat::Xben => { + let inner = crate::io::reader::XZAssignmentReader::new(stream) + .map_err(BundleAssignmentReaderError::Decoder)?; + Ok(BundleAssignmentReader::Xben(inner)) + } + } + } + + /// Read the fully-decoded bytes of an asset by directory entry. + /// + /// If the entry has [`ASSET_FLAG_XZ`] set, the payload is decompressed + /// through `xz2::read::XzDecoder`. Otherwise the bytes are returned + /// as-is. + pub fn asset_bytes(&mut self, entry: &BendlDirectoryEntry) -> io::Result> { + let mut out = Vec::new(); + self.asset_reader(entry)?.read_to_end(&mut out)?; + Ok(out) + } + + /// Obtain a boxed reader for the decoded contents of an asset. + /// + /// The returned reader is positioned at the first decoded byte and + /// automatically handles xz decompression when the asset is flagged + /// as compressed. The reader borrows `self`, so only one asset or + /// stream reader may be live at a time. + pub fn asset_reader<'a>( + &'a mut self, + entry: &BendlDirectoryEntry, + ) -> io::Result> { + self.inner.seek(SeekFrom::Start(entry.payload_offset))?; + let raw = (&mut self.inner).take(entry.payload_len); + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + Ok(Box::new(XzDecoder::new(raw))) + } else { + Ok(Box::new(raw)) + } + } + + /// Validate that the loaded directory is well-formed under the + /// canonical-name and uniqueness rules. + /// + /// Returns [`BundleValidationError`] if any entry violates the rules. + /// This is called automatically by [`BendlReader::open`] when the + /// `strict` constructor is used in tests; in normal reads, the + /// writer is already expected to enforce these rules and a + /// malformed bundle is a program bug somewhere else. + pub fn validate_directory(&self) -> Result<(), BundleValidationError> { + let mut seen_names = std::collections::HashSet::new(); + let mut seen_singleton_types = std::collections::HashSet::new(); + + for entry in &self.directory { + if !seen_names.insert(entry.name.as_str()) { + return Err(BundleValidationError::DuplicateName(entry.name.clone())); + } + if let Some(canonical) = canonical_name_for(entry.asset_type) { + if entry.name != canonical { + return Err(BundleValidationError::WrongCanonicalName { + asset_type: entry.asset_type, + expected: canonical.to_string(), + found: entry.name.clone(), + }); + } + if !seen_singleton_types.insert(entry.asset_type) { + return Err(BundleValidationError::DuplicateSingletonType( + entry.asset_type, + )); + } + } + } + Ok(()) + } + + /// Release the underlying reader. + pub fn into_inner(self) -> R { + self.inner + } +} + +/// Either a BEN or an XBEN assignment decoder over a bundle's embedded +/// stream region. +/// +/// Both variants hold a `Take<&mut R>` reader limited to the stream +/// window declared in the bundle header, so they cannot accidentally +/// read into the trailing directory table. +pub enum BundleAssignmentReader { + /// The bundle carries an uncompressed BEN stream. + Ben(crate::io::reader::AssignmentReader), + /// The bundle carries an xz-compressed XBEN stream. + Xben(crate::io::reader::XZAssignmentReader), +} + +impl BundleAssignmentReader { + /// True when the reader is backed by a BEN stream. + pub fn is_ben(&self) -> bool { + matches!(self, BundleAssignmentReader::Ben(_)) + } + + /// True when the reader is backed by an XBEN stream. + pub fn is_xben(&self) -> bool { + matches!(self, BundleAssignmentReader::Xben(_)) + } +} + +/// Errors raised by [`BendlReader::open_assignment_reader`]. +#[derive(Debug, thiserror::Error)] +pub enum BundleAssignmentReaderError { + /// The header's `assignment_format` byte did not map to a known format. + #[error("unknown assignment_format in bundle header: {0}")] + UnknownAssignmentFormat(u8), + /// The embedded BEN/XBEN decoder rejected the stream banner. + #[error(transparent)] + Decoder(#[from] crate::io::reader::DecoderInitError), + /// An underlying I/O error occurred while seeking to the stream. + #[error(transparent)] + Io(#[from] io::Error), +} + +/// Errors raised when a directory violates the canonical-name or +/// uniqueness rules. +#[derive(Debug, thiserror::Error)] +pub enum BundleValidationError { + /// Two entries share the same name. + #[error("duplicate asset name: {0:?}")] + DuplicateName(String), + + /// Two entries share the same singleton asset type. + #[error("duplicate singleton asset type: {0}")] + DuplicateSingletonType(u16), + + /// An entry with a known singleton type is not using its canonical name. + #[error( + "asset type {asset_type} must use canonical name {expected:?}, found {found:?}" + )] + WrongCanonicalName { + /// The asset type whose canonical name was violated. + asset_type: u16, + /// The canonical name the writer should have used. + expected: String, + /// The name that was actually written. + found: String, + }, +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::io::{Cursor, Write}; + + use xz2::write::XzEncoder; + + use super::*; + use crate::io::bundle::format::{ + encode_directory, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, + ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, + COMPLETE_YES, HEADER_SIZE, + }; + + /// Build a complete in-memory finalized bundle with two assets: + /// an xz-compressed `graph.json` and a raw custom blob, followed by + /// a fake BEN stream and a trailing directory. + fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { + // Asset payloads (decoded): + let graph_json = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#.to_vec(); + let custom_blob = vec![0xAA, 0xBB, 0xCC, 0xDD, 0xEE]; + let fake_stream = b"STANDARD BEN FILE\x00\x01\x02\x03fake payload".to_vec(); + + // xz-compress graph_json using the default preset. + let mut encoder = XzEncoder::new(Vec::new(), 6); + encoder.write_all(&graph_json).unwrap(); + let compressed_graph = encoder.finish().unwrap(); + + // Layout: + // [0 .. 64) header + // [64 .. 64+len(compressed_graph)) graph payload + // [... .. ...+len(custom_blob)) custom payload + // [stream_offset .. stream_offset+len(fake_stream)) stream + // [directory_offset .. EOF) directory + let mut bundle = Vec::new(); + // Reserve space for header; fill later. + bundle.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let graph_offset = bundle.len() as u64; + bundle.extend_from_slice(&compressed_graph); + + let custom_offset = bundle.len() as u64; + bundle.extend_from_slice(&custom_blob); + + let stream_offset = bundle.len() as u64; + bundle.extend_from_slice(&fake_stream); + let stream_len = fake_stream.len() as u64; + + let directory_offset = bundle.len() as u64; + + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: graph_offset, + payload_len: compressed_graph.len() as u64, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "custom.bin".to_string(), + payload_offset: custom_offset, + payload_len: custom_blob.len() as u64, + checksum: None, + }, + ]; + let directory_bytes = encode_directory(&entries).unwrap(); + bundle.extend_from_slice(&directory_bytes); + let directory_len = directory_bytes.len() as u64; + + // Now patch the header. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 42, + }; + bundle[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + (bundle, graph_json, custom_blob, fake_stream) + } + + #[test] + fn open_finalized_bundle_and_read_metadata() { + let (bytes, _, _, _) = build_finalized_bundle(); + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(42)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + assert_eq!(reader.assets().len(), 2); + assert!(reader.validate_directory().is_ok()); + } + + #[test] + fn read_compressed_graph_asset_decodes_through_xz() { + let (bytes, graph_json, _, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph entry"); + let bytes_out = reader.asset_bytes(&entry).unwrap(); + assert_eq!(bytes_out, graph_json); + } + + #[test] + fn read_raw_custom_asset_returns_exact_bytes() { + let (bytes, _, custom_blob, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader + .find_asset_by_name("custom.bin") + .cloned() + .expect("custom entry"); + let bytes_out = reader.asset_bytes(&entry).unwrap(); + assert_eq!(bytes_out, custom_blob); + } + + #[test] + fn assignment_stream_range_matches_finalized_header() { + let (bytes, _, _, fake_stream) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(len, fake_stream.len() as u64); + let mut buf = Vec::new(); + reader.assignment_stream_reader().unwrap().read_to_end(&mut buf).unwrap(); + assert_eq!(buf, fake_stream); + // Sanity-check the offset is consistent with the header. + assert_eq!(offset, reader.header().stream_offset); + } + + #[test] + fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { + // Build an incomplete bundle: header + some fake stream bytes, no directory. + let fake_stream = b"STANDARD BEN FILE\x00\x01some partial bytes".to_vec(); + let mut bytes = Vec::new(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + assert_eq!(reader.sample_count(), None); + assert!(reader.assets().is_empty()); + + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(offset, HEADER_SIZE as u64); + assert_eq!(len, fake_stream.len() as u64); + + let mut buf = Vec::new(); + reader.assignment_stream_reader().unwrap().read_to_end(&mut buf).unwrap(); + assert_eq!(buf, fake_stream); + } + + #[test] + fn open_rejects_malformed_magic() { + let mut bytes = vec![0u8; HEADER_SIZE]; + bytes[0..8].copy_from_slice(b"NOPENOPE"); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::InvalidMagic(_)) => {} + Err(other) => panic!("expected InvalidMagic, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn validate_directory_catches_duplicate_names() { + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a".to_string(), + payload_offset: 65, + payload_len: 1, + checksum: None, + }, + ]; + let reader = BendlReader { + inner: Cursor::new(Vec::::new()), + header: BendlHeader::provisional(AssignmentFormat::Ben, 64), + directory: entries, + }; + let err = reader.validate_directory().unwrap_err(); + assert!(matches!(err, BundleValidationError::DuplicateName(ref n) if n == "a")); + } + + #[test] + fn validate_directory_catches_wrong_canonical_name() { + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: 0, + name: "not_graph.json".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }]; + let reader = BendlReader { + inner: Cursor::new(Vec::::new()), + header: BendlHeader::provisional(AssignmentFormat::Ben, 64), + directory: entries, + }; + let err = reader.validate_directory().unwrap_err(); + assert!(matches!( + err, + BundleValidationError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + )); + } + + // ----------------------------------------------------------------------- + // Robustness tests + // ----------------------------------------------------------------------- + + /// Build a small finalized bundle with a known graph asset, metadata + /// asset, empty stream, and no validation pitfalls. Useful as a base + /// that tests can mutate byte-by-byte. + fn build_basic_finalized_bundle() -> Vec { + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + // One raw metadata asset right after the header. + let metadata_payload = br#"{"k":"v"}"#.to_vec(); + let metadata_offset = bytes.len() as u64; + bytes.extend_from_slice(&metadata_payload); + + // Stream region is empty. + let stream_offset = bytes.len() as u64; + let stream_len = 0u64; + + // Directory at EOF with one entry. + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: "metadata.json".to_string(), + payload_offset: metadata_offset, + payload_len: metadata_payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let directory_len = directory.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + bytes + } + + #[test] + fn open_rejects_short_header() { + let too_short = vec![0u8; HEADER_SIZE - 1]; + match BendlReader::open(Cursor::new(too_short)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn open_rejects_unsupported_major_version() { + let mut bytes = build_basic_finalized_bundle(); + // major_version lives at offset 8..10 in the header. + bytes[8..10].copy_from_slice(&(BENDL_MAJOR_VERSION + 1).to_le_bytes()); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::UnsupportedMajorVersion { .. }) => {} + Err(other) => panic!("expected UnsupportedMajorVersion, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn open_rejects_directory_with_inflated_entry_count() { + let mut bytes = build_basic_finalized_bundle(); + // Read directory_offset from the header (bytes 24..32). + let directory_offset = + u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + // Blow up the entry count at the start of the directory to a + // value that cannot possibly fit in the remaining file bytes. + bytes[directory_offset..directory_offset + 4] + .copy_from_slice(&9999u32.to_le_bytes()); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn open_rejects_directory_with_chopped_final_entry() { + // Drop the last byte of the file, which lies inside the name + // field of the final directory entry. + let mut bytes = build_basic_finalized_bundle(); + bytes.pop(); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn asset_bytes_read_twice_returns_identical_payload() { + let (bytes, _, custom_blob, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); + let first = reader.asset_bytes(&entry).unwrap(); + let second = reader.asset_bytes(&entry).unwrap(); + assert_eq!(first, second); + assert_eq!(first, custom_blob); + } + + #[test] + fn interleaved_reads_do_not_corrupt_each_other() { + // Read asset A, then stream, then asset A again, then asset B. + let (bytes, graph_json, custom_blob, fake_stream) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + + let graph_entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let custom_entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); + + let graph_first = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_first, graph_json); + + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, fake_stream); + + let graph_second = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_second, graph_json); + + let custom = reader.asset_bytes(&custom_entry).unwrap(); + assert_eq!(custom, custom_blob); + } + + #[test] + fn asset_bytes_errors_when_declared_length_runs_past_eof() { + // Hand-construct a bundle where the metadata directory entry + // claims a payload_len that extends well past EOF. + let mut bytes = build_basic_finalized_bundle(); + // Parse the directory offset to find where the entry lives. + let directory_offset = + u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + // Skip the u32 entry count (4 bytes) and then the 16-byte fixed + // entry header up to `payload_len` (bytes 16..24 of the entry). + let entry_start = directory_offset + 4; + let payload_len_offset = entry_start + 16; + bytes[payload_len_offset..payload_len_offset + 8] + .copy_from_slice(&u64::MAX.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + // The reader opens fine — the directory parses. But reading the + // asset bytes must surface an error eventually (short read vs + // declared length). xz would also trip on this, but this is the + // raw-asset path. + match reader.asset_bytes(&entry) { + Ok(bytes) => { + // At the very least the returned bytes should not pretend + // to fill u64::MAX — saturate at what the file actually had. + assert!(bytes.len() < u64::MAX as usize); + } + Err(_) => {} + } + } + + #[test] + fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { + // Build an incomplete bundle but stuff a stale sample count into + // the header. `sample_count()` must still return None because + // the `complete` flag is what makes the value authoritative. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 999_999, // lie, but header is "incomplete" + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + assert_eq!(reader.sample_count(), None); + } + + #[test] + fn unknown_assignment_format_reports_none_on_typed_getter() { + // Build a finalized but otherwise-empty bundle and corrupt the + // assignment_format byte to a value that is neither BEN nor XBEN. + let mut bytes = build_basic_finalized_bundle(); + // assignment_format byte is at offset 13 in the header. + bytes[13] = 42; + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert_eq!(reader.assignment_format(), None); + // The header still parses and the directory is still available. + assert_eq!(reader.assets().len(), 1); + } + + #[test] + fn open_assignment_reader_rejects_unknown_assignment_format() { + let mut bytes = build_basic_finalized_bundle(); + bytes[13] = 42; // corrupt assignment format byte + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + match reader.open_assignment_reader() { + Err(BundleAssignmentReaderError::UnknownAssignmentFormat(42)) => {} + Err(other) => panic!("expected UnknownAssignmentFormat(42), got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { + let fake_stream = b"STANDARD BEN FILE\x00\x01payload bytes".to_vec(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + let eof = bytes.len() as u64; + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let (off, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(off, HEADER_SIZE as u64); + assert_eq!(off + len, eof); + } + + #[test] + fn validate_directory_catches_duplicate_singleton_types() { + // Two entries of type METADATA (both with canonical name + // "metadata.json"). The canonical-name check would fire for + // the second entry because the name is duplicated, so force a + // different name shape: this is a belt-and-braces test that + // confirms the singleton check is separate from the name check. + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: 0, + name: "metadata.json".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: 0, + // Distinct name so the duplicate-name check does not fire + // first; the singleton-type check should catch this. + name: "meta2.json".to_string(), + payload_offset: 65, + payload_len: 1, + checksum: None, + }, + ]; + let reader = BendlReader { + inner: Cursor::new(Vec::::new()), + header: BendlHeader::provisional(AssignmentFormat::Ben, 64), + directory: entries, + }; + // The second entry has asset_type METADATA but name "meta2.json" + // which fails the canonical-name check before the singleton + // check; that's still a valid rejection. + let err = reader.validate_directory().unwrap_err(); + assert!(matches!( + err, + BundleValidationError::WrongCanonicalName { .. } + | BundleValidationError::DuplicateSingletonType(_) + )); + } + + #[test] + fn validate_directory_accepts_well_formed_multi_singleton_bundle() { + // A bundle with one of every singleton type, plus two custom + // assets with distinct names, should validate cleanly. + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: "metadata.json".to_string(), + payload_offset: 64, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: 68, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: crate::io::bundle::format::ASSET_TYPE_RELABEL_MAP, + asset_flags: ASSET_FLAG_JSON, + name: "relabel_map.json".to_string(), + payload_offset: 72, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a.bin".to_string(), + payload_offset: 76, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "b.bin".to_string(), + payload_offset: 80, + payload_len: 4, + checksum: None, + }, + ]; + let reader = BendlReader { + inner: Cursor::new(Vec::::new()), + header: BendlHeader::provisional(AssignmentFormat::Ben, 64), + directory: entries, + }; + reader.validate_directory().expect("well-formed directory"); + } + + #[test] + fn stress_thousand_custom_assets_round_trip() { + // Build a directory with 1000 small custom assets, each with a + // unique payload derived from its index, and confirm they all + // round-trip via `asset_bytes`. This catches any off-by-one or + // seek-caching bugs that might only show up with many entries. + const N: usize = 1000; + + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let mut entries = Vec::with_capacity(N); + let mut expected = Vec::with_capacity(N); + for i in 0..N { + let payload: Vec = (0..(i % 31 + 1) as u8) + .map(|j| (i as u8).wrapping_add(j)) + .collect(); + let offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + entries.push(BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: format!("blob-{i:04}.bin"), + payload_offset: offset, + payload_len: payload.len() as u64, + checksum: None, + }); + expected.push(payload); + } + + let stream_offset = bytes.len() as u64; + let stream_len = 0u64; + let directory_offset = bytes.len() as u64; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let directory_len = directory.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert_eq!(reader.assets().len(), N); + reader.validate_directory().unwrap(); + // Access in scrambled order to exercise seeking. + for &idx in &[0usize, N - 1, 1, N / 2, N / 3, 2 * N / 3, 7, 999] { + let name = format!("blob-{idx:04}.bin"); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(got, expected[idx], "mismatch at index {idx}"); + } + } + + #[test] + fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { + // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ + // whose payload bytes are not a valid xz container. `asset_bytes` + // must surface an io::Error rather than panicking. + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let bad_payload = vec![0xFFu8, 0xFE, 0xFD, 0xFC, 0xFB]; + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&bad_payload); + + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_XZ, + name: "broken.xz".to_string(), + payload_offset, + payload_len: bad_payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("broken.xz").cloned().unwrap(); + let res = reader.asset_bytes(&entry); + assert!(res.is_err(), "expected xz decode error, got {res:?}"); + } + + #[test] + fn reader_scales_to_very_wide_stream_offset_field() { + // Confirm the `Take` bound clamps a stream reader even when the + // header's stream_len is much larger than the actual remaining + // bytes: the reader must return the shorter slice rather than + // loop forever or panic. This is a "short read" tolerance check. + let fake_stream = b"STANDARD BEN FILE\x00\x01tiny".to_vec(); + let actual_len = fake_stream.len() as u64; + let directory_offset = HEADER_SIZE as u64 + actual_len; + // Build a bundle that lies about stream_len: claims ten times + // what's actually present. + let entries: Vec = Vec::new(); + let directory_bytes = encode_directory(&entries).unwrap(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory_bytes.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: actual_len * 10, // lie + sample_count: 0, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + bytes.extend_from_slice(&directory_bytes); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let mut buf = Vec::new(); + // Take will try to read `stream_len` bytes but the Cursor will + // just return however many bytes remain from stream_offset to EOF. + // The reader must not panic; it must simply return what it got. + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); + // Take includes the directory bytes in the window since they come + // after stream_offset and the claim exceeds file size — so we + // assert only that we got *at least* the real stream bytes as a + // prefix, which is the basic "no truncation of what exists" check. + assert!(buf.starts_with(&fake_stream)); + } +} diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs new file mode 100644 index 0000000..9e1473f --- /dev/null +++ b/ben/src/io/bundle/writer.rs @@ -0,0 +1,2009 @@ +//! Write-side API for `.bendl` files. +//! +//! [`BendlWriter`] produces finalized bundles with the on-disk layout +//! +//! ```text +//! [header] [asset payloads] [assignment stream] [directory] +//! ``` +//! +//! The writer operates in three logical phases: +//! +//! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero +//! or more times. Each call writes the (optionally xz-compressed) +//! payload to the file and records its absolute offset and length in +//! an in-memory entry list. +//! 2. **stream phase** — the caller invokes [`BendlWriter::begin_stream`] +//! to enter the stream region. The returned handle wraps the raw +//! underlying writer so the caller can plumb it into +//! [`crate::io::writer::AssignmentWriter`] or +//! [`crate::io::writer::XZAssignmentWriter`]. When the stream is +//! complete the caller records the sample count via +//! [`BendlWriter::end_stream`]. +//! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing +//! directory and patches the header. +//! +//! The writer requires `Write + Seek` because the header is patched +//! twice: once with the stream offset (implicitly, by having reserved +//! its slot at construction) and once with the finalized stream length, +//! sample count, directory offset, directory length, and `complete` flag. + +use std::collections::HashSet; +use std::io::{self, Read, Seek, SeekFrom, Write}; + +use thiserror::Error; +use xz2::write::XzEncoder; + +use super::format::{ + canonical_name_for, default_compresses_by_type, encode_directory, read_directory, + AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_JSON, + ASSET_FLAG_XZ, COMPLETE_YES, DEFAULT_XZ_PRESET, HEADER_SIZE, +}; + +/// Ability to truncate an underlying seekable target to a given length. +/// +/// This is not part of `std::io`, so `BendlAppender` takes a trait bound +/// that abstracts it and is implemented below for `std::fs::File` and +/// `std::io::Cursor>`. +pub trait BendlTruncate { + /// Truncate or extend the underlying target to exactly `len` bytes. + fn truncate_at(&mut self, len: u64) -> io::Result<()>; +} + +impl BendlTruncate for std::fs::File { + fn truncate_at(&mut self, len: u64) -> io::Result<()> { + self.set_len(len) + } +} + +impl BendlTruncate for std::io::Cursor> { + fn truncate_at(&mut self, len: u64) -> io::Result<()> { + let target = len as usize; + let vec = self.get_mut(); + if vec.len() > target { + vec.truncate(target); + } else if vec.len() < target { + vec.resize(target, 0); + } + Ok(()) + } +} + +/// Options passed alongside each [`BendlWriter::add_asset`] call. +#[derive(Debug, Clone, Default)] +pub struct AddAssetOptions { + /// Compression override. `None` means "follow the default policy for + /// this asset type"; `Some(true)` forces xz compression; `Some(false)` + /// forces a raw payload. + pub compress: Option, + /// Whether the decoded payload is UTF-8 JSON. Adds the + /// [`ASSET_FLAG_JSON`] bit to the entry's flags. + pub is_json: bool, + /// Optional trailing checksum bytes to store in the directory entry. + /// When set, [`crate::io::bundle::format::ASSET_FLAG_CHECKSUM`] is + /// applied automatically. + pub checksum: Option>, +} + +impl AddAssetOptions { + /// Sentinel "use the default policy with no extras" options. + pub fn defaults() -> Self { + Self::default() + } + + /// Flag a payload as UTF-8 JSON. + pub fn json(mut self) -> Self { + self.is_json = true; + self + } + + /// Force xz compression regardless of the default policy. + pub fn compress(mut self) -> Self { + self.compress = Some(true); + self + } + + /// Force the writer to store the payload raw even if the default + /// policy would compress it. + pub fn raw(mut self) -> Self { + self.compress = Some(false); + self + } +} + +/// Writer for a single `.bendl` file. +pub struct BendlWriter { + inner: W, + header: BendlHeader, + entries: Vec, + names: HashSet, + singleton_types: HashSet, + state: WriterState, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum WriterState { + /// No assets have been written yet, but the provisional header is + /// already in place and the writer is positioned just after it. + Assets, + /// `begin_stream` has been called; the caller is responsible for + /// writing the embedded BEN/XBEN payload before calling `end_stream`. + Streaming, + /// `end_stream` has completed; the writer is ready for `finish`. + StreamWritten { stream_len: u64, sample_count: i64 }, + /// `finish` has been called. No further operations are permitted. + Finished, +} + +impl BendlWriter { + /// Create a new writer by writing a provisional header at offset 0. + /// + /// The assignment stream will begin immediately after the asset + /// payload region — [`BendlWriter::begin_stream`] computes the + /// exact offset at the moment it is called, so asset writes that + /// happen between `new` and `begin_stream` push the stream out as + /// expected. + pub fn new(mut inner: W, assignment_format: AssignmentFormat) -> io::Result { + inner.seek(SeekFrom::Start(0))?; + // stream_offset in the provisional header is patched at + // begin_stream time; start it just after the header. + let header = BendlHeader::provisional(assignment_format, HEADER_SIZE as u64); + header.write_to(&mut inner)?; + + Ok(BendlWriter { + inner, + header, + entries: Vec::new(), + names: HashSet::new(), + singleton_types: HashSet::new(), + state: WriterState::Assets, + }) + } + + /// Add an asset to the bundle. + /// + /// The payload is written to the file immediately at the current + /// position (right after the previous asset, or right after the + /// header if this is the first asset). Its absolute offset and + /// length are recorded in the in-memory directory entry list. + /// + /// This method enforces the canonical-name and uniqueness rules + /// **before** writing any bytes, so a rejected asset leaves the + /// file untouched. + pub fn add_asset( + &mut self, + asset_type: u16, + name: &str, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + if self.state != WriterState::Assets { + return Err(BendlWriteError::AssetsAfterStream); + } + + // Canonical-name rule for known singleton types. + if let Some(canonical) = canonical_name_for(asset_type) { + if name != canonical { + return Err(BendlWriteError::WrongCanonicalName { + asset_type, + expected: canonical.to_string(), + found: name.to_string(), + }); + } + if !self.singleton_types.insert(asset_type) { + return Err(BendlWriteError::DuplicateSingletonType(asset_type)); + } + } + + // Unique name rule. + if !self.names.insert(name.to_string()) { + // Roll back the singleton insertion before returning, so + // the writer remains in a consistent state. (Only known + // singleton types would have been inserted above.) + if canonical_name_for(asset_type).is_some() { + self.singleton_types.remove(&asset_type); + } + return Err(BendlWriteError::DuplicateName(name.to_string())); + } + + // Decide compression. + let compress = options + .compress + .unwrap_or_else(|| default_compresses_by_type(asset_type)); + + // Compute final payload bytes. + let payload_bytes: Vec = if compress { + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder + .write_all(payload) + .map_err(BendlWriteError::Io)?; + encoder.finish().map_err(BendlWriteError::Io)? + } else { + payload.to_vec() + }; + + // Flags. + let mut asset_flags: u16 = 0; + if options.is_json { + asset_flags |= ASSET_FLAG_JSON; + } + if compress { + asset_flags |= ASSET_FLAG_XZ; + } + if options.checksum.is_some() { + asset_flags |= + crate::io::bundle::format::ASSET_FLAG_CHECKSUM; + } + + // Write at current file position. + let payload_offset = self.inner.seek(SeekFrom::Current(0))?; + self.inner + .write_all(&payload_bytes) + .map_err(BendlWriteError::Io)?; + let payload_len = payload_bytes.len() as u64; + + self.entries.push(BendlDirectoryEntry { + asset_type, + asset_flags, + name: name.to_string(), + payload_offset, + payload_len, + checksum: options.checksum, + }); + + Ok(()) + } + + /// Convenience wrapper around [`add_asset`] for JSON-encoded assets. + pub fn add_json_asset( + &mut self, + asset_type: u16, + name: &str, + payload: &[u8], + ) -> Result<(), BendlWriteError> { + self.add_asset(asset_type, name, payload, AddAssetOptions::defaults().json()) + } + + /// Transition from the asset phase into the stream phase and return + /// a mutable reference to the inner writer so the caller can + /// directly write the embedded BEN/XBEN payload. + /// + /// Once this method has been called, no further assets may be added. + /// The caller is responsible for calling [`BendlWriter::end_stream`] + /// when the payload is complete. + pub fn begin_stream(&mut self) -> Result, BendlWriteError> { + if self.state != WriterState::Assets { + return Err(BendlWriteError::WrongState { + expected: "Assets", + found: self.state_name(), + }); + } + + let stream_offset = self.inner.seek(SeekFrom::Current(0))?; + self.header.stream_offset = stream_offset; + self.state = WriterState::Streaming; + + Ok(BendlStreamHandle { + parent: self, + start_offset: stream_offset, + }) + } + + /// Directly write the whole stream region from an in-memory byte + /// slice. This is a convenience for tests and for tools that already + /// have the encoded stream bytes on hand. + pub fn write_stream_bytes( + &mut self, + bytes: &[u8], + sample_count: i64, + ) -> Result<(), BendlWriteError> { + let mut handle = self.begin_stream()?; + handle.write_all(bytes).map_err(BendlWriteError::Io)?; + handle.finish(sample_count) + } + + /// Open a BEN assignment stream backed by an + /// [`crate::io::writer::AssignmentWriter`] and invoke `f` with a + /// context that can encode assignments into it. + /// + /// The context tracks how many `write_assignment` / `write_json_value` + /// calls the closure makes and records that count as the bundle's + /// authoritative `sample_count` when the stream is finalized. The + /// closure is free to short-circuit by returning an error, in which + /// case the stream phase is abandoned and the error is propagated. + pub fn write_ben_stream( + &mut self, + variant: crate::BenVariant, + f: F, + ) -> Result<(), BendlWriteError> + where + F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, + { + let mut handle = self.begin_stream()?; + let mut sample_count: i64 = 0; + { + let mut ben = + crate::io::writer::AssignmentWriter::new(&mut handle, variant)?; + { + let mut ctx = BundleAssignmentStreamCtx { + writer: &mut ben, + sample_count: &mut sample_count, + }; + f(&mut ctx)?; + } + ben.finish()?; + // `ben` is dropped here, releasing its borrow on `handle`. + } + handle.finish(sample_count) + } + + /// Open an XBEN assignment stream backed by an + /// [`crate::io::writer::XZAssignmentWriter`] and invoke `f` with a + /// context that can encode assignments into it. + /// + /// The closure sees the same counting [`BundleAssignmentStreamCtx`] + /// type used by [`BendlWriter::write_ben_stream`], so callers can be + /// written to be generic over the assignment container. + pub fn write_xben_stream( + &mut self, + variant: crate::BenVariant, + f: F, + ) -> Result<(), BendlWriteError> + where + F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, + { + let mut handle = self.begin_stream()?; + let mut sample_count: i64 = 0; + { + let encoder = xz2::write::XzEncoder::new(&mut handle, DEFAULT_XZ_PRESET); + let mut xben = + crate::io::writer::XZAssignmentWriter::new(encoder, variant)?; + { + let mut ctx = BundleAssignmentStreamCtx { + writer: &mut xben, + sample_count: &mut sample_count, + }; + f(&mut ctx)?; + } + xben.finish()?; + // `xben` is dropped here, which drops its inner `XzEncoder`, + // which in turn finalizes the xz stream and flushes the last + // bytes out to `handle`. + } + handle.finish(sample_count) + } + + /// Write the trailing directory, patch the header, and return the + /// underlying writer. + pub fn finish(mut self) -> Result { + let (stream_len, sample_count) = match self.state { + WriterState::StreamWritten { + stream_len, + sample_count, + } => (stream_len, sample_count), + // Allow finalizing a bundle that has no stream at all (useful + // for asset-only bundles), treating the stream as empty. + WriterState::Assets => { + let stream_offset = self.inner.seek(SeekFrom::Current(0))?; + self.header.stream_offset = stream_offset; + (0, 0) + } + WriterState::Streaming => { + return Err(BendlWriteError::WrongState { + expected: "StreamWritten", + found: "Streaming", + }); + } + WriterState::Finished => { + return Err(BendlWriteError::WrongState { + expected: "StreamWritten", + found: "Finished", + }); + } + }; + + // Position at end of stream (== start of directory). + let directory_offset = self.header.stream_offset + stream_len; + self.inner.seek(SeekFrom::Start(directory_offset))?; + + let directory_bytes = encode_directory(&self.entries) + .map_err(BendlWriteError::Format)?; + self.inner + .write_all(&directory_bytes) + .map_err(BendlWriteError::Io)?; + + let directory_len = directory_bytes.len() as u64; + + // Patch the header. + self.header.directory_offset = directory_offset; + self.header.directory_len = directory_len; + self.header.stream_len = stream_len; + self.header.sample_count = sample_count; + self.header.complete = COMPLETE_YES; + self.inner.seek(SeekFrom::Start(0))?; + self.header.write_to(&mut self.inner)?; + + // Flush explicitly; some writers (files) are not flushed on drop. + self.inner.flush()?; + + self.state = WriterState::Finished; + Ok(self.inner) + } + + fn state_name(&self) -> &'static str { + match self.state { + WriterState::Assets => "Assets", + WriterState::Streaming => "Streaming", + WriterState::StreamWritten { .. } => "StreamWritten", + WriterState::Finished => "Finished", + } + } +} + +/// Mutable handle to the stream region held by a [`BendlWriter`]. +/// +/// The handle implements `Write` so it can be wrapped in +/// `AssignmentWriter::new(handle, variant)` or +/// `XZAssignmentWriter::new(handle, variant)` directly. +pub struct BendlStreamHandle<'a, W: Write + Seek> { + parent: &'a mut BendlWriter, + start_offset: u64, +} + +impl<'a, W: Write + Seek> BendlStreamHandle<'a, W> { + /// Record the sample count and transition the writer out of the + /// stream phase. Call this after the embedded BEN/XBEN payload has + /// been written. + pub fn finish(self, sample_count: i64) -> Result<(), BendlWriteError> { + let end = self.parent.inner.seek(SeekFrom::Current(0))?; + let stream_len = end.saturating_sub(self.start_offset); + self.parent.state = WriterState::StreamWritten { + stream_len, + sample_count, + }; + Ok(()) + } +} + +impl<'a, W: Write + Seek> Write for BendlStreamHandle<'a, W> { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.parent.inner.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.parent.inner.flush() + } +} + +/// Minimal trait that hides the concrete assignment-writer type behind a +/// pair of methods that both [`crate::io::writer::AssignmentWriter`] and +/// [`crate::io::writer::XZAssignmentWriter`] implement. +/// +/// The bundle layer uses this to let a single +/// [`BundleAssignmentStreamCtx`] wrap either container. +pub trait BundleAssignmentSink { + /// Encode one assignment vector. + fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()>; + /// Encode one JSON assignment record. + fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()>; +} + +impl BundleAssignmentSink for crate::io::writer::AssignmentWriter { + fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + crate::io::writer::AssignmentWriter::write_assignment(self, assign_vec) + } + + fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { + crate::io::writer::AssignmentWriter::write_json_value(self, data) + } +} + +impl BundleAssignmentSink for crate::io::writer::XZAssignmentWriter { + fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + crate::io::writer::XZAssignmentWriter::write_assignment(self, assign_vec) + } + + fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { + crate::io::writer::XZAssignmentWriter::write_json_value(self, data) + } +} + +/// Closure-side handle passed to [`BendlWriter::write_ben_stream`] and +/// [`BendlWriter::write_xben_stream`]. +/// +/// Exposes the usual assignment-writing methods while also counting +/// samples so the bundle's header can be patched with an authoritative +/// `sample_count` at stream finalization. +pub struct BundleAssignmentStreamCtx<'a> { + writer: &'a mut dyn BundleAssignmentSink, + sample_count: &'a mut i64, +} + +impl<'a> BundleAssignmentStreamCtx<'a> { + /// Encode one assignment vector and bump the sample counter. + pub fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + self.writer.write_assignment(assign_vec)?; + *self.sample_count += 1; + Ok(()) + } + + /// Encode one JSON assignment record and bump the sample counter. + pub fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { + self.writer.write_json_value(data)?; + *self.sample_count += 1; + Ok(()) + } + + /// Number of samples the closure has written so far. + pub fn sample_count(&self) -> i64 { + *self.sample_count + } +} + +/// Errors produced by the bundle writer. +#[derive(Debug, Error)] +pub enum BendlWriteError { + /// A new asset's name collides with an existing one. + #[error("duplicate asset name: {0:?}")] + DuplicateName(String), + + /// A second singleton asset of this type was requested. + #[error("duplicate singleton asset type: {0}")] + DuplicateSingletonType(u16), + + /// A singleton asset was added under the wrong canonical name. + #[error( + "asset type {asset_type} must use canonical name {expected:?}, got {found:?}" + )] + WrongCanonicalName { + /// The asset type whose canonical name was violated. + asset_type: u16, + /// The canonical name the caller should have used. + expected: String, + /// The name the caller actually provided. + found: String, + }, + + /// An asset was added after the stream phase began. + #[error("cannot add assets after the stream region has been opened")] + AssetsAfterStream, + + /// Tried to append to a bundle that is not finalized. + #[error("cannot append to a bundle whose header does not have complete == 1")] + BundleIncomplete, + + /// The writer was asked to perform an operation in the wrong state. + #[error("writer is in state {found}, expected {expected}")] + WrongState { + /// The state the operation expected. + expected: &'static str, + /// The state the writer was actually in. + found: &'static str, + }, + + /// A format-layer error escaped while encoding the directory table. + #[error(transparent)] + Format(#[from] BendlFormatError), + + /// An underlying I/O error. + #[error(transparent)] + Io(#[from] io::Error), +} + +// --------------------------------------------------------------------------- +// Append path +// --------------------------------------------------------------------------- + +/// Post-finalize appender that grows an existing `.bendl` file with new +/// assets without rewriting the assignment stream. +/// +/// The workflow is: +/// +/// 1. [`BendlAppender::open`] opens a finalized bundle and loads its +/// directory into memory. +/// 2. [`BendlAppender::add_asset`] (or [`BendlAppender::add_json_asset`]) +/// validates and buffers each new asset. Validation happens up front, +/// so duplicate singletons or names are rejected **before** any file +/// mutation, and a rejected add_asset leaves the file unchanged. +/// 3. [`BendlAppender::commit`] compresses the buffered assets (if any), +/// truncates the file at the old directory offset, writes the new +/// asset payloads, writes a new directory at the new EOF, and patches +/// the header. +/// +/// A [`BendlAppender`] that is dropped without calling `commit` leaves +/// the underlying file unchanged. +pub struct BendlAppender { + inner: W, + header: BendlHeader, + existing_entries: Vec, + existing_names: HashSet, + existing_singleton_types: HashSet, + pending: Vec, + pending_names: HashSet, + pending_singleton_types: HashSet, +} + +/// An asset queued for append but not yet written to disk. +struct PendingAsset { + asset_type: u16, + name: String, + /// Raw payload bytes as provided by the caller. + raw_payload: Vec, + /// Resolved compression decision: `true` means compress, `false` means raw. + compress: bool, + is_json: bool, + checksum: Option>, +} + +impl BendlAppender { + /// Open a finalized bundle for append. + /// + /// Returns [`BendlWriteError::BundleIncomplete`] if the header's + /// `complete` flag is not set — append is unsafe on unfinalized + /// bundles because the stream region has no authoritative end. + pub fn open(mut inner: W) -> Result { + inner.seek(SeekFrom::Start(0))?; + let header = BendlHeader::read_from(&mut inner).map_err(BendlWriteError::Format)?; + if !header.is_complete() { + return Err(BendlWriteError::BundleIncomplete); + } + if header.directory_offset == 0 || header.directory_len == 0 { + return Err(BendlWriteError::BundleIncomplete); + } + + inner.seek(SeekFrom::Start(header.directory_offset))?; + let existing_entries = + read_directory(&mut inner).map_err(BendlWriteError::Format)?; + + let mut existing_names = HashSet::new(); + let mut existing_singleton_types = HashSet::new(); + for entry in &existing_entries { + existing_names.insert(entry.name.clone()); + if canonical_name_for(entry.asset_type).is_some() { + existing_singleton_types.insert(entry.asset_type); + } + } + + Ok(BendlAppender { + inner, + header, + existing_entries, + existing_names, + existing_singleton_types, + pending: Vec::new(), + pending_names: HashSet::new(), + pending_singleton_types: HashSet::new(), + }) + } + + /// The currently loaded (pre-append) directory entries. + pub fn existing_assets(&self) -> &[BendlDirectoryEntry] { + &self.existing_entries + } + + /// Enqueue a new asset for append. + /// + /// This validates the new asset against both the loaded directory + /// and any previously-enqueued pending assets. If validation fails, + /// the pending list is unchanged and no bytes have been written to + /// the file. + pub fn add_asset( + &mut self, + asset_type: u16, + name: &str, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + // Canonical-name rule. + if let Some(canonical) = canonical_name_for(asset_type) { + if name != canonical { + return Err(BendlWriteError::WrongCanonicalName { + asset_type, + expected: canonical.to_string(), + found: name.to_string(), + }); + } + if self.existing_singleton_types.contains(&asset_type) + || self.pending_singleton_types.contains(&asset_type) + { + return Err(BendlWriteError::DuplicateSingletonType(asset_type)); + } + } + + // Uniqueness rule against both existing and pending assets. + if self.existing_names.contains(name) || self.pending_names.contains(name) { + return Err(BendlWriteError::DuplicateName(name.to_string())); + } + + let compress = options + .compress + .unwrap_or_else(|| default_compresses_by_type(asset_type)); + + self.pending_names.insert(name.to_string()); + if canonical_name_for(asset_type).is_some() { + self.pending_singleton_types.insert(asset_type); + } + self.pending.push(PendingAsset { + asset_type, + name: name.to_string(), + raw_payload: payload.to_vec(), + compress, + is_json: options.is_json, + checksum: options.checksum, + }); + Ok(()) + } + + /// Convenience wrapper around [`add_asset`] for JSON-encoded assets. + pub fn add_json_asset( + &mut self, + asset_type: u16, + name: &str, + payload: &[u8], + ) -> Result<(), BendlWriteError> { + self.add_asset(asset_type, name, payload, AddAssetOptions::defaults().json()) + } + + /// Commit all pending appends. + /// + /// This compresses any buffered payloads that need it (entirely in + /// memory), then performs the file mutation in a single burst: + /// truncate at the old directory offset, write new payloads, write + /// a new directory, and patch the header. + /// + /// If compression fails, the file is left unchanged. + pub fn commit(mut self) -> Result { + // If nothing was enqueued, commit is a no-op — return the file untouched. + if self.pending.is_empty() { + return Ok(self.inner); + } + + // Phase 1: compress any pending payloads and build new entries with + // placeholder offsets. Do this entirely in memory so failures here + // leave the file untouched. + struct EncodedPending { + asset_type: u16, + name: String, + bytes: Vec, + asset_flags: u16, + checksum: Option>, + } + + let mut encoded: Vec = Vec::with_capacity(self.pending.len()); + for asset in self.pending.drain(..) { + let bytes = if asset.compress { + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder.write_all(&asset.raw_payload)?; + encoder.finish()? + } else { + asset.raw_payload + }; + + let mut asset_flags: u16 = 0; + if asset.is_json { + asset_flags |= ASSET_FLAG_JSON; + } + if asset.compress { + asset_flags |= ASSET_FLAG_XZ; + } + if asset.checksum.is_some() { + asset_flags |= + crate::io::bundle::format::ASSET_FLAG_CHECKSUM; + } + + encoded.push(EncodedPending { + asset_type: asset.asset_type, + name: asset.name, + bytes, + asset_flags, + checksum: asset.checksum, + }); + } + + // Phase 2: file mutation. From this point forward, a failure + // leaves the bundle in a damaged state. We do everything in the + // order (truncate, write payloads, write directory, patch header) + // so that even if we crash mid-way, the header still points at + // the old directory until the very last write. + let old_directory_offset = self.header.directory_offset; + + // Truncate at the old directory offset. + self.inner.truncate_at(old_directory_offset)?; + self.inner.seek(SeekFrom::Start(old_directory_offset))?; + + // Compute new entries with real offsets as we write. + let mut new_entries: Vec = + Vec::with_capacity(self.existing_entries.len() + encoded.len()); + new_entries.extend(self.existing_entries.iter().cloned()); + + for enc in encoded { + let payload_offset = self.inner.seek(SeekFrom::Current(0))?; + self.inner.write_all(&enc.bytes)?; + new_entries.push(BendlDirectoryEntry { + asset_type: enc.asset_type, + asset_flags: enc.asset_flags, + name: enc.name, + payload_offset, + payload_len: enc.bytes.len() as u64, + checksum: enc.checksum, + }); + } + + // Write the new directory at the new EOF. + let new_directory_offset = self.inner.seek(SeekFrom::Current(0))?; + let directory_bytes = + encode_directory(&new_entries).map_err(BendlWriteError::Format)?; + self.inner.write_all(&directory_bytes)?; + let new_directory_len = directory_bytes.len() as u64; + + // Patch the header. + self.header.directory_offset = new_directory_offset; + self.header.directory_len = new_directory_len; + self.inner.seek(SeekFrom::Start(0))?; + self.header.write_to(&mut self.inner)?; + self.inner.flush()?; + + Ok(self.inner) + } + + /// Release the underlying reader without committing any pending + /// appends. The file is unchanged. + pub fn abort(self) -> W { + self.inner + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::io::{Cursor, Read}; + + use super::*; + use crate::io::bundle::format::{ + ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + }; + use crate::io::bundle::reader::BendlReader; + + fn make_buffer() -> Cursor> { + Cursor::new(Vec::new()) + } + + #[test] + fn minimal_bundle_round_trip_through_reader() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset( + ASSET_TYPE_METADATA, + "metadata.json", + br#"{"note":"hello"}"#, + ) + .unwrap(); + let stream_bytes = b"STANDARD BEN FILE\x00\x01fake".to_vec(); + writer.write_stream_bytes(&stream_bytes, 7).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(7)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + assert_eq!(reader.assets().len(), 1); + + let entry = reader + .find_asset_by_type(ASSET_TYPE_METADATA) + .cloned() + .expect("metadata entry present"); + assert_eq!(entry.name, "metadata.json"); + assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); + let meta_bytes = reader.asset_bytes(&entry).unwrap(); + assert_eq!(meta_bytes, br#"{"note":"hello"}"#); + + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, stream_bytes); + } + + #[test] + fn graph_asset_is_compressed_by_default() { + let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9],"edges":[[0,1],[1,2],[2,3],[3,4]]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph entry present"); + assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); + // Compressed size should differ from the raw size for a non-trivial + // JSON payload. For very short payloads xz actually inflates the + // bytes, so this just checks the size is non-zero and different. + assert_ne!(entry.payload_len, graph.len() as u64); + + // Decoded bytes round-trip. + let decoded = reader.asset_bytes(&entry).unwrap(); + assert_eq!(decoded, graph); + } + + #[test] + fn graph_asset_can_be_forced_raw() { + let graph = br#"{"nodes":[0,1,2]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + graph, + AddAssetOptions::defaults().json().raw(), + ) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .expect("graph entry present"); + assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); + assert_eq!(entry.payload_len, graph.len() as u64); + } + + #[test] + fn writer_rejects_second_graph() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateSingletonType(t) if t == ASSET_TYPE_GRAPH)); + } + + #[test] + fn writer_rejects_wrong_canonical_name_for_singleton() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph_but_wrong_name.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + )); + } + + #[test] + fn writer_rejects_duplicate_custom_name() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"first", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"second", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); + } + + #[test] + fn writer_rejects_asset_added_after_stream_begins() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + { + let mut handle = writer.begin_stream().unwrap(); + handle.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + handle.finish(1).unwrap(); + } + let err = writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::AssetsAfterStream)); + } + + #[test] + fn asset_only_bundle_finalizes_with_empty_stream() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(0)); + assert_eq!(reader.header().stream_len, 0); + } + + #[test] + fn finalized_directory_lives_at_eof() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let header = reader.header(); + let file_len = buf.len() as u64; + assert_eq!(header.directory_offset + header.directory_len, file_len); + // Stream ends where directory begins. + assert_eq!(header.stream_offset + header.stream_len, header.directory_offset); + } + + // ----------------------------------------------------------------------- + // Append-path tests + // ----------------------------------------------------------------------- + + /// Build a finalized bundle with a single `metadata.json` asset and + /// a short fake stream, then return both the bytes and the byte + /// range (offset, len) occupied by the stream region. + fn build_base_bundle() -> (Vec, (u64, u64)) { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"version\":1}") + .unwrap(); + let stream = b"STANDARD BEN FILE\x00\x01\x02\x03\x04\x05stream bytes"; + writer.write_stream_bytes(stream, 3).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let range = (reader.header().stream_offset, reader.header().stream_len); + (buf, range) + } + + #[test] + fn append_adds_new_asset_and_preserves_old_entries() { + let (bundle, _) = build_base_bundle(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 2); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + // Finalized bundle invariants still hold. + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(3)); + } + + #[test] + fn append_leaves_stream_bytes_byte_for_byte_unchanged() { + let (bundle, (stream_offset, stream_len)) = build_base_bundle(); + let original_stream_bytes = bundle + [stream_offset as usize..(stream_offset + stream_len) as usize] + .to_vec(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"appended custom bytes", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Read back the new header to locate the stream region, then + // confirm the stream bytes are byte-identical to the original. + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let (off, len) = (reader.header().stream_offset, reader.header().stream_len); + let appended_stream_bytes = buf[off as usize..(off + len) as usize].to_vec(); + assert_eq!(appended_stream_bytes, original_stream_bytes); + // Stream offset should not have moved either. + assert_eq!(off, stream_offset); + assert_eq!(len, stream_len); + } + + #[test] + fn append_preserves_existing_entries_payload_offsets() { + let (bundle, _) = build_base_bundle(); + + // Snapshot the metadata entry's payload_offset before append. + let reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); + let old_offset = reader + .find_asset_by_name("metadata.json") + .unwrap() + .payload_offset; + drop(reader); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[0,1,2,3,4,5]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let new_offset = reader + .find_asset_by_name("metadata.json") + .unwrap() + .payload_offset; + assert_eq!(old_offset, new_offset, "existing asset offset must not move"); + } + + #[test] + fn append_rejects_duplicate_singleton_without_touching_file() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"new\":true}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateSingletonType(_))); + + // Abort and confirm the file is byte-for-byte unchanged. + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); + } + + #[test] + fn append_rejects_duplicate_custom_name_without_touching_file() { + // Start from a bundle containing a custom asset named "blob", then + // try to append another "blob". + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"original", + AddAssetOptions::defaults(), + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let bundle = writer.finish().unwrap().into_inner(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"dup", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); + } + + #[test] + fn append_rejects_wrong_canonical_name_without_touching_file() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + )); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); + } + + #[test] + fn append_rejects_incomplete_bundle() { + // Construct a minimal incomplete bundle: just the provisional + // header and some stream bytes, no directory. + use crate::io::bundle::format::{BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, + COMPLETE_NO}; + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(b"STANDARD BEN FILE\x00fake"); + + match BendlAppender::open(Cursor::new(bytes)) { + Err(BendlWriteError::BundleIncomplete) => {} + Err(other) => panic!("expected BundleIncomplete, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + #[test] + fn append_multiple_assets_in_one_commit() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") + .unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob1", + b"blob one", + AddAssetOptions::defaults(), + ) + .unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob2", + b"blob two", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 4); + // Round-trip the appended graph through the reader to confirm + // compression happened and decodes cleanly. + let graph_entry = reader + .find_asset_by_name("graph.json") + .cloned() + .expect("graph entry present"); + assert_ne!(graph_entry.asset_flags & ASSET_FLAG_XZ, 0); + let graph_bytes = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_bytes, b"{\"n\":[0,1,2]}"); + } + + #[test] + fn append_rejects_conflicting_pending_additions() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "new_blob", + b"a", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = appender + .add_asset( + ASSET_TYPE_CUSTOM, + "new_blob", + b"b", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(_))); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); + } + + // -------- Phase 4: assignment-stream integration tests -------- + + #[test] + fn write_ben_stream_round_trips_through_assignment_reader() { + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + + let samples: Vec> = vec![ + vec![0, 0, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], // repeat + vec![1, 1, 1, 1, 2, 2], + ]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + // Four write_assignment calls → sample_count == 4. + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); + } + + #[test] + fn write_xben_stream_round_trips_through_assignment_reader() { + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + + let samples: Vec> = vec![ + vec![0, 1, 2, 3, 4, 5], + vec![0, 1, 2, 3, 4, 5], // repeat + vec![1, 1, 2, 3, 4, 5], + vec![1, 1, 2, 3, 4, 4], + ]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + writer + .write_xben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); + + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Xben(r) => r, + BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); + } + + #[test] + fn write_ben_stream_alongside_front_loaded_asset() { + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + + let graph = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#; + let samples: Vec> = vec![vec![0, 1, 1, 2], vec![0, 1, 2, 2]]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + + // Front-loaded graph asset survives round trip through xz. + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph asset present"); + assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); + let decoded_graph = reader.asset_bytes(&entry).unwrap(); + assert_eq!(decoded_graph, graph); + + // Assignment stream is still intact after pulling asset bytes. + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); + } + + #[test] + fn open_assignment_reader_rejects_mismatched_format() { + // Build a BEN bundle and open a reader, and verify the is_ben/is_xben + // discriminators reflect the header. + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + ctx.write_assignment(vec![0, 1])?; + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let decoder: BundleAssignmentReader<_> = + reader.open_assignment_reader().unwrap(); + assert!(decoder.is_ben()); + assert!(!decoder.is_xben()); + } + + // ----------------------------------------------------------------------- + // Robustness tests + // ----------------------------------------------------------------------- + + #[test] + fn fully_empty_bundle_finalizes_and_round_trips() { + // No assets, no stream bytes, no stream phase at all. + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(0)); + assert_eq!(reader.header().stream_len, 0); + assert_eq!(reader.assets().len(), 0); + // Even with zero assets the directory is present and empty. + assert_ne!(reader.header().directory_offset, 0); + // directory_len should equal the 4-byte empty entry-count header. + assert_eq!(reader.header().directory_len, 4); + } + + #[test] + fn begin_stream_twice_returns_wrong_state_error() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + { + let handle = match writer.begin_stream() { + Ok(h) => h, + Err(_) => panic!("first begin_stream must succeed"), + }; + // Drop the handle without calling finish() — the writer is + // now stuck in the Streaming state. + drop(handle); + } + let err = writer.begin_stream().err().expect("second begin_stream must fail"); + assert!(matches!(err, BendlWriteError::WrongState { .. })); + } + + #[test] + fn finish_from_streaming_state_errors() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + match writer.begin_stream() { + Ok(handle) => drop(handle), + Err(_) => panic!("begin_stream must succeed"), + } + // Intentionally leave the writer in the Streaming state. + let err = writer.finish().unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongState { found: "Streaming", .. } + )); + } + + #[test] + fn stress_many_custom_assets_round_trip() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let count = 500usize; + for i in 0..count { + let name = format!("blob_{i:05}"); + let payload = vec![(i & 0xFF) as u8; (i % 17) + 1]; + writer + .add_asset(ASSET_TYPE_CUSTOM, &name, &payload, AddAssetOptions::defaults()) + .unwrap(); + } + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), count); + // Spot-check a handful of entries by reading their payload bytes back. + for i in [0usize, 1, 42, 199, 499] { + let name = format!("blob_{i:05}"); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(got, vec![(i & 0xFF) as u8; (i % 17) + 1]); + } + } + + #[test] + fn append_empty_commit_is_noop() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + let appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + // No add_asset calls. Commit should return the file unchanged. + let buf = appender.commit().unwrap().into_inner(); + assert_eq!(buf, bundle_before); + } + + #[test] + fn append_then_reopen_and_append_again() { + let (bundle, _) = build_base_bundle(); + + // First commit: add a graph. + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Second commit: reopen the same bytes and add a custom blob. + let mut appender = BendlAppender::open(Cursor::new(buf)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"later", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Final read: all three assets should be present. + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let names: Vec<&str> = reader.assets().iter().map(|e| e.name.as_str()).collect(); + assert!(names.contains(&"metadata.json")); + assert!(names.contains(&"graph.json")); + assert!(names.contains(&"extra.bin")); + // Sample count from the original stream is preserved across both + // appends. + assert_eq!(reader.sample_count(), Some(3)); + } + + #[test] + fn append_does_not_disturb_front_loaded_asset_bytes() { + // Base bundle has a graph.json asset with known bytes; after + // append of a custom blob, reading graph.json must still return + // exactly the same decoded bytes as before. + let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9,10]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let bundle = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let graph_before = reader.asset_bytes(&entry).unwrap(); + drop(reader); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"0123456789", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let graph_after = reader.asset_bytes(&entry).unwrap(); + assert_eq!(graph_before, graph_after); + } + + #[test] + fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { + // A custom asset named "graph.json" is not a singleton because the + // singleton uniqueness check keys off asset_type, not name. Adding + // a real GRAPH singleton after it must then fail on DuplicateName. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "graph.json", + b"custom graph-ish bytes", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "graph.json")); + } + + #[test] + fn writer_asset_with_checksum_round_trips_through_reader() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let checksum = vec![0x01, 0x02, 0x03, 0x04]; + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "with_checksum", + b"hello", + AddAssetOptions { + checksum: Some(checksum.clone()), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_name("with_checksum") + .cloned() + .unwrap(); + assert_eq!(entry.checksum, Some(checksum)); + assert_ne!(entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, 0); + } + + #[test] + fn finished_writer_rejects_further_operations() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + // Take a handle to the writer by going through begin_stream first. + // Actually finish() consumes self, so instead assert the state + // machine barfs when we manually poke it in the Finished state. + // + // We simulate by calling finish() and then checking there is no + // way to call add_asset/begin_stream afterwards — `finish` consumes + // `self`, which is itself the protection. + let buf = writer.finish().unwrap().into_inner(); + // The resulting buffer is a valid finalized bundle. + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + } + + #[test] + fn appender_commit_after_abort_is_not_possible_but_abort_leaves_bytes_unchanged() { + let (bundle, _) = build_base_bundle(); + let before = bundle.clone(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "wont_land", + b"orphan", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.abort().into_inner(); + assert_eq!(buf, before, "abort must leave file bytes unchanged"); + } + + #[test] + fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_METADATA, "meta.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_METADATA, .. } + )); + // After a rejected add, no entries have been recorded — a + // subsequent valid add proceeds normally. + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 1); + } + + #[test] + fn writer_rejected_add_leaves_singleton_slot_usable() { + // A rejected singleton add must not consume the singleton slot — + // otherwise a future valid add with the correct canonical name + // would spuriously fail with DuplicateSingletonType. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + // First try with wrong canonical name — rejected. + let _ = writer + .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") + .unwrap_err(); + // Now retry with correct name; should succeed. + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap(); + } + + #[test] + fn append_rejects_duplicate_name_across_existing_and_pending() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + // First pending add: "blob". + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"1", + AddAssetOptions::defaults(), + ) + .unwrap(); + // Second pending add with same name must be rejected. + let err = appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"2", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(_))); + // Committing the still-valid first pending add should still work. + let buf = appender.commit().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.find_asset_by_name("blob").is_some()); + } + + #[test] + fn write_ben_stream_closure_error_short_circuits_finalize() { + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .write_ben_stream(BenVariant::Standard, |_ctx| { + Err(io::Error::new(io::ErrorKind::Other, "boom")) + }) + .unwrap_err(); + match err { + BendlWriteError::Io(e) => assert_eq!(e.kind(), io::ErrorKind::Other), + other => panic!("expected Io(Other), got {other:?}"), + } + } + + // ----------------------------------------------------------------------- + // Randomized / stress tests + // ----------------------------------------------------------------------- + + /// Build a bundle from a random set of custom assets (plus an optional + /// metadata asset) and fully round-trip it through the reader. Repeated + /// with a seeded ChaCha PRNG so the sequence is deterministic but + /// covers a wide surface. + #[test] + fn randomized_round_trip_many_custom_assets() { + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + for seed in 0u64..12 { + let mut rng = ChaCha8Rng::seed_from_u64(seed ^ 0xA110_CADE_F00D); + let n_assets: usize = rng.random_range(0..=25); + let include_metadata = rng.random_bool(0.5); + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + + let mut expected: Vec<(String, Vec, bool)> = Vec::new(); + if include_metadata { + let payload = format!(r#"{{"seed":{seed}}}"#).into_bytes(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", &payload) + .unwrap(); + expected.push(("metadata.json".to_string(), payload, false)); + } + + for i in 0..n_assets { + let size: usize = rng.random_range(0..=512); + let payload: Vec = (0..size).map(|_| rng.random::()).collect(); + let compress = rng.random_bool(0.4); + let is_json = rng.random_bool(0.15) && size > 0; + let payload = if is_json { + // Override with a synthetic JSON blob so the json flag + // actually matches the content. + format!(r#"{{"i":{i},"seed":{seed}}}"#).into_bytes() + } else { + payload + }; + + let mut opts = AddAssetOptions::defaults(); + if compress { + opts = opts.compress(); + } else { + opts = opts.raw(); + } + if is_json { + opts = opts.json(); + } + let name = format!("seed{seed}-asset{i}.bin"); + writer + .add_asset(ASSET_TYPE_CUSTOM, &name, &payload, opts) + .unwrap(); + expected.push((name, payload, is_json)); + } + + // Write a small deterministic stream so the bundle is + // assignment-complete. + let sample_count: i64 = rng.random_range(0..=20); + let fake_stream = b"STANDARD BEN FILE\x00\x01\x02payload".to_vec(); + writer + .write_stream_bytes(&fake_stream, sample_count) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete(), "seed {seed}: not finalized"); + assert_eq!(reader.sample_count(), Some(sample_count)); + reader + .validate_directory() + .unwrap_or_else(|e| panic!("seed {seed}: validation failed: {e:?}")); + assert_eq!(reader.assets().len(), expected.len(), "seed {seed}"); + + for (name, want, _is_json) in &expected { + let entry = reader + .find_asset_by_name(name) + .cloned() + .unwrap_or_else(|| panic!("seed {seed}: asset {name:?} missing")); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "seed {seed}: payload mismatch for {name}"); + } + + // Stream must also read back exactly. + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, fake_stream, "seed {seed}"); + } + } + + #[test] + fn five_successive_appends_preserve_everything() { + // Start from a finalized bundle with only a metadata asset and a + // short stream. Then open it five times via BendlAppender and add + // one asset per round. After every round, the previous assets must + // still be readable and sample_count must remain authoritative. + let (mut buf, _) = build_base_bundle(); + + // Sanity-check the baseline. + let baseline_reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let baseline_samples = baseline_reader.sample_count(); + assert!(baseline_samples.is_some()); + drop(baseline_reader); + + let mut accumulated: Vec<(String, Vec)> = vec![( + "metadata.json".to_string(), + br#"{"version":1}"#.to_vec(), + )]; + + for round in 0..5 { + let cursor = Cursor::new(buf); + let mut appender = BendlAppender::open(cursor).unwrap(); + let name = format!("round-{round}.bin"); + let payload: Vec = (0u8..=(round as u8 * 7 + 3)).collect(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) + .unwrap(); + let commit = appender.commit().unwrap(); + buf = commit.into_inner(); + accumulated.push((name, payload)); + + // Re-open and verify the full set is intact and sample_count + // still matches the baseline (append must not touch it). + let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + assert!(reader.is_complete(), "round {round}"); + assert_eq!( + reader.sample_count(), + baseline_samples, + "sample count drifted at round {round}" + ); + assert_eq!( + reader.assets().len(), + accumulated.len(), + "asset count wrong at round {round}" + ); + reader.validate_directory().unwrap(); + + for (n, want) in &accumulated { + let entry = reader + .find_asset_by_name(n) + .cloned() + .unwrap_or_else(|| panic!("round {round}: {n:?} missing")); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "round {round}: payload mismatch for {n}"); + } + } + } + + #[test] + fn randomized_append_sequence_preserves_all_prior_entries() { + // Independent coverage for append: random number of rounds, random + // payload sizes. Catches any bookkeeping drift in the appender's + // directory-rewrite path. + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + let (mut buf, _) = build_base_bundle(); + let mut accumulated: Vec<(String, Vec)> = vec![( + "metadata.json".to_string(), + br#"{"version":1}"#.to_vec(), + )]; + + let mut rng = ChaCha8Rng::seed_from_u64(0xDEAD_BEEF_CAFE_F00D); + let rounds: usize = rng.random_range(3..=8); + for round in 0..rounds { + let adds: usize = rng.random_range(1..=4); + let cursor = Cursor::new(buf); + let mut appender = BendlAppender::open(cursor).unwrap(); + for k in 0..adds { + let size: usize = rng.random_range(0..=256); + let payload: Vec = + (0..size).map(|_| rng.random::()).collect(); + let name = format!("r{round}-a{k}.bin"); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) + .unwrap(); + accumulated.push((name, payload)); + } + let commit = appender.commit().unwrap(); + buf = commit.into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + reader.validate_directory().unwrap(); + assert_eq!(reader.assets().len(), accumulated.len()); + for (n, want) in &accumulated { + let entry = reader.find_asset_by_name(n).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "append round {round}: {n}"); + } + } + } +} diff --git a/ben/src/io/mod.rs b/ben/src/io/mod.rs index 785e3c0..6c2a848 100644 --- a/ben/src/io/mod.rs +++ b/ben/src/io/mod.rs @@ -1,4 +1,5 @@ //! Stream-oriented readers and writers for BEN and XBEN files. +pub mod bundle; pub mod reader; pub mod writer; diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 0d566ac..7ab755a 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -39,6 +39,7 @@ fn bin_path(name: &str) -> &'static str { "ben" => env!("CARGO_BIN_EXE_ben"), "pben" => env!("CARGO_BIN_EXE_pben"), "reben" => env!("CARGO_BIN_EXE_reben"), + "bendl" => env!("CARGO_BIN_EXE_bendl"), _ => panic!("unknown binary {name}"), } } @@ -114,7 +115,7 @@ fn sample_graph() -> &'static str { #[test] fn all_clis_report_help_and_package_version() { - for bin in ["ben", "pben", "reben"] { + for bin in ["ben", "pben", "reben", "bendl"] { let help = run(bin, &["--help"], Path::new(".")); assert_success(&help); let help_text = String::from_utf8_lossy(&help.stdout); @@ -1500,3 +1501,169 @@ fn pben_cli_converts_between_formats() { let printed = String::from_utf8_lossy(&xdecode.stdout); assert!(printed.contains(r#""assignment":[2,2,3]"#)); } + +#[test] +fn bendl_cli_create_inspect_extract_append_roundtrip() { + let temp = TempDir::new("bendl-workflow"); + + // Seed: a .ben assignment file to wrap. + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + assert_success(&run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + ben_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + )); + + // Seed: a graph.json file to front-load as an asset. + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + + // Seed: a small metadata.json file. + let metadata_path = temp.path().join("metadata.json"); + fs::write(&metadata_path, r#"{"note":"hello"}"#).unwrap(); + + // `bendl create` — build a finalized bundle. + let bundle_path = temp.path().join("out.bendl"); + let create = run( + "bendl", + &[ + "create", + "--input", + ben_path.to_str().unwrap(), + "--output", + bundle_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--metadata", + metadata_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&create); + assert!(bundle_path.exists()); + + // `bendl inspect` — header should report both assets and complete=true. + let inspect = run( + "bendl", + &["inspect", bundle_path.to_str().unwrap()], + temp.path(), + ); + assert_success(&inspect); + let inspect_out = String::from_utf8_lossy(&inspect.stdout); + assert!(inspect_out.contains("complete: true")); + assert!(inspect_out.contains("assignment_format: ben")); + assert!(inspect_out.contains("graph.json")); + assert!(inspect_out.contains("metadata.json")); + + // `bendl extract --stream` — recover the original .ben bytes exactly. + let recovered_ben = temp.path().join("recovered.ben"); + let extract_stream = run( + "bendl", + &[ + "extract", + bundle_path.to_str().unwrap(), + "--stream", + "--output", + recovered_ben.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_stream); + assert_eq!( + fs::read(&recovered_ben).unwrap(), + fs::read(&ben_path).unwrap() + ); + + // `bendl extract --asset graph.json` — recover the decoded graph JSON. + let recovered_graph = temp.path().join("recovered-graph.json"); + let extract_asset = run( + "bendl", + &[ + "extract", + bundle_path.to_str().unwrap(), + "--asset", + "graph.json", + "--output", + recovered_graph.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_asset); + assert_eq!( + fs::read_to_string(&recovered_graph).unwrap(), + sample_graph() + ); + + // `bendl append` — add a custom asset to the already-finalized bundle. + let custom_path = temp.path().join("notes.txt"); + fs::write(&custom_path, b"bundle notes").unwrap(); + let append = run( + "bendl", + &[ + "append", + bundle_path.to_str().unwrap(), + "--asset", + &format!("notes={}", custom_path.display()), + ], + temp.path(), + ); + assert_success(&append); + + // Inspect again: new asset should be present, old assets preserved. + let inspect2 = run( + "bendl", + &["inspect", bundle_path.to_str().unwrap()], + temp.path(), + ); + assert_success(&inspect2); + let inspect2_out = String::from_utf8_lossy(&inspect2.stdout); + assert!(inspect2_out.contains("graph.json")); + assert!(inspect2_out.contains("metadata.json")); + assert!(inspect2_out.contains("notes")); + + // Stream bytes should still match after append. + let recovered_ben2 = temp.path().join("recovered2.ben"); + let extract_stream2 = run( + "bendl", + &[ + "extract", + bundle_path.to_str().unwrap(), + "--stream", + "--output", + recovered_ben2.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_stream2); + assert_eq!( + fs::read(&recovered_ben2).unwrap(), + fs::read(&ben_path).unwrap() + ); + + // Appending a second graph.json is rejected — singleton constraint. + let append_duplicate = run( + "bendl", + &[ + "append", + bundle_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&append_duplicate); +} diff --git a/pyben/binary_ensemble/__init__.py b/pyben/binary_ensemble/__init__.py index fe15f10..fe76a5e 100644 --- a/pyben/binary_ensemble/__init__.py +++ b/pyben/binary_ensemble/__init__.py @@ -1,6 +1,7 @@ from ._core import ( PyBenDecoder, PyBenEncoder, + PyBundleReader, compress_jsonl_to_ben, compress_ben_to_xben, compress_jsonl_to_xben, @@ -12,6 +13,7 @@ __all__ = [ "PyBenDecoder", "PyBenEncoder", + "PyBundleReader", "compress_jsonl_to_ben", "compress_ben_to_xben", "compress_jsonl_to_xben", diff --git a/pyben/src/bundle/mod.rs b/pyben/src/bundle/mod.rs new file mode 100644 index 0000000..592687b --- /dev/null +++ b/pyben/src/bundle/mod.rs @@ -0,0 +1,241 @@ +//! Python bindings for the `.bendl` bundle container. +//! +//! Exposes a [`PyBundleReader`] that wraps +//! [`binary_ensemble::io::bundle::BendlReader`] and provides a small +//! Python-facing surface: +//! +//! - `is_complete()`, `sample_count()`, `assignment_format()` +//! - `asset_names()` / `list_assets()` +//! - `read_asset_bytes(name)` — raw (decoded) bytes as `bytes` +//! - `read_json_asset(name)` — parsed JSON as a Python object +//! - `read_graph()` / `read_metadata()` / `read_relabel_map()` — canonical-name helpers +//! - `extract_stream(out_path, overwrite=False)` — copy the embedded +//! assignment stream to a `.ben` / `.xben` file the caller can then +//! open with `PyBenDecoder`. + +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, BufWriter}; +use std::path::PathBuf; + +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, + ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, +}; +use binary_ensemble::io::bundle::BendlReader; +use pyo3::exceptions::{PyException, PyIOError, PyKeyError}; +use pyo3::prelude::*; +use pyo3::types::PyDict; + +/// Python-facing wrapper around a `BendlReader>`. +#[pyclass(module = "binary_ensemble", unsendable, name = "PyBundleReader")] +pub struct PyBundleReader { + inner: BendlReader>, + path: PathBuf, +} + +#[pymethods] +impl PyBundleReader { + /// Open a `.bendl` file for reading. + #[new] + #[pyo3(text_signature = "(file_path)")] + fn new(file_path: PathBuf) -> PyResult { + let file = File::open(&file_path).map_err(|e| { + PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) + })?; + let inner = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + file_path.display() + )) + })?; + Ok(Self { + inner, + path: file_path, + }) + } + + /// Return the bundle's format version as a `(major, minor)` tuple. + #[pyo3(text_signature = "(self)")] + fn version(&self) -> (u16, u16) { + let h = self.inner.header(); + (h.major_version, h.minor_version) + } + + /// Whether the bundle was successfully finalized. + #[pyo3(text_signature = "(self)")] + fn is_complete(&self) -> bool { + self.inner.is_complete() + } + + /// Authoritative sample count from the header, or `None` when the + /// bundle is incomplete. + #[pyo3(text_signature = "(self)")] + fn sample_count(&self) -> Option { + self.inner.sample_count() + } + + /// Container format of the embedded assignment stream: `"ben"` or + /// `"xben"`, or `None` when the header byte is unrecognized. + #[pyo3(text_signature = "(self)")] + fn assignment_format(&self) -> Option<&'static str> { + self.inner.assignment_format().map(|f| match f { + AssignmentFormat::Ben => "ben", + AssignmentFormat::Xben => "xben", + }) + } + + /// Names of all directory entries, in directory order. + #[pyo3(text_signature = "(self)")] + fn asset_names(&self) -> Vec { + self.inner + .assets() + .iter() + .map(|e| e.name.clone()) + .collect() + } + + /// Return the full directory as a list of dicts with keys + /// `name`, `type`, `offset`, `len`, and `flags` (a list of string tags). + #[pyo3(text_signature = "(self)")] + fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { + let mut out = Vec::with_capacity(self.inner.assets().len()); + for entry in self.inner.assets() { + let d = PyDict::new(py); + d.set_item("name", &entry.name)?; + d.set_item("type", entry.asset_type)?; + d.set_item("offset", entry.payload_offset)?; + d.set_item("len", entry.payload_len)?; + let mut flags: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flags.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flags.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flags.push("checksum"); + } + d.set_item("flags", flags)?; + out.push(d); + } + Ok(out) + } + + /// Read the (decoded) bytes of an asset by name and return them as + /// a Python `bytes` object. + #[pyo3(text_signature = "(self, name, /)")] + fn read_asset_bytes(&mut self, name: &str) -> PyResult> { + let entry = self + .inner + .find_asset_by_name(name) + .cloned() + .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; + self.inner + .asset_bytes(&entry) + .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) + } + + /// Parse a JSON asset into a Python object (dict, list, …). Fails + /// if the asset does not exist or the decoded bytes are not JSON. + #[pyo3(text_signature = "(self, name, /)")] + fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { + let bytes = self.read_asset_bytes(name)?; + let json_mod = py.import("json")?; + let text = std::str::from_utf8(&bytes).map_err(|e| { + PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")) + })?; + let parsed = json_mod.call_method1("loads", (text,))?; + Ok(parsed.into()) + } + + /// Read the bundle's `graph.json` asset as a parsed JSON object. + /// Returns `None` if the bundle does not carry a graph asset. + #[pyo3(text_signature = "(self)")] + fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { + if self + .inner + .find_asset_by_type(ASSET_TYPE_GRAPH) + .is_none() + { + return Ok(None); + } + Ok(Some(self.read_json_asset(py, "graph.json")?)) + } + + /// Read the bundle's `metadata.json` asset as a parsed JSON object, + /// or `None` if absent. + #[pyo3(text_signature = "(self)")] + fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { + if self + .inner + .find_asset_by_type(ASSET_TYPE_METADATA) + .is_none() + { + return Ok(None); + } + Ok(Some(self.read_json_asset(py, "metadata.json")?)) + } + + /// Read the bundle's `relabel_map.json` asset as a parsed JSON + /// object, or `None` if absent. + #[pyo3(text_signature = "(self)")] + fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { + if self + .inner + .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) + .is_none() + { + return Ok(None); + } + Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) + } + + /// Copy the embedded assignment stream region verbatim to + /// `out_path`. The resulting file can be opened directly with + /// `PyBenDecoder(out_path, mode=assignment_format())`. + #[pyo3(signature = (out_path, overwrite=false))] + #[pyo3(text_signature = "(self, out_path, overwrite=False)")] + fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { + if out_path.exists() && !overwrite { + return Err(PyIOError::new_err(format!( + "Output file {} already exists (use overwrite=True to replace).", + out_path.display() + ))); + } + let out = if overwrite { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&out_path) + } else { + OpenOptions::new() + .write(true) + .create_new(true) + .open(&out_path) + } + .map_err(|e| { + PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())) + })?; + let mut out = BufWriter::new(out); + + let mut stream = self.inner.assignment_stream_reader().map_err(|e| { + PyException::new_err(format!("Failed to open stream region: {e}")) + })?; + io::copy(&mut stream, &mut out).map_err(|e| { + PyIOError::new_err(format!("Failed to copy stream bytes: {e}")) + })?; + Ok(()) + } + + fn __repr__(&self) -> String { + format!( + "PyBundleReader(path={:?}, complete={}, format={:?}, samples={:?}, assets={})", + self.path.display(), + self.inner.is_complete(), + self.inner.assignment_format(), + self.inner.sample_count(), + self.inner.assets().len(), + ) + } +} diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 071c58c..3f6d3e3 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -2,52 +2,255 @@ use crate::common::{open_input, open_output, parse_variant, validate_input_outpu use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, }; +use binary_ensemble::io::bundle::format::{ + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, + ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH, COMPLETE_YES, DEFAULT_XZ_PRESET, + HEADER_SIZE, +}; use binary_ensemble::io::writer::AssignmentWriter; -use pyo3::exceptions::PyIOError; -use pyo3::prelude::PyResult; -use pyo3::{pyclass, pyfunction, pymethods}; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyDict, PyList}; +use std::cell::RefCell; use std::fs::File; -use std::io::BufWriter; +use std::io::{self, BufWriter, Seek, SeekFrom, Write}; use std::path::PathBuf; +use std::rc::Rc; +use xz2::write::XzEncoder; + +/// Handle to the underlying output file shared between the live +/// `AssignmentWriter` and the `PyBenEncoder` that owns it. Needed so the +/// encoder can reach the buffered file after the inner assignment writer +/// has finished, in order to patch the bundle header and write the +/// trailing directory. +type SharedFileSlot = Rc>>; + +/// Wrapper around a shared buffered file that implements `Write`. The +/// `AssignmentWriter` holds one of these and delegates every write into +/// the shared slot. +struct SharedFileWriter(SharedFileSlot); -#[pyclass] +impl Write for SharedFileWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.borrow_mut().write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.0.borrow_mut().flush() + } +} + +/// Output container produced by `PyBenEncoder`. +enum OutputMode { + /// Plain `.ben` file: just the assignment stream, no header or directory. + BenOnly, + /// `.bendl` bundle: provisional header up front, optional graph asset, + /// then the assignment stream, then a directory written at close time. + Bundle { + header: BendlHeader, + entries: Vec, + stream_start: u64, + sample_count: i64, + }, +} + +#[pyclass(unsendable)] pub struct PyBenEncoder { - encoder: Option>>, + file: Option, + encoder: Option>, + mode: OutputMode, } #[pymethods] impl PyBenEncoder { + /// Open a new encoder. The default output is a `.bendl` bundle with + /// an embedded assignment stream and an optional embedded graph; set + /// `ben_file_only=True` to emit a plain `.ben` file instead. + /// + /// # Arguments + /// + /// * `file_path` - Output path. Must not exist unless `overwrite=True`. + /// * `overwrite` - Replace an existing file at `file_path`. + /// * `variant` - BEN variant for the assignment stream (`"standard"`, + /// `"mkv_chain"`, or `"twodelta"`). + /// * `graph` - Optional graph to embed as the `graph.json` asset when + /// writing a bundle. Accepts a `pathlib.Path` / `str` path, a + /// `bytes` object containing UTF-8 JSON, a Python `dict` / `list` + /// that will be serialized with `json.dumps`, or a file-like object + /// with a `.read()` method. Passing a graph alongside + /// `ben_file_only=True` is an error. + /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no + /// bundle framing. Defaults to `False`. #[new] - #[pyo3(signature = (file_path, overwrite = false, variant = None))] - #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] - fn new(file_path: PathBuf, overwrite: bool, variant: Option) -> PyResult { + #[pyo3(signature = ( + file_path, + overwrite = false, + variant = None, + graph = None, + ben_file_only = false, + ))] + #[pyo3(text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)")] + fn new( + py: Python<'_>, + file_path: PathBuf, + overwrite: bool, + variant: Option, + graph: Option>, + ben_file_only: bool, + ) -> PyResult { let ben_var = parse_variant(variant.as_deref())?; - let writer = open_output(&file_path, overwrite)?; - let encoder = AssignmentWriter::new(writer, ben_var) - .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {}", e)))?; + if ben_file_only && graph.is_some() { + return Err(PyValueError::new_err( + "graph= cannot be combined with ben_file_only=True (the graph \ + would have nowhere to live in a plain .ben file).", + )); + } + + let buf = open_output(&file_path, overwrite)?; + let file: SharedFileSlot = Rc::new(RefCell::new(buf)); + + let mode = if ben_file_only { + OutputMode::BenOnly + } else { + let graph_bytes = match graph { + Some(obj) => Some(parse_graph_input(py, &obj)?), + None => None, + }; + + // Write a provisional bundle header and any graph asset before + // the assignment stream begins. + let mut header = BendlHeader::provisional(AssignmentFormat::Ben, HEADER_SIZE as u64); + let mut entries: Vec = Vec::new(); + { + let mut slot = file.borrow_mut(); + slot.seek(SeekFrom::Start(0)) + .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; + header + .write_to(&mut *slot) + .map_err(|e| PyIOError::new_err(format!("Failed to write bundle header: {e}")))?; + + if let Some(bytes) = graph_bytes { + let compressed = xz_compress(&bytes).map_err(|e| { + PyIOError::new_err(format!("Failed to xz-compress graph asset: {e}")) + })?; + let payload_offset = slot.stream_position().map_err(|e| { + PyIOError::new_err(format!("Failed to query output position: {e}")) + })?; + slot.write_all(&compressed).map_err(|e| { + PyIOError::new_err(format!("Failed to write graph asset payload: {e}")) + })?; + entries.push(BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset, + payload_len: compressed.len() as u64, + checksum: None, + }); + } + } + + let stream_start = file.borrow_mut().stream_position().map_err(|e| { + PyIOError::new_err(format!("Failed to query output position: {e}")) + })?; + header.stream_offset = stream_start; + + OutputMode::Bundle { + header, + entries, + stream_start, + sample_count: 0, + } + }; + + // Construct the AssignmentWriter on a clone of the shared slot. + // This writes the BEN banner as its first action, which in the + // bundle case becomes the first byte of the stream region. + let encoder = AssignmentWriter::new(SharedFileWriter(Rc::clone(&file)), ben_var) + .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {e}")))?; + Ok(PyBenEncoder { + file: Some(file), encoder: Some(encoder), + mode, }) } + /// Encode a single assignment and append it to the output stream. #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { - if let Some(enc) = self.encoder.as_mut() { - enc.write_assignment(assignment) - .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {}", e)))?; - Ok(()) - } else { - Err(PyIOError::new_err("Encoder has already been closed.")) + let enc = self.encoder.as_mut().ok_or_else(|| { + PyIOError::new_err("Encoder has already been closed.") + })?; + enc.write_assignment(assignment) + .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {e}")))?; + if let OutputMode::Bundle { sample_count, .. } = &mut self.mode { + *sample_count += 1; } + Ok(()) } + /// Flush the assignment stream and, for bundle output, patch the + /// header and write the trailing directory. Idempotent. fn close(&mut self) -> PyResult<()> { + // Finish the assignment stream and drop the inner encoder so its + // Rc handle to the shared file slot is released. if let Some(mut enc) = self.encoder.take() { enc.finish().map_err(|e| { - PyIOError::new_err(format!("Failed to flush encoder when closing: {}", e)) + PyIOError::new_err(format!("Failed to flush encoder when closing: {e}")) })?; + drop(enc); + } + + let file = match self.file.take() { + Some(f) => f, + None => return Ok(()), + }; + + match &mut self.mode { + OutputMode::BenOnly => { + file.borrow_mut() + .flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + } + OutputMode::Bundle { + header, + entries, + stream_start, + sample_count, + } => { + let mut slot = file.borrow_mut(); + let stream_end = slot.stream_position().map_err(|e| { + PyIOError::new_err(format!("Failed to query output position: {e}")) + })?; + let stream_len = stream_end.saturating_sub(*stream_start); + + let directory_offset = stream_end; + let directory_bytes = encode_directory(entries).map_err(|e| { + PyException::new_err(format!("Failed to encode bundle directory: {e}")) + })?; + slot.write_all(&directory_bytes).map_err(|e| { + PyIOError::new_err(format!("Failed to write bundle directory: {e}")) + })?; + let directory_len = directory_bytes.len() as u64; + + header.stream_offset = *stream_start; + header.stream_len = stream_len; + header.directory_offset = directory_offset; + header.directory_len = directory_len; + header.sample_count = *sample_count; + header.complete = COMPLETE_YES; + + slot.seek(SeekFrom::Start(0)) + .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; + header.write_to(&mut *slot).map_err(|e| { + PyIOError::new_err(format!("Failed to patch bundle header: {e}")) + })?; + slot.flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + } } Ok(()) } @@ -67,6 +270,68 @@ impl PyBenEncoder { } } +/// xz-compress a byte slice with the bundle's default preset. +fn xz_compress(bytes: &[u8]) -> io::Result> { + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder.write_all(bytes)?; + encoder.finish() +} + +/// Normalize a user-supplied graph argument into raw UTF-8 JSON bytes. +/// +/// Accepted forms: +/// +/// - `dict` / `list`: serialized via `json.dumps`. +/// - `bytes` / `bytearray`: used verbatim. +/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): +/// `.read()` is called and the result is coerced to bytes. +/// - `pathlib.Path` or `str`: treated as a filesystem path to read. +fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { + // Dict / list → json.dumps. + if obj.is_instance_of::() || obj.is_instance_of::() { + let json_mod = py.import("json")?; + let dumped = json_mod.call_method1("dumps", (obj,))?; + let s: String = dumped.extract()?; + return Ok(s.into_bytes()); + } + + // Raw bytes / bytearray. + if let Ok(b) = obj.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = obj.extract::>() { + return Ok(b); + } + + // File-like: must have .read(). Check before str/path, since a plain + // `str` / `Path` has no `.read()` attribute and will fall through. + if obj.hasattr("read")? { + let data = obj.call_method0("read")?; + if let Ok(b) = data.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = data.extract::>() { + return Ok(b); + } + if let Ok(s) = data.extract::() { + return Ok(s.into_bytes()); + } + return Err(PyException::new_err( + "graph .read() must return bytes or str", + )); + } + + // Path / str → read the file at that path. + let path: PathBuf = obj.extract().map_err(|_| { + PyValueError::new_err( + "graph must be a dict/list, bytes, a file-like with .read(), or a path", + ) + })?; + std::fs::read(&path).map_err(|e| { + PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) + }) +} + #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, n_threads = None, compression_level = None))] #[pyo3( diff --git a/pyben/src/lib.rs b/pyben/src/lib.rs index f976f1a..b993662 100755 --- a/pyben/src/lib.rs +++ b/pyben/src/lib.rs @@ -1,6 +1,7 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; // <-- needed for wrap_pyfunction! +pub mod bundle; pub mod common; pub mod decode; pub mod encode; @@ -10,6 +11,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { // Export classes m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(crate::decode::decompress_ben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::decode::decompress_xben_to_ben, m)?)?; m.add_function(wrap_pyfunction!( diff --git a/pyben/tests/test_bundle.py b/pyben/tests/test_bundle.py new file mode 100644 index 0000000..5f0115e --- /dev/null +++ b/pyben/tests/test_bundle.py @@ -0,0 +1,1375 @@ +"""Tests for PyBundleReader. + +These tests do not rely on the `bendl` CLI binary being built. Instead, they +construct `.bendl` bundles directly in Python from the on-disk format spec +documented in ``ben/src/io/bundle/format.rs``. This keeps the tests +self-contained and lets them stress odd byte layouts that a CLI-based helper +could not produce (truncated files, bad magic, dangling offsets, etc). + +Real BEN/XBEN stream payloads are produced via ``PyBenEncoder`` / +``compress_jsonl_to_xben`` so the stream region always matches what the +main compression pipeline would produce. +""" + +from __future__ import annotations + +import io +import json +import lzma +import random +import struct +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + +import pytest + +import binary_ensemble +from binary_ensemble import ( + PyBenDecoder, + PyBenEncoder, + PyBundleReader, + compress_jsonl_to_ben, + compress_jsonl_to_xben, +) + + +# --------------------------------------------------------------------------- +# Format constants (mirror ben/src/io/bundle/format.rs) +# --------------------------------------------------------------------------- + +BENDL_MAGIC = b"BENDL\x00\x00\x01" +BENDL_MAJOR_VERSION = 1 +BENDL_MINOR_VERSION = 0 +HEADER_SIZE = 64 + +COMPLETE_NO = 0 +COMPLETE_YES = 1 + +ASSIGNMENT_FORMAT_BEN = 1 +ASSIGNMENT_FORMAT_XBEN = 2 + +ASSET_TYPE_METADATA = 1 +ASSET_TYPE_GRAPH = 2 +ASSET_TYPE_RELABEL_MAP = 3 +ASSET_TYPE_CUSTOM = 4 + +ASSET_FLAG_JSON = 1 << 0 +ASSET_FLAG_XZ = 1 << 1 +ASSET_FLAG_CHECKSUM = 1 << 2 + + +# --------------------------------------------------------------------------- +# Byte-level bundle construction +# --------------------------------------------------------------------------- + + +def _pack_header( + *, + complete: int, + assignment_format: int, + directory_offset: int, + directory_len: int, + stream_offset: int, + stream_len: int, + sample_count: int, + magic: bytes = BENDL_MAGIC, + major_version: int = BENDL_MAJOR_VERSION, + minor_version: int = BENDL_MINOR_VERSION, + flags: int = 0, + reserved_0: int = 0, +) -> bytes: + if len(magic) != 8: + raise ValueError("magic must be 8 bytes") + return ( + magic + + struct.pack( + " bytes: + name_bytes = name.encode("utf-8") + checksum_bytes = checksum or b"" + header = struct.pack( + " bytes: + entries = list(entries) + return struct.pack(" bytes: + """Compress ``data`` with the xz container so the Rust xz2 decoder accepts it.""" + return lzma.compress(data, format=lzma.FORMAT_XZ, preset=6) + + +class _Asset: + """Helper describing one asset to place in a hand-built bundle.""" + + def __init__( + self, + *, + asset_type: int, + name: str, + payload: bytes, + is_json: bool = False, + compress: bool = False, + checksum: Optional[bytes] = None, + ) -> None: + self.asset_type = asset_type + self.name = name + self.raw_payload = payload + self.is_json = is_json + self.compress = compress + self.checksum = checksum + + def encoded_bytes(self) -> bytes: + return _xz(self.raw_payload) if self.compress else self.raw_payload + + def flags(self) -> int: + flags = 0 + if self.is_json: + flags |= ASSET_FLAG_JSON + if self.compress: + flags |= ASSET_FLAG_XZ + if self.checksum is not None: + flags |= ASSET_FLAG_CHECKSUM + return flags + + +def build_bundle( + *, + stream_bytes: bytes, + sample_count: int, + assignment_format: int = ASSIGNMENT_FORMAT_BEN, + assets: Iterable[_Asset] = (), + complete: int = COMPLETE_YES, + magic: bytes = BENDL_MAGIC, + major_version: int = BENDL_MAJOR_VERSION, +) -> bytes: + """Construct the bytes of a `.bendl` file from pieces. + + The layout is ``[header][asset payloads][stream][directory]``. This + helper mirrors the writer's finalize path closely enough to produce + bundles that the Rust reader accepts, while also exposing enough knobs + to generate deliberately broken bundles for negative tests. + """ + assets = list(assets) + + buf = bytearray() + # Reserve header space. + buf.extend(b"\x00" * HEADER_SIZE) + + # Write asset payloads and remember (offset, len, encoded_bytes) for each. + encoded_assets: List[Tuple[int, int, bytes]] = [] + for asset in assets: + offset = len(buf) + encoded = asset.encoded_bytes() + buf.extend(encoded) + encoded_assets.append((offset, len(encoded), encoded)) + + stream_offset = len(buf) + buf.extend(stream_bytes) + stream_len = len(stream_bytes) + + directory_offset = len(buf) + entries_bytes: List[bytes] = [] + for (offset, length, _enc), asset in zip(encoded_assets, assets): + entries_bytes.append( + _pack_directory_entry( + asset_type=asset.asset_type, + asset_flags=asset.flags(), + name=asset.name, + payload_offset=offset, + payload_len=length, + checksum=asset.checksum, + ) + ) + directory = _pack_directory(entries_bytes) + buf.extend(directory) + directory_len = len(directory) + + header = _pack_header( + complete=complete, + assignment_format=assignment_format, + directory_offset=directory_offset, + directory_len=directory_len, + stream_offset=stream_offset, + stream_len=stream_len, + sample_count=sample_count, + magic=magic, + major_version=major_version, + ) + buf[:HEADER_SIZE] = header + return bytes(buf) + + +# --------------------------------------------------------------------------- +# Real BEN/XBEN stream helpers +# --------------------------------------------------------------------------- + + +def _write_jsonl(samples: List[List[int]], path: Path) -> None: + with path.open("w", encoding="utf-8") as f: + for i, a in enumerate(samples, start=1): + json.dump({"assignment": a, "sample": i}, f, separators=(",", ":")) + f.write("\n") + + +def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: + """Produce real BEN bytes for ``samples`` via ``PyBenEncoder``.""" + ben_path = tmp / "inner.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant=variant, ben_file_only=True + ) as enc: + for a in samples: + enc.write(a) + return ben_path.read_bytes() + + +def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: + src = tmp / "src.jsonl" + _write_jsonl(samples, src) + out = tmp / "inner.xben" + compress_jsonl_to_xben( + src, out, overwrite=True, variant=variant, n_threads=1, compression_level=1 + ) + return out.read_bytes() + + +def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: + path.write_bytes(bundle_bytes) + return path + + +# --------------------------------------------------------------------------- +# Baseline happy-path tests +# --------------------------------------------------------------------------- + + +def test_module_exports_pybundlereader() -> None: + assert "PyBundleReader" in binary_ensemble.__all__ + assert hasattr(binary_ensemble, "PyBundleReader") + + +def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: + rng = random.Random(4242) + samples = [[rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40)] + + graph_json = b'{"nodes":[0,1,2,3],"edges":[[0,1],[1,2],[2,3]]}' + metadata_json = b'{"note":"hello bundle","seed":4242}' + relabel_json = b'{"0":"A","1":"B","2":"C","3":"D"}' + custom_blob = bytes(range(256)) + + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_BEN, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=metadata_json, + is_json=True, + compress=False, + ), + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=graph_json, + is_json=True, + compress=True, + ), + _Asset( + asset_type=ASSET_TYPE_RELABEL_MAP, + name="relabel_map.json", + payload=relabel_json, + is_json=True, + compress=False, + ), + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="notes.bin", + payload=custom_blob, + is_json=False, + compress=False, + ), + ], + ) + path = _write_bundle(tmp_path / "out.bendl", bundle) + + reader = PyBundleReader(path) + + assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) + assert reader.is_complete() is True + assert reader.sample_count() == len(samples) + assert reader.assignment_format() == "ben" + + names = reader.asset_names() + assert names == ["metadata.json", "graph.json", "relabel_map.json", "notes.bin"] + + assets = reader.list_assets() + assert [a["name"] for a in assets] == names + by_name = {a["name"]: a for a in assets} + assert by_name["graph.json"]["type"] == ASSET_TYPE_GRAPH + assert "xz" in by_name["graph.json"]["flags"] + assert "json" in by_name["graph.json"]["flags"] + assert "xz" not in by_name["metadata.json"]["flags"] + assert "json" in by_name["metadata.json"]["flags"] + assert by_name["notes.bin"]["flags"] == [] + # payload_offset must sit at or past the end of the header. + for entry in assets: + assert entry["offset"] >= HEADER_SIZE + assert entry["len"] > 0 + + # Raw byte access (decompresses xz transparently). + assert reader.read_asset_bytes("metadata.json") == metadata_json + assert reader.read_asset_bytes("graph.json") == graph_json + assert reader.read_asset_bytes("relabel_map.json") == relabel_json + assert reader.read_asset_bytes("notes.bin") == custom_blob + + # Typed JSON helpers. + assert reader.read_metadata() == json.loads(metadata_json) + assert reader.read_graph() == json.loads(graph_json) + assert reader.read_relabel_map() == json.loads(relabel_json) + + # read_json_asset by name. + assert reader.read_json_asset("metadata.json") == json.loads(metadata_json) + + # extract_stream then decode via PyBenDecoder. + extracted = tmp_path / "stream.ben" + reader.extract_stream(extracted) + got = list(PyBenDecoder(extracted, mode="ben")) + assert got == samples + + # __repr__ should not crash and should mention the path. + r = repr(reader) + assert "PyBundleReader" in r + assert "complete=true" in r or "complete=True" in r + + +def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: + samples = [[1, 2, 3], [1, 2, 3], [4, 4, 5], [6, 7, 8]] + bundle = build_bundle( + stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_XBEN, + assets=[], + ) + path = _write_bundle(tmp_path / "xout.bendl", bundle) + reader = PyBundleReader(path) + + assert reader.assignment_format() == "xben" + assert reader.is_complete() + assert reader.sample_count() == len(samples) + assert reader.asset_names() == [] + + # extract_stream → file must round-trip via the xben decoder. + extracted = tmp_path / "stream.xben" + reader.extract_stream(extracted) + assert list(PyBenDecoder(extracted, mode="xben")) == samples + + +def test_bundle_reader_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: + samples = [[1, 2, 3]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[ + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="only_custom.bin", + payload=b"x", + ), + ], + ) + path = _write_bundle(tmp_path / "sparse.bendl", bundle) + reader = PyBundleReader(path) + assert reader.read_metadata() is None + assert reader.read_graph() is None + assert reader.read_relabel_map() is None + + +def test_bundle_reader_asset_free_empty_stream(tmp_path: Path) -> None: + # A bundle with no assets and an empty stream is legal (spec says so). + bundle = build_bundle(stream_bytes=b"", sample_count=0, assets=[]) + path = _write_bundle(tmp_path / "empty.bendl", bundle) + reader = PyBundleReader(path) + assert reader.is_complete() + assert reader.sample_count() == 0 + assert reader.asset_names() == [] + assert reader.list_assets() == [] + # extract_stream writes a zero-byte file. + out = tmp_path / "empty.ben" + reader.extract_stream(out) + assert out.read_bytes() == b"" + + +# --------------------------------------------------------------------------- +# Robustness: asset lookup and JSON parsing +# --------------------------------------------------------------------------- + + +def test_read_asset_bytes_raises_keyerror_for_unknown_name(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="there.bin", payload=b"x"), + ], + ) + path = _write_bundle(tmp_path / "x.bendl", bundle) + reader = PyBundleReader(path) + with pytest.raises(KeyError, match="no asset named"): + reader.read_asset_bytes("missing.bin") + with pytest.raises(KeyError): + reader.read_json_asset("missing.json") + + +def test_read_json_asset_rejects_non_utf8_payload(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="binary.bin", + payload=b"\xff\xfe\xfd", # not valid UTF-8 + is_json=False, + compress=False, + ) + ], + ) + path = _write_bundle(tmp_path / "bin.bendl", bundle) + reader = PyBundleReader(path) + # Raw bytes come back fine. + assert reader.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" + # But the JSON helper must reject non-UTF8 bytes. + with pytest.raises(Exception, match="not valid UTF-8"): + reader.read_json_asset("binary.bin") + + +def test_read_json_asset_rejects_malformed_json(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b"not a json {{{", + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "m.bendl", bundle) + reader = PyBundleReader(path) + # Raw bytes: fine. + assert reader.read_asset_bytes("metadata.json") == b"not a json {{{" + # Parsed via python's json module: must raise. + with pytest.raises(json.JSONDecodeError): + reader.read_metadata() + + +def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: + # Directory entries store UTF-8 names; a multi-byte name should work. + name = "tëst_ääää_✓.bin" + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name=name, payload=b"payload"), + ], + ) + path = _write_bundle(tmp_path / "u.bendl", bundle) + reader = PyBundleReader(path) + assert reader.asset_names() == [name] + assert reader.read_asset_bytes(name) == b"payload" + + +def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: + # Stress the directory with a large-ish asset count. + payloads = {f"asset_{i:04d}.bin": bytes([i & 0xFF] * (i + 1)) for i in range(200)} + assets = [ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) + for n, p in payloads.items() + ] + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), + sample_count=1, + assets=assets, + ) + path = _write_bundle(tmp_path / "many.bendl", bundle) + reader = PyBundleReader(path) + names = reader.asset_names() + assert names == list(payloads.keys()) + # Spot-check the contents round-trip. + for i in (0, 1, 42, 199): + name = f"asset_{i:04d}.bin" + assert reader.read_asset_bytes(name) == payloads[name] + + +# --------------------------------------------------------------------------- +# Robustness: extract_stream overwrite semantics +# --------------------------------------------------------------------------- + + +def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + ) + path = _write_bundle(tmp_path / "a.bendl", bundle) + reader = PyBundleReader(path) + target = tmp_path / "already.ben" + target.write_bytes(b"pre-existing") + with pytest.raises(OSError, match="already exists"): + reader.extract_stream(target) + # File must be untouched. + assert target.read_bytes() == b"pre-existing" + + +def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), + sample_count=2, + ) + path = _write_bundle(tmp_path / "b.bendl", bundle) + reader = PyBundleReader(path) + target = tmp_path / "out.ben" + target.write_bytes(b"filler") + reader.extract_stream(target, overwrite=True) + # Re-opening the extracted file via PyBenDecoder confirms it's a valid .ben. + assert list(PyBenDecoder(target, mode="ben")) == [[1, 2], [3, 4]] + + +# --------------------------------------------------------------------------- +# Robustness: invalid headers and corrupted bundles +# --------------------------------------------------------------------------- + + +def test_open_rejects_missing_file(tmp_path: Path) -> None: + with pytest.raises(OSError, match="Failed to open"): + PyBundleReader(tmp_path / "does_not_exist.bendl") + + +def test_open_rejects_bad_magic(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + magic=b"NOTABEND", + ) + path = _write_bundle(tmp_path / "bad.bendl", bundle) + with pytest.raises(Exception, match="Failed to parse bundle header"): + PyBundleReader(path) + + +def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + major_version=999, + ) + path = _write_bundle(tmp_path / "oldfuture.bendl", bundle) + with pytest.raises(Exception, match="Failed to parse bundle header"): + PyBundleReader(path) + + +def test_open_rejects_truncated_header(tmp_path: Path) -> None: + path = tmp_path / "short.bendl" + path.write_bytes(b"BENDL\x00\x00\x01\x00") # magic plus 2 bytes — not enough + with pytest.raises(Exception, match="Failed to parse bundle header"): + PyBundleReader(path) + + +def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> None: + # Corrupt the directory's leading u32 entry-count so the reader tries + # to decode many more entries than the file actually contains. + bundle = bytearray( + build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="x", payload=b"abc")], + ) + ) + directory_offset = struct.unpack_from(" None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="x", payload=b"abc")], + ) + # Drop the final two bytes of the directory. + path = _write_bundle(tmp_path / "chop.bendl", bundle[:-2]) + with pytest.raises(Exception): + PyBundleReader(path) + + +def test_incomplete_bundle_reports_none_sample_count(tmp_path: Path) -> None: + # Provisional bundle with complete=0: sample_count() must be None. + stream = _ben_bytes_for([[1, 2, 3]], tmp_path) + # Build it by hand — no directory, complete=NO. + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=0, + sample_count=-1, + ) + path = _write_bundle(tmp_path / "incomplete.bendl", header + stream) + reader = PyBundleReader(path) + assert reader.is_complete() is False + assert reader.sample_count() is None + assert reader.asset_names() == [] + # extract_stream should still write out bytes that decode as BEN. + out = tmp_path / "extracted.ben" + reader.extract_stream(out) + assert list(PyBenDecoder(out, mode="ben")) == [[1, 2, 3]] + + +def test_unknown_assignment_format_byte_reports_none(tmp_path: Path) -> None: + # Assignment format byte = 0 → unknown. Finalized bundle but without + # a valid stream container — the directory side still works. + bundle = bytearray( + build_bundle( + stream_bytes=b"", + sample_count=0, + assets=[], + ) + ) + # assignment_format byte is at offset 13 in the header. + bundle[13] = 99 + path = _write_bundle(tmp_path / "wtfmt.bendl", bytes(bundle)) + reader = PyBundleReader(path) + assert reader.assignment_format() is None + assert reader.is_complete() + + +def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: + bundle = bytearray( + build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=b'{"nodes":[0,1,2,3,4,5,6,7,8,9]}', + is_json=True, + compress=True, + ) + ], + ) + ) + + # Hunt for the xz payload bytes and flip one in the middle. + # We know the xz magic is b"\xfd7zXZ". + xz_start = bundle.find(b"\xfd7zXZ") + assert xz_start != -1, "expected xz magic in hand-built bundle" + # Flip a byte well past the magic so the decoder reads it and fails. + bundle[xz_start + 20] ^= 0xFF + path = _write_bundle(tmp_path / "badxz.bendl", bytes(bundle)) + reader = PyBundleReader(path) + # Opening works — the header/directory are intact. + with pytest.raises(OSError): + reader.read_asset_bytes("graph.json") + + +def test_directory_entry_with_zero_length_custom_payload(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="empty.bin", payload=b""), + ], + ) + path = _write_bundle(tmp_path / "zlen.bendl", bundle) + reader = PyBundleReader(path) + assert reader.read_asset_bytes("empty.bin") == b"" + entry = next(a for a in reader.list_assets() if a["name"] == "empty.bin") + assert entry["len"] == 0 + + +def test_repr_on_incomplete_bundle(tmp_path: Path) -> None: + stream = _ben_bytes_for([[1, 2]], tmp_path) + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=0, + sample_count=-1, + ) + path = _write_bundle(tmp_path / "rep.bendl", header + stream) + reader = PyBundleReader(path) + r = repr(reader) + # Incomplete bundles report no sample count. + assert "samples=None" in r + assert "assets=0" in r + + +# --------------------------------------------------------------------------- +# Robustness: interrupted / truncated BEN streams inside a bundle +# --------------------------------------------------------------------------- + + +def _incomplete_bundle(stream_bytes: bytes) -> bytes: + """Simulate a writer that crashed mid-stream: valid header, partial + stream bytes, and no directory table at all (complete=0).""" + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=0, + sample_count=-1, + ) + return header + stream_bytes + + +def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) -> None: + # Simulate a writer that was killed after flushing the header and + # part of the BEN stream, but before the stream was finished or the + # directory was written. + samples = [[1, 1, 2, 2], [3, 3, 4, 4], [5, 5, 6, 6], [7, 7, 8, 8], [9, 9, 9, 9]] + full_ben = _ben_bytes_for(samples, tmp_path) + # Cut the BEN bytes well past the 17-byte banner but before the end + # so the truncation lands mid-frame. + assert len(full_ben) > 25 + partial = full_ben[: len(full_ben) - 3] + path = _write_bundle(tmp_path / "crashed.bendl", _incomplete_bundle(partial)) + + reader = PyBundleReader(path) + assert reader.is_complete() is False + assert reader.sample_count() is None + assert reader.assignment_format() == "ben" + + # extract_stream should write exactly the partial byte sequence. + extracted = tmp_path / "partial.ben" + reader.extract_stream(extracted) + assert extracted.read_bytes() == partial + + # The extracted file opens as a BEN stream (banner is intact). + dec = PyBenDecoder(extracted, mode="ben") + # Iterating through the truncated stream must either yield a strict + # prefix of the samples and then raise, or raise on the very first + # frame — both are acceptable outcomes. What is NOT acceptable is + # silently producing garbage or decoding past the truncation. + produced: list[list[int]] = [] + with pytest.raises(Exception): + for a in dec: + produced.append(a) + # Whatever came out must be a strict prefix of the original samples. + assert produced == samples[: len(produced)] + assert len(produced) < len(samples) + + +def test_interrupted_ben_stream_inside_banner_fails_to_open_decoder( + tmp_path: Path, +) -> None: + # Truncate the BEN bytes inside the 17-byte banner region. + full_ben = _ben_bytes_for([[1, 2, 3]], tmp_path) + path = _write_bundle(tmp_path / "head_cut.bendl", _incomplete_bundle(full_ben[:8])) + + reader = PyBundleReader(path) + assert reader.is_complete() is False + + extracted = tmp_path / "head_cut.ben" + reader.extract_stream(extracted) + # The decoder must reject a BEN file whose banner is incomplete. + with pytest.raises(Exception, match="Failed to create BenDecoder"): + PyBenDecoder(extracted, mode="ben") + + +def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: + # The worst case: the writer crashed after writing the header and + # before any stream bytes landed. + path = _write_bundle(tmp_path / "zero.bendl", _incomplete_bundle(b"")) + + reader = PyBundleReader(path) + assert reader.is_complete() is False + assert reader.sample_count() is None + assert reader.asset_names() == [] + + extracted = tmp_path / "zero.ben" + reader.extract_stream(extracted) + assert extracted.read_bytes() == b"" + # A zero-byte .ben has no banner → decoder construction must fail. + with pytest.raises(Exception, match="Failed to create BenDecoder"): + PyBenDecoder(extracted, mode="ben") + + +def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) -> None: + # Build a valid finalized bundle, then patch stream_len to a value + # larger than the actual stream payload. This simulates the narrow + # window where the writer updated the header but was killed before + # writing the directory — and something (or someone) re-flagged it + # as finalized. + samples = [[1, 2, 3], [4, 5, 6]] + bundle = bytearray( + build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + ) + # stream_len lives at header offset 48..56. + old_stream_len = struct.unpack_from(" None: + # Confirm that the same reader can serve asset reads after an + # extract_stream call (i.e. internal seek state doesn't wedge things). + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), + sample_count=2, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b'{"x":1}', + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "seq.bendl", bundle) + reader = PyBundleReader(path) + reader.extract_stream(tmp_path / "s.ben") + assert reader.read_metadata() == {"x": 1} + reader.extract_stream(tmp_path / "s2.ben", overwrite=True) + assert reader.read_asset_bytes("metadata.json") == b'{"x":1}' + + +# --------------------------------------------------------------------------- +# Stress / fuzz +# --------------------------------------------------------------------------- + + +def test_long_asset_name_near_u16_max(tmp_path: Path) -> None: + # name_len in the directory entry is u16, so ~65500 is near the top. + # Anything above u16::MAX should be rejected by a real writer — we only + # stress the reader here, so we stay safely under 65535. + long_name = "x" * 65500 + ".bin" + assert len(long_name.encode("utf-8")) < 65536 + payload = b"payload-for-absurdly-long-name" + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name=long_name, payload=payload) + ], + ) + path = _write_bundle(tmp_path / "long.bendl", bundle) + reader = PyBundleReader(path) + assert reader.asset_names() == [long_name] + assert reader.read_asset_bytes(long_name) == payload + + +def test_list_assets_flag_fidelity(tmp_path: Path) -> None: + # Every combination of (json, xz, checksum) should round-trip verbatim + # through list_assets()["flags"]. + combos: List[Tuple[bool, bool, bool]] = [ + (False, False, False), + (True, False, False), + (False, True, False), + (False, False, True), + (True, True, False), + (True, False, True), + (False, True, True), + (True, True, True), + ] + assets: List[_Asset] = [] + expected: List[List[str]] = [] + for i, (is_json, compress, has_checksum) in enumerate(combos): + payload = f'{{"i":{i}}}'.encode("utf-8") if is_json else bytes([i % 256]) * 32 + checksum = b"\xde\xad\xbe\xef" if has_checksum else None + assets.append( + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name=f"asset-{i}.bin", + payload=payload, + is_json=is_json, + compress=compress, + checksum=checksum, + ) + ) + want: List[str] = [] + if is_json: + want.append("json") + if compress: + want.append("xz") + if has_checksum: + want.append("checksum") + expected.append(want) + + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=assets, + ) + path = _write_bundle(tmp_path / "flags.bendl", bundle) + reader = PyBundleReader(path) + got = reader.list_assets() + assert len(got) == len(combos) + for entry, want in zip(got, expected): + assert entry["flags"] == want + + +def test_read_asset_bytes_is_idempotent(tmp_path: Path) -> None: + # Reading the same asset twice (with an xz round-trip in between) must + # return byte-identical content, proving no internal state gets mutated. + payload = b"repeat-me " * 100 + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="raw.bin", + payload=payload, + ), + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="compressed.bin", + payload=payload, + compress=True, + ), + ], + ) + path = _write_bundle(tmp_path / "idem.bendl", bundle) + reader = PyBundleReader(path) + for _ in range(5): + assert reader.read_asset_bytes("raw.bin") == payload + assert reader.read_asset_bytes("compressed.bin") == payload + + +def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: + # 500 custom assets with rotating flags. This exercises directory + # scaling, offset bookkeeping, and name lookup on a non-trivial directory. + N = 500 + assets: List[_Asset] = [] + expected: List[Tuple[str, bytes]] = [] + rng = random.Random(0xBEEF) + for i in range(N): + payload = rng.randbytes(rng.randint(1, 200)) + compress = i % 3 == 0 + is_json = i % 5 == 0 + # When is_json is set we need valid UTF-8; use a safe synthetic blob. + if is_json: + payload = f'{{"i":{i},"n":{rng.randint(0, 1000)}}}'.encode("utf-8") + assets.append( + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name=f"asset-{i:04d}.bin", + payload=payload, + is_json=is_json, + compress=compress, + ) + ) + expected.append((f"asset-{i:04d}.bin", payload)) + + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2, 3], [4, 5, 6]], tmp_path), + sample_count=2, + assets=assets, + ) + path = _write_bundle(tmp_path / "many.bendl", bundle) + reader = PyBundleReader(path) + + assert reader.asset_names() == [name for name, _ in expected] + # Sample every 37th asset and verify the payload decodes correctly + # (xz pass-through on ~a third of them). + for idx in range(0, N, 37): + name, want = expected[idx] + assert reader.read_asset_bytes(name) == want + # Spot-check a JSON asset that was flagged json+compressed? Only json alone. + json_idxs = [i for i in range(N) if i % 5 == 0 and i % 3 != 0] + assert json_idxs # sanity + sample = json_idxs[len(json_idxs) // 2] + name, want = expected[sample] + assert reader.read_json_asset(name) == json.loads(want) + + +def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: + # Build 20 deliberately-different bundles from a seeded PRNG. Each one + # mixes random asset sizes, random flags, random samples, and is then + # fully round-tripped through PyBundleReader + PyBenDecoder. + rng = random.Random(0xFEED_FACE) + for trial in range(20): + n_assets = rng.randint(0, 12) + assets: List[_Asset] = [] + truth: List[Tuple[str, bytes]] = [] + for i in range(n_assets): + size = rng.choice([0, 1, 7, 64, 500, 4096]) + payload = rng.randbytes(size) + compress = rng.random() < 0.4 + assets.append( + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name=f"t{trial}-a{i}.bin", + payload=payload, + compress=compress, + ) + ) + truth.append((f"t{trial}-a{i}.bin", payload)) + + n_samples = rng.randint(1, 25) + samples = [ + [rng.randint(1, 8) for _ in range(rng.randint(1, 40))] + for _ in range(n_samples) + ] + + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=n_samples, + assets=assets, + ) + path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) + + reader = PyBundleReader(path) + assert reader.is_complete() + assert reader.sample_count() == n_samples + assert reader.asset_names() == [name for name, _ in truth] + for name, want in truth: + assert reader.read_asset_bytes(name) == want + + extracted = tmp_path / f"fuzz-{trial}.ben" + reader.extract_stream(extracted) + assert list(PyBenDecoder(extracted, mode="ben")) == samples + + +def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: + # Interleave every user-facing method to prove the reader does not + # wedge its internal seek state when operations are reordered. + samples = [[1, 2], [3, 4], [5, 6], [7, 8]] + metadata = b'{"hello":"world"}' + graph = b'{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}' + custom = b"\x00\x01\x02\x03" * 64 + + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=metadata, + is_json=True, + ), + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=graph, + is_json=True, + compress=True, + ), + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="blob.bin", + payload=custom, + ), + ], + ) + path = _write_bundle(tmp_path / "interleave.bendl", bundle) + reader = PyBundleReader(path) + + # Strongly non-sequential access pattern. + assert reader.read_asset_bytes("blob.bin") == custom + assert reader.read_metadata() == {"hello": "world"} + reader.extract_stream(tmp_path / "a.ben") + assert reader.read_graph() == json.loads(graph) + reader.extract_stream(tmp_path / "b.ben", overwrite=True) + assert reader.read_asset_bytes("metadata.json") == metadata + assert reader.read_asset_bytes("blob.bin") == custom + assert reader.read_asset_bytes("graph.json") == graph + reader.extract_stream(tmp_path / "c.ben", overwrite=True) + + # Every extracted stream must be byte-identical. + a = (tmp_path / "a.ben").read_bytes() + b = (tmp_path / "b.ben").read_bytes() + c = (tmp_path / "c.ben").read_bytes() + assert a == b == c + assert list(PyBenDecoder(tmp_path / "a.ben", mode="ben")) == samples + + +def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + ) + path = _write_bundle(tmp_path / "mini.bendl", bundle) + reader = PyBundleReader(path) + missing = tmp_path / "does" / "not" / "exist" / "out.ben" + with pytest.raises(OSError): + reader.extract_stream(missing) + + +# --------------------------------------------------------------------------- +# PyBenEncoder bundle-output tests +# --------------------------------------------------------------------------- + + +SAMPLE_GRAPH = { + "directed": False, + "multigraph": False, + "graph": {}, + "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}], + "adjacency": [ + [{"id": 1}], + [{"id": 0}, {"id": 2}], + [{"id": 1}, {"id": 3}], + [{"id": 2}], + ], +} + + +def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None: + out = tmp_path / "stream.bendl" + samples = [[1, 1, 2, 2], [3, 3, 2, 2], [3, 3, 3, 3]] + with PyBenEncoder(out, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) + assert reader.is_complete() + assert reader.sample_count() == len(samples) + assert reader.assignment_format() == "ben" + # No graph because none was provided. + assert reader.asset_names() == [] + assert reader.read_graph() is None + + extracted = tmp_path / "extracted.ben" + reader.extract_stream(extracted) + assert list(PyBenDecoder(extracted, mode="ben")) == samples + + +def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: + out = tmp_path / "with_graph.bendl" + samples = [[1, 1, 2, 2], [1, 1, 3, 3]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.is_complete() + assert reader.sample_count() == len(samples) + assert reader.asset_names() == ["graph.json"] + + assets = reader.list_assets() + assert len(assets) == 1 + graph_entry = assets[0] + assert graph_entry["name"] == "graph.json" + # Default bundle policy xz-compresses graph.json. + assert "xz" in graph_entry["flags"] + assert "json" in graph_entry["flags"] + + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: + graph_path = tmp_path / "graph.json" + graph_path.write_text(json.dumps(SAMPLE_GRAPH)) + + out = tmp_path / "with_graph_path.bendl" + samples = [[0, 0, 1, 1]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=graph_path + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.asset_names() == ["graph.json"] + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: + # String paths must be accepted verbatim (same coercion Path arguments + # go through elsewhere in the API). + graph_path = tmp_path / "graph-str.json" + graph_path.write_text(json.dumps(SAMPLE_GRAPH)) + + out = tmp_path / "via-str.bendl" + samples = [[0, 1, 0, 1]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=str(graph_path) + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: + raw = json.dumps(SAMPLE_GRAPH).encode("utf-8") + out = tmp_path / "via-bytes.bendl" + samples = [[2, 2, 2, 2]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=raw + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: + buf = io.BytesIO(json.dumps(SAMPLE_GRAPH).encode("utf-8")) + out = tmp_path / "via-bytesio.bendl" + samples = [[1, 2, 1, 2]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=buf + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: + buf = io.StringIO(json.dumps(SAMPLE_GRAPH)) + out = tmp_path / "via-stringio.bendl" + samples = [[3, 3, 3, 3]] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=buf + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> None: + out = tmp_path / "full.bendl" + rng = random.Random(0xCAFE) + samples = [[rng.randint(1, 8) for _ in range(12)] for _ in range(15)] + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH + ) as enc: + for a in samples: + enc.write(a) + + reader = PyBundleReader(out) + assert reader.sample_count() == len(samples) + extracted = tmp_path / "full.ben" + reader.extract_stream(extracted) + assert list(PyBenDecoder(extracted, mode="ben")) == samples + # And the graph still round-trips from the same reader. + assert reader.read_graph() == SAMPLE_GRAPH + + +def test_pybenencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: + out = tmp_path / "ben-with-graph.ben" + with pytest.raises(ValueError, match="ben_file_only"): + PyBenEncoder( + out, + overwrite=True, + variant="standard", + graph=SAMPLE_GRAPH, + ben_file_only=True, + ) + + +def test_pybenencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: + # A ben_file_only=True output should be byte-identical to the legacy + # plain-BEN path, so the header has no BENDL magic. + out = tmp_path / "legacy.ben" + with PyBenEncoder( + out, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + enc.write([1, 2, 3]) + blob = out.read_bytes() + assert not blob.startswith(BENDL_MAGIC) + # PyBenDecoder should still read it in ben mode. + assert list(PyBenDecoder(out, mode="ben")) == [[1, 2, 3]] + + +def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: + out = tmp_path / "idem.bendl" + enc = PyBenEncoder(out, overwrite=True, variant="standard") + enc.write([1, 1, 2]) + enc.close() + enc.close() # second close must be a no-op + with pytest.raises(OSError, match="already been closed"): + enc.write([1, 2, 3]) + + reader = PyBundleReader(out) + assert reader.is_complete() + assert reader.sample_count() == 1 + + +def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: + out = tmp_path / "bad.bendl" + with pytest.raises(ValueError, match="graph must be"): + PyBenEncoder(out, overwrite=True, variant="standard", graph=12345) diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 29bcd63..84ba92b 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -302,7 +302,9 @@ def test_pybenencoder_roundtrip(tmp_path: Path) -> None: seq = gen_sequence_standard(rng, n_samples) ben = tmp_path / "out.ben" - with PyBenEncoder(ben, overwrite=True, variant="standard") as enc: + with PyBenEncoder( + ben, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in seq: enc.write(a) @@ -400,7 +402,7 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3]] default_ben = tmp_path / "default.ben" - with PyBenEncoder(default_ben, overwrite=True) as enc: + with PyBenEncoder(default_ben, overwrite=True, ben_file_only=True) as enc: for sample in samples: enc.write(sample) assert list(PyBenDecoder(default_ben, mode="ben")) == samples @@ -425,7 +427,9 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: out = tmp_path / "out.ben" - enc = PyBenEncoder(out, overwrite=True, variant="standard") + enc = PyBenEncoder( + out, overwrite=True, variant="standard", ben_file_only=True + ) enc.write([1, 2, 3]) enc.close() enc.close() @@ -433,7 +437,9 @@ def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: enc.write([1, 2, 3]) ctx_path = tmp_path / "ctx.ben" - with PyBenEncoder(ctx_path, overwrite=True, variant="standard") as ctx_enc: + with PyBenEncoder( + ctx_path, overwrite=True, variant="standard", ben_file_only=True + ) as ctx_enc: ctx_enc.write([4, 5, 6]) assert list(PyBenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] From 5b5ba8b9d8c969245a490e50f99d0b69eec8c2c2 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 11 Apr 2026 10:44:50 -0600 Subject: [PATCH 065/221] better stess testing --- ben/src/cli/ben.rs | 317 ++++++++++- ben/src/cli/bendl.rs | 25 +- ben/src/cli/reben.rs | 487 ++++++++-------- ben/src/codec/decode/ben.rs | 9 + ben/src/codec/decode/errors.rs | 10 +- ben/src/codec/decode/jsonl.rs | 7 +- ben/src/codec/decode/mod.rs | 4 +- ben/src/codec/decode/tests/mkvchain.rs | 11 +- ben/src/codec/decode/tests/mod.rs | 6 +- ben/src/codec/decode/twodelta.rs | 7 + ben/src/codec/decode/xz.rs | 6 +- ben/src/codec/encode/ben.rs | 6 + ben/src/codec/encode/tests.rs | 46 +- ben/src/codec/encode/twodelta.rs | 12 + ben/src/codec/encode/xz.rs | 3 +- ben/src/codec/frames/tests.rs | 63 ++- ben/src/codec/frames/twodelta_decode.rs | 9 +- ben/src/codec/translate/tests.rs | 9 +- ben/src/io/bundle/format.rs | 51 +- ben/src/io/bundle/manifest.rs | 3 +- ben/src/io/bundle/reader.rs | 103 ++-- ben/src/io/bundle/writer.rs | 173 +++--- ben/src/io/reader/assignment_reader.rs | 34 +- ben/src/io/reader/subsample.rs | 9 +- ben/src/io/reader/tests.rs | 54 +- ben/src/io/reader/xz_assignment_reader.rs | 73 ++- ben/src/io/writer/assignment_writer.rs | 54 +- ben/src/io/writer/tests.rs | 1 + ben/src/io/writer/xz_assignment_writer.rs | 67 ++- ben/src/ops/relabel/errors.rs | 12 +- ben/src/ops/relabel/mod.rs | 12 + ben/src/util/rle/mod.rs | 5 + ben/tests/test_assignment_reader.rs | 241 ++++++-- ben/tests/test_cli.rs | 77 +++ ben/tests/test_coverage.rs | 23 +- ben/tests/test_impls_pipeline.rs | 21 +- ben/tests/test_stress_edges.rs | 640 ++++++++++++++++++++++ 37 files changed, 2119 insertions(+), 571 deletions(-) create mode 100644 ben/tests/test_stress_edges.rs diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 1289d3f..7048062 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -5,12 +5,17 @@ use crate::codec::decode::{ use crate::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; +use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH}; +use crate::io::bundle::writer::BendlAppender; +use crate::io::bundle::{AddAssetOptions, BendlWriter}; +use crate::io::reader::subsample::count_samples_from_file; use crate::ops::extract::extract_assignment_ben; use crate::BenVariant; use clap::{Parser, ValueEnum}; use std::{ - fs::File, - io::{self, BufReader, BufWriter, Result, Write}, + fs::{File, OpenOptions}, + io::{self, BufRead, BufReader, BufWriter, Result, Write}, + path::{Path, PathBuf}, }; type DynReader = Box; @@ -142,6 +147,12 @@ struct Args { /// Default is 10,000. #[arg(long)] chunk_size: Option, + /// Embed a graph JSON asset alongside the assignment stream and emit + /// the result as a `.bendl` bundle. The graph is added after the + /// assignment stream has been fully written. Only applies to the + /// `encode` and `x-encode` modes. + #[arg(long)] + graph: Option, } /// Derive the output path for encode-style CLI modes. @@ -152,6 +163,9 @@ struct Args { /// * `input_file_name` - The input file path supplied by the user. /// * `output_file_name` - An optional explicit output path. /// * `overwrite` - Whether to skip overwrite prompting. +/// * `with_graph` - When true, the output is a `.bendl` bundle instead +/// of a bare `.ben`/`.xben` stream, so the derived extension is +/// `.bendl` regardless of `mode`. /// /// # Returns /// @@ -161,8 +175,11 @@ fn encode_setup( input_file_name: String, output_file_name: Option, overwrite: bool, + with_graph: bool, ) -> Result { - let extension = if mode == Mode::XEncode { + let extension = if with_graph { + ".bendl" + } else if mode == Mode::XEncode { ".xben" } else if mode == Mode::Encode { ".ben" @@ -173,8 +190,13 @@ fn encode_setup( let out_file_name = match output_file_name { Some(name) => name.to_owned(), None => { - if input_file_name.ends_with(".ben") && extension == ".xben" { + let stripped_ben = input_file_name.ends_with(".ben") + && (extension == ".xben" || extension == ".bendl"); + let stripped_xben = input_file_name.ends_with(".xben") && extension == ".bendl"; + if stripped_ben { input_file_name.trim_end_matches(".ben").to_owned() + extension + } else if stripped_xben { + input_file_name.trim_end_matches(".xben").to_owned() + extension } else { input_file_name.to_string() + extension } @@ -287,15 +309,209 @@ fn open_derived_writer(path: String) -> DynWriter { Box::new(BufWriter::new(File::create(path).unwrap())) } +/// Count the number of non-empty lines in a JSONL file. Used to populate +/// the bundle header's `sample_count` when wrapping a stream encode in a +/// `.bendl` container. +fn count_jsonl_lines(path: &Path) -> io::Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut n: i64 = 0; + for line in reader.lines() { + let line = line?; + if !line.is_empty() { + n += 1; + } + } + Ok(n) +} + +/// After a finalized `.bendl` has been written, reopen it in append mode +/// and attach the graph asset in-place. This runs *after* the stream has +/// finished, which is why we print "Adding graph..." at this point. +fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { + eprintln!("Adding graph..."); + let graph_bytes = std::fs::read(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to read graph {graph_path:?}: {e}"), + ) + })?; + + let file = OpenOptions::new().read(true).write(true).open(out_path)?; + let mut appender = BendlAppender::open(file) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + appender + .add_asset( + ASSET_TYPE_GRAPH, + CANONICAL_NAME_GRAPH, + &graph_bytes, + AddAssetOptions::defaults().json(), + ) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to add graph asset: {e}"), + ) + })?; + appender + .commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + Ok(()) +} + +/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at +/// `out_path` and then append the graph as a post-stream asset. +fn run_encode_bundle_with_graph( + input_path: &Path, + out_path: &str, + variant: BenVariant, + graph_path: &Path, +) -> Result<()> { + // Validate the graph file is readable before we do any real work, + // so a bad --graph path doesn't leave a half-written bundle behind. + std::fs::metadata(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to stat graph {graph_path:?}: {e}"), + ) + })?; + + let sample_count = count_jsonl_lines(input_path)?; + + let out_file = File::create(out_path)?; + let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + { + let mut handle = bendl_writer + .begin_stream() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let reader = BufReader::new(File::open(input_path)?); + encode_jsonl_to_ben(reader, &mut handle, variant)?; + handle + .finish(sample_count) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + } + bendl_writer + .finish() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + + append_graph_asset(out_path, graph_path) +} + +/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` +/// bundle at `out_path` and then append the graph as a post-stream asset. +#[allow(clippy::too_many_arguments)] +fn run_xencode_bundle_with_graph( + input_path: &Path, + out_path: &str, + variant: BenVariant, + from_ben: bool, + n_threads: Option, + compression_level: Option, + chunk_size: Option, + graph_path: &Path, +) -> Result<()> { + std::fs::metadata(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to stat graph {graph_path:?}: {e}"), + ) + })?; + + let sample_count: i64 = if from_ben { + count_samples_from_file(input_path, "ben")? as i64 + } else { + count_jsonl_lines(input_path)? + }; + + let out_file = File::create(out_path)?; + let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + { + let mut handle = bendl_writer + .begin_stream() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let reader = BufReader::new(File::open(input_path)?); + if from_ben { + encode_ben_to_xben( + reader, + &mut handle, + n_threads, + compression_level, + chunk_size, + )?; + } else { + encode_jsonl_to_xben( + reader, + &mut handle, + variant, + n_threads, + compression_level, + chunk_size, + )?; + } + handle + .finish(sample_count) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + } + bendl_writer + .finish() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + + append_graph_asset(out_path, graph_path) +} + /// Parse CLI arguments and execute the selected `ben` sub-mode. pub fn run() { let args = Args::parse(); set_verbose(args.verbose); + // --graph is only meaningful for the stream-producing modes. + if args.graph.is_some() && args.mode != Mode::Encode && args.mode != Mode::XEncode { + eprintln!("Error: --graph is only supported with --mode encode or --mode x-encode"); + return; + } + match args.mode { Mode::Encode => { tracing::trace!("Running in encode mode"); + // --graph path: produce a .bendl bundle with the BEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = match args.input_file.as_ref() { + Some(f) => f, + None => { + eprintln!("Error: --graph requires an input file (stdin not supported)."); + return; + } + }; + if args.print { + eprintln!("Error: --graph is incompatible with --print."); + return; + } + let out_path = match encode_setup( + args.mode, + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + ) { + Ok(path) => path, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = + run_encode_bundle_with_graph(Path::new(in_file), &out_path, variant, graph_path) + { + eprintln!("Error: {:?}", err); + } + return; + } + let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { Some(in_file) if !args.print => match encode_setup( @@ -303,6 +519,7 @@ pub fn run() { in_file.clone(), args.output_file.clone(), args.overwrite, + false, ) { Ok(path) => open_derived_writer(path), Err(err) => { @@ -338,6 +555,53 @@ pub fn run() { } } + // --graph path: produce a .bendl bundle with the XBEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = match args.input_file.as_ref() { + Some(f) => f, + None => { + eprintln!("Error: --graph requires an input file (stdin not supported)."); + return; + } + }; + if args.print { + eprintln!("Error: --graph is incompatible with --print."); + return; + } + if !ben_and_xben && !jsonl_and_xben { + eprintln!("Error: Unsupported file type(s) for xencode mode"); + return; + } + let out_path = match encode_setup( + args.mode, + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + ) { + Ok(path) => path, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = run_xencode_bundle_with_graph( + Path::new(in_file), + &out_path, + variant, + ben_and_xben, + args.n_cpus, + args.compression_level, + args.chunk_size, + graph_path, + ) { + eprintln!("Error: {:?}", err); + } + return; + } + let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { Some(in_file) if !args.print => match encode_setup( @@ -345,6 +609,7 @@ pub fn run() { in_file.clone(), args.output_file.clone(), args.overwrite, + false, ) { Ok(path) => open_derived_writer(path), Err(err) => { @@ -362,9 +627,13 @@ pub fn run() { }; if ben_and_xben { - if let Err(err) = - encode_ben_to_xben(reader, writer, args.n_cpus, args.compression_level, args.chunk_size) - { + if let Err(err) = encode_ben_to_xben( + reader, + writer, + args.n_cpus, + args.compression_level, + args.chunk_size, + ) { eprintln!("Error: {:?}", err); } } else if jsonl_and_xben { @@ -674,19 +943,45 @@ mod tests { #[test] fn encode_setup_derives_extensions() { assert_eq!( - encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true).unwrap(), + encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, false).unwrap(), "samples.jsonl.ben" ); assert_eq!( - encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true).unwrap(), + encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, false).unwrap(), "samples.xben" ); assert_eq!( - encode_setup(Mode::XzCompress, "samples.jsonl".to_string(), None, true).unwrap(), + encode_setup( + Mode::XzCompress, + "samples.jsonl".to_string(), + None, + true, + false + ) + .unwrap(), "samples.jsonl.xz" ); } + #[test] + fn encode_setup_with_graph_derives_bendl_extension() { + // JSONL + encode + graph → .bendl + assert_eq!( + encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, true).unwrap(), + "samples.jsonl.bendl" + ); + // .ben input to x-encode with graph trims the .ben suffix + assert_eq!( + encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, true).unwrap(), + "samples.bendl" + ); + // .xben input to x-encode with graph trims the .xben suffix + assert_eq!( + encode_setup(Mode::XEncode, "samples.xben".to_string(), None, true, true).unwrap(), + "samples.bendl" + ); + } + #[test] fn encode_setup_respects_explicit_output() { assert_eq!( @@ -695,6 +990,7 @@ mod tests { "ignored.jsonl".to_string(), Some("custom-output.ben".to_string()), true, + false, ) .unwrap(), "custom-output.ben" @@ -711,6 +1007,7 @@ mod tests { "input.jsonl".to_string(), Some(path.to_string_lossy().into_owned()), true, + false, ); assert!(err.is_ok()); diff --git a/ben/src/cli/bendl.rs b/ben/src/cli/bendl.rs index 786aa25..b574ac2 100644 --- a/ben/src/cli/bendl.rs +++ b/ben/src/cli/bendl.rs @@ -21,10 +21,8 @@ use crate::io::bundle::format::{ AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, }; -use crate::io::bundle::{ - AddAssetOptions, BendlReader, BendlWriteError, BendlWriter, -}; use crate::io::bundle::writer::BendlAppender; +use crate::io::bundle::{AddAssetOptions, BendlReader, BendlWriteError, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; /// Parsed form of a `name=path` option such as `--asset myblob=/tmp/x`. @@ -290,8 +288,8 @@ fn add_file_asset( } fn run_inspect(args: InspectArgs) -> Result<(), String> { - let file = File::open(&args.input) - .map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let file = + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; let reader = BendlReader::open(BufReader::new(file)) .map_err(|e| format!("failed to parse bundle header: {e}"))?; @@ -363,21 +361,21 @@ fn run_extract(args: ExtractArgs) -> Result<(), String> { ) .map_err(|e| format!("{e}"))?; - let file = File::open(&args.input) - .map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let file = + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; let mut reader = BendlReader::open(BufReader::new(file)) .map_err(|e| format!("failed to parse bundle header: {e}"))?; let mut out = BufWriter::new( - File::create(&args.output).map_err(|e| format!("failed to create {:?}: {e}", args.output))?, + File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, ); if args.stream { let mut stream = reader .assignment_stream_reader() .map_err(|e| format!("failed to open stream region: {e}"))?; - io::copy(&mut stream, &mut out) - .map_err(|e| format!("failed to copy stream bytes: {e}"))?; + io::copy(&mut stream, &mut out).map_err(|e| format!("failed to copy stream bytes: {e}"))?; } else if let Some(name) = args.asset.as_deref() { let entry = reader .find_asset_by_name(name) @@ -386,8 +384,7 @@ fn run_extract(args: ExtractArgs) -> Result<(), String> { let mut asset = reader .asset_reader(&entry) .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; - io::copy(&mut asset, &mut out) - .map_err(|e| format!("failed to copy asset bytes: {e}"))?; + io::copy(&mut asset, &mut out).map_err(|e| format!("failed to copy asset bytes: {e}"))?; } out.flush().map_err(|e| format!("flush failed: {e}"))?; @@ -400,8 +397,8 @@ fn run_append(args: AppendArgs) -> Result<(), String> { .write(true) .open(&args.input) .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; - let mut appender = BendlAppender::open(file) - .map_err(|e| format!("failed to open appender: {e}"))?; + let mut appender = + BendlAppender::open(file).map_err(|e| format!("failed to open appender: {e}"))?; let mut added = 0usize; if let Some(ref path) = args.metadata { diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 28d6b54..9ddf7af 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -12,6 +12,7 @@ use crate::{ use clap::{Parser, ValueEnum}; use serde_json::{json, Value}; use std::{ + collections::HashMap, fs::File, io::{BufReader, BufWriter, Write}, }; @@ -32,6 +33,7 @@ enum OrderingMethod { #[clap(alias = "mlc")] MultiLevelCluster, /// Reverse Cuthill-McKee ordering. + #[clap(alias = "rcm")] ReverseCuthillMckee, } @@ -102,236 +104,280 @@ pub fn run() { let args = Args::parse(); set_verbose(args.verbose); - match &args.mode { - Mode::Json => { - if args.n_items.is_some() { - panic!("--n-items is only supported in BEN mode."); - } - let input_file = File::open(&args.input_file).expect("Could not open input file."); - let reader = BufReader::new(input_file); - let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref()) - .expect("Provide either --key or --ordering."); - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", label).as_str() - } - }; + if let Err(err) = run_with_args(args) { + eprintln!("Error: {err}"); + std::process::exit(1); + } +} - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); +fn run_with_args(args: Args) -> Result<(), String> { + match args.mode.clone() { + Mode::Json => run_json_mode(args), + Mode::Ben => run_ben_mode(args), + } +} - let map = if let Some(key) = args.key.as_ref() { - sort_json_file_by_key(reader, writer, key) - } else { - sort_json_file_by_ordering( - reader, - writer, - to_graph_ordering(args.ordering.as_ref().unwrap()), - ) - }; - - let map_file_name = args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", label).as_str() - + "_map.json"; - let map_file = File::create(map_file_name).expect("Could not create map file."); - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": args.key.as_ref(), - "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map.unwrap() - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .expect("Could not write map file."); - } - Mode::Ben => { - if args.convert_only && args.output_variant.is_none() { - panic!("--convert-only requires --output-variant."); - } - if args.convert_only - && (args.map_file.is_some() || args.key.is_some() || args.ordering.is_some()) - { - panic!("--convert-only cannot be combined with relabeling options."); - } +fn run_json_mode(args: Args) -> Result<(), String> { + if args.n_items.is_some() { + return Err("--n-items is only supported in BEN mode.".to_string()); + } - let input_file = File::open(&args.input_file).expect("Could not open input file."); - let reader = BufReader::new(input_file); - let output_variant = args.output_variant.as_ref().map(to_ben_variant); + let input_file = File::open(&args.input_file) + .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; + let reader = BufReader::new(input_file); + let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; - if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { - if args.convert_only { - tracing::trace!("Converting BEN file to requested variant."); - } else { - tracing::trace!("Canonicalizing assignment vectors in ben file."); - } + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", label).as_str() + } + }; + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(reader, writer, key) + } else { + let ordering = args + .ordering + .as_ref() + .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; + sort_json_file_by_ordering(reader, writer, to_graph_ordering(ordering)) + } + .map_err(|e| format!("Could not sort input graph: {e}"))?; + + let map_file_name = args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", label).as_str() + + "_map.json"; + let map_file = File::create(&map_file_name) + .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), + "relabeling_old_to_new_nodes_map": map + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; + Ok(()) +} - let output_file_name = match args.output_file { - Some(name) => name, - None => { - if let Some(variant) = output_variant { - args.input_file.trim_end_matches(".ben").to_owned() - + format!("_{}.ben", ben_variant_name(variant)).as_str() - } else { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + "_canonicalized_assignments.jsonl.ben" - } - } - }; - - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - if args.convert_only { - let variant = output_variant.unwrap(); - if let Some(limit) = args.n_items { - convert_ben_file_limit(reader, writer, variant, limit).unwrap(); - } else { - convert_ben_file(reader, writer, variant).unwrap(); - } - } else if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_as_variant_limit(reader, writer, variant, limit).unwrap(); - } else { - relabel_ben_file_as_variant(reader, writer, variant).unwrap(); - } - } else if let Some(limit) = args.n_items { - relabel_ben_file_limit(reader, writer, limit).unwrap(); - } else { - relabel_ben_file(reader, writer).unwrap(); - } - return; - } +fn run_ben_mode(args: Args) -> Result<(), String> { + if args.convert_only && args.output_variant.is_none() { + return Err("--convert-only requires --output-variant.".to_string()); + } + if args.convert_only + && (args.map_file.is_some() || args.key.is_some() || args.ordering.is_some()) + { + return Err("--convert-only cannot be combined with relabeling options.".to_string()); + } - if args.map_file.is_some() && (args.key.is_some() || args.ordering.is_some()) { - panic!(concat!( - "Cannot provide both a map file and a sorting option. ", - "Please provide either the map file or the key/ordering and the ", - "(JSON formatted) dual-graph file needed to generate a map file." - )); - } + let input_file = File::open(&args.input_file) + .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; + let reader = BufReader::new(input_file); + let output_variant = args.output_variant.as_ref().map(to_ben_variant); + + if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { + if args.convert_only { + tracing::trace!("Converting BEN file to requested variant."); + } else { + tracing::trace!("Canonicalizing assignment vectors in ben file."); + } - let mut map_file_name = String::new(); - if args.key.is_some() || args.ordering.is_some() { - if let Some(shape) = args.shape_file { - let label = - relabeling_label(args.key.as_deref(), args.ordering.as_ref()).unwrap(); - tracing::trace!("Creating map file for ordering: {}", label); - - let output_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", label).as_str(); - - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - let shape_reader = - BufReader::new(File::open(&shape).expect("Could not open shape file.")); - let map = if let Some(key) = args.key.as_ref() { - sort_json_file_by_key(shape_reader, writer, key) - } else { - sort_json_file_by_ordering( - shape_reader, - writer, - to_graph_ordering(args.ordering.as_ref().unwrap()), - ) - }; - - map_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", label).as_str() - + "_map.json"; - let map_file = - File::create(&map_file_name).expect("Could not create map file."); - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": args.key.as_ref(), - "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map.unwrap() - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .expect("Could not write map file."); + let output_file_name = match args.output_file { + Some(name) => name, + None => { + if let Some(variant) = output_variant { + args.input_file.trim_end_matches(".ben").to_owned() + + format!("_{}.ben", ben_variant_name(variant)).as_str() } else { - panic!("No shape file provided to go with the requested ordering."); + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + "_canonicalized_assignments.jsonl.ben" } } + }; + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); - if map_file_name.is_empty() { - map_file_name = args.map_file.as_ref().unwrap().to_owned(); + if args.convert_only { + let variant = output_variant.expect("checked above"); + if let Some(limit) = args.n_items { + convert_ben_file_limit(reader, writer, variant, limit) + } else { + convert_ben_file(reader, writer, variant) } - let map_file = File::open(&map_file_name).expect("Could not open map file."); - let map_reader = BufReader::new(map_file); - - let data: Value = serde_json::from_reader(map_reader).unwrap(); - - let new_to_old_node_map = data["relabeling_old_to_new_nodes_map"] - .as_object() - .unwrap() - .iter() - .map(|(k, v)| (v.as_u64().unwrap() as usize, k.parse::().unwrap())) - .collect::>(); - - let label = data["key"] - .as_str() - .map(ToOwned::to_owned) - .or_else(|| data["ordering_method"].as_str().map(ToOwned::to_owned)) - .unwrap_or_else(|| "map".to_string()); - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + format!("_sorted_by_{}.jsonl.ben", label).as_str() - } - }; - let output_file = - File::create(&output_file_name).expect("Could not create output file."); - let writer = BufWriter::new(output_file); - - tracing::trace!( - "Relabeling ben file according to map file {}", - map_file_name, - ); - - if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_with_map_as_variant_limit( - reader, - writer, - new_to_old_node_map, - variant, - limit, - ) - .unwrap(); - } else { - relabel_ben_file_with_map_as_variant( - reader, - writer, - new_to_old_node_map, - variant, - ) - .unwrap(); - } - } else if let Some(limit) = args.n_items { - relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) - .unwrap(); + } else if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_as_variant_limit(reader, writer, variant, limit) } else { - relabel_ben_file_with_map(reader, writer, new_to_old_node_map).unwrap(); + relabel_ben_file_as_variant(reader, writer, variant) } + } else if let Some(limit) = args.n_items { + relabel_ben_file_limit(reader, writer, limit) + } else { + relabel_ben_file(reader, writer) + } + .map_err(|e| format!("BEN relabeling failed: {e}"))?; + return Ok(()); + } + + if args.map_file.is_some() && (args.key.is_some() || args.ordering.is_some()) { + return Err(concat!( + "Cannot provide both a map file and a sorting option. ", + "Please provide either the map file or the key/ordering and the ", + "(JSON formatted) dual-graph file needed to generate a map file." + ) + .to_string()); + } + + let mut map_file_name = String::new(); + if args.key.is_some() || args.ordering.is_some() { + let shape = args.shape_file.as_ref().ok_or_else(|| { + "No shape file provided to go with the requested ordering.".to_string() + })?; + let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; + tracing::trace!("Creating map file for ordering: {}", label); + + let output_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", label).as_str(); + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + let shape_file = + File::open(shape).map_err(|e| format!("Could not open shape file {shape:?}: {e}"))?; + let shape_reader = BufReader::new(shape_file); + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(shape_reader, writer, key) + } else { + let ordering = args + .ordering + .as_ref() + .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; + sort_json_file_by_ordering(shape_reader, writer, to_graph_ordering(ordering)) } + .map_err(|e| format!("Could not sort shape file: {e}"))?; + + map_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", label).as_str() + + "_map.json"; + let map_file = File::create(&map_file_name) + .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), + "relabeling_old_to_new_nodes_map": map + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; } + + if map_file_name.is_empty() { + map_file_name = args + .map_file + .as_ref() + .ok_or_else(|| "Provide --map-file, --key, or --ordering in BEN mode.".to_string())? + .to_owned(); + } + + let (new_to_old_node_map, label) = read_relabel_map_file(&map_file_name)?; + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + format!("_sorted_by_{}.jsonl.ben", label).as_str() + } + }; + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + tracing::trace!( + "Relabeling ben file according to map file {}", + map_file_name, + ); + + if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_with_map_as_variant_limit( + reader, + writer, + new_to_old_node_map, + variant, + limit, + ) + } else { + relabel_ben_file_with_map_as_variant(reader, writer, new_to_old_node_map, variant) + } + } else if let Some(limit) = args.n_items { + relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) + } else { + relabel_ben_file_with_map(reader, writer, new_to_old_node_map) + } + .map_err(|e| format!("BEN relabeling with map {map_file_name:?} failed: {e}"))?; + Ok(()) +} + +fn read_relabel_map_file(map_file_name: &str) -> Result<(HashMap, String), String> { + let map_file = File::open(map_file_name) + .map_err(|e| format!("Could not open map file {map_file_name:?}: {e}"))?; + let map_reader = BufReader::new(map_file); + + let data: Value = serde_json::from_reader(map_reader) + .map_err(|e| format!("Could not parse map file {map_file_name:?} as JSON: {e}"))?; + + let map_obj = data + .get("relabeling_old_to_new_nodes_map") + .and_then(Value::as_object) + .ok_or_else(|| { + format!( + "Map file {map_file_name:?} must contain object field \ + relabeling_old_to_new_nodes_map" + ) + })?; + + let mut new_to_old_node_map = HashMap::with_capacity(map_obj.len()); + for (old_idx_text, new_idx_value) in map_obj { + let old_idx = old_idx_text.parse::().map_err(|e| { + format!( + "Map file {map_file_name:?} contains invalid old node index {old_idx_text:?}: {e}" + ) + })?; + let new_idx = new_idx_value.as_u64().ok_or_else(|| { + format!( + "Map file {map_file_name:?} maps old node {old_idx} to non-integer value \ + {new_idx_value}" + ) + })? as usize; + new_to_old_node_map.insert(new_idx, old_idx); + } + + let label = data["key"] + .as_str() + .map(ToOwned::to_owned) + .or_else(|| data["ordering_method"].as_str().map(ToOwned::to_owned)) + .unwrap_or_else(|| "map".to_string()); + + Ok((new_to_old_node_map, label)) } /// Convert a CLI ordering method variant to the library's graph ordering type. @@ -410,12 +456,15 @@ fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { /// # Returns /// /// Returns the label string, or `None` if neither option is provided. -fn relabeling_label(key: Option<&str>, ordering: Option<&OrderingMethod>) -> Option { +fn relabeling_label( + key: Option<&str>, + ordering: Option<&OrderingMethod>, +) -> Result { match (key, ordering) { - (Some(_), Some(_)) => panic!("Provide either --key or --ordering, not both."), - (Some(key), None) => Some(key.to_string()), - (None, Some(ordering)) => Some(ordering_method_name(ordering).to_string()), - (None, None) => None, + (Some(_), Some(_)) => Err("Provide either --key or --ordering, not both.".to_string()), + (Some(key), None) => Ok(key.to_string()), + (None, Some(ordering)) => Ok(ordering_method_name(ordering).to_string()), + (None, None) => Err("Provide either --key or --ordering.".to_string()), } } diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 2a02e4e..2d0b120 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -23,6 +23,15 @@ pub fn decode_ben_line( max_len_bits: u8, n_bytes: u32, ) -> io::Result> { + if max_val_bits == 0 || max_val_bits > 16 || max_len_bits == 0 || max_len_bits > 16 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "invalid BEN bit width(s): max_val_bits={max_val_bits}, max_len_bits={max_len_bits}" + ), + )); + } + let mut assign_bits: Vec = vec![0; n_bytes as usize]; reader.read_exact(&mut assign_bits)?; diff --git a/ben/src/codec/decode/errors.rs b/ben/src/codec/decode/errors.rs index d2d280b..2520b7b 100644 --- a/ben/src/codec/decode/errors.rs +++ b/ben/src/codec/decode/errors.rs @@ -5,8 +5,10 @@ use thiserror::Error; /// Errors produced while decoding BEN or XBEN streams. #[derive(Debug, Error)] pub enum DecodeError { - #[error("TwoDelta run-length vector exhausted after {run_idx} runs \ - before position {pos} was covered")] + #[error( + "TwoDelta run-length vector exhausted after {run_idx} runs \ + before position {pos} was covered" + )] TwoDeltaRunsExhausted { run_idx: usize, pos: usize }, #[error("unknown XBEN frame tag byte {tag:#04x}")] @@ -18,9 +20,7 @@ pub enum DecodeError { #[error("TwoDelta frame encountered before an initial full-assignment frame")] TwoDeltaNoAnchorFrame, - #[error( - "unexpected TwoDelta frame in a non-TwoDelta BEN stream (variant: {variant:?})" - )] + #[error("unexpected TwoDelta frame in a non-TwoDelta BEN stream (variant: {variant:?})")] UnexpectedTwoDeltaFrame { variant: BenVariant }, #[error("IO error: {0}")] diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index d18a36e..03d2e81 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -1,8 +1,8 @@ -use crate::io::reader::{AssignmentReader, XZAssignmentReader}; -use crate::{progress, BenVariant}; use crate::codec::decode::jsonl_decode_ben32; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; +use crate::io::reader::{AssignmentReader, XZAssignmentReader}; +use crate::{progress, BenVariant}; use serde_json::json; use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; @@ -85,7 +85,8 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let mut line_count: usize = 0; let mut starting_sample: usize = 0; - while let Ok(count) = decoder.read(&mut buffer) { + loop { + let count = decoder.read(&mut buffer)?; if count == 0 { break; } diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index e2b4528..fae720f 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -9,10 +9,10 @@ mod twodelta; mod xz; pub use ben::decode_ben_line; -pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; -pub use twodelta::decode_twodelta_frame; +pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub(crate) use twodelta::apply_twodelta_runs_to_assignment; +pub use twodelta::decode_twodelta_frame; pub use xz::{decode_xben_to_ben, xz_decompress}; #[cfg(test)] diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs index 87b7d06..739d2b6 100644 --- a/ben/src/codec/decode/tests/mkvchain.rs +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -584,8 +584,7 @@ fn jsonl_decode_ben32_single_element() { let mut out = Vec::new(); jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); - let expected = - json!({"assignment": [23u16], "sample": 1}).to_string() + "\n"; + let expected = json!({"assignment": [23u16], "sample": 1}).to_string() + "\n"; assert_eq!(out, expected.as_bytes()); } @@ -594,9 +593,7 @@ fn jsonl_decode_ben32_three_frames() { // Three ben32 records with count=1 each — mirrors test_decode_ben32_multiple_simple_lines. let mut input: Vec = Vec::new(); // Record 1: rle [(1,4),(2,4),(3,4),(4,4)] - input.extend_from_slice(&[ - 0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, - ]); + input.extend_from_slice(&[0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0]); input.extend_from_slice(&1u16.to_be_bytes()); // Record 2: rle [(2,2),(3,7),(1,1),(2,1),(3,1)] input.extend_from_slice(&[ @@ -605,8 +602,8 @@ fn jsonl_decode_ben32_three_frames() { input.extend_from_slice(&1u16.to_be_bytes()); // Record 3: rle [(1..10, each 1)] input.extend_from_slice(&[ - 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 5, 0, 1, 0, 6, 0, 1, 0, 7, 0, 1, 0, - 8, 0, 1, 0, 9, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 5, 0, 1, 0, 6, 0, 1, 0, 7, 0, 1, 0, 8, + 0, 1, 0, 9, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, ]); input.extend_from_slice(&1u16.to_be_bytes()); diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 6005f25..b834e42 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -22,7 +22,7 @@ fn decode_error_non_io_becomes_invalid_data() { #[test] fn decode_xben_to_ben_twodelta_roundtrip() { - use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; + use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use crate::codec::encode::encode_jsonl_to_xben; use crate::BenVariant; use serde_json::Value; @@ -139,7 +139,7 @@ fn encode_ben_to_xben_roundtrip() { #[test] fn encode_ben_to_xben_with_chunk_size() { - use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; + use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; use crate::BenVariant; use serde_json::Value; @@ -171,7 +171,7 @@ fn encode_ben_to_xben_with_chunk_size() { #[test] fn encode_ben_to_xben_mkvchain_roundtrip() { - use crate::codec::decode::{decode_xben_to_ben, decode_ben_to_jsonl}; + use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben}; use crate::BenVariant; use serde_json::Value; diff --git a/ben/src/codec/decode/twodelta.rs b/ben/src/codec/decode/twodelta.rs index 50c6585..2003c09 100644 --- a/ben/src/codec/decode/twodelta.rs +++ b/ben/src/codec/decode/twodelta.rs @@ -51,6 +51,13 @@ pub(crate) fn apply_twodelta_runs_to_assignment( } } + if remaining_in_run > 0 || run_lengths.iter().skip(run_idx + 1).any(|&run| run > 0) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta run lengths exceed the number of positions in the assignment", + )); + } + Ok(assignment) } diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 11ef039..38c3fa0 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -64,7 +64,8 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let mut overflow: Vec = Vec::new(); let mut line_count: usize = 0; - while let Ok(count) = decoder.read(&mut buffer) { + loop { + let count = decoder.read(&mut buffer)?; if count == 0 { break; } @@ -123,7 +124,8 @@ pub fn xz_decompress(reader: R, mut writer: W) -> io::Resu let mut decoder = XzDecoder::new(reader); let mut buffer = [0u8; 4096]; - while let Ok(count) = decoder.read(&mut buffer) { + loop { + let count = decoder.read(&mut buffer)?; if count == 0 { break; } diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 14f37f9..691eb9b 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -78,6 +78,12 @@ pub(crate) fn encode_ben32_assignments(assign_vec: impl AsRef<[u16]>) -> Result< continue; } if assign == prev_assign { + if count == u16::MAX { + let encoded = (prev_assign as u32) << 16 | count as u32; + ret.extend(&encoded.to_be_bytes()); + count = 1; + continue; + } count += 1; } else { let encoded = (prev_assign as u32) << 16 | count as u32; diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index cfaac0c..20029d8 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -800,9 +800,8 @@ fn twodelta_encode_with_pair_and_mask_hints() { masks.insert(1, vec![0, 1]); masks.insert(2, vec![2, 3]); - let frame = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) - .unwrap(); + let frame = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap(); assert_eq!(frame.pair, (2, 1)); assert!(!frame.run_length_vector.is_empty()); // Verify masks were updated @@ -821,8 +820,8 @@ fn twodelta_encode_with_mask_hint_only() { masks.insert(1, vec![0, 1]); masks.insert(2, vec![2, 3]); - let frame = encode_twodelta_frame_with_hint(&prev, &curr, None, Some(&mut masks), None) - .unwrap(); + let frame = + encode_twodelta_frame_with_hint(&prev, &curr, None, Some(&mut masks), None).unwrap(); assert_eq!(frame.pair, (2, 1)); } @@ -842,8 +841,7 @@ fn twodelta_encode_hint_without_masks_errors() { let prev = vec![1u16, 1, 2, 2]; let curr = vec![2u16, 1, 2, 1]; - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), None, None).unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), None, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -857,9 +855,8 @@ fn twodelta_encode_identical_pair_hint_errors() { let mut masks = HashMap::new(); masks.insert(1u16, vec![0, 1]); - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 1)), Some(&mut masks), None) - .unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 1)), Some(&mut masks), None) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -892,8 +889,7 @@ fn twodelta_encode_mask_hint_identical_errors() { masks.insert(1, vec![0, 1]); masks.insert(2, vec![2, 3]); - let err = - encode_twodelta_frame_with_hint(&a, &a, None, Some(&mut masks), None).unwrap_err(); + let err = encode_twodelta_frame_with_hint(&a, &a, None, Some(&mut masks), None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -919,8 +915,8 @@ fn encode_error_non_io_becomes_invalid_data() { #[test] fn encode_jsonl_to_xben_roundtrip_verifies_content() { use crate::codec::decode::decode_xben_to_jsonl; - use std::io::BufReader; use serde_json::Value; + use std::io::BufReader; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[2,2,1,1],"sample":2} @@ -950,8 +946,8 @@ fn encode_jsonl_to_xben_roundtrip_verifies_content() { #[test] fn encode_jsonl_to_xben_mkv_verifies_content() { use crate::codec::decode::decode_xben_to_jsonl; - use std::io::BufReader; use serde_json::Value; + use std::io::BufReader; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[1,1,2,2],"sample":2} @@ -1051,9 +1047,8 @@ fn twodelta_encode_missing_mask_errors() { masks.insert(1, vec![0, 1]); // Missing mask for value 2 - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) - .unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -1068,9 +1063,8 @@ fn twodelta_encode_empty_mask_errors() { masks.insert(1, vec![0, 1]); masks.insert(2, vec![]); // Empty mask - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) - .unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -1086,9 +1080,8 @@ fn twodelta_encode_mask_out_of_pair_errors() { masks.insert(1, vec![0, 1]); masks.insert(2, vec![2, 3]); // position 2 in prev is actually 3, not 2 - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) - .unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -1120,10 +1113,7 @@ fn encode_ben32_line_value_at_u16_max() { let data = serde_json::json!({"assignment": [65535, 1]}); let result = encode_ben32_line(data).unwrap(); // (65535 << 16) | 1 → 0xFFFF0001 then (1 << 16) | 1 → 0x00010001 then terminator - assert_eq!( - result, - vec![0xFF, 0xFF, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0] - ); + assert_eq!(result, vec![0xFF, 0xFF, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0]); } // ── Encoding empty and single-element JSONL ───────────────────────────────── @@ -1157,8 +1147,8 @@ fn encode_jsonl_to_ben_single_sample() { #[test] fn encode_jsonl_to_xben_twodelta_roundtrip() { use crate::codec::decode::decode_xben_to_jsonl; - use std::io::BufReader; use serde_json::Value; + use std::io::BufReader; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[2,1,2,1],"sample":2} diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 23a2db8..7c473d6 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -261,6 +261,12 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( } if new_val == run_value { + if current_mask_count == u16::MAX { + return Err(Error::new( + ErrorKind::InvalidInput, + "TwoDelta run length exceeds u16::MAX", + )); + } current_mask_count += 1; } else { run_lengths.push(current_mask_count); @@ -382,6 +388,12 @@ fn construct_twodelta_frame_from_scratch( enc_pair_known = true; } if curr_val == run_value { + if run_count == u16::MAX { + return Err(Error::new( + ErrorKind::InvalidInput, + "TwoDelta run length exceeds u16::MAX", + )); + } run_count += 1; } else { run_lengths.push(run_count); diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 0295a85..8501691 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -49,7 +49,8 @@ pub fn xz_compress( .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let mut encoder = XzEncoder::new_stream(writer, mt); - while let Ok(count) = reader.read(&mut buff) { + loop { + let count = reader.read(&mut buff)?; if count == 0 { break; } diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index b715909..f38994c 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -112,7 +112,9 @@ fn mkv_decode_frame_from_reader() { // Count: u16 BE = 5 let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD, 0, 5]; let mut cursor = io::Cursor::new(data); - let frame = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let frame = MkvBenDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); assert_eq!(frame.max_val_bit_count, 2); assert_eq!(frame.max_len_bit_count, 3); assert_eq!(frame.n_bytes, 2); @@ -310,8 +312,14 @@ fn ben_encode_decode_roundtrip_standard() { let mut cursor = io::Cursor::new(encode_frame.as_slice()); let decode_frame = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - assert_eq!(decode_frame.max_val_bit_count, encode_frame.max_val_bit_count); - assert_eq!(decode_frame.max_len_bit_count, encode_frame.max_len_bit_count); + assert_eq!( + decode_frame.max_val_bit_count, + encode_frame.max_val_bit_count + ); + assert_eq!( + decode_frame.max_len_bit_count, + encode_frame.max_len_bit_count + ); assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); // Verify the payload decodes back to the original RLE runs @@ -332,10 +340,18 @@ fn mkv_encode_decode_roundtrip() { let encode_frame = MkvBenEncodeFrame::from_rle(runs.clone(), Some(42)); let mut cursor = io::Cursor::new(encode_frame.as_slice()); - let decode_frame = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let decode_frame = MkvBenDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); - assert_eq!(decode_frame.max_val_bit_count, encode_frame.max_val_bit_count); - assert_eq!(decode_frame.max_len_bit_count, encode_frame.max_len_bit_count); + assert_eq!( + decode_frame.max_val_bit_count, + encode_frame.max_val_bit_count + ); + assert_eq!( + decode_frame.max_len_bit_count, + encode_frame.max_len_bit_count + ); assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); assert_eq!(decode_frame.count, 42); @@ -353,12 +369,13 @@ fn mkv_encode_decode_roundtrip() { fn twodelta_encode_decode_roundtrip() { use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; let run_lengths = vec![3u16, 2, 1, 4]; - let encode_frame = - TwoDeltaEncodeFrame::from_run_lengths((5, 10), run_lengths.clone(), Some(7)); + let encode_frame = TwoDeltaEncodeFrame::from_run_lengths((5, 10), run_lengths.clone(), Some(7)); // Write the raw_bytes (which include pair, max_len_bits, n_bytes, payload, count) let mut cursor = io::Cursor::new(encode_frame.as_slice()); - let decode_frame = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let decode_frame = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); assert_eq!(decode_frame.pair, (5, 10)); assert_eq!(decode_frame.count, 7); @@ -396,8 +413,12 @@ fn mkv_decode_two_frames_back_to_back() { data.extend_from_slice(f2.as_slice()); let mut cursor = io::Cursor::new(data); - let d1 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - let d2 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d1 = MkvBenDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); + let d2 = MkvBenDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); let d3 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap(); assert_eq!(d1.count, 10); @@ -416,8 +437,12 @@ fn twodelta_decode_two_frames_back_to_back() { data.extend_from_slice(f2.as_slice()); let mut cursor = io::Cursor::new(data); - let d1 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - let d2 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d1 = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); + let d2 = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); let d3 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap(); assert_eq!(d1.pair, (1, 2)); @@ -435,7 +460,9 @@ fn twodelta_decode_two_frames_back_to_back() { fn mkv_decode_frame_count_max_u16() { let f = MkvBenEncodeFrame::from_rle(vec![(1u16, 1)], Some(u16::MAX)); let mut cursor = io::Cursor::new(f.as_slice()); - let d = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d = MkvBenDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); assert_eq!(d.count, u16::MAX); } @@ -444,7 +471,9 @@ fn twodelta_decode_frame_count_max_u16() { use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; let f = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(u16::MAX)); let mut cursor = io::Cursor::new(f.as_slice()); - let d = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let d = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); assert_eq!(d.count, u16::MAX); } @@ -510,7 +539,9 @@ fn twodelta_from_run_lengths_single_run() { let encoded = TwoDeltaEncodeFrame::from_run_lengths((1, 2), run_lengths.clone(), None); let mut cursor = io::Cursor::new(encoded.as_slice()); - let decoded = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let decoded = TwoDeltaDecodeFrame::from_reader(&mut cursor) + .unwrap() + .unwrap(); assert_eq!(decoded.run_lengths, run_lengths); } diff --git a/ben/src/codec/frames/twodelta_decode.rs b/ben/src/codec/frames/twodelta_decode.rs index d2eea41..1de0b3a 100644 --- a/ben/src/codec/frames/twodelta_decode.rs +++ b/ben/src/codec/frames/twodelta_decode.rs @@ -34,6 +34,12 @@ impl BenDecode for TwoDeltaDecodeFrame { let pair_b = reader.read_u16::()?; let max_len_bits = reader.read_u8()?; + if max_len_bits == 0 || max_len_bits > 16 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("invalid TwoDelta run-length bit width: {max_len_bits}"), + )); + } let n_bytes = reader.read_u32::()?; let mut payload = vec![0u8; n_bytes as usize]; @@ -41,8 +47,7 @@ impl BenDecode for TwoDeltaDecodeFrame { let count = reader.read_u16::()?; - let encode_frame = - TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload); + let encode_frame = TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload); Ok(Some(TwoDeltaDecodeFrame { pair: encode_frame.pair, diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 3ac28a8..d766b06 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -238,7 +238,10 @@ fn test_random_translation_ben_to_ben32() { fn test_ben32_to_ben_line_rejects_invalid_length() { let err = ben32_to_ben_line(vec![1, 2, 3]).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert_eq!(err.to_string(), "ben32 frame payload length 3 is not a multiple of 4"); + assert_eq!( + err.to_string(), + "ben32 frame payload length 3 is not a multiple of 4" + ); } #[test] @@ -336,8 +339,8 @@ fn test_ben_to_ben32_lines_mkv_roundtrip() { fn test_ben_to_ben32_lines_rejects_twodelta() { let ben_data = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; let mut output = Vec::new(); - let err = ben_to_ben32_lines(ben_data.as_slice(), &mut output, BenVariant::TwoDelta) - .unwrap_err(); + let err = + ben_to_ben32_lines(ben_data.as_slice(), &mut output, BenVariant::TwoDelta).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Unsupported); assert!(err.to_string().contains("TwoDelta")); } diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index c49255c..950408f 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -290,11 +290,13 @@ impl BendlDirectoryEntry { /// Serialize the entry into a byte vector. pub fn to_bytes(&self) -> Result, BendlFormatError> { let name_bytes = self.name.as_bytes(); - let name_len: u16 = name_bytes.len().try_into().map_err(|_| { - BendlFormatError::NameTooLong { - length: name_bytes.len(), - } - })?; + let name_len: u16 = + name_bytes + .len() + .try_into() + .map_err(|_| BendlFormatError::NameTooLong { + length: name_bytes.len(), + })?; let checksum_bytes = self.checksum.as_deref().unwrap_or(&[]); let checksum_len: u32 = checksum_bytes @@ -379,12 +381,13 @@ pub fn read_directory( /// Serialize a directory table into a byte vector. pub fn encode_directory(entries: &[BendlDirectoryEntry]) -> Result, BendlFormatError> { - let entry_count: u32 = entries - .len() - .try_into() - .map_err(|_| BendlFormatError::TooManyEntries { - length: entries.len(), - })?; + let entry_count: u32 = + entries + .len() + .try_into() + .map_err(|_| BendlFormatError::TooManyEntries { + length: entries.len(), + })?; let body_len: usize = entries.iter().map(|e| e.encoded_len()).sum(); let mut out = Vec::with_capacity(4 + body_len); @@ -440,6 +443,17 @@ pub enum BendlFormatError { #[error("directory entry name is not valid UTF-8")] NameNotUtf8, + /// A directory table contained bytes after the declared entries. + #[error("directory table has {remaining} trailing byte(s) after declared entries")] + TrailingDirectoryBytes { + /// Number of unread bytes left in the bounded directory region. + remaining: u64, + }, + + /// A directory table violated bundle-level validation rules. + #[error("malformed directory: {0}")] + MalformedDirectory(String), + /// An I/O error occurred while reading or writing the format layer. #[error("IO error: {0}")] Io(#[from] io::Error), @@ -470,7 +484,10 @@ mod tests { #[test] fn canonical_name_lookup() { - assert_eq!(canonical_name_for(ASSET_TYPE_METADATA), Some("metadata.json")); + assert_eq!( + canonical_name_for(ASSET_TYPE_METADATA), + Some("metadata.json") + ); assert_eq!(canonical_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); assert_eq!( canonical_name_for(ASSET_TYPE_RELABEL_MAP), @@ -510,7 +527,10 @@ mod tests { let decoded = BendlHeader::from_bytes(&header.to_bytes()).unwrap(); assert_eq!(header, decoded); assert!(!decoded.is_complete()); - assert_eq!(decoded.assignment_format_typed(), Some(AssignmentFormat::Xben)); + assert_eq!( + decoded.assignment_format_typed(), + Some(AssignmentFormat::Xben) + ); assert_eq!(decoded.sample_count, -1); assert_eq!(decoded.stream_len, 0); assert_eq!(decoded.directory_offset, 0); @@ -588,7 +608,10 @@ mod tests { let mut cursor = &bytes[..]; let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); assert_eq!(decoded, entry); - assert_eq!(decoded.checksum.unwrap(), vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]); + assert_eq!( + decoded.checksum.unwrap(), + vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE] + ); } #[test] diff --git a/ben/src/io/bundle/manifest.rs b/ben/src/io/bundle/manifest.rs index 86851ca..b6c2120 100644 --- a/ben/src/io/bundle/manifest.rs +++ b/ben/src/io/bundle/manifest.rs @@ -50,7 +50,8 @@ mod tests { #[test] fn manifest_accepts_missing_variant() { - let json = r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; + let json = + r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; let decoded: BendlManifest = serde_json::from_str(json).unwrap(); assert_eq!(decoded.variant, None); assert!(decoded.complete); diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index f6eafa8..c503cb5 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -36,7 +36,15 @@ impl BendlReader { let directory = if header.directory_offset != 0 && header.directory_len != 0 { inner.seek(SeekFrom::Start(header.directory_offset))?; - read_directory(&mut inner)? + let mut bounded = (&mut inner).take(header.directory_len); + let directory = read_directory(&mut bounded)?; + let remaining = bounded.limit(); + if remaining != 0 { + return Err(BendlFormatError::TrailingDirectoryBytes { remaining }); + } + validate_directory_entries(&directory) + .map_err(|e| BendlFormatError::MalformedDirectory(e.to_string()))?; + directory } else { Vec::new() }; @@ -135,11 +143,9 @@ impl BendlReader { pub fn open_assignment_reader( &mut self, ) -> Result>, BundleAssignmentReaderError> { - let format = self - .assignment_format() - .ok_or(BundleAssignmentReaderError::UnknownAssignmentFormat( - self.header.assignment_format, - ))?; + let format = self.assignment_format().ok_or( + BundleAssignmentReaderError::UnknownAssignmentFormat(self.header.assignment_format), + )?; let stream = self.assignment_stream_reader()?; match format { AssignmentFormat::Ben => { @@ -194,29 +200,7 @@ impl BendlReader { /// writer is already expected to enforce these rules and a /// malformed bundle is a program bug somewhere else. pub fn validate_directory(&self) -> Result<(), BundleValidationError> { - let mut seen_names = std::collections::HashSet::new(); - let mut seen_singleton_types = std::collections::HashSet::new(); - - for entry in &self.directory { - if !seen_names.insert(entry.name.as_str()) { - return Err(BundleValidationError::DuplicateName(entry.name.clone())); - } - if let Some(canonical) = canonical_name_for(entry.asset_type) { - if entry.name != canonical { - return Err(BundleValidationError::WrongCanonicalName { - asset_type: entry.asset_type, - expected: canonical.to_string(), - found: entry.name.clone(), - }); - } - if !seen_singleton_types.insert(entry.asset_type) { - return Err(BundleValidationError::DuplicateSingletonType( - entry.asset_type, - )); - } - } - } - Ok(()) + validate_directory_entries(&self.directory) } /// Release the underlying reader. @@ -225,6 +209,34 @@ impl BendlReader { } } +pub(crate) fn validate_directory_entries( + directory: &[BendlDirectoryEntry], +) -> Result<(), BundleValidationError> { + let mut seen_names = std::collections::HashSet::new(); + let mut seen_singleton_types = std::collections::HashSet::new(); + + for entry in directory { + if !seen_names.insert(entry.name.as_str()) { + return Err(BundleValidationError::DuplicateName(entry.name.clone())); + } + if let Some(canonical) = canonical_name_for(entry.asset_type) { + if entry.name != canonical { + return Err(BundleValidationError::WrongCanonicalName { + asset_type: entry.asset_type, + expected: canonical.to_string(), + found: entry.name.clone(), + }); + } + if !seen_singleton_types.insert(entry.asset_type) { + return Err(BundleValidationError::DuplicateSingletonType( + entry.asset_type, + )); + } + } + } + Ok(()) +} + /// Either a BEN or an XBEN assignment decoder over a bundle's embedded /// stream region. /// @@ -277,9 +289,7 @@ pub enum BundleValidationError { DuplicateSingletonType(u16), /// An entry with a known singleton type is not using its canonical name. - #[error( - "asset type {asset_type} must use canonical name {expected:?}, found {found:?}" - )] + #[error("asset type {asset_type} must use canonical name {expected:?}, found {found:?}")] WrongCanonicalName { /// The asset type whose canonical name was violated. asset_type: u16, @@ -427,7 +437,11 @@ mod tests { let (offset, len) = reader.assignment_stream_range().unwrap(); assert_eq!(len, fake_stream.len() as u64); let mut buf = Vec::new(); - reader.assignment_stream_reader().unwrap().read_to_end(&mut buf).unwrap(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); assert_eq!(buf, fake_stream); // Sanity-check the offset is consistent with the header. assert_eq!(offset, reader.header().stream_offset); @@ -465,7 +479,11 @@ mod tests { assert_eq!(len, fake_stream.len() as u64); let mut buf = Vec::new(); - reader.assignment_stream_reader().unwrap().read_to_end(&mut buf).unwrap(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); assert_eq!(buf, fake_stream); } @@ -527,7 +545,10 @@ mod tests { let err = reader.validate_directory().unwrap_err(); assert!(matches!( err, - BundleValidationError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + BundleValidationError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } )); } @@ -609,12 +630,10 @@ mod tests { fn open_rejects_directory_with_inflated_entry_count() { let mut bytes = build_basic_finalized_bundle(); // Read directory_offset from the header (bytes 24..32). - let directory_offset = - u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; // Blow up the entry count at the start of the directory to a // value that cannot possibly fit in the remaining file bytes. - bytes[directory_offset..directory_offset + 4] - .copy_from_slice(&9999u32.to_le_bytes()); + bytes[directory_offset..directory_offset + 4].copy_from_slice(&9999u32.to_le_bytes()); match BendlReader::open(Cursor::new(bytes)) { Err(BendlFormatError::Io(_)) => {} Err(other) => panic!("expected Io, got {other:?}"), @@ -682,14 +701,12 @@ mod tests { // claims a payload_len that extends well past EOF. let mut bytes = build_basic_finalized_bundle(); // Parse the directory offset to find where the entry lives. - let directory_offset = - u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; // Skip the u32 entry count (4 bytes) and then the 16-byte fixed // entry header up to `payload_len` (bytes 16..24 of the entry). let entry_start = directory_offset + 4; let payload_len_offset = entry_start + 16; - bytes[payload_len_offset..payload_len_offset + 8] - .copy_from_slice(&u64::MAX.to_le_bytes()); + bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 9e1473f..149dcf9 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -213,9 +213,7 @@ impl BendlWriter { // Compute final payload bytes. let payload_bytes: Vec = if compress { let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); - encoder - .write_all(payload) - .map_err(BendlWriteError::Io)?; + encoder.write_all(payload).map_err(BendlWriteError::Io)?; encoder.finish().map_err(BendlWriteError::Io)? } else { payload.to_vec() @@ -230,8 +228,7 @@ impl BendlWriter { asset_flags |= ASSET_FLAG_XZ; } if options.checksum.is_some() { - asset_flags |= - crate::io::bundle::format::ASSET_FLAG_CHECKSUM; + asset_flags |= crate::io::bundle::format::ASSET_FLAG_CHECKSUM; } // Write at current file position. @@ -260,7 +257,12 @@ impl BendlWriter { name: &str, payload: &[u8], ) -> Result<(), BendlWriteError> { - self.add_asset(asset_type, name, payload, AddAssetOptions::defaults().json()) + self.add_asset( + asset_type, + name, + payload, + AddAssetOptions::defaults().json(), + ) } /// Transition from the asset phase into the stream phase and return @@ -321,8 +323,7 @@ impl BendlWriter { let mut handle = self.begin_stream()?; let mut sample_count: i64 = 0; { - let mut ben = - crate::io::writer::AssignmentWriter::new(&mut handle, variant)?; + let mut ben = crate::io::writer::AssignmentWriter::new(&mut handle, variant)?; { let mut ctx = BundleAssignmentStreamCtx { writer: &mut ben, @@ -355,8 +356,7 @@ impl BendlWriter { let mut sample_count: i64 = 0; { let encoder = xz2::write::XzEncoder::new(&mut handle, DEFAULT_XZ_PRESET); - let mut xben = - crate::io::writer::XZAssignmentWriter::new(encoder, variant)?; + let mut xben = crate::io::writer::XZAssignmentWriter::new(encoder, variant)?; { let mut ctx = BundleAssignmentStreamCtx { writer: &mut xben, @@ -405,8 +405,7 @@ impl BendlWriter { let directory_offset = self.header.stream_offset + stream_len; self.inner.seek(SeekFrom::Start(directory_offset))?; - let directory_bytes = encode_directory(&self.entries) - .map_err(BendlWriteError::Format)?; + let directory_bytes = encode_directory(&self.entries).map_err(BendlWriteError::Format)?; self.inner .write_all(&directory_bytes) .map_err(BendlWriteError::Io)?; @@ -551,9 +550,7 @@ pub enum BendlWriteError { DuplicateSingletonType(u16), /// A singleton asset was added under the wrong canonical name. - #[error( - "asset type {asset_type} must use canonical name {expected:?}, got {found:?}" - )] + #[error("asset type {asset_type} must use canonical name {expected:?}, got {found:?}")] WrongCanonicalName { /// The asset type whose canonical name was violated. asset_type: u16, @@ -651,8 +648,17 @@ impl BendlAppender { } inner.seek(SeekFrom::Start(header.directory_offset))?; - let existing_entries = - read_directory(&mut inner).map_err(BendlWriteError::Format)?; + let mut bounded = (&mut inner).take(header.directory_len); + let existing_entries = read_directory(&mut bounded).map_err(BendlWriteError::Format)?; + let remaining = bounded.limit(); + if remaining != 0 { + return Err(BendlWriteError::Format( + BendlFormatError::TrailingDirectoryBytes { remaining }, + )); + } + super::reader::validate_directory_entries(&existing_entries).map_err(|e| { + BendlWriteError::Format(BendlFormatError::MalformedDirectory(e.to_string())) + })?; let mut existing_names = HashSet::new(); let mut existing_singleton_types = HashSet::new(); @@ -740,7 +746,12 @@ impl BendlAppender { name: &str, payload: &[u8], ) -> Result<(), BendlWriteError> { - self.add_asset(asset_type, name, payload, AddAssetOptions::defaults().json()) + self.add_asset( + asset_type, + name, + payload, + AddAssetOptions::defaults().json(), + ) } /// Commit all pending appends. @@ -786,8 +797,7 @@ impl BendlAppender { asset_flags |= ASSET_FLAG_XZ; } if asset.checksum.is_some() { - asset_flags |= - crate::io::bundle::format::ASSET_FLAG_CHECKSUM; + asset_flags |= crate::io::bundle::format::ASSET_FLAG_CHECKSUM; } encoded.push(EncodedPending { @@ -830,8 +840,7 @@ impl BendlAppender { // Write the new directory at the new EOF. let new_directory_offset = self.inner.seek(SeekFrom::Current(0))?; - let directory_bytes = - encode_directory(&new_entries).map_err(BendlWriteError::Format)?; + let directory_bytes = encode_directory(&new_entries).map_err(BendlWriteError::Format)?; self.inner.write_all(&directory_bytes)?; let new_directory_len = directory_bytes.len() as u64; @@ -874,11 +883,7 @@ mod tests { fn minimal_bundle_round_trip_through_reader() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer - .add_json_asset( - ASSET_TYPE_METADATA, - "metadata.json", - br#"{"note":"hello"}"#, - ) + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", br#"{"note":"hello"}"#) .unwrap(); let stream_bytes = b"STANDARD BEN FILE\x00\x01fake".to_vec(); writer.write_stream_bytes(&stream_bytes, 7).unwrap(); @@ -915,7 +920,9 @@ mod tests { writer .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -946,7 +953,9 @@ mod tests { AddAssetOptions::defaults().json().raw(), ) .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -977,7 +986,10 @@ mod tests { .unwrap_err(); assert!(matches!( err, - BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } )); } @@ -1037,7 +1049,9 @@ mod tests { writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); @@ -1045,7 +1059,10 @@ mod tests { let file_len = buf.len() as u64; assert_eq!(header.directory_offset + header.directory_len, file_len); // Stream ends where directory begins. - assert_eq!(header.stream_offset + header.stream_len, header.directory_offset); + assert_eq!( + header.stream_offset + header.stream_len, + header.directory_offset + ); } // ----------------------------------------------------------------------- @@ -1091,9 +1108,8 @@ mod tests { #[test] fn append_leaves_stream_bytes_byte_for_byte_unchanged() { let (bundle, (stream_offset, stream_len)) = build_base_bundle(); - let original_stream_bytes = bundle - [stream_offset as usize..(stream_offset + stream_len) as usize] - .to_vec(); + let original_stream_bytes = + bundle[stream_offset as usize..(stream_offset + stream_len) as usize].to_vec(); let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); appender @@ -1140,7 +1156,10 @@ mod tests { .find_asset_by_name("metadata.json") .unwrap() .payload_offset; - assert_eq!(old_offset, new_offset, "existing asset offset must not move"); + assert_eq!( + old_offset, new_offset, + "existing asset offset must not move" + ); } #[test] @@ -1204,7 +1223,10 @@ mod tests { .unwrap_err(); assert!(matches!( err, - BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_GRAPH, .. } + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } )); let buf = appender.abort().into_inner(); @@ -1215,8 +1237,9 @@ mod tests { fn append_rejects_incomplete_bundle() { // Construct a minimal incomplete bundle: just the provisional // header and some stream bytes, no directory. - use crate::io::bundle::format::{BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, - COMPLETE_NO}; + use crate::io::bundle::format::{ + BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, + }; let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, @@ -1464,8 +1487,7 @@ mod tests { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let decoder: BundleAssignmentReader<_> = - reader.open_assignment_reader().unwrap(); + let decoder: BundleAssignmentReader<_> = reader.open_assignment_reader().unwrap(); assert!(decoder.is_ben()); assert!(!decoder.is_xben()); } @@ -1502,7 +1524,10 @@ mod tests { // now stuck in the Streaming state. drop(handle); } - let err = writer.begin_stream().err().expect("second begin_stream must fail"); + let err = writer + .begin_stream() + .err() + .expect("second begin_stream must fail"); assert!(matches!(err, BendlWriteError::WrongState { .. })); } @@ -1517,7 +1542,10 @@ mod tests { let err = writer.finish().unwrap_err(); assert!(matches!( err, - BendlWriteError::WrongState { found: "Streaming", .. } + BendlWriteError::WrongState { + found: "Streaming", + .. + } )); } @@ -1529,7 +1557,12 @@ mod tests { let name = format!("blob_{i:05}"); let payload = vec![(i & 0xFF) as u8; (i % 17) + 1]; writer - .add_asset(ASSET_TYPE_CUSTOM, &name, &payload, AddAssetOptions::defaults()) + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) .unwrap(); } writer @@ -1676,18 +1709,20 @@ mod tests { let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader - .find_asset_by_name("with_checksum") - .cloned() - .unwrap(); + let entry = reader.find_asset_by_name("with_checksum").cloned().unwrap(); assert_eq!(entry.checksum, Some(checksum)); - assert_ne!(entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, 0); + assert_ne!( + entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, + 0 + ); } #[test] fn finished_writer_rejects_further_operations() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); // Take a handle to the writer by going through begin_stream first. // Actually finish() consumes self, so instead assert the state // machine barfs when we manually poke it in the Finished state. @@ -1726,14 +1761,19 @@ mod tests { .unwrap_err(); assert!(matches!( err, - BendlWriteError::WrongCanonicalName { asset_type: ASSET_TYPE_METADATA, .. } + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_METADATA, + .. + } )); // After a rejected add, no entries have been recorded — a // subsequent valid add proceeds normally. writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.assets().len(), 1); @@ -1761,21 +1801,11 @@ mod tests { let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); // First pending add: "blob". appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"1", - AddAssetOptions::defaults(), - ) + .add_asset(ASSET_TYPE_CUSTOM, "blob", b"1", AddAssetOptions::defaults()) .unwrap(); // Second pending add with same name must be rejected. let err = appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"2", - AddAssetOptions::defaults(), - ) + .add_asset(ASSET_TYPE_CUSTOM, "blob", b"2", AddAssetOptions::defaults()) .unwrap_err(); assert!(matches!(err, BendlWriteError::DuplicateName(_))); // Committing the still-valid first pending add should still work. @@ -1909,10 +1939,8 @@ mod tests { assert!(baseline_samples.is_some()); drop(baseline_reader); - let mut accumulated: Vec<(String, Vec)> = vec![( - "metadata.json".to_string(), - br#"{"version":1}"#.to_vec(), - )]; + let mut accumulated: Vec<(String, Vec)> = + vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; for round in 0..5 { let cursor = Cursor::new(buf); @@ -1967,10 +1995,8 @@ mod tests { use rand_chacha::ChaCha8Rng; let (mut buf, _) = build_base_bundle(); - let mut accumulated: Vec<(String, Vec)> = vec![( - "metadata.json".to_string(), - br#"{"version":1}"#.to_vec(), - )]; + let mut accumulated: Vec<(String, Vec)> = + vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; let mut rng = ChaCha8Rng::seed_from_u64(0xDEAD_BEEF_CAFE_F00D); let rounds: usize = rng.random_range(3..=8); @@ -1980,8 +2006,7 @@ mod tests { let mut appender = BendlAppender::open(cursor).unwrap(); for k in 0..adds { let size: usize = rng.random_range(0..=256); - let payload: Vec = - (0..size).map(|_| rng.random::()).collect(); + let payload: Vec = (0..size).map(|_| rng.random::()).collect(); let name = format!("r{round}-a{k}.bin"); appender .add_asset( diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs index af5b176..c357872 100644 --- a/ben/src/io/reader/assignment_reader.rs +++ b/ben/src/io/reader/assignment_reader.rs @@ -1,8 +1,7 @@ use super::errors::DecoderInitError; use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError}; use crate::codec::{ - BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, - TwoDeltaDecodeFrame, + BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, TwoDeltaDecodeFrame, }; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::util::rle::rle_to_vec; @@ -40,6 +39,13 @@ impl StoredBenFrame { } } +fn zero_count_frame_error() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + "BEN frame count must be greater than zero", + ) +} + impl AssignmentReader { /// Create a decoder for an uncompressed BEN stream. /// @@ -149,7 +155,11 @@ impl AssignmentReader { let mut this = self; let mut total = 0usize; while let Some(frame_res) = this.pop_frame_from_reader() { - total += frame_res?.count() as usize; + let count = frame_res?.count(); + if count == 0 { + return Err(zero_count_frame_error()); + } + total += count as usize; } Ok(total) } @@ -171,6 +181,9 @@ impl AssignmentReader { }; let count = frame.count(); + if count == 0 { + return Err(zero_count_frame_error()); + } let assignment = match frame { StoredBenFrame::Standard(f) => decode_ben_frame_to_assignment(&f)?, @@ -246,6 +259,9 @@ impl Iterator for AssignmentReader { None => return None, }; let count = frame.count(); + if count == 0 { + return Some(Err(zero_count_frame_error())); + } let assignment = match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { Ok(assgn) => assgn, @@ -289,6 +305,9 @@ impl Iterator for AssignmentFrameReader { Some(Ok(StoredBenFrame::Standard(frame))) => Some(Ok((frame, 1))), Some(Ok(StoredBenFrame::MkvChain(frame))) => { let count = frame.count; + if count == 0 { + return Some(Err(zero_count_frame_error())); + } Some(Ok(( BenDecodeFrame { max_val_bit_count: frame.max_val_bit_count, @@ -299,11 +318,11 @@ impl Iterator for AssignmentFrameReader { count, ))) } - Some(Ok(StoredBenFrame::TwoDelta(_))) => Some(Err(io::Error::from( - DecodeError::UnexpectedTwoDeltaFrame { + Some(Ok(StoredBenFrame::TwoDelta(_))) => { + Some(Err(io::Error::from(DecodeError::UnexpectedTwoDeltaFrame { variant: self.inner.variant, - }, - ))), + }))) + } Some(Err(err)) => Some(Err(err)), None => None, } @@ -370,4 +389,3 @@ impl AssignmentReader { super::subsample::SubsampleFrameDecoder::every(frames, step, offset) } } - diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index a9d4d75..940e259 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -218,6 +218,12 @@ where Ok(x) => x, Err(e) => return Some(Err(e)), }; + if count == 0 { + return Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + "frame count must be greater than zero", + ))); + } let lo = self.sample + 1; let hi = self.sample + count as usize; @@ -255,8 +261,7 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { let frames = AssignmentFrameReader::new(reader)?; - let mapped = frames - .map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))); + let mapped = frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))); Ok(Box::new(mapped)) } "xben" => { diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index d41c0bc..0e06da3 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -9,15 +9,7 @@ use xz2::write::XzEncoder; /// Build a minimal XBEN stream from JSONL input for testing. fn make_xben(jsonl: &str, variant: BenVariant) -> Vec { let mut xben = Vec::new(); - encode_jsonl_to_xben( - jsonl.as_bytes(), - &mut xben, - variant, - Some(1), - Some(1), - None, - ) - .unwrap(); + encode_jsonl_to_xben(jsonl.as_bytes(), &mut xben, variant, Some(1), Some(1), None).unwrap(); xben } @@ -69,11 +61,7 @@ fn xz_reader_mkv_iterator() { #[test] fn xz_reader_twodelta_iterator() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - vec![2, 2, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); assert_eq!(reader.variant(), BenVariant::TwoDelta); @@ -202,10 +190,7 @@ fn xz_reader_into_frames_standard() { #[test] fn xz_reader_into_frames_twodelta() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.into_frames().collect(); @@ -308,11 +293,7 @@ fn xz_reader_for_each_assignment_silent() { #[test] fn xz_reader_write_all_jsonl_twodelta() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - vec![2, 2, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); @@ -326,10 +307,7 @@ fn xz_reader_write_all_jsonl_twodelta() { #[test] fn xz_reader_count_samples_twodelta() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 2); @@ -504,11 +482,7 @@ fn xz_reader_subsample_mkv_with_count_gt_1() { #[test] fn xz_reader_subsample_twodelta() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - vec![2, 2, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader @@ -551,7 +525,10 @@ fn decoder_init_error_io() { struct FailReader; impl std::io::Read for FailReader { fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { - Err(std::io::Error::new(std::io::ErrorKind::BrokenPipe, "broken")) + Err(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "broken", + )) } } let result = AssignmentReader::new(FailReader); @@ -581,7 +558,10 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let err = reader .for_each_assignment(|_assignment, _count| { - Err(std::io::Error::new(std::io::ErrorKind::Other, "callback failed")) + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "callback failed", + )) }) .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); @@ -647,11 +627,7 @@ fn assignment_reader_twodelta_roundtrip() { use crate::io::reader::AssignmentReader; use crate::io::writer::AssignmentWriter; - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - vec![2, 2, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let mut ben = Vec::new(); { diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs index bad3fef..b360a35 100644 --- a/ben/src/io/reader/xz_assignment_reader.rs +++ b/ben/src/io/reader/xz_assignment_reader.rs @@ -227,7 +227,9 @@ impl XZAssignmentReader { ))) } XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. - _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { tag }))), + _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { + tag, + }))), } } @@ -254,11 +256,27 @@ impl XZAssignmentReader { // Calculate total chunk size: tag(1) + n_frames(4) // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) - let header_len = 5; - let pairs_len = n_frames * 4; - let counts_len = n_frames * 2; - let run_counts_len = n_frames * 4; - let fixed_len = header_len + pairs_len + counts_len + run_counts_len; + let header_len: usize = 5; + let pairs_len = match n_frames.checked_mul(4) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; + let counts_len = match n_frames.checked_mul(2) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; + let run_counts_len = match n_frames.checked_mul(4) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; + let fixed_len = match header_len + .checked_add(pairs_len) + .and_then(|v| v.checked_add(counts_len)) + .and_then(|v| v.checked_add(run_counts_len)) + { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; if self.overflow.len() < fixed_len { return None; @@ -277,11 +295,20 @@ impl XZAssignmentReader { self.overflow[offset + 3], ]) as usize; run_counts.push(rc); - total_runs += rc; + total_runs = match total_runs.checked_add(rc) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; } - let run_data_len = total_runs * 2; - let total_len = fixed_len + run_data_len; + let run_data_len = match total_runs.checked_mul(2) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; + let total_len = match fixed_len.checked_add(run_data_len) { + Some(v) => v, + None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), + }; if self.overflow.len() < total_len { return None; } @@ -409,6 +436,13 @@ impl XZAssignmentReader { } } +fn zero_count_frame_error() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + "XBEN frame count must be greater than zero", + ) +} + /// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. /// /// # Arguments @@ -439,7 +473,14 @@ impl Iterator for XZAssignmentReader { if let Some((frame_bytes, consumed, count)) = self.pop_frame_from_overflow(&self.overflow) { - let res = match decode_xben_frame_to_assignment(frame_bytes, self.inner_variant) { + if count == 0 { + self.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error())); + } + let res = match decode_xben_frame_to_assignment( + frame_bytes, + self.inner_variant, + ) { Ok(assignment) => { self.previous_assignment = Some(assignment.clone()); Ok((assignment, count)) @@ -453,6 +494,9 @@ impl Iterator for XZAssignmentReader { BenVariant::TwoDelta => { // Drain frames from a previously parsed chunk first. if let Some((frame, count)) = self.chunk_queue.pop_front() { + if count == 0 { + return Some(Err(zero_count_frame_error())); + } let assignment = match frame { XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), XBenTwoDeltaFrame::Delta { pair, run_lengths } => { @@ -487,6 +531,10 @@ impl Iterator for XZAssignmentReader { if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { let res = match parsed { Ok((frame, consumed, count)) => { + if count == 0 { + self.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error())); + } let assignment = match frame { XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), XBenTwoDeltaFrame::Delta { pair, run_lengths } => { @@ -581,6 +629,10 @@ impl Iterator for XZAssignmentFrameReader { if let Some((frame, consumed, count)) = self.inner.pop_frame_from_overflow(&self.inner.overflow) { + if count == 0 { + self.inner.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error())); + } let out = frame.to_vec(); self.inner.overflow.drain(..consumed); return Some(Ok((out, count))); @@ -675,4 +727,3 @@ impl XZAssignmentReader { SubsampleFrameDecoder::every(Box::new(frames), step, offset) } } - diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs index 20db650..9ca0df4 100644 --- a/ben/src/io/writer/assignment_writer.rs +++ b/ben/src/io/writer/assignment_writer.rs @@ -1,6 +1,6 @@ use super::utils::parse_json_assignment; use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::{BenConstruct, BenEncodeFrame, MkvBenEncodeFrame}; +use crate::codec::{BenConstruct, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; use crate::format::banners::banner_for_variant; use crate::BenVariant; use serde_json::Value; @@ -81,6 +81,9 @@ impl AssignmentWriter { Some(self.sample_count), ); self.writer.write_all(frame.as_slice())?; + } else if self.previous_sample == pending_sample { + let frame = twodelta_repeat_frame(&pending_sample, self.sample_count)?; + self.writer.write_all(frame.as_slice())?; } else { let frame = encode_twodelta_frame_with_hint( &self.previous_sample, @@ -112,6 +115,12 @@ impl AssignmentWriter { /// Returns `Ok(())` after the assignment has been queued or written. pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { if self.pending_sample.as_deref() == Some(assign_vec.as_slice()) { + if self.sample_count == u16::MAX { + self.flush_pending_frame()?; + self.pending_sample = Some(assign_vec); + self.sample_count = 1; + return Ok(()); + } self.sample_count += 1; return Ok(()); } @@ -153,6 +162,49 @@ impl AssignmentWriter { } } +fn twodelta_repeat_frame(assignment: &[u16], count: u16) -> io::Result { + let first = assignment.first().copied().unwrap_or(0); + let second = assignment + .iter() + .copied() + .find(|&value| value != first) + .unwrap_or_else(|| if first == u16::MAX { 0 } else { first + 1 }); + + let mut run_lengths = Vec::new(); + let mut current = first; + let mut run_len = 0u16; + + for &value in assignment { + if value != first && value != second { + continue; + } + if value == current { + if run_len == u16::MAX { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "TwoDelta repeat frame contains a run longer than u16::MAX", + )); + } + run_len += 1; + } else { + if run_len > 0 { + run_lengths.push(run_len); + } + current = value; + run_len = 1; + } + } + if run_len > 0 { + run_lengths.push(run_len); + } + + Ok(TwoDeltaEncodeFrame::from_run_lengths( + (first, second), + run_lengths, + Some(count), + )) +} + impl Drop for AssignmentWriter { /// Flush any buffered BEN state during drop. fn drop(&mut self) { diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index e69de29..8b13789 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -0,0 +1 @@ + diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index 7a62a80..81ed5c1 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -174,6 +174,12 @@ impl XZAssignmentWriter { } BenVariant::MkvChain => { if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { + if self.count == u16::MAX { + self.flush_pending_frame()?; + self.pending_assignment = Some(assign_vec); + self.count = 1; + return Ok(()); + } self.count += 1; return Ok(()); } @@ -190,6 +196,13 @@ impl XZAssignmentWriter { } // Repeat of the pending initial full frame. if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { + if self.count == u16::MAX { + self.flush_pending_frame()?; + let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; + self.chunk_buffer.push(repeat); + self.previous_assignment = assign_vec; + return Ok(()); + } self.count += 1; return Ok(()); } @@ -197,7 +210,13 @@ impl XZAssignmentWriter { if !self.chunk_buffer.is_empty() && self.previous_assignment.as_slice() == assign_vec.as_slice() { - self.chunk_buffer.last_mut().unwrap().count += 1; + if self.chunk_buffer.last().unwrap().count == u16::MAX { + self.flush_chunk()?; + let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; + self.chunk_buffer.push(repeat); + } else { + self.chunk_buffer.last_mut().unwrap().count += 1; + } return Ok(()); } // New distinct assignment: flush the initial full frame if pending. @@ -354,6 +373,52 @@ impl XZAssignmentWriter { } } +fn twodelta_repeat_buffered_frame( + assignment: &[u16], + count: u16, +) -> io::Result { + let first = assignment.first().copied().unwrap_or(0); + let second = assignment + .iter() + .copied() + .find(|&value| value != first) + .unwrap_or_else(|| if first == u16::MAX { 0 } else { first + 1 }); + + let mut run_lengths = Vec::new(); + let mut current = first; + let mut run_len = 0u16; + + for &value in assignment { + if value != first && value != second { + continue; + } + if value == current { + if run_len == u16::MAX { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "TwoDelta repeat frame contains a run longer than u16::MAX", + )); + } + run_len += 1; + } else { + if run_len > 0 { + run_lengths.push(run_len); + } + current = value; + run_len = 1; + } + } + if run_len > 0 { + run_lengths.push(run_len); + } + + Ok(BufferedDeltaFrame { + pair: (first, second), + run_lengths, + count, + }) +} + impl Drop for XZAssignmentWriter { /// Flush any buffered XBEN state during drop. fn drop(&mut self) { diff --git a/ben/src/ops/relabel/errors.rs b/ben/src/ops/relabel/errors.rs index d8d52fb..bbc1467 100644 --- a/ben/src/ops/relabel/errors.rs +++ b/ben/src/ops/relabel/errors.rs @@ -10,14 +10,20 @@ pub enum RelabelError { )] NonContiguousMap { max_key: usize, missing: usize }, - #[error( - "relabel map length {map_len} does not match assignment length {assignment_len}" - )] + #[error("relabel map length {map_len} does not match assignment length {assignment_len}")] LengthMismatch { map_len: usize, assignment_len: usize, }, + #[error( + "relabel map references old index {old_idx}, but assignment length is {assignment_len}" + )] + OldIndexOutOfRange { + old_idx: usize, + assignment_len: usize, + }, + #[error("IO error: {0}")] Io(#[from] io::Error), } diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 8fa350c..faadfc8 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -96,6 +96,12 @@ fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result= assignment.len() { + return Err(io::Error::from(RelabelError::OldIndexOutOfRange { + old_idx, + assignment_len: assignment.len(), + })); + } out[new_idx] = assignment[old_idx]; } Ok(out) @@ -568,6 +574,12 @@ fn relabel_ben_lines_with_map_impl( } for (new_idx, &old_idx) in permutation.iter().enumerate() { + if old_idx >= assignment_vec.len() { + return Err(io::Error::from(RelabelError::OldIndexOutOfRange { + old_idx, + assignment_len: assignment_vec.len(), + })); + } new_assignment_vec[new_idx] = assignment_vec[old_idx]; } diff --git a/ben/src/util/rle/mod.rs b/ben/src/util/rle/mod.rs index 0739819..c3c92ff 100644 --- a/ben/src/util/rle/mod.rs +++ b/ben/src/util/rle/mod.rs @@ -77,6 +77,11 @@ pub(crate) fn assign_slice_to_rle(assign_vec: &[u16], rle_vec: &mut Vec<(u16, u1 continue; } if assign == prev_assign { + if count == u16::MAX { + rle_vec.push((prev_assign, count)); + count = 1; + continue; + } count += 1; } else { rle_vec.push((prev_assign, count)); diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index c110221..1afdc8c 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -128,8 +128,11 @@ mod mkvchain { let b = vec![2u16, 2, 2]; let c = vec![3u16, 3, 3]; let assignments = [ - a.clone(), a.clone(), a.clone(), - b.clone(), b.clone(), + a.clone(), + a.clone(), + a.clone(), + b.clone(), + b.clone(), c.clone(), ]; let ben = encode_ben(&assignments, BenVariant::MkvChain); @@ -191,7 +194,13 @@ mod mkvchain { fn count_samples_with_no_repetitions() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 3 + ); } #[test] @@ -199,15 +208,30 @@ mod mkvchain { // 3×A + 2×B = 5 total samples from 2 frames. let a = vec![1u16, 0]; let b = vec![0u16, 1]; - let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..2).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..3) + .map(|_| a.clone()) + .chain((0..2).map(|_| b.clone())) + .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 5); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 5 + ); } #[test] fn count_samples_empty_stream() { let ben = MKVCHAIN_BEN_BANNER.to_vec(); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 0); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 0 + ); } // ─── write_all_jsonl ────────────────────────────────────────────────────── @@ -219,7 +243,10 @@ mod mkvchain { let ben = encode_ben(&vec![assignment.clone(); 3], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); assert_eq!(s.lines().count(), 3, "expected 3 JSONL lines for 3 samples"); @@ -235,13 +262,22 @@ mod mkvchain { let ben = encode_ben(&vec![assignment; 3], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); - let parsed: Vec = - s.lines().map(|l| serde_json::from_str(l).unwrap()).collect(); + let parsed: Vec = s + .lines() + .map(|l| serde_json::from_str(l).unwrap()) + .collect(); for (i, v) in parsed.iter().enumerate() { - assert_eq!(v["sample"], i as u64 + 1, "sample number mismatch at position {i}"); + assert_eq!( + v["sample"], + i as u64 + 1, + "sample number mismatch at position {i}" + ); } } @@ -253,7 +289,10 @@ mod mkvchain { let ben = encode_ben(&[a.clone(), a.clone(), b.clone()], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); let lines: Vec<&str> = s.lines().collect(); @@ -273,7 +312,10 @@ mod mkvchain { let ben = encode_ben(&assignments, BenVariant::MkvChain); let mut via_reader = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut via_reader).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut via_reader) + .unwrap(); let mut via_codec = Vec::new(); decode_ben_to_jsonl(ben.as_slice(), &mut via_codec).unwrap(); @@ -306,7 +348,14 @@ mod mkvchain { let a = vec![1u16, 1]; let b = vec![2u16, 2]; let c = vec![3u16, 3]; - let assignments = [a.clone(), a.clone(), a.clone(), b.clone(), b.clone(), c.clone()]; + let assignments = [ + a.clone(), + a.clone(), + a.clone(), + b.clone(), + b.clone(), + c.clone(), + ]; let ben = encode_ben(&assignments, BenVariant::MkvChain); let mut frames: Vec<(Vec, u16)> = Vec::new(); @@ -411,7 +460,10 @@ mod mkvchain { // A×5, B×5; index 3 is in the A run, index 6 is the first B. let a = vec![1u16; 4]; let b = vec![2u16; 4]; - let assignments: Vec<_> = (0..5).map(|_| a.clone()).chain((0..5).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..5) + .map(|_| a.clone()) + .chain((0..5).map(|_| b.clone())) + .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) @@ -435,7 +487,11 @@ mod mkvchain { .into_subsample_by_indices(vec![2usize, 4]) .map(|r| r.unwrap()) .collect(); - assert_eq!(selected.len(), 1, "two indices in same frame → one result tuple"); + assert_eq!( + selected.len(), + 1, + "two indices in same frame → one result tuple" + ); assert_eq!(selected[0].0, a); assert_eq!(selected[0].1, 2, "count should be 2"); } @@ -445,7 +501,10 @@ mod mkvchain { // A×3, B×3; range [2, 5] → A contributes samples 2,3 (count=2) and B contributes 4,5 (count=2). let a = vec![10u16; 3]; let b = vec![20u16; 3]; - let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..3) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) @@ -471,7 +530,11 @@ mod mkvchain { .into_subsample_every(2, 1) .map(|r| r.unwrap()) .collect(); - assert_eq!(selected.len(), 1, "all selected indices in one frame → one result"); + assert_eq!( + selected.len(), + 1, + "all selected indices in one frame → one result" + ); assert_eq!(selected[0].0, a); assert_eq!(selected[0].1, 3, "indices 1,3,5 selected → count=3"); } @@ -481,7 +544,10 @@ mod mkvchain { // A×4, B×4; every 2nd from offset 2 → indices 2,4,6,8 → 2 from A, 2 from B. let a = vec![10u16; 2]; let b = vec![20u16; 2]; - let assignments: Vec<_> = (0..4).map(|_| a.clone()).chain((0..4).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..4) + .map(|_| a.clone()) + .chain((0..4).map(|_| b.clone())) + .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) @@ -501,7 +567,11 @@ mod mkvchain { let assignment = vec![1u16, 1]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + let err = AssignmentReader::new(truncated) + .unwrap() + .next() + .unwrap() + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } @@ -510,7 +580,11 @@ mod mkvchain { let assignment = vec![1u16, 2, 3, 4, 5]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 5]; - let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + let err = AssignmentReader::new(truncated) + .unwrap() + .next() + .unwrap() + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } @@ -519,7 +593,10 @@ mod mkvchain { let assignment = vec![1u16, 2]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated).unwrap().count_samples().unwrap_err(); + let err = AssignmentReader::new(truncated) + .unwrap() + .count_samples() + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } @@ -633,7 +710,9 @@ mod twodelta { // A longer chain: a, b, a, b, a, b (6 assignments, 3 a→b and 2 b→a deltas). let a = vec![1u16, 1, 2, 2, 1, 2]; let b = vec![2u16, 2, 1, 1, 2, 1]; // 1↔2 everywhere - let input: Vec> = (0..6).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let input: Vec> = (0..6) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); let ben = encode_twodelta(&input); assert_eq!(expand_assignments(&ben), input); } @@ -681,7 +760,10 @@ mod twodelta { // a×2, b×3 → anchor(2), delta(3). Expanding must give 5 correct assignments. let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); assert_eq!(expand_assignments(&ben), assignments); } @@ -693,8 +775,11 @@ mod twodelta { let b = vec![2u16, 2, 1, 1]; let assignments = vec![ a.clone(), - b.clone(), b.clone(), - a.clone(), a.clone(), a.clone(), + b.clone(), + b.clone(), + a.clone(), + a.clone(), + a.clone(), b.clone(), ]; let ben = encode_twodelta(&assignments); @@ -706,7 +791,13 @@ mod twodelta { #[test] fn count_samples_single_anchor() { let ben = encode_twodelta(&[vec![1u16, 2, 3]]); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 1); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 1 + ); } #[test] @@ -715,7 +806,13 @@ mod twodelta { let b = vec![2u16, 2, 1, 1]; let assignments = vec![a.clone(), b.clone(), a.clone()]; let ben = encode_twodelta(&assignments); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 3); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 3 + ); } #[test] @@ -723,9 +820,18 @@ mod twodelta { // a×2, b×3 → 5 total. let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); - assert_eq!(AssignmentReader::new(ben.as_slice()).unwrap().count_samples().unwrap(), 5); + assert_eq!( + AssignmentReader::new(ben.as_slice()) + .unwrap() + .count_samples() + .unwrap(), + 5 + ); } // ─── write_all_jsonl ────────────────────────────────────────────────────── @@ -736,7 +842,10 @@ mod twodelta { let ben = encode_twodelta(&[assignment.clone()]); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); assert_eq!(s.lines().count(), 1); @@ -748,11 +857,17 @@ mod twodelta { // a×2, b×3 → 5 lines with correct content. let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); let lines: Vec<&str> = s.lines().collect(); @@ -768,15 +883,23 @@ mod twodelta { fn write_all_jsonl_sample_numbers_are_sequential() { let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut out).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut out) + .unwrap(); let s = String::from_utf8(out).unwrap(); - let parsed: Vec = - s.lines().map(|l| serde_json::from_str(l).unwrap()).collect(); + let parsed: Vec = s + .lines() + .map(|l| serde_json::from_str(l).unwrap()) + .collect(); for (i, v) in parsed.iter().enumerate() { assert_eq!(v["sample"], i as u64 + 1, "sample number at position {i}"); } @@ -789,7 +912,10 @@ mod twodelta { let ben = encode_twodelta(&[a.clone(), b.clone(), a.clone()]); let mut via_reader = Vec::new(); - AssignmentReader::new(ben.as_slice()).unwrap().write_all_jsonl(&mut via_reader).unwrap(); + AssignmentReader::new(ben.as_slice()) + .unwrap() + .write_all_jsonl(&mut via_reader) + .unwrap(); let mut via_codec = Vec::new(); decode_ben_to_jsonl(ben.as_slice(), &mut via_codec).unwrap(); @@ -822,7 +948,10 @@ mod twodelta { // a×2, b×3 → callback invoked twice: (a, 2) then (b, 3). let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); let mut frames: Vec<(Vec, u16)> = Vec::new(); @@ -868,7 +997,10 @@ mod twodelta { // a×2, b×3 → 2 re-encoded frames with counts [2, 3]. let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..2).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..2) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) @@ -932,7 +1064,9 @@ mod twodelta { // a, b, a, b, a → 5 distinct frames (no run-length compression). let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let input: Vec<_> = (0..5).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let input: Vec<_> = (0..5) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); let ben = encode_twodelta(&input); let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) @@ -990,7 +1124,14 @@ mod twodelta { let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; let c = vec![1u16, 2, 1, 2]; - let input = vec![a.clone(), b.clone(), c.clone(), a.clone(), b.clone(), c.clone()]; + let input = vec![ + a.clone(), + b.clone(), + c.clone(), + a.clone(), + b.clone(), + c.clone(), + ]; let ben = encode_twodelta(&input); let selected: Vec> = AssignmentReader::new(ben.as_slice()) @@ -1010,7 +1151,10 @@ mod twodelta { // Index 4 is the first b → (b, 1). let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..3).map(|_| a.clone()).chain((0..3).map(|_| b.clone())).collect(); + let assignments: Vec<_> = (0..3) + .map(|_| a.clone()) + .chain((0..3).map(|_| b.clone())) + .collect(); let ben = encode_twodelta(&assignments); let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) @@ -1032,7 +1176,11 @@ mod twodelta { let assignment = vec![1u16, 2, 3]; let ben = encode_twodelta(&[assignment]); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated).unwrap().next().unwrap().unwrap_err(); + let err = AssignmentReader::new(truncated) + .unwrap() + .next() + .unwrap() + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } @@ -1055,7 +1203,10 @@ mod twodelta { let b = vec![2u16, 2, 1, 1]; let ben = encode_twodelta(&[a, b]); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated).unwrap().count_samples().unwrap_err(); + let err = AssignmentReader::new(truncated) + .unwrap() + .count_samples() + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 7ab755a..451582f 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -909,6 +909,83 @@ fn reben_cli_json_and_ben_modes_work() { assert!(relabeled_text.contains(r#""assignment":[9,4,9]"#)); } +#[test] +fn reben_cli_rejects_map_referencing_missing_assignment_index() { + let temp = TempDir::new("reben-bad-map"); + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.jsonl.ben"); + let map_path = temp.path().join("bad_map.json"); + let out_path = temp.path().join("out.ben"); + + fs::write( + &jsonl_path, + r#"{"assignment":[9,4],"sample":1} +"#, + ) + .unwrap(); + + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben( + BufReader::new(fs::File::open(&jsonl_path).unwrap()), + &mut ben_bytes, + BenVariant::Standard, + ) + .unwrap(); + fs::write(&ben_path, ben_bytes).unwrap(); + + fs::write( + &map_path, + r#"{"key":"map","relabeling_old_to_new_nodes_map":{"0":0,"2":1}}"#, + ) + .unwrap(); + + let relabel = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&relabel); + let stderr = String::from_utf8_lossy(&relabel.stderr); + assert!( + stderr.contains("Error: BEN relabeling with map") + && stderr.contains("old index 2") + && !stderr.contains("panicked"), + "stderr:\n{stderr}" + ); + + let malformed_map_path = temp.path().join("malformed_map.json"); + fs::write(&malformed_map_path, r#"{"key":"map"}"#).unwrap(); + let malformed = run( + "reben", + &[ + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + malformed_map_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&malformed); + let stderr = String::from_utf8_lossy(&malformed.stderr); + assert!( + stderr.contains("Error: Map file") + && stderr.contains("relabeling_old_to_new_nodes_map") + && !stderr.contains("panicked"), + "stderr:\n{stderr}" + ); +} + #[test] fn reben_cli_can_limit_ben_relabeling_to_first_n_items() { let temp = TempDir::new("reben-limit"); diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 7772cb2..f1b9d84 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -15,7 +15,8 @@ use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::reader::{ - AssignmentReader, AssignmentFrameReader, DecoderInitError, XZAssignmentReader, XZAssignmentFrameReader, + AssignmentFrameReader, AssignmentReader, DecoderInitError, XZAssignmentFrameReader, + XZAssignmentReader, }; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::json::graph::{ @@ -1922,7 +1923,9 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::TwoDelta).unwrap(); // AssignmentFrameReader should re-encode TwoDelta frames back to standard BEN frames - let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)) + .unwrap() + .silent(true); let frame_iter = decoder.into_frames(); let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); @@ -1936,7 +1939,9 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { fn ben_decoder_subsample_by_indices() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 4]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)) + .unwrap() + .silent(true); // 1-based indices: 2, 5, 8 let selected: Vec> = decoder .into_subsample_by_indices(vec![2usize, 5, 8]) @@ -1952,7 +1957,9 @@ fn ben_decoder_subsample_by_indices() { fn ben_decoder_subsample_by_range() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 3]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)) + .unwrap() + .silent(true); // Inclusive 1-based range [3, 6] let selected: Vec> = decoder .into_subsample_by_range(3, 6) @@ -1967,7 +1974,9 @@ fn ben_decoder_subsample_by_range() { fn ben_decoder_subsample_every_nth() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)) + .unwrap() + .silent(true); // Every 3rd sample starting at 1-based offset 1: samples 1, 4, 7, 10 let selected: Vec> = decoder .into_subsample_every(3, 1) @@ -1984,7 +1993,9 @@ fn ben_decoder_subsample_every_nth() { fn ben_decoder_subsample_by_indices_dedup() { let assignments: Vec> = (0u16..5).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)).unwrap().silent(true); + let decoder = AssignmentReader::new(Cursor::new(ben)) + .unwrap() + .silent(true); // Duplicate index 2 → after dedup only samples 2 and 3 are selected let selected: Vec> = decoder .into_subsample_by_indices(vec![2usize, 2, 3]) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 1909b84..d401d8a 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -611,7 +611,10 @@ fn xben_decoder_rejects_bad_banner() { let err = XZAssignmentReader::new(xz.as_slice()) .err() .expect("expeced InvalidFileFormat error"); - assert_eq!(std::io::Error::from(err).kind(), std::io::ErrorKind::InvalidData); + assert_eq!( + std::io::Error::from(err).kind(), + std::io::ErrorKind::InvalidData + ); } #[test] @@ -740,7 +743,10 @@ fn xben_new_invalid_banner() { let err = XZAssignmentReader::new(wrong.as_slice()) .err() .expect("expected invalid data"); - assert_eq!(std::io::Error::from(err).kind(), std::io::ErrorKind::InvalidData); + assert_eq!( + std::io::Error::from(err).kind(), + std::io::ErrorKind::InvalidData + ); } #[test] @@ -987,7 +993,8 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { ) .unwrap(); - let mut frames = binary_ensemble::io::reader::XZAssignmentFrameReader::new(xz.as_slice()).unwrap(); + let mut frames = + binary_ensemble::io::reader::XZAssignmentFrameReader::new(xz.as_slice()).unwrap(); assert!(frames.next().unwrap().is_ok()); let trimmed = &xz[..xz.len() - 1]; @@ -1370,11 +1377,9 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let frames = AssignmentReader::new(ben.as_slice()).unwrap().into_frames(); assert_eq!( - collect_frames( - frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))) - ) - .unwrap() - .len(), + collect_frames(frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt)))) + .unwrap() + .len(), 3 ); } diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs new file mode 100644 index 0000000..7c439ee --- /dev/null +++ b/ben/tests/test_stress_edges.rs @@ -0,0 +1,640 @@ +use binary_ensemble::codec::decode::{ + decode_ben_to_jsonl, decode_twodelta_frame, decode_xben_to_ben, decode_xben_to_jsonl, + xz_decompress, +}; +use binary_ensemble::codec::encode::{encode_jsonl_to_xben, xz_compress}; +use binary_ensemble::codec::{BenConstruct, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; +use binary_ensemble::format::banners::{ + MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, +}; +use binary_ensemble::io::bundle::format::{ + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_TYPE_CUSTOM, + ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_YES, + HEADER_SIZE, +}; +use binary_ensemble::io::bundle::writer::{ + AddAssetOptions, BendlAppender, BendlTruncate, BendlWriter, +}; +use binary_ensemble::io::bundle::BendlReader; +use binary_ensemble::io::reader::{AssignmentReader, XZAssignmentReader}; +use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::ops::relabel::relabel_ben_file_with_map; +use binary_ensemble::BenVariant; +use std::cell::RefCell; +use std::collections::HashMap; +use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; +use std::rc::Rc; + +fn expand_ben(bytes: &[u8]) -> Vec> { + AssignmentReader::new(bytes) + .unwrap() + .silent(true) + .flat_map(|record| { + let (assignment, count) = record.unwrap(); + std::iter::repeat(assignment).take(count as usize) + }) + .collect() +} + +fn minimal_bendl_with_entries( + entries: Vec, + directory_len_adjustment: i64, +) -> Vec { + let mut bytes = vec![0u8; HEADER_SIZE]; + let directory_offset = bytes.len() as u64; + let mut directory = encode_directory(&entries).unwrap(); + if directory_len_adjustment > 0 { + directory.extend(std::iter::repeat(0u8).take(directory_len_adjustment as usize)); + } + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: (directory.len() as i64 + directory_len_adjustment.min(0)) as u64, + stream_offset: directory_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + bytes +} + +fn expect_bendl_open_err(bytes: Vec) -> binary_ensemble::io::bundle::format::BendlFormatError { + match BendlReader::open(Cursor::new(bytes)) { + Ok(_) => panic!("expected BendlReader::open to fail"), + Err(err) => err, + } +} + +#[derive(Debug)] +struct CrashState { + bytes: Vec, + pos: u64, + truncated: bool, +} + +#[derive(Debug, Clone)] +struct HeaderPatchCrashCursor { + state: Rc>, +} + +impl HeaderPatchCrashCursor { + fn new(bytes: Vec) -> (Self, Rc>) { + let state = Rc::new(RefCell::new(CrashState { + bytes, + pos: 0, + truncated: false, + })); + ( + Self { + state: Rc::clone(&state), + }, + state, + ) + } +} + +impl Read for HeaderPatchCrashCursor { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let mut state = self.state.borrow_mut(); + let pos = state.pos as usize; + if pos >= state.bytes.len() { + return Ok(0); + } + let count = buf.len().min(state.bytes.len() - pos); + buf[..count].copy_from_slice(&state.bytes[pos..pos + count]); + state.pos += count as u64; + Ok(count) + } +} + +impl Write for HeaderPatchCrashCursor { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let mut state = self.state.borrow_mut(); + if state.truncated && state.pos < HEADER_SIZE as u64 { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "simulated crash while patching bundle header", + )); + } + let pos = state.pos as usize; + let end = pos + buf.len(); + if end > state.bytes.len() { + state.bytes.resize(end, 0); + } + state.bytes[pos..end].copy_from_slice(buf); + state.pos = end as u64; + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl Seek for HeaderPatchCrashCursor { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + let mut state = self.state.borrow_mut(); + let next = match pos { + SeekFrom::Start(offset) => offset as i128, + SeekFrom::End(offset) => state.bytes.len() as i128 + offset as i128, + SeekFrom::Current(offset) => state.pos as i128 + offset as i128, + }; + if next < 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek before start", + )); + } + state.pos = next as u64; + Ok(state.pos) + } +} + +impl BendlTruncate for HeaderPatchCrashCursor { + fn truncate_at(&mut self, len: u64) -> std::io::Result<()> { + let mut state = self.state.borrow_mut(); + state.truncated = true; + state.bytes.truncate(len as usize); + if state.pos > len { + state.pos = len; + } + Ok(()) + } +} + +fn tiny_bendl_bundle() -> Vec { + let mut writer = BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "base.bin", + b"base", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00\x01\x02", 1) + .unwrap(); + writer.finish().unwrap().into_inner() +} + +fn assert_ben_bytes_do_not_panic(bytes: Vec) { + let outcome = std::panic::catch_unwind(|| { + if let Ok(reader) = AssignmentReader::new(bytes.as_slice()) { + for record in reader.silent(true).take(16) { + let _ = record; + } + } + }); + assert!(outcome.is_ok(), "BEN parser panicked for bytes: {bytes:?}"); +} + +fn assert_xben_bytes_do_not_panic(bytes: Vec) { + let outcome = std::panic::catch_unwind(|| { + if let Ok(reader) = XZAssignmentReader::new(bytes.as_slice()) { + for record in reader.silent(true).take(16) { + let _ = record; + } + } + }); + assert!(outcome.is_ok(), "XBEN parser panicked for bytes: {bytes:?}"); +} + +#[test] +fn standard_rle_splits_assignment_run_longer_than_u16_max() { + let assignment = vec![7u16; u16::MAX as usize + 1]; + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); + writer.write_assignment(assignment.clone()).unwrap(); + writer.finish().unwrap(); + } + + let decoded = expand_ben(&ben); + assert_eq!(decoded, vec![assignment]); +} + +#[test] +fn mkvchain_writer_splits_repetition_count_longer_than_u16_max() { + let sample = vec![1u16, 2, 2, 1]; + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::MkvChain).unwrap(); + for _ in 0..(u16::MAX as usize + 1) { + writer.write_assignment(sample.clone()).unwrap(); + } + writer.finish().unwrap(); + } + + let mut reader = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let first = reader.next().unwrap().unwrap(); + let second = reader.next().unwrap().unwrap(); + assert!(reader.next().is_none()); + assert_eq!(first, (sample.clone(), u16::MAX)); + assert_eq!(second, (sample, 1)); +} + +#[test] +fn twodelta_writer_splits_repetition_count_longer_than_u16_max() { + let sample = vec![1u16, 1, 2, 2]; + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for _ in 0..(u16::MAX as usize + 1) { + writer.write_assignment(sample.clone()).unwrap(); + } + writer.finish().unwrap(); + } + + let mut total = 0usize; + let mut unique_frames = 0usize; + AssignmentReader::new(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + assert_eq!(assignment, sample.as_slice()); + total += count as usize; + unique_frames += 1; + Ok(true) + }) + .unwrap(); + assert_eq!(total, u16::MAX as usize + 1); + assert_eq!(unique_frames, 2); +} + +#[test] +fn xben_mkvchain_splits_repetition_count_longer_than_u16_max() { + let mut jsonl = String::new(); + for sample in 1..=(u16::MAX as usize + 1) { + jsonl.push_str(&format!(r#"{{"assignment":[4,4,5],"sample":{sample}}}"#)); + jsonl.push('\n'); + } + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_bytes()), + &mut xben, + BenVariant::MkvChain, + Some(1), + Some(0), + None, + ) + .unwrap(); + + let mut reader = XZAssignmentReader::new(xben.as_slice()) + .unwrap() + .silent(true); + assert_eq!(reader.next().unwrap().unwrap(), (vec![4, 4, 5], u16::MAX)); + assert_eq!(reader.next().unwrap().unwrap(), (vec![4, 4, 5], 1)); + assert!(reader.next().is_none()); +} + +#[test] +fn malformed_ben_bit_widths_return_invalid_data() { + let mut ben = STANDARD_BEN_BANNER.to_vec(); + ben.extend_from_slice(&[0, 1, 0, 0, 0, 0]); + let err = AssignmentReader::new(ben.as_slice()) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + + let mut ben = STANDARD_BEN_BANNER.to_vec(); + ben.extend_from_slice(&[17, 1, 0, 0, 0, 0]); + let err = decode_ben_to_jsonl(ben.as_slice(), Vec::new()).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { + let anchor = MkvBenEncodeFrame::from_assignment(vec![1u16, 2], Some(1)); + let mut ben = TWODELTA_BEN_BANNER.to_vec(); + ben.extend_from_slice(anchor.as_slice()); + ben.extend_from_slice(&[0, 1, 0, 2, 0, 0, 0, 0, 0, 1]); + + let mut reader = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + assert_eq!(reader.next().unwrap().unwrap(), (vec![1, 2], 1)); + let err = reader.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + + let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let err = decode_twodelta_frame(vec![1u16], &frame).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn direct_xben_helpers_propagate_corrupt_xz_errors() { + let jsonl = b"{\"assignment\":[1,2,1],\"sample\":1}\n"; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::Standard, + Some(1), + Some(0), + None, + ) + .unwrap(); + xben.truncate(xben.len() - 1); + + assert!(decode_xben_to_jsonl(BufReader::new(xben.as_slice()), Vec::new()).is_err()); + assert!(decode_xben_to_ben(BufReader::new(xben.as_slice()), Vec::new()).is_err()); + assert!(xz_decompress(BufReader::new(xben.as_slice()), Vec::new()).is_err()); +} + +#[test] +fn xz_compress_propagates_input_reader_errors() { + struct FailingReader; + impl std::io::Read for FailingReader { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Err(std::io::Error::new(std::io::ErrorKind::Other, "boom")) + } + } + impl std::io::BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Err(std::io::Error::new(std::io::ErrorKind::Other, "boom")) + } + fn consume(&mut self, _amt: usize) {} + } + + let err = xz_compress(FailingReader, Vec::new(), Some(1), Some(0)).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); +} + +#[test] +fn relabel_map_out_of_range_old_indices_error_cleanly() { + let mut ben = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); + writer.write_assignment(vec![10, 20]).unwrap(); + writer.finish().unwrap(); + } + + let out_of_range_old = HashMap::from([(0usize, 0usize), (1, 2)]); + let err = relabel_ben_file_with_map(ben.as_slice(), Vec::new(), out_of_range_old).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +#[test] +fn bendl_open_rejects_malformed_directory_invariants() { + let dup_entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "dup.bin".to_string(), + payload_offset: HEADER_SIZE as u64, + payload_len: 0, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "dup.bin".to_string(), + payload_offset: HEADER_SIZE as u64, + payload_len: 0, + checksum: None, + }, + ]; + let duplicate_bundle = minimal_bendl_with_entries(dup_entries, 0); + let err = expect_bendl_open_err(duplicate_bundle.clone()); + assert!(err.to_string().contains("malformed directory")); + assert!(BendlAppender::open(Cursor::new(duplicate_bundle)).is_err()); + + let wrong_canonical = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: 0, + name: "not_graph.json".to_string(), + payload_offset: HEADER_SIZE as u64, + payload_len: 0, + checksum: None, + }]; + let err = expect_bendl_open_err(minimal_bendl_with_entries(wrong_canonical, 0)); + assert!(err.to_string().contains("malformed directory")); +} + +#[test] +fn bendl_open_rejects_directory_len_mismatches() { + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "ok.bin".to_string(), + payload_offset: HEADER_SIZE as u64, + payload_len: 0, + checksum: None, + }]; + + let trailing = minimal_bendl_with_entries(entries.clone(), 1); + let err = expect_bendl_open_err(trailing); + assert!(err.to_string().contains("trailing")); + + let too_short = minimal_bendl_with_entries(entries, -1); + let err = expect_bendl_open_err(too_short); + assert!(matches!( + err, + binary_ensemble::io::bundle::format::BendlFormatError::Io(_) + )); +} + +#[test] +fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { + let mut inner = TWODELTA_BEN_BANNER.to_vec(); + inner.push(2); // XBEN_TWODELTA_CHUNK_TAG + inner.extend_from_slice(&u32::MAX.to_be_bytes()); + + let mut xben = Vec::new(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xben, + Some(1), + Some(0), + ) + .unwrap(); + + let mut reader = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let err = reader.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn zero_count_frames_are_rejected() { + let frame = MkvBenEncodeFrame::from_assignment(vec![1u16], Some(0)); + let mut ben = MKVCHAIN_BEN_BANNER.to_vec(); + ben.extend_from_slice(frame.as_slice()); + let err = AssignmentReader::new(ben.as_slice()) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + + let mut inner = MKVCHAIN_BEN_BANNER.to_vec(); + inner.extend_from_slice(&(1u32 << 16 | 1).to_be_bytes()); + inner.extend_from_slice(&[0, 0, 0, 0]); + inner.extend_from_slice(&0u16.to_be_bytes()); + let mut xben = Vec::new(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xben, + Some(1), + Some(0), + ) + .unwrap(); + let err = XZAssignmentReader::new(xben.as_slice()) + .unwrap() + .next() + .unwrap() + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn seeded_malformed_ben_bytes_do_not_panic() { + let mut valid_standard = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut valid_standard, BenVariant::Standard).unwrap(); + writer.write_assignment(vec![1, 1, 2, 3]).unwrap(); + writer.write_assignment(vec![3, 3, 2, 1]).unwrap(); + writer.finish().unwrap(); + } + + let mut valid_mkv = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut valid_mkv, BenVariant::MkvChain).unwrap(); + writer.write_assignment(vec![4, 4, 5]).unwrap(); + writer.write_assignment(vec![4, 4, 5]).unwrap(); + writer.write_assignment(vec![5, 4, 4]).unwrap(); + writer.finish().unwrap(); + } + + let mut valid_twodelta = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut valid_twodelta, BenVariant::TwoDelta).unwrap(); + writer.write_assignment(vec![1, 1, 2, 2]).unwrap(); + writer.write_assignment(vec![1, 2, 1, 2]).unwrap(); + writer.write_assignment(vec![2, 2, 1, 1]).unwrap(); + writer.finish().unwrap(); + } + + for seed in [valid_standard, valid_mkv, valid_twodelta] { + for len in 0..=seed.len() { + assert_ben_bytes_do_not_panic(seed[..len].to_vec()); + } + + for idx in 0..seed.len() { + let mut mutated = seed.clone(); + mutated[idx] ^= 0xA5; + assert_ben_bytes_do_not_panic(mutated); + } + + if seed.len() >= STANDARD_BEN_BANNER.len() + 6 { + let mut inflated_frame_len = seed.clone(); + let start = STANDARD_BEN_BANNER.len() + 2; + inflated_frame_len[start..start + 4].copy_from_slice(&1024u32.to_be_bytes()); + assert_ben_bytes_do_not_panic(inflated_frame_len); + } + } +} + +#[test] +fn seeded_malformed_xben_bytes_do_not_panic() { + let jsonl = + b"{\"assignment\":[1,1,2,2],\"sample\":1}\n{\"assignment\":[1,2,1,2],\"sample\":2}\n"; + let mut seeds = Vec::new(); + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + variant, + Some(1), + Some(0), + Some(32), + ) + .unwrap(); + seeds.push(xben); + } + + for seed in seeds { + for len in 0..=seed.len() { + assert_xben_bytes_do_not_panic(seed[..len].to_vec()); + } + + for idx in (0..seed.len()).step_by(3) { + let mut mutated = seed.clone(); + mutated[idx] ^= 0x5A; + assert_xben_bytes_do_not_panic(mutated); + } + } + + let mut unknown_tag_inner = STANDARD_BEN_BANNER.to_vec(); + unknown_tag_inner.push(0xFF); + let mut unknown_tag_xben = Vec::new(); + xz_compress( + BufReader::new(unknown_tag_inner.as_slice()), + &mut unknown_tag_xben, + Some(1), + Some(0), + ) + .unwrap(); + assert_xben_bytes_do_not_panic(unknown_tag_xben); +} + +#[test] +fn bendl_append_header_patch_crash_is_rejected_on_reopen() { + let base = tiny_bendl_bundle(); + assert!(BendlReader::open(Cursor::new(base.clone())).is_ok()); + + let (cursor, state) = HeaderPatchCrashCursor::new(base); + let mut appender = BendlAppender::open(cursor).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "after-crash.bin", + b"payload written before header patch", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + + let err = match appender.commit() { + Ok(_) => panic!("expected simulated header patch crash"), + Err(err) => err, + }; + assert!(err.to_string().contains("simulated crash")); + + let damaged = state.borrow().bytes.clone(); + assert!(BendlReader::open(Cursor::new(damaged.clone())).is_err()); + assert!(BendlAppender::open(Cursor::new(damaged)).is_err()); +} + +#[test] +fn bendl_append_truncated_new_directory_is_rejected_on_reopen() { + let base = tiny_bendl_bundle(); + let mut appender = BendlAppender::open(Cursor::new(base)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "complete-append.bin", + b"payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let mut appended = appender.commit().unwrap().into_inner(); + assert!(BendlReader::open(Cursor::new(appended.clone())).is_ok()); + + appended.pop(); + let err = expect_bendl_open_err(appended); + assert!(err.to_string().contains("IO error")); +} From f9336298e0c8ef32ec3d49fa10582f5a1005f3ee Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 11 Apr 2026 11:00:51 -0600 Subject: [PATCH 066/221] update pyben decoder --- ben/src/io/reader/mod.rs | 5 +- ben/src/io/reader/subsample.rs | 35 +- pyben/binary_ensemble/_core.pyi | 94 +++++- pyben/src/bundle/mod.rs | 39 +-- pyben/src/decode/mod.rs | 481 ++++++++++++++++++++++++--- pyben/src/encode/mod.rs | 24 +- pyben/tests/test_bundle.py | 314 +++++++++++++++++ pyben/tests/test_python_pipelines.py | 17 +- 8 files changed, 917 insertions(+), 92 deletions(-) diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs index 6e1eaf6..d7d40b5 100644 --- a/ben/src/io/reader/mod.rs +++ b/ben/src/io/reader/mod.rs @@ -8,7 +8,8 @@ pub mod xz_assignment_reader; pub use assignment_reader::{AssignmentFrameReader, AssignmentReader}; pub use errors::DecoderInitError; pub use subsample::{ - build_frame_iter, count_samples_from_file, Ben32Frame, DecodeFrame, FrameIter, MkvRecord, - Selection, SubsampleFrameDecoder, + build_frame_iter, build_frame_iter_from_reader, count_samples_from_file, + count_samples_from_frame_iter, Ben32Frame, DecodeFrame, FrameIter, MkvRecord, Selection, + SubsampleFrameDecoder, }; pub use xz_assignment_reader::{XZAssignmentFrameReader, XZAssignmentReader}; diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index 940e259..b0b59b5 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -6,7 +6,7 @@ use super::xz_assignment_reader::XZAssignmentReader; use crate::codec::BenDecodeFrame; use crate::BenVariant; use std::fs::File; -use std::io::{self, BufReader}; +use std::io::{self, BufReader, Read}; use std::iter::Peekable; use std::path::{Path, PathBuf}; @@ -257,7 +257,30 @@ where pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { let file = File::options().read(true).open(file_path)?; let reader = BufReader::new(file); + build_frame_iter_from_reader(reader, mode) +} +/// Build a generic frame iterator from an already-opened reader. +/// +/// This is the reader-driven variant of [`build_frame_iter`], useful when +/// the caller needs to iterate frames over a sub-region of a file (e.g. +/// the assignment stream embedded in a `.bendl` bundle, wrapped in a +/// [`std::io::Read::take`] guard) without re-opening the file from offset +/// zero. +/// +/// # Arguments +/// +/// * `reader` - Any owned reader positioned at the start of a `.ben` or +/// `.xben` byte stream. +/// * `mode` - Either `"ben"` or `"xben"`. +/// +/// # Returns +/// +/// Returns a boxed iterator over generic frames and their repetition counts. +pub fn build_frame_iter_from_reader( + reader: R, + mode: &str, +) -> io::Result { match mode { "ben" => { let frames = AssignmentFrameReader::new(reader)?; @@ -293,6 +316,16 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result io::Result { let iter = build_frame_iter(&path.to_path_buf(), mode)?; + count_samples_from_frame_iter(iter) +} + +/// Count the number of samples reachable through a pre-built frame iterator. +/// +/// Mirror of [`count_samples_from_file`] that operates on an existing +/// [`FrameIter`], so callers that already have one (e.g. constructed via +/// [`build_frame_iter_from_reader`] over a bundle's stream region) can +/// reuse the walking logic without re-opening any files. +pub fn count_samples_from_frame_iter(iter: FrameIter) -> io::Result { let mut total = 0usize; for item in iter { let (_frame, cnt) = item?; diff --git a/pyben/binary_ensemble/_core.pyi b/pyben/binary_ensemble/_core.pyi index 64e223e..7c14ae3 100644 --- a/pyben/binary_ensemble/_core.pyi +++ b/pyben/binary_ensemble/_core.pyi @@ -1,9 +1,15 @@ -from typing import Iterable, Iterator, Literal +from typing import Any, Iterable, Iterator, Literal from pathlib import Path class PyBenDecoder: - """Iterator over assignments in a BEN or XBEN file. - Open a decoder over a BEN (`.ben`) or XBEN (`.xben`) file. + """Iterator over assignments in a BEN, XBEN, or BENDL file. + + Open a decoder over a plain stream (`.ben` / `.xben`) or a bundle + (`.bendl`). The file's leading bytes are sniffed; when the BENDL magic + is present, the bundle header decides between BEN and XBEN and the + ``mode`` argument is ignored. Iteration walks only the embedded stream + region, and the bundle's table of contents / asset payloads are + available through the bundle-inspection methods. Construction is lazy with respect to sample counting: opening the decoder does not scan the whole file. The first call to :func:`len` or :meth:`count_samples` @@ -14,7 +20,8 @@ class PyBenDecoder: file_path : Path to the input file. mode : {"ben", "xben"}, default "ben" - Select container format. + Select container format. Only consulted for plain streams; for + bundles the header dictates the format. Raises ------ @@ -106,6 +113,85 @@ class PyBenDecoder: """ ... + # ----------------------------------------------------------------- + # Bundle-inspection surface. + # + # These methods only make sense when the decoder was opened on a + # `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear + # error pointing the user at the right tool. + # ----------------------------------------------------------------- + + def is_bundle(self) -> bool: + """Return True if the decoder was opened on a `.bendl` bundle.""" + ... + + def assignment_format(self) -> Literal["ben", "xben"]: + """Return the container format of the underlying stream.""" + ... + + def version(self) -> tuple[int, int]: + """Return the bundle's format version as ``(major, minor)``. + + Raises an error on plain streams. + """ + ... + + def is_complete(self) -> bool: + """Return whether the bundle was successfully finalized. + + Raises an error on plain streams. + """ + ... + + def asset_names(self) -> list[str]: + """Return the names of every entry in the bundle's directory. + + Raises an error on plain streams. + """ + ... + + def list_assets(self) -> list[dict[str, Any]]: + """Return the full bundle directory as a list of dicts with keys + ``name``, ``type``, ``offset``, ``len``, and ``flags``. + + Raises an error on plain streams. + """ + ... + + def read_asset_bytes(self, name: str) -> bytes: + """Return the (decoded) bytes of the named asset. + + Raises an error on plain streams, a ``KeyError`` when the asset is + absent, and an ``OSError`` when the payload cannot be read. + """ + ... + + def read_json_asset(self, name: str) -> Any: + """Parse a JSON asset into a Python object. + + Raises an error on plain streams, a ``KeyError`` when the asset is + absent, and an exception when the bytes are not valid UTF-8 JSON. + """ + ... + + def read_graph(self) -> Any | None: + """Return the bundle's ``graph.json`` asset as a parsed JSON + object, or ``None`` if absent. Raises on plain streams. + """ + ... + + def read_metadata(self) -> Any | None: + """Return the bundle's ``metadata.json`` asset as a parsed JSON + object, or ``None`` if absent. Raises on plain streams. + """ + ... + + def read_relabel_map(self) -> Any | None: + """Return the bundle's ``relabel_map.json`` asset as a parsed + JSON object, or ``None`` if absent. Raises on plain streams. + """ + ... + class PyBenEncoder: """Encoder for Binary Ensemble (.ben) files. diff --git a/pyben/src/bundle/mod.rs b/pyben/src/bundle/mod.rs index 592687b..055287a 100644 --- a/pyben/src/bundle/mod.rs +++ b/pyben/src/bundle/mod.rs @@ -87,11 +87,7 @@ impl PyBundleReader { /// Names of all directory entries, in directory order. #[pyo3(text_signature = "(self)")] fn asset_names(&self) -> Vec { - self.inner - .assets() - .iter() - .map(|e| e.name.clone()) - .collect() + self.inner.assets().iter().map(|e| e.name.clone()).collect() } /// Return the full directory as a list of dicts with keys @@ -141,9 +137,8 @@ impl PyBundleReader { fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { let bytes = self.read_asset_bytes(name)?; let json_mod = py.import("json")?; - let text = std::str::from_utf8(&bytes).map_err(|e| { - PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")) - })?; + let text = std::str::from_utf8(&bytes) + .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; let parsed = json_mod.call_method1("loads", (text,))?; Ok(parsed.into()) } @@ -152,11 +147,7 @@ impl PyBundleReader { /// Returns `None` if the bundle does not carry a graph asset. #[pyo3(text_signature = "(self)")] fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { - if self - .inner - .find_asset_by_type(ASSET_TYPE_GRAPH) - .is_none() - { + if self.inner.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { return Ok(None); } Ok(Some(self.read_json_asset(py, "graph.json")?)) @@ -166,11 +157,7 @@ impl PyBundleReader { /// or `None` if absent. #[pyo3(text_signature = "(self)")] fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { - if self - .inner - .find_asset_by_type(ASSET_TYPE_METADATA) - .is_none() - { + if self.inner.find_asset_by_type(ASSET_TYPE_METADATA).is_none() { return Ok(None); } Ok(Some(self.read_json_asset(py, "metadata.json")?)) @@ -214,17 +201,15 @@ impl PyBundleReader { .create_new(true) .open(&out_path) } - .map_err(|e| { - PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())) - })?; + .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; let mut out = BufWriter::new(out); - let mut stream = self.inner.assignment_stream_reader().map_err(|e| { - PyException::new_err(format!("Failed to open stream region: {e}")) - })?; - io::copy(&mut stream, &mut out).map_err(|e| { - PyIOError::new_err(format!("Failed to copy stream bytes: {e}")) - })?; + let mut stream = self + .inner + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; + io::copy(&mut stream, &mut out) + .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; Ok(()) } diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index 3124409..cfbfeae 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -2,19 +2,26 @@ use crate::common::{open_input, open_output, validate_input_output_paths}; use binary_ensemble::codec::decode::{ decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, }; +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, BENDL_MAGIC, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, + ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, +}; +use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::{ - build_frame_iter, count_samples_from_file, AssignmentReader, MkvRecord, Selection, - SubsampleFrameDecoder, XZAssignmentReader, + build_frame_iter, build_frame_iter_from_reader, count_samples_from_file, + count_samples_from_frame_iter, AssignmentReader, MkvRecord, Selection, SubsampleFrameDecoder, + XZAssignmentReader, }; -use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; +use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyUserWarning}; use pyo3::prelude::*; use pyo3::types::PyDict; -use std::io; -use std::path::PathBuf; +use std::fs::File; +use std::io::{self, BufReader, Read, Seek, SeekFrom}; +use std::path::{Path, PathBuf}; type DynIter = Box> + Send>; -#[derive(Clone)] +#[derive(Clone, Copy)] enum DecoderMode { Ben, XBen, @@ -37,17 +44,43 @@ impl DecoderMode { Self::XBen => "xben", } } + + fn from_assignment_format(fmt: AssignmentFormat) -> Self { + match fmt { + AssignmentFormat::Ben => Self::Ben, + AssignmentFormat::Xben => Self::XBen, + } + } } -#[derive(Clone)] -struct DecoderSource { - path: PathBuf, - mode: DecoderMode, +/// Cached bundle state for a decoder opened on a `.bendl` file. +/// +/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC +/// inspection and asset-read calls without disturbing the iterator (which +/// reads the stream region through a separate file handle). +struct BundleState { + reader: BendlReader>, + stream_offset: u64, + stream_len: u64, +} + +/// What the decoder was actually opened on. +enum DecoderBackend { + Plain, + Bundle(BundleState), +} + +impl DecoderBackend { + fn is_bundle(&self) -> bool { + matches!(self, DecoderBackend::Bundle(_)) + } } #[pyclass(module = "binary_ensemble", unsendable)] pub struct PyBenDecoder { - source: DecoderSource, + path: PathBuf, + mode: DecoderMode, + backend: DecoderBackend, iter: DynIter, current_assignment: Option>, remaining_count: u16, @@ -57,25 +90,91 @@ pub struct PyBenDecoder { #[pymethods] impl PyBenDecoder { + /// Open a decoder on a `.ben`, `.xben`, or `.bendl` file. + /// + /// The file's leading bytes are sniffed to decide whether it is a + /// bundle. When the file is a `.bendl`, the bundle's header decides + /// the BEN/XBEN format and the `mode` argument is ignored; when the + /// file is a plain stream, `mode` selects between the BEN and XBEN + /// readers and defaults to `"ben"`. + /// + /// # Arguments + /// + /// * `file_path` - Path to the input file. + /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain + /// streams; bundles use `assignment_format` from the header. #[new] #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { - let mode = DecoderMode::parse(mode)?; - let source = DecoderSource { - path: file_path, - mode, - }; - let iter = build_iter(py, &source)?; - - Ok(Self { - source, - iter, - current_assignment: None, - remaining_count: 0, - base_len: None, - len_hint: None, - }) + // Validate the mode string up front so "Unknown mode" is reported + // regardless of whether the file exists or turns out to be a bundle. + let parsed_mode = DecoderMode::parse(mode)?; + let is_bundle = detect_is_bundle(&file_path).map_err(|e| { + PyIOError::new_err(format!( + "Failed to open {}: {e}", + file_path.display() + )) + })?; + + if is_bundle { + let file = File::open(&file_path).map_err(|e| { + PyIOError::new_err(format!( + "Failed to open {}: {e}", + file_path.display() + )) + })?; + let reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + file_path.display() + )) + })?; + let fmt = reader.assignment_format().ok_or_else(|| { + PyException::new_err( + "Bundle header has an unrecognized assignment_format field.", + ) + })?; + let derived_mode = DecoderMode::from_assignment_format(fmt); + let (stream_offset, stream_len) = { + let header = reader.header(); + (header.stream_offset, header.stream_len) + }; + let state = BundleState { + reader, + stream_offset, + stream_len, + }; + + // Emit the XBEN startup warning once, up front. + if matches!(derived_mode, DecoderMode::XBen) { + warn_xben_startup(py)?; + } + + let iter = build_bundle_iter(&file_path, &state, derived_mode)?; + Ok(Self { + path: file_path, + mode: derived_mode, + backend: DecoderBackend::Bundle(state), + iter, + current_assignment: None, + remaining_count: 0, + base_len: None, + len_hint: None, + }) + } else { + let iter = build_plain_iter(py, &file_path, parsed_mode)?; + Ok(Self { + path: file_path, + mode: parsed_mode, + backend: DecoderBackend::Plain, + iter, + current_assignment: None, + remaining_count: 0, + base_len: None, + len_hint: None, + }) + } } fn __iter__(slf: PyRefMut) -> PyResult> { @@ -220,6 +319,189 @@ impl PyBenDecoder { reset_with_selection(&mut slf, sel, len_hint)?; Ok(slf.into()) } + + // --------------------------------------------------------------------- + // Bundle-inspection surface. + // + // These methods only make sense when the decoder was opened on a + // `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear + // error pointing the user at the right tool. + // --------------------------------------------------------------------- + + /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a + /// plain `.ben`/`.xben` stream (`False`). + #[pyo3(text_signature = "(self)")] + fn is_bundle(&self) -> bool { + self.backend.is_bundle() + } + + /// Return the container format of the underlying assignment stream + /// as `"ben"` or `"xben"`. + #[pyo3(text_signature = "(self)")] + fn assignment_format(&self) -> &'static str { + self.mode.as_str() + } + + /// Return the bundle's format version as a `(major, minor)` tuple. + /// Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn version(&self) -> PyResult<(u16, u16)> { + let state = self.require_bundle("version()")?; + let h = state.reader.header(); + Ok((h.major_version, h.minor_version)) + } + + /// Whether the bundle was successfully finalized. Errors on plain + /// streams. + #[pyo3(text_signature = "(self)")] + fn is_complete(&self) -> PyResult { + let state = self.require_bundle("is_complete()")?; + Ok(state.reader.is_complete()) + } + + /// Names of every entry in the bundle's directory, in directory + /// order. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn asset_names(&self) -> PyResult> { + let state = self.require_bundle("asset_names()")?; + Ok(state + .reader + .assets() + .iter() + .map(|e| e.name.clone()) + .collect()) + } + + /// Return the full bundle directory as a list of dicts with keys + /// `name`, `type`, `offset`, `len`, and `flags` (a list of string + /// tags). Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { + let state = self.require_bundle("list_assets()")?; + let entries = state.reader.assets(); + let mut out = Vec::with_capacity(entries.len()); + for entry in entries { + let d = PyDict::new(py); + d.set_item("name", &entry.name)?; + d.set_item("type", entry.asset_type)?; + d.set_item("offset", entry.payload_offset)?; + d.set_item("len", entry.payload_len)?; + let mut flags: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flags.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flags.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flags.push("checksum"); + } + d.set_item("flags", flags)?; + out.push(d); + } + Ok(out) + } + + /// Read the (decoded) bytes of a named asset as a Python `bytes` + /// object. Errors on plain streams. + #[pyo3(text_signature = "(self, name, /)")] + fn read_asset_bytes(&mut self, name: &str) -> PyResult> { + let state = self.require_bundle_mut("read_asset_bytes()")?; + let entry = state + .reader + .find_asset_by_name(name) + .cloned() + .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; + state + .reader + .asset_bytes(&entry) + .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) + } + + /// Parse a JSON asset into a Python object (dict, list, …). Errors + /// on plain streams and when the asset does not exist or is not + /// valid UTF-8 / JSON. + #[pyo3(text_signature = "(self, name, /)")] + fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { + let bytes = self.read_asset_bytes(name)?; + let json_mod = py.import("json")?; + let text = std::str::from_utf8(&bytes) + .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; + let parsed = json_mod.call_method1("loads", (text,))?; + Ok(parsed.into()) + } + + /// Read the bundle's `graph.json` asset as a parsed JSON object. + /// Returns `None` if the bundle does not carry a graph asset. Errors + /// on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_graph()")?; + if state.reader.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "graph.json")?)) + } + + /// Read the bundle's `metadata.json` asset as a parsed JSON object, + /// or `None` if absent. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_metadata()")?; + if state.reader.find_asset_by_type(ASSET_TYPE_METADATA).is_none() { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "metadata.json")?)) + } + + /// Read the bundle's `relabel_map.json` asset as a parsed JSON + /// object, or `None` if absent. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_relabel_map()")?; + if state + .reader + .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) + .is_none() + { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) + } +} + +impl PyBenDecoder { + /// Borrow the bundle state or raise a clear Python error explaining + /// that the decoder was opened on a plain stream. + fn require_bundle(&self, op: &str) -> PyResult<&BundleState> { + match &self.backend { + DecoderBackend::Bundle(state) => Ok(state), + DecoderBackend::Plain => Err(PyException::new_err(format!( + "{op} is only available on .bendl bundles; this decoder was opened \ + on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ + via PyBenEncoder with ben_file_only=False) to get bundle features.", + self.mode.as_str() + ))), + } + } + + fn require_bundle_mut(&mut self, op: &str) -> PyResult<&mut BundleState> { + match &mut self.backend { + DecoderBackend::Bundle(state) => Ok(state), + DecoderBackend::Plain => Err(PyException::new_err(format!( + "{op} is only available on .bendl bundles; this decoder was opened \ + on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ + via PyBenEncoder with ben_file_only=False) to get bundle features.", + self.mode.as_str() + ))), + } + } } fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { @@ -238,9 +520,26 @@ fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { Ok(()) } -fn build_iter(py: Python<'_>, source: &DecoderSource) -> PyResult { - let reader = open_input(&source.path)?; - match source.mode { +/// Sniff the first 8 bytes of a file and decide whether it starts with +/// the `BENDL` magic. +fn detect_is_bundle(path: &Path) -> io::Result { + let mut file = File::open(path)?; + let mut magic = [0u8; 8]; + match file.read_exact(&mut magic) { + Ok(()) => Ok(magic == BENDL_MAGIC), + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(false), + Err(e) => Err(e), + } +} + +/// Build a plain-stream iterator from `path` using `mode`. +fn build_plain_iter( + py: Python<'_>, + path: &Path, + mode: DecoderMode, +) -> PyResult { + let reader = open_input(&path.to_path_buf())?; + match mode { DecoderMode::Ben => { let ben = AssignmentReader::new(reader) .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; @@ -255,13 +554,66 @@ fn build_iter(py: Python<'_>, source: &DecoderSource) -> PyResult { } } -fn build_frames(source: &DecoderSource) -> PyResult { - build_frame_iter(&source.path, source.mode.as_str()).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - source.path.display() - )) - }) +/// Open a second file handle on the bundle path, seek to the stream +/// region, and wrap it in the appropriate assignment reader so the +/// decoder iterator only walks the embedded stream. +fn build_bundle_iter( + path: &Path, + state: &BundleState, + mode: DecoderMode, +) -> PyResult { + let reader = open_bundle_stream_reader(path, state)?; + match mode { + DecoderMode::Ben => { + let ben = AssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; + Ok(Box::new(ben)) + } + DecoderMode::XBen => { + let xben = XZAssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; + Ok(Box::new(xben)) + } + } +} + +/// Create a `Read`-only handle bounded to the bundle's assignment stream +/// region. +fn open_bundle_stream_reader( + path: &Path, + state: &BundleState, +) -> PyResult>> { + let file = File::open(path) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", path.display())))?; + let mut buf = BufReader::new(file); + buf.seek(SeekFrom::Start(state.stream_offset)).map_err(|e| { + PyIOError::new_err(format!("Failed to seek into bundle stream: {e}")) + })?; + Ok(buf.take(state.stream_len)) +} + +fn build_frames_for_subsample( + path: &Path, + mode: DecoderMode, + backend: &DecoderBackend, +) -> PyResult { + match backend { + DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + path.display() + )) + }), + DecoderBackend::Bundle(state) => { + let reader = open_bundle_stream_reader(path, state)?; + build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from bundle {}: {e}", + path.display() + )) + }) + } + } } fn reset_with_selection( @@ -269,7 +621,7 @@ fn reset_with_selection( selection: Selection, len_hint: usize, ) -> PyResult<()> { - let frames = build_frames(&decoder.source)?; + let frames = build_frames_for_subsample(&decoder.path, decoder.mode, &decoder.backend)?; let frame_decoder = SubsampleFrameDecoder::new(frames, selection); decoder.iter = Box::new(frame_decoder); decoder.current_assignment = None; @@ -283,20 +635,55 @@ fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { + let path = decoder.path.clone(); + let mode = decoder.mode.as_str().to_string(); + py.detach(|| count_samples_from_file(&path, &mode)) + .map_err(|e| { + PyException::new_err(format!( + "Failed to count samples in {}: {e}", + path.display() + )) + })? + } + DecoderBackend::Bundle(state) => { + // Prefer the authoritative sample_count carried in the + // bundle header, which is set for finalized bundles and is + // O(1). Fall back to scanning the stream region when the + // header has no count (unfinalized append target, or a + // header byte we cannot interpret). + if let Some(n) = state.reader.sample_count() { + if n >= 0 { + n as usize + } else { + scan_bundle_samples(&decoder.path, state, decoder.mode)? + } + } else { + scan_bundle_samples(&decoder.path, state, decoder.mode)? + } + } + }; decoder.base_len = Some(base_len); Ok(base_len) } +fn scan_bundle_samples( + path: &Path, + state: &BundleState, + mode: DecoderMode, +) -> PyResult { + let reader = open_bundle_stream_reader(path, state)?; + let iter = build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to open bundle stream for sample count: {e}" + )) + })?; + count_samples_from_frame_iter(iter).map_err(|e| { + PyException::new_err(format!("Failed to count samples in bundle: {e}")) + }) +} + #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 3f6d3e3..540e533 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -89,7 +89,9 @@ impl PyBenEncoder { graph = None, ben_file_only = false, ))] - #[pyo3(text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)")] + #[pyo3( + text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)" + )] fn new( py: Python<'_>, file_path: PathBuf, @@ -126,9 +128,9 @@ impl PyBenEncoder { let mut slot = file.borrow_mut(); slot.seek(SeekFrom::Start(0)) .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; - header - .write_to(&mut *slot) - .map_err(|e| PyIOError::new_err(format!("Failed to write bundle header: {e}")))?; + header.write_to(&mut *slot).map_err(|e| { + PyIOError::new_err(format!("Failed to write bundle header: {e}")) + })?; if let Some(bytes) = graph_bytes { let compressed = xz_compress(&bytes).map_err(|e| { @@ -151,9 +153,10 @@ impl PyBenEncoder { } } - let stream_start = file.borrow_mut().stream_position().map_err(|e| { - PyIOError::new_err(format!("Failed to query output position: {e}")) - })?; + let stream_start = file + .borrow_mut() + .stream_position() + .map_err(|e| PyIOError::new_err(format!("Failed to query output position: {e}")))?; header.stream_offset = stream_start; OutputMode::Bundle { @@ -181,9 +184,10 @@ impl PyBenEncoder { #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { - let enc = self.encoder.as_mut().ok_or_else(|| { - PyIOError::new_err("Encoder has already been closed.") - })?; + let enc = self + .encoder + .as_mut() + .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; enc.write_assignment(assignment) .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {e}")))?; if let OutputMode::Bundle { sample_count, .. } = &mut self.mode { diff --git a/pyben/tests/test_bundle.py b/pyben/tests/test_bundle.py index 5f0115e..3a6b374 100644 --- a/pyben/tests/test_bundle.py +++ b/pyben/tests/test_bundle.py @@ -642,6 +642,55 @@ def test_open_rejects_bundle_with_chopped_directory_bytes(tmp_path: Path) -> Non PyBundleReader(path) +def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: + stream = _ben_bytes_for([[1, 2]], tmp_path) + + duplicate_names = build_bundle( + stream_bytes=stream, + sample_count=1, + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="dup.bin", payload=b"a"), + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="dup.bin", payload=b"b"), + ], + ) + path = _write_bundle(tmp_path / "dup.bendl", duplicate_names) + with pytest.raises(Exception, match="malformed directory"): + PyBundleReader(path) + + wrong_singleton_name = build_bundle( + stream_bytes=stream, + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="not_metadata.json", + payload=b"{}", + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "singleton.bendl", wrong_singleton_name) + with pytest.raises(Exception, match="malformed directory"): + PyBundleReader(path) + + +def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) -> None: + bundle = bytearray( + build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="x", payload=b"abc")], + ) + ) + directory_len = struct.unpack_from(" None: # Provisional bundle with complete=0: sample_count() must be None. stream = _ben_bytes_for([[1, 2, 3]], tmp_path) @@ -1373,3 +1422,268 @@ def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: out = tmp_path / "bad.bendl" with pytest.raises(ValueError, match="graph must be"): PyBenEncoder(out, overwrite=True, variant="standard", graph=12345) + + +# --------------------------------------------------------------------------- +# PyBenDecoder opened directly on a .bendl bundle. +# +# The decoder auto-detects the BENDL magic and, when present, iterates only +# the embedded stream region while exposing TOC / asset helpers on the side. +# When opened on a plain .ben/.xben stream, iteration still works but the +# bundle methods must raise a clear error. +# --------------------------------------------------------------------------- + + +def test_pybendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: + samples = [[1, 2, 3], [1, 2, 3], [4, 4, 5]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_BEN, + ) + path = _write_bundle(tmp_path / "stream.bendl", bundle) + + dec = PyBenDecoder(path) + assert dec.is_bundle() is True + assert dec.assignment_format() == "ben" + assert dec.is_complete() is True + assert dec.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) + assert len(dec) == len(samples) + assert list(dec) == samples + + +def test_pybendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: + samples = [[1, 1, 2, 2], [3, 3, 4, 4]] + bundle = build_bundle( + stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_XBEN, + ) + path = _write_bundle(tmp_path / "stream.bendl", bundle) + + dec = PyBenDecoder(path) + assert dec.is_bundle() is True + assert dec.assignment_format() == "xben" + assert len(dec) == len(samples) + assert list(dec) == samples + + +def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: + samples = [[1, 2, 3]] + graph_json = b'{"nodes":[0,1],"edges":[[0,1]]}' + metadata_json = b'{"note":"hello"}' + relabel_json = b'{"0":"A","1":"B"}' + + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=metadata_json, + is_json=True, + ), + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=graph_json, + is_json=True, + compress=True, + ), + _Asset( + asset_type=ASSET_TYPE_RELABEL_MAP, + name="relabel_map.json", + payload=relabel_json, + is_json=True, + ), + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="notes.bin", + payload=b"\x00\x01\x02", + ), + ], + ) + path = _write_bundle(tmp_path / "rich.bendl", bundle) + + dec = PyBenDecoder(path) + + # TOC surface + assert dec.asset_names() == [ + "metadata.json", + "graph.json", + "relabel_map.json", + "notes.bin", + ] + assets = dec.list_assets() + assert [a["name"] for a in assets] == dec.asset_names() + by_name = {a["name"]: a for a in assets} + assert "xz" in by_name["graph.json"]["flags"] + assert "json" in by_name["graph.json"]["flags"] + assert by_name["notes.bin"]["flags"] == [] + + # Raw and JSON asset access + assert dec.read_asset_bytes("metadata.json") == metadata_json + assert dec.read_asset_bytes("graph.json") == graph_json + assert dec.read_metadata() == json.loads(metadata_json) + assert dec.read_graph() == json.loads(graph_json) + assert dec.read_relabel_map() == json.loads(relabel_json) + assert dec.read_json_asset("metadata.json") == json.loads(metadata_json) + + # Unknown asset by name raises KeyError. + with pytest.raises(KeyError, match="no asset named"): + dec.read_asset_bytes("missing.bin") + + # Iteration still works after the TOC surface has been used. + assert list(dec) == samples + + +def test_pybendecoder_bundle_canonical_helpers_return_none_when_absent( + tmp_path: Path, +) -> None: + samples = [[1, 2]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="custom.bin", payload=b"x") + ], + ) + path = _write_bundle(tmp_path / "sparse.bendl", bundle) + dec = PyBenDecoder(path) + assert dec.read_graph() is None + assert dec.read_metadata() is None + assert dec.read_relabel_map() is None + + +def test_pybendecoder_bundle_subsample_range(tmp_path: Path) -> None: + samples = [[i, i + 1] for i in range(1, 11)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "range.bendl", bundle) + + dec = PyBenDecoder(path) + dec.subsample_range(3, 6) + assert list(dec) == samples[2:6] + + +def test_pybendecoder_bundle_subsample_indices(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 9)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "idx.bendl", bundle) + + dec = PyBenDecoder(path) + dec.subsample_indices([1, 4, 8]) + assert list(dec) == [samples[0], samples[3], samples[7]] + + +def test_pybendecoder_bundle_subsample_every(tmp_path: Path) -> None: + samples = [[i, i] for i in range(1, 11)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "every.bendl", bundle) + + dec = PyBenDecoder(path) + dec.subsample_every(3, 2) + assert list(dec) == [samples[1], samples[4], samples[7]] + + +def test_pybendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: + # For bundles, the header decides the format — a caller-supplied + # `mode="xben"` on a BEN bundle must not confuse the reader. + samples = [[1, 2, 3]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_BEN, + ) + path = _write_bundle(tmp_path / "ignore_mode.bendl", bundle) + + dec = PyBenDecoder(path, mode="xben") + assert dec.assignment_format() == "ben" + assert list(dec) == samples + + +def test_pybendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None: + # Opening a plain .ben file must still iterate unchanged; the new + # bundle surface is simply unavailable. + samples = [[1, 2, 3], [4, 5, 6]] + ben_path = tmp_path / "plain.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(ben_path) + assert dec.is_bundle() is False + assert dec.assignment_format() == "ben" + assert list(dec) == samples + + +@pytest.mark.parametrize( + "method_call", + [ + lambda d: d.version(), + lambda d: d.is_complete(), + lambda d: d.asset_names(), + lambda d: d.list_assets(), + lambda d: d.read_asset_bytes("metadata.json"), + lambda d: d.read_json_asset("metadata.json"), + lambda d: d.read_graph(), + lambda d: d.read_metadata(), + lambda d: d.read_relabel_map(), + ], +) +def test_pybendecoder_plain_stream_rejects_bundle_methods( + tmp_path: Path, method_call +) -> None: + ben_path = tmp_path / "plain.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + enc.write([1, 2, 3]) + + dec = PyBenDecoder(ben_path) + with pytest.raises(Exception, match="only available on .bendl bundles"): + method_call(dec) + + +def test_pybendecoder_plain_stream_error_mentions_ben_file_only( + tmp_path: Path, +) -> None: + ben_path = tmp_path / "plain.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + enc.write([1]) + + dec = PyBenDecoder(ben_path) + with pytest.raises(Exception, match="ben_file_only=False"): + dec.read_graph() + + +def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> None: + # End-to-end: a bundle written by PyBenEncoder (with a graph asset) + # must round-trip through a single PyBenDecoder call — no need to + # extract the stream first. + out = tmp_path / "e2e.bendl" + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH + ) as enc: + for a in [[1, 2, 3], [2, 3, 4]]: + enc.write(a) + + dec = PyBenDecoder(out) + assert dec.is_bundle() is True + assert dec.is_complete() is True + assert dec.assignment_format() == "ben" + assert dec.read_graph() == SAMPLE_GRAPH + assert list(dec) == [[1, 2, 3], [2, 3, 4]] diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 84ba92b..68b98e6 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -443,6 +443,15 @@ def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: ctx_enc.write([4, 5, 6]) assert list(PyBenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] + invalid_path = tmp_path / "invalid_assignment.ben" + with PyBenEncoder( + invalid_path, overwrite=True, variant="standard", ben_file_only=True + ) as invalid_enc: + with pytest.raises(Exception): + invalid_enc.write([-1]) + with pytest.raises(Exception): + invalid_enc.write([65536]) + def test_pybenencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> None: out = tmp_path / "out.ben" @@ -628,12 +637,18 @@ def test_decoder_subsample_validations_and_warning_paths(tmp_path: Path) -> None with pytest.raises(Exception, match="indices must be 1-based"): PyBenDecoder(ben, mode="ben").subsample_indices([0, 1]) + with pytest.raises(Exception): + PyBenDecoder(ben, mode="ben").subsample_indices([-1]) + with pytest.raises(Exception, match="indices must be <="): PyBenDecoder(ben, mode="ben").subsample_indices([6]) with pytest.raises(Exception, match="range must be 1-based"): PyBenDecoder(ben, mode="ben").subsample_range(0, 2) + with pytest.raises(Exception): + PyBenDecoder(ben, mode="ben").subsample_range(-1, 2) + with pytest.raises(Exception, match="end must be <="): PyBenDecoder(ben, mode="ben").subsample_range(1, 99) @@ -670,7 +685,7 @@ def test_decoder_reports_zero_count_and_bad_frame_errors(tmp_path: Path) -> None data = bytearray(mkv_ben.read_bytes()) data[-2:] = b"\x00\x00" mkv_ben.write_bytes(data) - with pytest.raises(Exception, match="zero-count"): + with pytest.raises(Exception, match="count must be greater than zero"): next(iter(PyBenDecoder(mkv_ben, mode="ben"))) standard_ben = tmp_path / "standard.ben" From cdfd7d80861166e6006031a39dc0f62d8268e78a Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 11 Apr 2026 11:10:52 -0600 Subject: [PATCH 067/221] fix warnings --- ben/src/io/reader/mod.rs | 3 ++- ben/src/json/graph/petxgraph/mod.rs | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs index d7d40b5..25c70cc 100644 --- a/ben/src/io/reader/mod.rs +++ b/ben/src/io/reader/mod.rs @@ -1,7 +1,8 @@ pub mod assignment_reader; pub mod errors; pub mod subsample; -pub(crate) mod tests; +#[cfg(test)] +mod tests; pub(crate) mod twodelta; pub mod xz_assignment_reader; diff --git a/ben/src/json/graph/petxgraph/mod.rs b/ben/src/json/graph/petxgraph/mod.rs index 06437b6..c78a99c 100644 --- a/ben/src/json/graph/petxgraph/mod.rs +++ b/ben/src/json/graph/petxgraph/mod.rs @@ -3,7 +3,10 @@ mod permutation; mod sort; use super::nx_formats::NxAdjEntry; -use petgraph::graph::{DiGraph, Graph, UnGraph}; +use petgraph::graph::Graph; +#[cfg(test)] +use petgraph::graph::{DiGraph, UnGraph}; +#[cfg(test)] use petgraph::{Directed, Undirected}; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -40,12 +43,16 @@ where } /// Convenience alias for a directed [`PetxGraph`]. +#[cfg(test)] pub(crate) type PetxDiGraph = PetxGraph; /// Convenience alias for an undirected [`PetxGraph`]. +#[cfg(test)] pub(crate) type PetxUnGraph = PetxGraph; /// Convenience alias for the inner directed petgraph type. +#[cfg(test)] pub(crate) type PetxDiInnerGraph = DiGraph; /// Convenience alias for the inner undirected petgraph type. +#[cfg(test)] pub(crate) type PetxUnInnerGraph = UnGraph; pub(in crate::json::graph) use permutation::apply_permutation; From 42e191ab688106c40653cca4991866745c17b7ca Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 11 Apr 2026 11:50:46 -0600 Subject: [PATCH 068/221] allow multiple iterator passes --- pyben/binary_ensemble/_core.pyi | 12 +- pyben/src/decode/mod.rs | 79 +++++- pyben/tests/test_bundle.py | 458 ++++++++++++++++++++++++++++++++ 3 files changed, 540 insertions(+), 9 deletions(-) diff --git a/pyben/binary_ensemble/_core.pyi b/pyben/binary_ensemble/_core.pyi index 7c14ae3..2d2287d 100644 --- a/pyben/binary_ensemble/_core.pyi +++ b/pyben/binary_ensemble/_core.pyi @@ -34,7 +34,17 @@ class PyBenDecoder: def __init__( self, file_path: str | Path, mode: Literal["ben", "xben"] = "ben" ) -> None: ... - def __iter__(self) -> Iterator[list[int]]: ... + def __iter__(self) -> Iterator[list[int]]: + """Return an iterator over the samples, restarting from the start. + + Each call to :func:`iter` (including the implicit call made by + ``for x in dec:``) rebuilds the underlying frame walker and, if a + subsample selection has been installed via :meth:`subsample_indices`, + :meth:`subsample_range`, or :meth:`subsample_every`, reapplies it. + Iteration can therefore be performed multiple times on the same + decoder. + """ + ... def __next__(self) -> list[int]: ... def __len__(self) -> int: """Return the number of samples. diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index cfbfeae..20ee689 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -76,6 +76,37 @@ impl DecoderBackend { } } +/// Stored form of the most recently installed subsampling selection. +/// +/// The iterator is single-pass, so to support restarting iteration +/// (e.g. `for x in dec: ... ; for x in dec: ...`) the decoder remembers +/// the active selection and rebuilds a fresh frame decoder on every +/// call to `__iter__`. +#[derive(Clone)] +enum ActiveSelection { + None, + Indices(Vec), + Range { start: usize, end: usize }, + Every { step: usize, offset: usize }, +} + +impl ActiveSelection { + fn to_selection(&self) -> Option { + match self { + Self::None => None, + Self::Indices(v) => Some(Selection::Indices(v.clone().into_iter().peekable())), + Self::Range { start, end } => Some(Selection::Range { + start: *start, + end: *end, + }), + Self::Every { step, offset } => Some(Selection::Every { + step: *step, + offset: *offset, + }), + } + } +} + #[pyclass(module = "binary_ensemble", unsendable)] pub struct PyBenDecoder { path: PathBuf, @@ -86,6 +117,7 @@ pub struct PyBenDecoder { remaining_count: u16, base_len: Option, len_hint: Option, + active_selection: ActiveSelection, } #[pymethods] @@ -161,9 +193,13 @@ impl PyBenDecoder { remaining_count: 0, base_len: None, len_hint: None, + active_selection: ActiveSelection::None, }) } else { - let iter = build_plain_iter(py, &file_path, parsed_mode)?; + if matches!(parsed_mode, DecoderMode::XBen) { + warn_xben_startup(py)?; + } + let iter = build_plain_iter(&file_path, parsed_mode)?; Ok(Self { path: file_path, mode: parsed_mode, @@ -173,11 +209,40 @@ impl PyBenDecoder { remaining_count: 0, base_len: None, len_hint: None, + active_selection: ActiveSelection::None, }) } } - fn __iter__(slf: PyRefMut) -> PyResult> { + /// Return `self` as an iterator, rebuilding the underlying frame + /// walker so iteration can be restarted. + /// + /// Calling `iter(dec)` (or using `for x in dec: …`) more than once + /// is supported: each call reopens the stream region from the start + /// and, if a subsample selection is active, reapplies it. + fn __iter__(mut slf: PyRefMut) -> PyResult> { + slf.current_assignment = None; + slf.remaining_count = 0; + + let path = slf.path.clone(); + let mode = slf.mode; + let selection = slf.active_selection.clone(); + + let new_iter: DynIter = match selection { + ActiveSelection::None => match &slf.backend { + DecoderBackend::Plain => build_plain_iter(&path, mode)?, + DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, + }, + sel => { + let frames = build_frames_for_subsample(&path, mode, &slf.backend)?; + let ben_sel = sel + .to_selection() + .expect("active subsample selection must be convertible"); + Box::new(SubsampleFrameDecoder::new(frames, ben_sel)) + } + }; + + slf.iter = new_iter; Ok(slf.into()) } @@ -266,6 +331,7 @@ impl PyBenDecoder { } let len_hint = indices.len(); + slf.active_selection = ActiveSelection::Indices(indices.clone()); let sel = Selection::Indices(indices.into_iter().peekable()); reset_with_selection(&mut slf, sel, len_hint)?; Ok(slf.into()) @@ -291,6 +357,7 @@ impl PyBenDecoder { ))); } + slf.active_selection = ActiveSelection::Range { start, end }; let sel = Selection::Range { start, end }; let len_hint = end - start + 1; reset_with_selection(&mut slf, sel, len_hint)?; @@ -314,6 +381,7 @@ impl PyBenDecoder { base_len ))); } + slf.active_selection = ActiveSelection::Every { step, offset }; let sel = Selection::Every { step, offset }; let len_hint = (base_len + step - 1 - (offset - 1)) / step; reset_with_selection(&mut slf, sel, len_hint)?; @@ -533,11 +601,7 @@ fn detect_is_bundle(path: &Path) -> io::Result { } /// Build a plain-stream iterator from `path` using `mode`. -fn build_plain_iter( - py: Python<'_>, - path: &Path, - mode: DecoderMode, -) -> PyResult { +fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { let reader = open_input(&path.to_path_buf())?; match mode { DecoderMode::Ben => { @@ -546,7 +610,6 @@ fn build_plain_iter( Ok(Box::new(ben)) } DecoderMode::XBen => { - warn_xben_startup(py)?; let xben = XZAssignmentReader::new(reader) .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; Ok(Box::new(xben)) diff --git a/pyben/tests/test_bundle.py b/pyben/tests/test_bundle.py index 3a6b374..2ead064 100644 --- a/pyben/tests/test_bundle.py +++ b/pyben/tests/test_bundle.py @@ -1687,3 +1687,461 @@ def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> N assert dec.assignment_format() == "ben" assert dec.read_graph() == SAMPLE_GRAPH assert list(dec) == [[1, 2, 3], [2, 3, 4]] + + +def test_pybendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: + # An incomplete bundle has complete=0 and no directory — its header + # carries no authoritative sample_count, so __len__ must fall back + # to scanning the stream region. This exercises the + # `scan_bundle_samples` path in the decoder. + samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + stream = _ben_bytes_for(samples, tmp_path) + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=len(stream), + sample_count=-1, + ) + path = tmp_path / "incomplete.bendl" + path.write_bytes(header + stream) + + dec = PyBenDecoder(path) + assert dec.is_bundle() is True + assert dec.is_complete() is False + # len() forces the fallback scan, which must agree with the data. + assert len(dec) == len(samples) + # A second call uses the cached value and still returns the same. + assert len(dec) == len(samples) + # The iterator itself still works. + assert list(dec) == samples + + +def test_pybendecoder_incomplete_bundle_count_samples_matches_len( + tmp_path: Path, +) -> None: + # Explicit count_samples() also flows through scan_bundle_samples + # for incomplete bundles. + samples = [[i, i + 1] for i in range(1, 6)] + stream = _ben_bytes_for(samples, tmp_path) + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=len(stream), + sample_count=-1, + ) + path = tmp_path / "incomplete_count.bendl" + path.write_bytes(header + stream) + + dec = PyBenDecoder(path) + assert dec.count_samples() == len(samples) + assert len(dec) == len(samples) + + +def test_pybendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: + # A finalized bundle whose assignment_format byte is neither BEN + # nor XBEN must surface a clear error at decoder construction, not + # silently fall through. + samples = [[1, 2, 3]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assignment_format=99, + ) + path = _write_bundle(tmp_path / "weird_fmt.bendl", bundle) + with pytest.raises(Exception, match="unrecognized assignment_format"): + PyBenDecoder(path) + + +def test_pybendecoder_empty_stream_bundle(tmp_path: Path) -> None: + # A bundle containing a valid BEN banner but zero frames must be + # openable and produce an empty iterator / zero-length decoder. + bundle = build_bundle(stream_bytes=_ben_bytes_for([], tmp_path), sample_count=0) + path = _write_bundle(tmp_path / "empty.bendl", bundle) + + dec = PyBenDecoder(path) + assert dec.is_bundle() is True + assert len(dec) == 0 + assert dec.count_samples() == 0 + assert list(dec) == [] + assert dec.asset_names() == [] + assert dec.list_assets() == [] + + +def test_pybendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> None: + # Calling TOC / asset methods in between __next__ calls must not + # break the iterator — the TOC access uses a separate BendlReader, + # not the file handle backing the iterator. + samples = [[1, 2], [3, 4], [5, 6]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b'{"tag":42}', + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "interleave.bendl", bundle) + + dec = PyBenDecoder(path) + it = iter(dec) + + assert next(it) == samples[0] + # TOC read between samples + assert dec.read_metadata() == {"tag": 42} + assert dec.asset_names() == ["metadata.json"] + assert next(it) == samples[1] + # And another TOC read + assert dec.read_asset_bytes("metadata.json") == b'{"tag":42}' + assert next(it) == samples[2] + with pytest.raises(StopIteration): + next(it) + + +def test_pybendecoder_bundle_subsample_range_rejects_out_of_bounds( + tmp_path: Path, +) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "range_bad.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.raises(Exception, match="end must be <= number of samples"): + dec.subsample_range(1, 99) + with pytest.raises(Exception, match="1-based"): + dec.subsample_range(0, 1) + + +def test_pybendecoder_bundle_subsample_indices_rejects_out_of_bounds( + tmp_path: Path, +) -> None: + samples = [[1, 2], [3, 4]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "idx_bad.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.raises(Exception, match="number of samples"): + dec.subsample_indices([1, 42]) + # Empty index list is also rejected. + dec2 = PyBenDecoder(path) + with pytest.raises(Exception, match="must not be empty"): + dec2.subsample_indices([]) + + +def test_pybendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> None: + samples = [[1], [2], [3]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "every_bad.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.raises(Exception, match="offset must be <= number of samples"): + dec.subsample_every(1, 99) + dec2 = PyBenDecoder(path) + with pytest.raises(Exception, match="step and offset must be >= 1"): + dec2.subsample_every(0, 1) + + +def test_pybendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: + # __len__ caches the scan result; calling it twice must not re-scan + # but must return the same answer. + samples = [[1, 2], [3, 4], [5, 6]] + ben_path = tmp_path / "cached.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + for a in samples: + enc.write(a) + dec = PyBenDecoder(ben_path) + assert len(dec) == len(samples) + assert len(dec) == len(samples) + # Explicit count_samples must also agree. + assert dec.count_samples() == len(samples) + + +def test_pybendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: + # A 4-byte file cannot start with the BENDL magic; detect_is_bundle + # must return false on UnexpectedEof, after which plain-stream + # decoding fails with a banner error. + path = tmp_path / "tiny.ben" + path.write_bytes(b"abcd") + with pytest.raises(Exception): + PyBenDecoder(path) + + +def test_pybendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: + path = tmp_path / "empty.ben" + path.write_bytes(b"") + with pytest.raises(Exception): + PyBenDecoder(path) + + +def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: + # read_json_asset on the decoder should mirror PyBundleReader's + # error behavior when an asset isn't valid UTF-8. + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="binary.bin", + payload=b"\xff\xfe\xfd", + ) + ], + ) + path = _write_bundle(tmp_path / "bad_utf8.bendl", bundle) + dec = PyBenDecoder(path) + # Raw bytes are fine. + assert dec.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" + with pytest.raises(Exception, match="not valid UTF-8"): + dec.read_json_asset("binary.bin") + + +def test_pybendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> None: + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1]], tmp_path), + sample_count=1, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b"not json {", + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "bad_json.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.raises(json.JSONDecodeError): + dec.read_metadata() + + +def test_pybendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> None: + # A bundle built with PyBenEncoder compresses the graph asset as xz; + # read_graph() on PyBenDecoder must still return the decoded JSON. + out = tmp_path / "xz_graph.bendl" + with PyBenEncoder( + out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH + ) as enc: + enc.write([1, 2, 3]) + dec = PyBenDecoder(out) + # Spot-check that graph.json was actually stored compressed. + by_name = {a["name"]: a for a in dec.list_assets()} + assert "xz" in by_name["graph.json"]["flags"] + assert dec.read_graph() == SAMPLE_GRAPH + + +def test_pybendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: + # XBEN bundles with TOC entries were not previously covered — only + # the plain XBEN-bundle auto-detect case. Verify iteration AND TOC + # access both work on an XBEN bundle. + samples = [[1, 1, 2, 2], [2, 2, 1, 1], [3, 3, 3, 3]] + meta = b'{"variant":"mkv_chain"}' + bundle = build_bundle( + stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), + sample_count=len(samples), + assignment_format=ASSIGNMENT_FORMAT_XBEN, + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=meta, + is_json=True, + ) + ], + ) + path = _write_bundle(tmp_path / "xben_assets.bendl", bundle) + + dec = PyBenDecoder(path) + assert dec.assignment_format() == "xben" + assert dec.asset_names() == ["metadata.json"] + assert dec.read_metadata() == {"variant": "mkv_chain"} + assert list(dec) == samples + + +def test_pybendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> None: + # The subsample_indices path that sorts+dedupes unsorted input also + # has to work for bundles. Mixing in duplicates should still yield + # the deduplicated selection. + samples = [[i] for i in range(1, 6)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "unsorted.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.warns(UserWarning, match="sorted and unique"): + dec.subsample_indices([4, 1, 4, 1]) + assert list(dec) == [[1], [4]] + + +def test_pybendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: + # `assignment_format()` must report "xben" when opened on a plain + # XBEN stream as well, not only on bundles. + samples = [[1, 1, 2, 2], [2, 2, 1, 1]] + src = tmp_path / "src.jsonl" + _write_jsonl(samples, src) + xben_path = tmp_path / "plain.xben" + compress_jsonl_to_xben( + src, xben_path, overwrite=True, variant="standard", n_threads=1, compression_level=1 + ) + with pytest.warns(UserWarning): + dec = PyBenDecoder(xben_path, mode="xben") + assert dec.is_bundle() is False + assert dec.assignment_format() == "xben" + assert list(dec) == samples + + +def test_pybendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( + tmp_path: Path, +) -> None: + # An incomplete bundle has no directory, so there are no assets to + # list — asset-free surface still returns empty structures, which is + # the contract for finalized asset-free bundles too. Just verify it + # doesn't crash. + samples = [[1, 2]] + stream = _ben_bytes_for(samples, tmp_path) + header = _pack_header( + complete=COMPLETE_NO, + assignment_format=ASSIGNMENT_FORMAT_BEN, + directory_offset=0, + directory_len=0, + stream_offset=HEADER_SIZE, + stream_len=len(stream), + sample_count=-1, + ) + path = tmp_path / "incomplete_toc.bendl" + path.write_bytes(header + stream) + + dec = PyBenDecoder(path) + assert dec.is_bundle() is True + assert dec.is_complete() is False + assert dec.asset_names() == [] + assert dec.list_assets() == [] + assert dec.read_graph() is None + assert dec.read_metadata() is None + assert dec.read_relabel_map() is None + + +def test_pybendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: + # `__iter__` rebuilds the underlying frame walker so `for x in dec:` + # can be used more than once against a bundle. + samples = [[1, 2], [3, 4], [5, 6]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "twice.bendl", bundle) + dec = PyBenDecoder(path) + assert list(dec) == samples + # A second pass reopens the stream region from the start. + assert list(dec) == samples + + +def test_pybendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None: + samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + ben_path = tmp_path / "twice.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + for a in samples: + enc.write(a) + dec = PyBenDecoder(ben_path) + assert list(dec) == samples + assert list(dec) == samples + + +def test_pybendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> None: + # Subsample selections must persist across `__iter__` calls, so + # iterating the same (subsampled) decoder twice gives the same + # filtered window each time. + samples = [[i, i + 1] for i in range(1, 11)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "range_twice.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_range(3, 6) + expected = samples[2:6] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 8)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "idx_twice.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_indices([2, 5, 7]) + expected = [samples[1], samples[4], samples[6]] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 11)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "every_twice.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_every(3, 2) + expected = [samples[1], samples[4], samples[7]] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> None: + # Calling subsample_* a second time must replace the first selection + # AND survive reiteration with the new selection. + samples = [[i] for i in range(1, 8)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "reselect.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_range(1, 3) + assert list(dec) == samples[:3] + dec.subsample_indices([4, 7]) + expected = [samples[3], samples[6]] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: + # Consuming part of the iterator and then calling `iter()` / `list()` + # again must restart cleanly from the first sample, not resume + # mid-stream. + samples = [[1, 2], [3, 4], [5, 6], [7, 8]] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "partial.bendl", bundle) + dec = PyBenDecoder(path) + it = iter(dec) + assert next(it) == samples[0] + assert next(it) == samples[1] + # Any new pass (list / for / iter) rebuilds and starts over. + assert list(dec) == samples From cbcad0d1750b82bf3187f5335d829fd2b68304e7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 13 Apr 2026 08:29:16 -0600 Subject: [PATCH 069/221] Lots more tests and remove PyBundleReader --- ben/src/io/reader/tests.rs | 390 +++++++++++++ ben/src/io/writer/tests.rs | 331 +++++++++++ ben/src/json/graph/tests/test_algos.rs | 122 ++++ ben/src/json/graph/tests/test_io.rs | 107 ++++ ben/src/ops/relabel/tests.rs | 370 ++++++++++++ pyben/binary_ensemble/__init__.py | 2 - pyben/binary_ensemble/_core.pyi | 17 +- pyben/docs/user/using_pyben.ipynb | 48 +- pyben/src/bundle/mod.rs | 226 -------- pyben/src/decode/mod.rs | 65 ++- pyben/src/lib.rs | 5 +- pyben/tests/test_bundle.py | 265 ++++++--- pyben/tests/test_python_pipelines.py | 761 +++++++++++++++++++++++++ 13 files changed, 2366 insertions(+), 343 deletions(-) delete mode 100644 pyben/src/bundle/mod.rs diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 0e06da3..df5ee88 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -580,6 +580,396 @@ fn xz_reader_large_assignment_roundtrip() { assert_eq!(results[0], big_assign); } +// ── build_frame_iter_from_reader unknown mode ───────────────────────── + +#[test] +fn build_frame_iter_from_reader_unknown_mode_errors() { + use crate::io::reader::subsample::build_frame_iter_from_reader; + let data = Cursor::new(b"dummy data for unknown mode test".to_vec()); + let result = build_frame_iter_from_reader(data, "bogus"); + assert!(result.is_err()); + let err = result.err().unwrap(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("bogus")); +} + +// ── SubsampleFrameDecoder stress tests ──────────────────────────────── + +#[test] +fn subsample_every_start_beyond_hi_returns_zero() { + let assignments = vec![vec![1u16, 2, 3], vec![4, 5, 6]]; + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_every(1, 100) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 0); +} + +#[test] +fn subsample_range_non_overlapping_returns_empty() { + let assignments = vec![vec![1u16, 2], vec![3, 4], vec![5, 6]]; + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_range(10, 20) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 0); +} + +#[test] +fn subsample_indices_mixed_before_and_after() { + let assignments: Vec> = (1..=5).map(|i| vec![i; 3]).collect(); + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_indices(vec![2, 4, 100]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![2, 2, 2]); + assert_eq!(results[1], vec![4, 4, 4]); +} + +#[test] +fn subsample_every_step_larger_than_stream() { + let assignments = vec![vec![1u16, 2], vec![3, 4]]; + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_every(100, 1) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], vec![1, 2]); +} + +#[test] +fn subsample_indices_empty_yields_nothing() { + let assignments = vec![vec![1u16, 2]]; + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_indices(Vec::::new()) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 0); +} + +#[test] +fn subsample_twodelta_by_range() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 2, 2], + vec![2, 2, 2, 2], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_range(2, 3) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![2, 1, 2, 2]); + assert_eq!(results[1], vec![2, 2, 2, 2]); +} + +#[test] +fn subsample_twodelta_every() { + let assignments = vec![ + vec![1u16, 1, 2, 2], + vec![2, 1, 1, 2], + vec![1, 2, 1, 2], + vec![2, 1, 2, 1], + ]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_every(2, 1) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 2); + assert_eq!(results[0], vec![1, 1, 2, 2]); + assert_eq!(results[1], vec![1, 2, 1, 2]); +} + +// ── XBEN TwoDelta writer stress tests (roundtrip via reader) ────────── + +#[test] +fn xz_twodelta_many_identical_assignments_roundtrip() { + let assign = vec![1u16, 2, 1, 2]; + let assignments: Vec<_> = (0..100).map(|_| assign.clone()).collect(); + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total_samples: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total_samples, 100); + for (a, _) in &results { + assert_eq!(a, &assign); + } +} + +#[test] +fn xz_twodelta_all_identical_single_value_roundtrip() { + let assign = vec![5u16; 10]; + let assignments: Vec<_> = (0..10).map(|_| assign.clone()).collect(); + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 10); + for (a, _) in &results { + assert_eq!(a, &assign); + } +} + +#[test] +fn xz_twodelta_alternating_assignments_roundtrip() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..50).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results.len(), 50); + for (i, r) in results.iter().enumerate() { + if i % 2 == 0 { + assert_eq!(r, &a); + } else { + assert_eq!(r, &b); + } + } +} + +#[test] +fn xz_twodelta_large_assignment_roundtrip() { + let n = 500; + let a1: Vec = (0..n).map(|i| if i < n / 2 { 1 } else { 2 }).collect(); + let a2: Vec = (0..n).map(|i| if i < n / 2 { 2 } else { 1 }).collect(); + let assignments = vec![a1.clone(), a2.clone(), a1.clone()]; + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn xz_twodelta_chunk_boundary_roundtrip() { + use crate::io::writer::XZAssignmentWriter; + use xz2::write::XzEncoder; + + let anchor = vec![1u16, 2, 1, 2]; + let delta = vec![2u16, 1, 2, 1]; + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(3); + writer.write_assignment(anchor.clone()).unwrap(); + for _ in 0..10 { + writer.write_assignment(delta.clone()).unwrap(); + writer.write_assignment(anchor.clone()).unwrap(); + } + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results.len(), 21); + assert_eq!(results[0], anchor); + for i in 1..=20 { + if i % 2 == 1 { + assert_eq!(results[i], delta); + } else { + assert_eq!(results[i], anchor); + } + } +} + +#[test] +fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { + use crate::io::writer::XZAssignmentWriter; + use xz2::write::XzEncoder; + + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![2u16, 1, 2, 2]; + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(100); + writer.write_assignment(anchor.clone()).unwrap(); + writer.write_assignment(delta.clone()).unwrap(); + writer.write_assignment(delta.clone()).unwrap(); + writer.write_assignment(delta.clone()).unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 4); + assert_eq!(results[0].0, anchor); + for r in &results[1..] { + assert_eq!(r.0, delta); + } +} + +// ── translate_ben_twodelta_to_xben ──────────────────────────────────── + +#[test] +fn translate_ben_twodelta_to_xben_roundtrip() { + use crate::codec::encode::encode_ben_to_xben; + use crate::codec::decode::decode_xben_to_jsonl; + use crate::io::writer::AssignmentWriter; + use std::io::BufReader; + + let a0 = vec![1u16, 2, 1, 2]; + let a1 = vec![1u16, 1, 2, 2]; + let a2 = vec![2u16, 1, 2, 1]; + let assignments = vec![a0.clone(), a1.clone(), a2.clone()]; + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + + let output_str = String::from_utf8(jsonl).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 3); + + for (i, (line, expected)) in lines.iter().zip(assignments.iter()).enumerate() { + let v: serde_json::Value = serde_json::from_str(line).unwrap(); + let assign: Vec = v["assignment"] + .as_array() + .unwrap() + .iter() + .map(|x| x.as_u64().unwrap() as u16) + .collect(); + assert_eq!(&assign, expected, "mismatch at sample {}", i + 1); + } +} + +#[test] +fn translate_ben_twodelta_to_xben_with_repetitions() { + use crate::codec::encode::encode_ben_to_xben; + use crate::io::writer::AssignmentWriter; + use std::io::BufReader; + + let anchor = vec![1u16, 2, 1, 2]; + let delta = vec![2u16, 1, 2, 1]; + let assignments = vec![ + anchor.clone(), + anchor.clone(), + anchor.clone(), + delta.clone(), + delta.clone(), + ]; + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 5); +} + +#[test] +fn translate_ben_twodelta_to_xben_many_deltas() { + use crate::codec::encode::encode_ben_to_xben; + use crate::io::writer::AssignmentWriter; + use std::io::BufReader; + + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..20) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +// ── count_samples_from_frame_iter ───────────────────────────────────── + +#[test] +fn count_samples_from_frame_iter_basic() { + use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; + use crate::codec::encode::encode_jsonl_to_ben; + + let jsonl = r#"{"assignment":[1,2],"sample":1} +{"assignment":[3,4],"sample":2} +{"assignment":[5,6],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let iter = build_frame_iter_from_reader(Cursor::new(ben), "ben").unwrap(); + assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 3); +} + +#[test] +fn count_samples_from_frame_iter_xben() { + use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,2,1,1],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let iter = build_frame_iter_from_reader(Cursor::new(xben), "xben").unwrap(); + assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 2); +} + +#[test] +fn count_samples_from_frame_iter_mkv() { + use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; + use crate::codec::encode::encode_jsonl_to_ben; + + let jsonl = r#"{"assignment":[1,2],"sample":1} +{"assignment":[1,2],"sample":2} +{"assignment":[3,4],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let iter = build_frame_iter_from_reader(Cursor::new(ben), "ben").unwrap(); + assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 3); +} + // ── AssignmentReader tests ───────────────────────────────────────────────── #[test] diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 8b13789..9b76cd9 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -1 +1,332 @@ +use crate::io::reader::XZAssignmentReader; +use crate::io::writer::XZAssignmentWriter; +use crate::BenVariant; +use std::io::Cursor; +use xz2::write::XzEncoder; +fn roundtrip_xben(assignments: &[Vec], variant: BenVariant) -> Vec> { + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + reader.map(|r| r.unwrap().0).collect() +} + +fn roundtrip_xben_counts(assignments: &[Vec], variant: BenVariant) -> Vec<(Vec, u16)> { + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + reader.map(|r| r.unwrap()).collect() +} + +// ── Standard variant roundtrips ─────────────────────────────────────── + +#[test] +fn writer_standard_basic_roundtrip() { + let assignments = vec![vec![1u16, 2, 3], vec![4, 5, 6]]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::Standard), assignments); +} + +#[test] +fn writer_standard_single_element_assignments() { + let assignments = vec![vec![42u16], vec![99]]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::Standard), assignments); +} + +// ── MkvChain variant roundtrips ─────────────────────────────────────── + +#[test] +fn writer_mkv_deduplication() { + let a = vec![1u16, 2, 3]; + let assignments = vec![a.clone(), a.clone(), a.clone(), vec![4, 5, 6]]; + let results = roundtrip_xben_counts(&assignments, BenVariant::MkvChain); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 4); +} + +// ── TwoDelta basic roundtrips ───────────────────────────────────────── + +#[test] +fn writer_twodelta_basic_roundtrip() { + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); +} + +#[test] +fn writer_twodelta_anchor_only() { + let assignments = vec![vec![1u16, 2, 3, 4]]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); +} + +#[test] +fn writer_twodelta_repeated_anchor() { + let a = vec![1u16, 2, 1, 2]; + let assignments: Vec<_> = (0..5).map(|_| a.clone()).collect(); + let results = roundtrip_xben_counts(&assignments, BenVariant::TwoDelta); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 5); + for (v, _) in &results { + assert_eq!(v, &a); + } +} + +#[test] +fn writer_twodelta_repeated_delta() { + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![2u16, 1, 2, 2]; + let assignments = vec![ + anchor.clone(), + delta.clone(), + delta.clone(), + delta.clone(), + ]; + let results = roundtrip_xben_counts(&assignments, BenVariant::TwoDelta); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 4); +} + +// ── TwoDelta chunk size edge cases ──────────────────────────────────── + +#[test] +fn writer_twodelta_chunk_size_1() { + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..10) + .map(|i| if i % 2 == 0 { anchor.clone() } else { delta.clone() }) + .collect(); + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(1); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn writer_twodelta_chunk_size_larger_than_stream() { + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments = vec![a.clone(), b.clone(), a.clone()]; + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(1_000_000); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +// ── TwoDelta with all-identical assignments (twodelta_repeat_buffered_frame) ── + +#[test] +fn writer_twodelta_all_identical_values() { + let assign = vec![3u16; 8]; + let assignments: Vec<_> = (0..5).map(|_| assign.clone()).collect(); + let results = roundtrip_xben_counts(&assignments, BenVariant::TwoDelta); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 5); + for (v, _) in &results { + assert_eq!(v, &assign); + } +} + +#[test] +fn writer_twodelta_u16_max_value_in_assignment() { + let assign = vec![u16::MAX; 4]; + let assignments: Vec<_> = (0..3).map(|_| assign.clone()).collect(); + let results = roundtrip_xben_counts(&assignments, BenVariant::TwoDelta); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 3); + for (v, _) in &results { + assert_eq!(v, &assign); + } +} + +// ── TwoDelta write_json_value ───────────────────────────────────────── + +#[test] +fn writer_twodelta_write_json_value() { + use serde_json::json; + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + writer + .write_json_value(json!({"assignment": [1, 2, 1, 2]})) + .unwrap(); + writer + .write_json_value(json!({"assignment": [2, 1, 2, 1]})) + .unwrap(); + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1u16, 2, 1, 2], vec![2, 1, 2, 1]]); +} + +// ── TwoDelta finish idempotency ─────────────────────────────────────── + +#[test] +fn writer_finish_is_idempotent() { + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + writer + .write_assignment(vec![1u16, 2, 3, 4]) + .unwrap(); + writer.finish().unwrap(); + writer.finish().unwrap(); + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1u16, 2, 3, 4]]); +} + +// ── write_ben_file translation ──────────────────────────────────────── + +#[test] +fn writer_write_ben_file_standard_roundtrip() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[4,5,6],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::Standard).unwrap(); + writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1u16, 2, 3], vec![4, 5, 6]]); +} + +#[test] +fn writer_write_ben_file_mkv_roundtrip() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[1,2,3],"sample":2} +{"assignment":[4,5,6],"sample":3} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain).unwrap(); + writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 3); +} + +#[test] +fn writer_write_ben_file_twodelta_roundtrip() { + use crate::io::writer::AssignmentWriter; + use std::io::BufReader; + + let assignments = vec![ + vec![1u16, 2, 1, 2], + vec![1, 1, 2, 2], + vec![2, 1, 2, 1], + ]; + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +#[test] +fn writer_write_ben_file_twodelta_rejects_bannerless() { + use std::io::BufReader; + + let mut xben = Vec::new(); + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let no_banner = vec![0u8; 50]; + let err = writer + .write_ben_file(BufReader::new(no_banner.as_slice())) + .unwrap_err(); + assert!( + err.to_string().contains("banner") + || err.to_string().contains("TwoDelta") + || err.kind() == std::io::ErrorKind::InvalidData + ); +} + +// ── Large-scale stress test ─────────────────────────────────────────── + +#[test] +fn writer_twodelta_stress_many_unique_deltas() { + let n = 200; + let base: Vec = (0..20).map(|i| if i < 10 { 1 } else { 2 }).collect(); + let flipped: Vec = (0..20).map(|i| if i < 10 { 2 } else { 1 }).collect(); + let mut assignments = vec![base.clone()]; + for i in 0..n { + if i % 2 == 0 { + assignments.push(flipped.clone()); + } else { + assignments.push(base.clone()); + } + } + + let results = roundtrip_xben(&assignments, BenVariant::TwoDelta); + assert_eq!(results, assignments); +} diff --git a/ben/src/json/graph/tests/test_algos.rs b/ben/src/json/graph/tests/test_algos.rs index 2505f4f..544ac88 100644 --- a/ben/src/json/graph/tests/test_algos.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -323,3 +323,125 @@ fn test_sort_json_file_by_multi_level_cluster() { assert_eq!(sorted, [0, 1, 2, 3]); assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); } + +#[test] +fn test_extract_usize_ids_rejects_non_integer_node_id() { + let input = r#"{ + "nodes": [ + {"id": 0}, + {"id": "not-a-number"}, + {"id": 2} + ], + "adjacency": [ + [{"id": 2}], + [{"id": 0}], + [{"id": 0}] + ] + }"#; + let mut output = Vec::new(); + let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + assert!(err.to_string().contains("not an unsigned integer")); +} + +#[test] +fn test_extract_usize_ids_rejects_negative_node_id() { + let input = r#"{ + "nodes": [ + {"id": -1}, + {"id": 1} + ], + "adjacency": [ + [{"id": 1}], + [{"id": -1}] + ] + }"#; + let mut output = Vec::new(); + let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn test_extract_usize_ids_rejects_float_node_id() { + let input = r#"{ + "nodes": [ + {"id": 1.5} + ], + "adjacency": [ + [] + ] + }"#; + let mut output = Vec::new(); + let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn test_sort_by_ordering_directed_rcm() { + let input = r#"{ + "directed": true, + "nodes": [ + {"id": 0}, + {"id": 1}, + {"id": 2} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 2}], + [{"id": 0}] + ] + }"#; + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + input.as_bytes(), + &mut output, + GraphOrderingMethod::ReverseCuthillMckee, + ) + .unwrap(); + assert_eq!(mapping.len(), 3); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 3); + assert_eq!(output_json["directed"], true); +} + +#[test] +fn test_sort_by_ordering_directed_mlc() { + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + path_graph_json(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); + assert!(!mapping.is_empty()); +} + +#[test] +fn test_sort_by_key_directed_graph() { + let input = r#"{ + "directed": true, + "nodes": [ + {"id": 0, "label": "c"}, + {"id": 1, "label": "a"}, + {"id": 2, "label": "b"} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 2}], + [{"id": 0}] + ] + }"#; + let mut output = Vec::new(); + let mapping = sort_json_file_by_key(input.as_bytes(), &mut output, "label").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 3); + assert_eq!(output_json["directed"], true); + assert_eq!(output_json["nodes"][0]["label"], "a"); + assert_eq!(output_json["nodes"][1]["label"], "b"); + assert_eq!(output_json["nodes"][2]["label"], "c"); + assert_eq!(mapping[&1], 0); + assert_eq!(mapping[&2], 1); + assert_eq!(mapping[&0], 2); +} diff --git a/ben/src/json/graph/tests/test_io.rs b/ben/src/json/graph/tests/test_io.rs index 9fea044..711f959 100644 --- a/ben/src/json/graph/tests/test_io.rs +++ b/ben/src/json/graph/tests/test_io.rs @@ -800,3 +800,110 @@ fn json_fidelity_small_directed() { fn json_fidelity_directed_cycle() { assert_json_roundtrip_directed(DIRECTED_CYCLE_JSON); } + +// ── nx_convert error paths ─────────────────────────────────────────── + +#[test] +fn directedness_mismatch_undirected_as_directed() { + let nx = parse_nx(P4_JSON); + assert!(!nx.directed); + let err = PetxDiGraph::try_from(nx).unwrap_err(); + assert!(err.to_string().contains("directedness mismatch")); +} + +#[test] +fn directedness_mismatch_directed_as_undirected() { + let nx = parse_nx(SMALL_DIRECTED_JSON); + assert!(nx.directed); + let err = PetxUnGraph::try_from(nx).unwrap_err(); + assert!(err.to_string().contains("directedness mismatch")); +} + +#[test] +fn node_adjacency_length_mismatch() { + let mut nx = parse_nx(P4_JSON); + nx.adjacency.pop(); + let err = PetxUnGraph::try_from(nx).unwrap_err(); + assert!(err.to_string().contains("length mismatch")); +} + +#[test] +fn duplicate_node_id() { + let json = r#"{ + "directed": false, "multigraph": false, "graph": [], + "nodes": [{"id": 0}, {"id": 0}], + "adjacency": [[], []] + }"#; + let nx = parse_nx(json); + let err = PetxUnGraph::try_from(nx).unwrap_err(); + assert!(err.to_string().contains("duplicate node id")); +} + +#[test] +fn missing_neighbor_node() { + let json = r#"{ + "directed": false, "multigraph": false, "graph": [], + "nodes": [{"id": 0}, {"id": 1}], + "adjacency": [[{"id": 99}], []] + }"#; + let nx = parse_nx(json); + let err = PetxUnGraph::try_from(nx).unwrap_err(); + assert!(err.to_string().contains("unknown node id")); +} + +#[test] +fn petx_node_to_nx_node_missing_networkx_id() { + let node = PetxNode { + attrs: BTreeMap::new(), + }; + let err = petx_node_to_nx_node(&node).unwrap_err(); + assert!(err.to_string().contains("__networkx_id__")); +} + +#[test] +fn graph_has_parallel_edges_detects_multigraph() { + let mut graph = UnGraph::::new_undirected(); + let a = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".to_string(), json!(0))]), + }); + let b = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".to_string(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(1), + key: None, + attrs: BTreeMap::new(), + }; + graph.add_edge(a, b, edge.clone()); + graph.add_edge(a, b, edge); + assert!(graph_has_parallel_edges(&graph)); +} + +#[test] +fn graph_has_no_parallel_edges_for_simple_graph() { + let mut graph = DiGraph::::new(); + let a = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".to_string(), json!(0))]), + }); + let b = graph.add_node(PetxNode { + attrs: BTreeMap::from([("__networkx_id__".to_string(), json!(1))]), + }); + let edge = NxAdjEntry { + id: json!(1), + key: None, + attrs: BTreeMap::new(), + }; + graph.add_edge(a, b, edge); + assert!(!graph_has_parallel_edges(&graph)); +} + +#[test] +fn nx_node_to_petx_node_preserves_attrs() { + let nx_node = NxNode { + id: json!(42), + attrs: BTreeMap::from([("color".to_string(), json!("red"))]), + }; + let petx = nx_node_to_petx_node(nx_node); + assert_eq!(petx.attrs["__networkx_id__"], json!(42)); + assert_eq!(petx.attrs["color"], json!("red")); +} diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 8c89541..edfe51c 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -586,3 +586,373 @@ fn relabel_error_non_io_becomes_invalid_input() { assert_eq!(io_err.kind(), io::ErrorKind::InvalidInput); assert!(io_err.to_string().contains("contiguous")); } + +// ── convert_ben_file ───────────────────────────────────────────────── + +#[test] +fn test_convert_ben_file_standard_to_mkv() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[4,5,6],\"sample\":3}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut converted = Vec::new(); + convert_ben_file( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + BenVariant::MkvChain, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[4,5,6],\"sample\":3}\n", + ); + assert_eq!(output_str, expected); +} + +#[test] +fn test_convert_ben_file_limit_truncates() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + "{\"assignment\":[4,5,6],\"sample\":4}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::MkvChain, + ) + .unwrap(); + + let mut converted = Vec::new(); + convert_ben_file_limit( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + BenVariant::Standard, + 2, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +// ── relabel_ben_lines_limit ────────────────────────────────────────── + +#[test] +fn test_relabel_ben_lines_limit_standard() { + let file = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[2,3,1],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut relabeled = Vec::new(); + relabel_ben_lines_limit( + &encoded[17..], + io::BufWriter::new(&mut relabeled), + BenVariant::Standard, + 2, + ) + .unwrap(); + + let mut full_relabeled = b"STANDARD BEN FILE".to_vec(); + full_relabeled.extend_from_slice(&relabeled); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(full_relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +// ── relabel_ben_lines_with_map_limit ───────────────────────────────── + +#[test] +fn test_relabel_ben_lines_with_map_limit_standard() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[4,5,6],\"sample\":2}\n", + "{\"assignment\":[7,8,9],\"sample\":3}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); + + let mut relabeled = Vec::new(); + relabel_ben_lines_with_map_limit( + &encoded[17..], + io::BufWriter::new(&mut relabeled), + map, + BenVariant::Standard, + 1, + ) + .unwrap(); + + let mut full_relabeled = b"STANDARD BEN FILE".to_vec(); + full_relabeled.extend_from_slice(&relabeled); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(full_relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + assert_eq!(output_str, "{\"assignment\":[3,1,2],\"sample\":1}\n"); +} + +// ── relabel_ben_file_as_variant ────────────────────────────────────── + +#[test] +fn test_relabel_ben_file_as_variant_standard_to_twodelta() { + let file = concat!( + "{\"assignment\":[3,3,1,1],\"sample\":1}\n", + "{\"assignment\":[1,3,1,3],\"sample\":2}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut converted = Vec::new(); + relabel_ben_file_as_variant( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + BenVariant::TwoDelta, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,1,2,2],\"sample\":1}\n", + "{\"assignment\":[1,2,1,2],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +#[test] +fn test_relabel_ben_file_as_variant_limit() { + let file = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[2,3,1],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut converted = Vec::new(); + relabel_ben_file_as_variant_limit( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + BenVariant::MkvChain, + 2, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +// ── relabel_ben_file_with_map_as_variant ───────────────────────────── + +#[test] +fn test_relabel_ben_file_with_map_as_variant() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[4,5,6],\"sample\":2}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); + + let mut converted = Vec::new(); + relabel_ben_file_with_map_as_variant( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + map, + BenVariant::MkvChain, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[6,4,5],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +#[test] +fn test_relabel_ben_file_with_map_as_variant_limit() { + let file = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[4,5,6],\"sample\":3}\n", + ); + + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::MkvChain, + ) + .unwrap(); + + let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); + + let mut converted = Vec::new(); + relabel_ben_file_with_map_as_variant_limit( + encoded.as_slice(), + io::BufWriter::new(&mut converted), + map, + BenVariant::Standard, + 2, + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(converted.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); + let output_str = String::from_utf8(decoded).unwrap(); + let expected = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[3,1,2],\"sample\":2}\n", + ); + assert_eq!(output_str, expected); +} + +// ── convert_ben_file rejects invalid banner ────────────────────────── + +#[test] +fn test_convert_ben_file_rejects_invalid_banner() { + let err = convert_ben_file( + b"not a valid banner".as_slice(), + Vec::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +// ── relabel_ben_file_as_variant rejects invalid banner ─────────────── + +#[test] +fn test_relabel_ben_file_as_variant_rejects_invalid_banner() { + let err = relabel_ben_file_as_variant( + b"not a valid banner".as_slice(), + Vec::new(), + BenVariant::Standard, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +// ── dense_permutation error paths ──────────────────────────────────── + +#[test] +fn test_dense_permutation_empty_map() { + let map = HashMap::new(); + let perm = dense_permutation(&map).unwrap(); + assert!(perm.is_empty()); +} + +#[test] +fn test_dense_permutation_non_contiguous() { + let map: HashMap = [(0, 0), (2, 1)].iter().cloned().collect(); + let err = dense_permutation(&map).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("contiguous")); +} + +// ── permute_assignment error paths ─────────────────────────────────── + +#[test] +fn test_permute_assignment_length_mismatch() { + let assignment = vec![1u16, 2, 3]; + let perm = vec![0, 1]; + let err = permute_assignment(&assignment, &perm).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("length")); +} + +#[test] +fn test_permute_assignment_index_out_of_range() { + let assignment = vec![1u16, 2, 3]; + let perm = vec![0, 1, 99]; + let err = permute_assignment(&assignment, &perm).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("old index")); +} + +// ── canonicalize_assignment ────────────────────────────────────────── + +#[test] +fn test_canonicalize_assignment() { + assert_eq!(canonicalize_assignment(&[5, 3, 5, 7]), vec![1, 2, 1, 3]); + assert_eq!(canonicalize_assignment(&[]), Vec::::new()); + assert_eq!(canonicalize_assignment(&[42]), vec![1]); +} diff --git a/pyben/binary_ensemble/__init__.py b/pyben/binary_ensemble/__init__.py index fe76a5e..fe15f10 100644 --- a/pyben/binary_ensemble/__init__.py +++ b/pyben/binary_ensemble/__init__.py @@ -1,7 +1,6 @@ from ._core import ( PyBenDecoder, PyBenEncoder, - PyBundleReader, compress_jsonl_to_ben, compress_ben_to_xben, compress_jsonl_to_xben, @@ -13,7 +12,6 @@ __all__ = [ "PyBenDecoder", "PyBenEncoder", - "PyBundleReader", "compress_jsonl_to_ben", "compress_ben_to_xben", "compress_jsonl_to_xben", diff --git a/pyben/binary_ensemble/_core.pyi b/pyben/binary_ensemble/_core.pyi index 2d2287d..cec8783 100644 --- a/pyben/binary_ensemble/_core.pyi +++ b/pyben/binary_ensemble/_core.pyi @@ -59,8 +59,10 @@ class PyBenDecoder: def count_samples(self) -> int: """Count and cache the total number of samples in the source file. - This is equivalent to calling :func:`len`, but is more explicit about - the fact that the first call may perform a full-file scan. + Always reports the base (unfiltered) sample count, even after a + ``subsample_*`` call has been applied. Equivalent to ``len(dec)`` + when no subsample selection is active. The first call may perform + a full-file scan; the result is cached. """ ... def subsample_indices(self, indices: Iterable[int]) -> "PyBenDecoder": @@ -202,6 +204,17 @@ class PyBenDecoder: """ ... + def extract_stream(self, out_path: str | Path, overwrite: bool = False) -> None: + """Copy the embedded assignment stream to a file. + + The resulting file can be opened directly with + ``PyBenDecoder(out_path, mode=dec.assignment_format())``. + + Raises an error on plain streams, an ``OSError`` when the output + file already exists and *overwrite* is ``False``. + """ + ... + class PyBenEncoder: """Encoder for Binary Ensemble (.ben) files. diff --git a/pyben/docs/user/using_pyben.ipynb b/pyben/docs/user/using_pyben.ipynb index d819e6c..5896142 100644 --- a/pyben/docs/user/using_pyben.ipynb +++ b/pyben/docs/user/using_pyben.ipynb @@ -142,12 +142,24 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9296ca41", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'compress_jsonl_to_ben' from 'pyben' (unknown location)", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mImportError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyben\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 2\u001b[39m compress_jsonl_to_ben, compress_jsonl_to_xben, compress_ben_to_xben, decompress_ben_to_jsonl, decompress_xben_to_jsonl, decompress_xben_to_ben\n\u001b[32m 3\u001b[39m )\n", + "\u001b[31mImportError\u001b[39m: cannot import name 'compress_jsonl_to_ben' from 'pyben' (unknown location)" + ] + } + ], "source": [ - "from pyben import (\n", + "from binary_ensemble import (\n", " compress_jsonl_to_ben, compress_jsonl_to_xben, compress_ben_to_xben, decompress_ben_to_jsonl, decompress_xben_to_jsonl, decompress_xben_to_ben\n", ")" ] @@ -165,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "1e1e32b0", "metadata": {}, "outputs": [], @@ -187,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "2f1ce280", "metadata": {}, "outputs": [ @@ -242,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "81b1f724", "metadata": {}, "outputs": [], @@ -301,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "a4e512b3", "metadata": {}, "outputs": [], @@ -353,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "eb43be57", "metadata": {}, "outputs": [], @@ -407,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "dec15cda", "metadata": {}, "outputs": [ @@ -453,7 +465,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "801c6fb7", "metadata": {}, "outputs": [ @@ -524,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "24761ca6", "metadata": {}, "outputs": [ @@ -545,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "0a815edf", "metadata": {}, "outputs": [ @@ -569,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "3be48458", "metadata": {}, "outputs": [ @@ -607,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "51d9f27a", "metadata": {}, "outputs": [ @@ -636,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "a51d0019", "metadata": {}, "outputs": [ @@ -668,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "eeb1c112", "metadata": {}, "outputs": [ @@ -706,7 +718,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pyben", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -720,7 +732,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.7" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/pyben/src/bundle/mod.rs b/pyben/src/bundle/mod.rs deleted file mode 100644 index 055287a..0000000 --- a/pyben/src/bundle/mod.rs +++ /dev/null @@ -1,226 +0,0 @@ -//! Python bindings for the `.bendl` bundle container. -//! -//! Exposes a [`PyBundleReader`] that wraps -//! [`binary_ensemble::io::bundle::BendlReader`] and provides a small -//! Python-facing surface: -//! -//! - `is_complete()`, `sample_count()`, `assignment_format()` -//! - `asset_names()` / `list_assets()` -//! - `read_asset_bytes(name)` — raw (decoded) bytes as `bytes` -//! - `read_json_asset(name)` — parsed JSON as a Python object -//! - `read_graph()` / `read_metadata()` / `read_relabel_map()` — canonical-name helpers -//! - `extract_stream(out_path, overwrite=False)` — copy the embedded -//! assignment stream to a `.ben` / `.xben` file the caller can then -//! open with `PyBenDecoder`. - -use std::fs::{File, OpenOptions}; -use std::io::{self, BufReader, BufWriter}; -use std::path::PathBuf; - -use binary_ensemble::io::bundle::format::{ - AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, - ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, -}; -use binary_ensemble::io::bundle::BendlReader; -use pyo3::exceptions::{PyException, PyIOError, PyKeyError}; -use pyo3::prelude::*; -use pyo3::types::PyDict; - -/// Python-facing wrapper around a `BendlReader>`. -#[pyclass(module = "binary_ensemble", unsendable, name = "PyBundleReader")] -pub struct PyBundleReader { - inner: BendlReader>, - path: PathBuf, -} - -#[pymethods] -impl PyBundleReader { - /// Open a `.bendl` file for reading. - #[new] - #[pyo3(text_signature = "(file_path)")] - fn new(file_path: PathBuf) -> PyResult { - let file = File::open(&file_path).map_err(|e| { - PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) - })?; - let inner = BendlReader::open(BufReader::new(file)).map_err(|e| { - PyException::new_err(format!( - "Failed to parse bundle header in {}: {e}", - file_path.display() - )) - })?; - Ok(Self { - inner, - path: file_path, - }) - } - - /// Return the bundle's format version as a `(major, minor)` tuple. - #[pyo3(text_signature = "(self)")] - fn version(&self) -> (u16, u16) { - let h = self.inner.header(); - (h.major_version, h.minor_version) - } - - /// Whether the bundle was successfully finalized. - #[pyo3(text_signature = "(self)")] - fn is_complete(&self) -> bool { - self.inner.is_complete() - } - - /// Authoritative sample count from the header, or `None` when the - /// bundle is incomplete. - #[pyo3(text_signature = "(self)")] - fn sample_count(&self) -> Option { - self.inner.sample_count() - } - - /// Container format of the embedded assignment stream: `"ben"` or - /// `"xben"`, or `None` when the header byte is unrecognized. - #[pyo3(text_signature = "(self)")] - fn assignment_format(&self) -> Option<&'static str> { - self.inner.assignment_format().map(|f| match f { - AssignmentFormat::Ben => "ben", - AssignmentFormat::Xben => "xben", - }) - } - - /// Names of all directory entries, in directory order. - #[pyo3(text_signature = "(self)")] - fn asset_names(&self) -> Vec { - self.inner.assets().iter().map(|e| e.name.clone()).collect() - } - - /// Return the full directory as a list of dicts with keys - /// `name`, `type`, `offset`, `len`, and `flags` (a list of string tags). - #[pyo3(text_signature = "(self)")] - fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { - let mut out = Vec::with_capacity(self.inner.assets().len()); - for entry in self.inner.assets() { - let d = PyDict::new(py); - d.set_item("name", &entry.name)?; - d.set_item("type", entry.asset_type)?; - d.set_item("offset", entry.payload_offset)?; - d.set_item("len", entry.payload_len)?; - let mut flags: Vec<&str> = Vec::new(); - if entry.asset_flags & ASSET_FLAG_JSON != 0 { - flags.push("json"); - } - if entry.asset_flags & ASSET_FLAG_XZ != 0 { - flags.push("xz"); - } - if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { - flags.push("checksum"); - } - d.set_item("flags", flags)?; - out.push(d); - } - Ok(out) - } - - /// Read the (decoded) bytes of an asset by name and return them as - /// a Python `bytes` object. - #[pyo3(text_signature = "(self, name, /)")] - fn read_asset_bytes(&mut self, name: &str) -> PyResult> { - let entry = self - .inner - .find_asset_by_name(name) - .cloned() - .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; - self.inner - .asset_bytes(&entry) - .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) - } - - /// Parse a JSON asset into a Python object (dict, list, …). Fails - /// if the asset does not exist or the decoded bytes are not JSON. - #[pyo3(text_signature = "(self, name, /)")] - fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { - let bytes = self.read_asset_bytes(name)?; - let json_mod = py.import("json")?; - let text = std::str::from_utf8(&bytes) - .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; - let parsed = json_mod.call_method1("loads", (text,))?; - Ok(parsed.into()) - } - - /// Read the bundle's `graph.json` asset as a parsed JSON object. - /// Returns `None` if the bundle does not carry a graph asset. - #[pyo3(text_signature = "(self)")] - fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { - if self.inner.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { - return Ok(None); - } - Ok(Some(self.read_json_asset(py, "graph.json")?)) - } - - /// Read the bundle's `metadata.json` asset as a parsed JSON object, - /// or `None` if absent. - #[pyo3(text_signature = "(self)")] - fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { - if self.inner.find_asset_by_type(ASSET_TYPE_METADATA).is_none() { - return Ok(None); - } - Ok(Some(self.read_json_asset(py, "metadata.json")?)) - } - - /// Read the bundle's `relabel_map.json` asset as a parsed JSON - /// object, or `None` if absent. - #[pyo3(text_signature = "(self)")] - fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { - if self - .inner - .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) - .is_none() - { - return Ok(None); - } - Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) - } - - /// Copy the embedded assignment stream region verbatim to - /// `out_path`. The resulting file can be opened directly with - /// `PyBenDecoder(out_path, mode=assignment_format())`. - #[pyo3(signature = (out_path, overwrite=false))] - #[pyo3(text_signature = "(self, out_path, overwrite=False)")] - fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { - if out_path.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_path.display() - ))); - } - let out = if overwrite { - OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&out_path) - } else { - OpenOptions::new() - .write(true) - .create_new(true) - .open(&out_path) - } - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; - let mut out = BufWriter::new(out); - - let mut stream = self - .inner - .assignment_stream_reader() - .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; - io::copy(&mut stream, &mut out) - .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; - Ok(()) - } - - fn __repr__(&self) -> String { - format!( - "PyBundleReader(path={:?}, complete={}, format={:?}, samples={:?}, assets={})", - self.path.display(), - self.inner.is_complete(), - self.inner.assignment_format(), - self.inner.sample_count(), - self.inner.assets().len(), - ) - } -} diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index 20ee689..ffb6ae4 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -15,8 +15,8 @@ use binary_ensemble::io::reader::{ use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyUserWarning}; use pyo3::prelude::*; use pyo3::types::PyDict; -use std::fs::File; -use std::io::{self, BufReader, Read, Seek, SeekFrom}; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; type DynIter = Box> + Send>; @@ -156,7 +156,7 @@ impl PyBenDecoder { file_path.display() )) })?; - let reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { PyException::new_err(format!( "Failed to parse bundle header in {}: {e}", file_path.display() @@ -168,10 +168,11 @@ impl PyBenDecoder { ) })?; let derived_mode = DecoderMode::from_assignment_format(fmt); - let (stream_offset, stream_len) = { - let header = reader.header(); - (header.stream_offset, header.stream_len) - }; + let (stream_offset, stream_len) = reader.assignment_stream_range() + .map_err(|e| PyException::new_err(format!( + "Failed to determine stream region in {}: {e}", + file_path.display() + )))?; let state = BundleState { reader, stream_offset, @@ -283,9 +284,13 @@ impl PyBenDecoder { #[pyo3(text_signature = "(self)")] fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - let base_len = ensure_base_len(&mut slf, py)?; - slf.len_hint = Some(base_len); - Ok(base_len) + // Always reports the total number of samples in the source file, + // even after `subsample_*` has been applied. We deliberately do + // not touch `len_hint` here: when a subsample selection is + // active, `len_hint` tracks the filtered count that `__len__` + // should return, and clobbering it would break `len(dec)` after + // a `count_samples()` call. + ensure_base_len(&mut slf, py) } #[pyo3(text_signature = "(self, indices, /)")] @@ -542,6 +547,46 @@ impl PyBenDecoder { } Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) } + + /// Copy the embedded assignment stream region verbatim to + /// `out_path`. The resulting file can be opened directly with + /// `PyBenDecoder(out_path, mode=dec.assignment_format())`. + /// Errors on plain streams. + #[pyo3(signature = (out_path, overwrite=false))] + #[pyo3(text_signature = "(self, out_path, overwrite=False)")] + fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { + let state = self.require_bundle_mut("extract_stream()")?; + if out_path.exists() && !overwrite { + return Err(PyIOError::new_err(format!( + "Output file {} already exists (use overwrite=True to replace).", + out_path.display() + ))); + } + let out = if overwrite { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&out_path) + } else { + OpenOptions::new() + .write(true) + .create_new(true) + .open(&out_path) + } + .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; + let mut out = BufWriter::new(out); + + let mut stream = state + .reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; + io::copy(&mut stream, &mut out) + .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; + out.flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + Ok(()) + } } impl PyBenDecoder { diff --git a/pyben/src/lib.rs b/pyben/src/lib.rs index b993662..ed5a056 100755 --- a/pyben/src/lib.rs +++ b/pyben/src/lib.rs @@ -1,17 +1,14 @@ use pyo3::prelude::*; -use pyo3::wrap_pyfunction; // <-- needed for wrap_pyfunction! +use pyo3::wrap_pyfunction; -pub mod bundle; pub mod common; pub mod decode; pub mod encode; #[pymodule] fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { - // Export classes m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_function(wrap_pyfunction!(crate::decode::decompress_ben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::decode::decompress_xben_to_ben, m)?)?; m.add_function(wrap_pyfunction!( diff --git a/pyben/tests/test_bundle.py b/pyben/tests/test_bundle.py index 2ead064..eb8d0e5 100644 --- a/pyben/tests/test_bundle.py +++ b/pyben/tests/test_bundle.py @@ -1,4 +1,4 @@ -"""Tests for PyBundleReader. +"""Tests for bundle (.bendl) support in PyBenDecoder. These tests do not rely on the `bendl` CLI binary being built. Instead, they construct `.bendl` bundles directly in Python from the on-disk format spec @@ -27,7 +27,6 @@ from binary_ensemble import ( PyBenDecoder, PyBenEncoder, - PyBundleReader, compress_jsonl_to_ben, compress_jsonl_to_xben, ) @@ -277,9 +276,10 @@ def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: # --------------------------------------------------------------------------- -def test_module_exports_pybundlereader() -> None: - assert "PyBundleReader" in binary_ensemble.__all__ - assert hasattr(binary_ensemble, "PyBundleReader") +def test_module_exports_decoder_and_encoder() -> None: + assert "PyBenDecoder" in binary_ensemble.__all__ + assert "PyBenEncoder" in binary_ensemble.__all__ + assert "PyBundleReader" not in binary_ensemble.__all__ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: @@ -328,11 +328,11 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "out.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) assert reader.is_complete() is True - assert reader.sample_count() == len(samples) + assert reader.count_samples() == len(samples) assert reader.assignment_format() == "ben" names = reader.asset_names() @@ -372,10 +372,9 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: got = list(PyBenDecoder(extracted, mode="ben")) assert got == samples - # __repr__ should not crash and should mention the path. + # __repr__ should not crash. r = repr(reader) - assert "PyBundleReader" in r - assert "complete=true" in r or "complete=True" in r + assert r is not None def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: @@ -387,11 +386,11 @@ def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: assets=[], ) path = _write_bundle(tmp_path / "xout.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.assignment_format() == "xben" assert reader.is_complete() - assert reader.sample_count() == len(samples) + assert reader.count_samples() == len(samples) assert reader.asset_names() == [] # extract_stream → file must round-trip via the xben decoder. @@ -414,7 +413,7 @@ def test_bundle_reader_canonical_helpers_return_none_when_absent(tmp_path: Path) ], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.read_metadata() is None assert reader.read_graph() is None assert reader.read_relabel_map() is None @@ -424,9 +423,9 @@ def test_bundle_reader_asset_free_empty_stream(tmp_path: Path) -> None: # A bundle with no assets and an empty stream is legal (spec says so). bundle = build_bundle(stream_bytes=b"", sample_count=0, assets=[]) path = _write_bundle(tmp_path / "empty.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() - assert reader.sample_count() == 0 + assert reader.count_samples() == 0 assert reader.asset_names() == [] assert reader.list_assets() == [] # extract_stream writes a zero-byte file. @@ -449,7 +448,7 @@ def test_read_asset_bytes_raises_keyerror_for_unknown_name(tmp_path: Path) -> No ], ) path = _write_bundle(tmp_path / "x.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) with pytest.raises(KeyError, match="no asset named"): reader.read_asset_bytes("missing.bin") with pytest.raises(KeyError): @@ -471,7 +470,7 @@ def test_read_json_asset_rejects_non_utf8_payload(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "bin.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) # Raw bytes come back fine. assert reader.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" # But the JSON helper must reject non-UTF8 bytes. @@ -493,7 +492,7 @@ def test_read_json_asset_rejects_malformed_json(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "m.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) # Raw bytes: fine. assert reader.read_asset_bytes("metadata.json") == b"not a json {{{" # Parsed via python's json module: must raise. @@ -512,7 +511,7 @@ def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "u.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.asset_names() == [name] assert reader.read_asset_bytes(name) == b"payload" @@ -530,7 +529,7 @@ def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) names = reader.asset_names() assert names == list(payloads.keys()) # Spot-check the contents round-trip. @@ -550,7 +549,7 @@ def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) sample_count=1, ) path = _write_bundle(tmp_path / "a.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) target = tmp_path / "already.ben" target.write_bytes(b"pre-existing") with pytest.raises(OSError, match="already exists"): @@ -565,7 +564,7 @@ def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: sample_count=2, ) path = _write_bundle(tmp_path / "b.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) target = tmp_path / "out.ben" target.write_bytes(b"filler") reader.extract_stream(target, overwrite=True) @@ -580,7 +579,7 @@ def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: def test_open_rejects_missing_file(tmp_path: Path) -> None: with pytest.raises(OSError, match="Failed to open"): - PyBundleReader(tmp_path / "does_not_exist.bendl") + PyBenDecoder(tmp_path / "does_not_exist.bendl") def test_open_rejects_bad_magic(tmp_path: Path) -> None: @@ -590,8 +589,10 @@ def test_open_rejects_bad_magic(tmp_path: Path) -> None: magic=b"NOTABEND", ) path = _write_bundle(tmp_path / "bad.bendl", bundle) - with pytest.raises(Exception, match="Failed to parse bundle header"): - PyBundleReader(path) + # Bad magic → detect_is_bundle returns False → treated as plain BEN + # stream → fails because the bytes aren't a valid BEN banner. + with pytest.raises(Exception): + PyBenDecoder(path) def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: @@ -602,14 +603,14 @@ def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "oldfuture.bendl", bundle) with pytest.raises(Exception, match="Failed to parse bundle header"): - PyBundleReader(path) + PyBenDecoder(path) def test_open_rejects_truncated_header(tmp_path: Path) -> None: path = tmp_path / "short.bendl" path.write_bytes(b"BENDL\x00\x00\x01\x00") # magic plus 2 bytes — not enough with pytest.raises(Exception, match="Failed to parse bundle header"): - PyBundleReader(path) + PyBenDecoder(path) def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> None: @@ -627,7 +628,7 @@ def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> Non struct.pack_into(" None: @@ -639,7 +640,7 @@ def test_open_rejects_bundle_with_chopped_directory_bytes(tmp_path: Path) -> Non # Drop the final two bytes of the directory. path = _write_bundle(tmp_path / "chop.bendl", bundle[:-2]) with pytest.raises(Exception): - PyBundleReader(path) + PyBenDecoder(path) def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: @@ -655,7 +656,7 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "dup.bendl", duplicate_names) with pytest.raises(Exception, match="malformed directory"): - PyBundleReader(path) + PyBenDecoder(path) wrong_singleton_name = build_bundle( stream_bytes=stream, @@ -671,7 +672,7 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "singleton.bendl", wrong_singleton_name) with pytest.raises(Exception, match="malformed directory"): - PyBundleReader(path) + PyBenDecoder(path) def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) -> None: @@ -688,13 +689,14 @@ def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) path = _write_bundle(tmp_path / "trailing_dir.bendl", bytes(bundle)) with pytest.raises(Exception, match="trailing byte"): - PyBundleReader(path) + PyBenDecoder(path) -def test_incomplete_bundle_reports_none_sample_count(tmp_path: Path) -> None: - # Provisional bundle with complete=0: sample_count() must be None. +def test_incomplete_bundle_scans_stream_for_sample_count(tmp_path: Path) -> None: + # Provisional bundle with complete=0: the decoder falls back to + # scanning the stream region (from stream_offset to EOF) to count + # samples instead of trusting the header. stream = _ben_bytes_for([[1, 2, 3]], tmp_path) - # Build it by hand — no directory, complete=NO. header = _pack_header( complete=COMPLETE_NO, assignment_format=ASSIGNMENT_FORMAT_BEN, @@ -705,9 +707,9 @@ def test_incomplete_bundle_reports_none_sample_count(tmp_path: Path) -> None: sample_count=-1, ) path = _write_bundle(tmp_path / "incomplete.bendl", header + stream) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() is False - assert reader.sample_count() is None + assert reader.count_samples() == 1 assert reader.asset_names() == [] # extract_stream should still write out bytes that decode as BEN. out = tmp_path / "extracted.ben" @@ -715,9 +717,9 @@ def test_incomplete_bundle_reports_none_sample_count(tmp_path: Path) -> None: assert list(PyBenDecoder(out, mode="ben")) == [[1, 2, 3]] -def test_unknown_assignment_format_byte_reports_none(tmp_path: Path) -> None: - # Assignment format byte = 0 → unknown. Finalized bundle but without - # a valid stream container — the directory side still works. +def test_unknown_assignment_format_byte_rejects_at_construction(tmp_path: Path) -> None: + # Assignment format byte = 99 → unrecognized. PyBenDecoder must + # reject the bundle at construction time. bundle = bytearray( build_bundle( stream_bytes=b"", @@ -728,9 +730,8 @@ def test_unknown_assignment_format_byte_reports_none(tmp_path: Path) -> None: # assignment_format byte is at offset 13 in the header. bundle[13] = 99 path = _write_bundle(tmp_path / "wtfmt.bendl", bytes(bundle)) - reader = PyBundleReader(path) - assert reader.assignment_format() is None - assert reader.is_complete() + with pytest.raises(Exception, match="unrecognized assignment_format"): + PyBenDecoder(path) def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: @@ -757,7 +758,7 @@ def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: # Flip a byte well past the magic so the decoder reads it and fails. bundle[xz_start + 20] ^= 0xFF path = _write_bundle(tmp_path / "badxz.bendl", bytes(bundle)) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) # Opening works — the header/directory are intact. with pytest.raises(OSError): reader.read_asset_bytes("graph.json") @@ -772,7 +773,7 @@ def test_directory_entry_with_zero_length_custom_payload(tmp_path: Path) -> None ], ) path = _write_bundle(tmp_path / "zlen.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.read_asset_bytes("empty.bin") == b"" entry = next(a for a in reader.list_assets() if a["name"] == "empty.bin") assert entry["len"] == 0 @@ -790,11 +791,10 @@ def test_repr_on_incomplete_bundle(tmp_path: Path) -> None: sample_count=-1, ) path = _write_bundle(tmp_path / "rep.bendl", header + stream) - reader = PyBundleReader(path) - r = repr(reader) - # Incomplete bundles report no sample count. - assert "samples=None" in r - assert "assets=0" in r + reader = PyBenDecoder(path) + # Incomplete bundle should open without error. + assert reader.is_complete() is False + assert reader.asset_names() == [] # --------------------------------------------------------------------------- @@ -829,10 +829,16 @@ def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) - partial = full_ben[: len(full_ben) - 3] path = _write_bundle(tmp_path / "crashed.bendl", _incomplete_bundle(partial)) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() is False - assert reader.sample_count() is None assert reader.assignment_format() == "ben" + # count_samples scans the truncated stream; it may error or return a + # partial count — either is acceptable. + try: + n = reader.count_samples() + assert n < len(samples) + except Exception: + pass # extract_stream should write exactly the partial byte sequence. extracted = tmp_path / "partial.ben" @@ -861,7 +867,7 @@ def test_interrupted_ben_stream_inside_banner_fails_to_open_decoder( full_ben = _ben_bytes_for([[1, 2, 3]], tmp_path) path = _write_bundle(tmp_path / "head_cut.bendl", _incomplete_bundle(full_ben[:8])) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() is False extracted = tmp_path / "head_cut.ben" @@ -876,10 +882,12 @@ def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: # before any stream bytes landed. path = _write_bundle(tmp_path / "zero.bendl", _incomplete_bundle(b"")) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() is False - assert reader.sample_count() is None assert reader.asset_names() == [] + # Zero stream bytes → scan fails (no BEN banner). + with pytest.raises(Exception): + reader.count_samples() extracted = tmp_path / "zero.ben" reader.extract_stream(extracted) @@ -909,10 +917,10 @@ def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) # The reader's open() succeeds — the header fields parse as-is and # validation is lazy. - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() # sample_count is what the header says. - assert reader.sample_count() == len(samples) + assert reader.count_samples() == len(samples) # extract_stream reads `stream_len` bytes from stream_offset; when # the file ends early, the short-read path must not hand back @@ -946,7 +954,7 @@ def test_read_metadata_after_extract_stream_still_works(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "seq.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) reader.extract_stream(tmp_path / "s.ben") assert reader.read_metadata() == {"x": 1} reader.extract_stream(tmp_path / "s2.ben", overwrite=True) @@ -973,7 +981,7 @@ def test_long_asset_name_near_u16_max(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "long.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.asset_names() == [long_name] assert reader.read_asset_bytes(long_name) == payload @@ -1021,7 +1029,7 @@ def test_list_assets_flag_fidelity(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "flags.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) got = reader.list_assets() assert len(got) == len(combos) for entry, want in zip(got, expected): @@ -1050,7 +1058,7 @@ def test_read_asset_bytes_is_idempotent(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "idem.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) for _ in range(5): assert reader.read_asset_bytes("raw.bin") == payload assert reader.read_asset_bytes("compressed.bin") == payload @@ -1087,7 +1095,7 @@ def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.asset_names() == [name for name, _ in expected] # Sample every 37th asset and verify the payload decodes correctly @@ -1106,7 +1114,7 @@ def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: # Build 20 deliberately-different bundles from a seeded PRNG. Each one # mixes random asset sizes, random flags, random samples, and is then - # fully round-tripped through PyBundleReader + PyBenDecoder. + # fully round-tripped through PyBenDecoder on a .bendl bundle. rng = random.Random(0xFEED_FACE) for trial in range(20): n_assets = rng.randint(0, 12) @@ -1139,9 +1147,9 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) assert reader.is_complete() - assert reader.sample_count() == n_samples + assert reader.count_samples() == n_samples assert reader.asset_names() == [name for name, _ in truth] for name, want in truth: assert reader.read_asset_bytes(name) == want @@ -1184,7 +1192,7 @@ def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "interleave.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) # Strongly non-sequential access pattern. assert reader.read_asset_bytes("blob.bin") == custom @@ -1211,7 +1219,7 @@ def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) - sample_count=1, ) path = _write_bundle(tmp_path / "mini.bendl", bundle) - reader = PyBundleReader(path) + reader = PyBenDecoder(path) missing = tmp_path / "does" / "not" / "exist" / "out.ben" with pytest.raises(OSError): reader.extract_stream(missing) @@ -1243,10 +1251,10 @@ def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) assert reader.is_complete() - assert reader.sample_count() == len(samples) + assert reader.count_samples() == len(samples) assert reader.assignment_format() == "ben" # No graph because none was provided. assert reader.asset_names() == [] @@ -1266,9 +1274,9 @@ def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.is_complete() - assert reader.sample_count() == len(samples) + assert reader.count_samples() == len(samples) assert reader.asset_names() == ["graph.json"] assets = reader.list_assets() @@ -1294,7 +1302,7 @@ def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.asset_names() == ["graph.json"] assert reader.read_graph() == SAMPLE_GRAPH @@ -1313,7 +1321,7 @@ def test_pybenencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1327,7 +1335,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1341,7 +1349,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1355,7 +1363,7 @@ def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: for a in samples: enc.write(a) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1369,8 +1377,8 @@ def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> No for a in samples: enc.write(a) - reader = PyBundleReader(out) - assert reader.sample_count() == len(samples) + reader = PyBenDecoder(out) + assert reader.count_samples() == len(samples) extracted = tmp_path / "full.ben" reader.extract_stream(extracted) assert list(PyBenDecoder(extracted, mode="ben")) == samples @@ -1413,9 +1421,9 @@ def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: with pytest.raises(OSError, match="already been closed"): enc.write([1, 2, 3]) - reader = PyBundleReader(out) + reader = PyBenDecoder(out) assert reader.is_complete() - assert reader.sample_count() == 1 + assert reader.count_samples() == 1 def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: @@ -1891,7 +1899,7 @@ def test_pybendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: - # read_json_asset on the decoder should mirror PyBundleReader's + # read_json_asset on the decoder should reject non-UTF-8 the same as # error behavior when an asset isn't valid UTF-8. bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), @@ -2145,3 +2153,98 @@ def test_pybendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: assert next(it) == samples[1] # Any new pass (list / for / iter) rebuilds and starts over. assert list(dec) == samples + + +def test_pybendecoder_count_samples_after_subsample_preserves_len( + tmp_path: Path, +) -> None: + # After `subsample_*`, `len(dec)` must reflect the filtered count. + # Calling `count_samples()` reports the base (unfiltered) count but + # must not clobber the filtered `len(dec)` value. + samples = [[i] for i in range(1, 9)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "count_after_sub.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_range(2, 5) + assert len(dec) == 4 + assert dec.count_samples() == len(samples) + # The filtered length contract must survive a count_samples() call. + assert len(dec) == 4 + assert list(dec) == samples[1:5] + + +def test_pybendecoder_count_samples_plain_after_subsample_preserves_len( + tmp_path: Path, +) -> None: + # Same contract as above, but on a plain .ben stream to cover the + # non-bundle branch of `ensure_base_len`. + samples = [[i] for i in range(1, 11)] + ben_path = tmp_path / "plain_count.ben" + with PyBenEncoder( + ben_path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: + for a in samples: + enc.write(a) + dec = PyBenDecoder(ben_path) + dec.subsample_every(3, 1) + expected = samples[::3] + assert len(dec) == len(expected) + assert dec.count_samples() == len(samples) + assert len(dec) == len(expected) + assert list(dec) == expected + + +def test_pybendecoder_subsample_then_count_samples_then_reiterate( + tmp_path: Path, +) -> None: + # Composing subsample → count_samples → restart iteration must keep + # the filtered view intact across the restart. + samples = [[i, i + 1] for i in range(1, 9)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "sub_count_restart.bendl", bundle) + dec = PyBenDecoder(path) + dec.subsample_indices([1, 4, 8]) + assert dec.count_samples() == len(samples) + expected = [samples[0], samples[3], samples[7]] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_bundle_read_json_asset_missing_name_raises_keyerror( + tmp_path: Path, +) -> None: + # `read_json_asset` on a valid bundle that does not carry the named + # asset must surface a KeyError, matching `read_asset_bytes`. + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + ) + path = _write_bundle(tmp_path / "missing_json.bendl", bundle) + dec = PyBenDecoder(path) + with pytest.raises(KeyError, match="nope.json"): + dec.read_json_asset("nope.json") + + +def test_pybendecoder_bundle_len_uses_header_fast_path(tmp_path: Path) -> None: + # For a finalized bundle, `len(dec)` should use the O(1) header + # sample_count fast path rather than scanning the stream. We can't + # observe the scan directly, but we can verify the result matches + # the count declared in the header even when the stream is a real + # BEN payload. + samples = [[i] for i in range(1, 6)] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + ) + path = _write_bundle(tmp_path / "fast_len.bendl", bundle) + dec = PyBenDecoder(path) + assert len(dec) == len(samples) + # A second call returns the cached value and must agree. + assert len(dec) == len(samples) + assert dec.count_samples() == len(samples) diff --git a/pyben/tests/test_python_pipelines.py b/pyben/tests/test_python_pipelines.py index 68b98e6..f26959f 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/pyben/tests/test_python_pipelines.py @@ -1,3 +1,4 @@ +import io import json import random from pathlib import Path @@ -744,3 +745,763 @@ def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_inv out.write_text("exists\n", encoding="utf-8") with pytest.raises(OSError, match="already exists"): decompress_ben_to_jsonl(ben, out, overwrite=False) + + +# --------------------------------------------------------------------------- +# Bundle inspection via PyBenDecoder +# --------------------------------------------------------------------------- + + +def test_decoder_bundle_round_trip_all_methods(tmp_path: Path) -> None: + samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + graph = {"nodes": [{"id": 0}, {"id": 1}], "links": [{"source": 0, "target": 1}]} + path = tmp_path / "full.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert dec.is_bundle() + assert dec.is_complete() + assert dec.count_samples() == len(samples) + assert dec.assignment_format() == "ben" + v = dec.version() + assert isinstance(v, tuple) and len(v) == 2 + + names = dec.asset_names() + assert "graph.json" in names + + assets = dec.list_assets() + assert len(assets) >= 1 + for entry in assets: + assert "name" in entry + assert "type" in entry + assert "flags" in entry + + raw = dec.read_asset_bytes("graph.json") + assert isinstance(raw, bytes) + + parsed = dec.read_json_asset("graph.json") + assert parsed["nodes"] == graph["nodes"] + + g = dec.read_graph() + assert g is not None + assert g["nodes"] == graph["nodes"] + + assert dec.read_metadata() is None + assert dec.read_relabel_map() is None + + assert list(dec) == samples + + +def test_decoder_bundle_extract_stream_and_decode(tmp_path: Path) -> None: + samples = [[10, 20], [30, 40]] + path = tmp_path / "extract.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + out = tmp_path / "extracted.ben" + dec.extract_stream(out) + assert list(PyBenDecoder(out, mode="ben")) == samples + + +def test_decoder_bundle_extract_stream_overwrite_and_refuse(tmp_path: Path) -> None: + samples = [[1]] + path = tmp_path / "ow.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + enc.write(samples[0]) + + dec = PyBenDecoder(path) + out = tmp_path / "out.ben" + dec.extract_stream(out) + with pytest.raises(OSError, match="already exists"): + dec.extract_stream(out, overwrite=False) + dec.extract_stream(out, overwrite=True) + assert list(PyBenDecoder(out, mode="ben")) == samples + + +def test_decoder_bundle_missing_asset_raises_keyerror(tmp_path: Path) -> None: + path = tmp_path / "no_asset.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + enc.write([1, 2]) + + dec = PyBenDecoder(path) + with pytest.raises(KeyError, match="nope"): + dec.read_asset_bytes("nope") + with pytest.raises(KeyError, match="nope"): + dec.read_json_asset("nope") + + +# --------------------------------------------------------------------------- +# PyBenEncoder bundle-mode coverage +# --------------------------------------------------------------------------- + + +def test_pybenencoder_bundle_without_graph(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4]] + path = tmp_path / "no_graph.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert dec.is_bundle() + assert dec.assignment_format() == "ben" + assert dec.read_graph() is None + assert list(dec) == samples + + +def test_pybenencoder_bundle_graph_from_dict(tmp_path: Path) -> None: + graph = {"test": True} + path = tmp_path / "dict_graph.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + enc.write([1]) + dec = PyBenDecoder(path) + assert dec.read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: + graph = {"test": "bytes"} + path = tmp_path / "bytes_graph.bendl" + with PyBenEncoder( + path, overwrite=True, variant="standard", graph=json.dumps(graph).encode() + ) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: + graph = {"test": "bytearray"} + path = tmp_path / "ba_graph.bendl" + with PyBenEncoder( + path, + overwrite=True, + variant="standard", + graph=bytearray(json.dumps(graph).encode()), + ) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: + graph = {"test": "path"} + gpath = tmp_path / "g.json" + gpath.write_text(json.dumps(graph), encoding="utf-8") + path = tmp_path / "path_graph.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard", graph=gpath) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: + graph = {"test": "str_path"} + gpath = tmp_path / "g2.json" + gpath.write_text(json.dumps(graph), encoding="utf-8") + path = tmp_path / "str_path_graph.bendl" + with PyBenEncoder( + path, overwrite=True, variant="standard", graph=str(gpath) + ) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: + graph = {"test": "bytesio"} + path = tmp_path / "bio_graph.bendl" + with PyBenEncoder( + path, + overwrite=True, + variant="standard", + graph=io.BytesIO(json.dumps(graph).encode()), + ) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: + graph = {"test": "stringio"} + path = tmp_path / "sio_graph.bendl" + with PyBenEncoder( + path, + overwrite=True, + variant="standard", + graph=io.StringIO(json.dumps(graph)), + ) as enc: + enc.write([1]) + assert PyBenDecoder(path).read_graph() == graph + + +def test_pybenencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="graph.*cannot be combined"): + PyBenEncoder( + tmp_path / "bad.ben", + overwrite=True, + variant="standard", + graph={"a": 1}, + ben_file_only=True, + ) + + +def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="graph must be"): + PyBenEncoder( + tmp_path / "bad.bendl", + overwrite=True, + variant="standard", + graph=42, + ) + + +def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: + path = tmp_path / "idempotent.bendl" + enc = PyBenEncoder(path, overwrite=True, variant="standard") + enc.write([1, 2]) + enc.close() + enc.close() + assert list(PyBenDecoder(path)) == [[1, 2]] + + +def test_pybenencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: + path = tmp_path / "closed.bendl" + enc = PyBenEncoder(path, overwrite=True, variant="standard") + enc.write([1]) + enc.close() + with pytest.raises(OSError, match="already been closed"): + enc.write([2]) + + +# --------------------------------------------------------------------------- +# PyBenDecoder bundle-path coverage +# --------------------------------------------------------------------------- + + +def test_pybendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + path = tmp_path / "auto.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + dec = PyBenDecoder(path) + assert dec.is_bundle() + assert list(dec) == samples + + +def test_pybendecoder_bundle_toc_methods(tmp_path: Path) -> None: + graph = {"g": 1} + path = tmp_path / "toc.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + enc.write([1, 2, 3]) + + dec = PyBenDecoder(path) + assert dec.is_bundle() + assert dec.assignment_format() == "ben" + v = dec.version() + assert isinstance(v, tuple) and len(v) == 2 + assert dec.is_complete() + + names = dec.asset_names() + assert "graph.json" in names + + assets = dec.list_assets() + assert len(assets) >= 1 + for entry in assets: + assert "name" in entry + assert "type" in entry + assert "flags" in entry + + raw = dec.read_asset_bytes("graph.json") + assert isinstance(raw, bytes) + + parsed = dec.read_json_asset("graph.json") + assert parsed == graph + + assert dec.read_graph() == graph + assert dec.read_metadata() is None + assert dec.read_relabel_map() is None + + +def test_pybendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 11)] + path = tmp_path / "subsample.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_range(2, 5) + assert list(dec) == samples[1:5] + + dec2 = PyBenDecoder(path) + dec2.subsample_indices([1, 3, 10]) + assert list(dec2) == [samples[0], samples[2], samples[9]] + + dec3 = PyBenDecoder(path) + dec3.subsample_every(3, 2) + assert list(dec3) == [samples[1], samples[4], samples[7]] + + +def test_pybendecoder_bundle_len_and_count(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5]] + path = tmp_path / "len.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert len(dec) == len(samples) + assert dec.count_samples() == len(samples) + assert list(dec) == samples + + +def test_pybendecoder_bundle_iteration_restart(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4]] + path = tmp_path / "restart.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert list(dec) == samples + assert list(dec) == samples + + +def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 8)] + path = tmp_path / "re_sub.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_range(2, 5) + expected = samples[1:5] + assert list(dec) == expected + assert list(dec) == expected + + +def test_pybendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: + path = tmp_path / "plain.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + enc.write([1, 2]) + + dec = PyBenDecoder(path) + assert not dec.is_bundle() + assert dec.assignment_format() == "ben" + + for method, args in [ + ("version", ()), + ("is_complete", ()), + ("asset_names", ()), + ("list_assets", ()), + ("read_asset_bytes", ("x",)), + ("read_json_asset", ("x",)), + ("read_graph", ()), + ("read_metadata", ()), + ("read_relabel_map", ()), + ]: + with pytest.raises(Exception, match="only available on .bendl"): + getattr(dec, method)(*args) + + +def test_pybendecoder_bundle_count_samples_preserves_subsample_len( + tmp_path: Path, +) -> None: + samples = [[i] for i in range(1, 9)] + path = tmp_path / "count_sub.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_range(2, 5) + assert len(dec) == 4 + assert dec.count_samples() == len(samples) + assert len(dec) == 4 + + +# --------------------------------------------------------------------------- +# PyBenDecoder XBEN bundle coverage +# --------------------------------------------------------------------------- + + +def test_pybendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + xben_path = tmp_path / "samples.xben" + compress_jsonl_to_xben( + src, xben_path, overwrite=True, variant="standard", + n_threads=1, compression_level=1, + ) + + bendl_path = tmp_path / "xben_bundle.bendl" + with PyBenEncoder(bendl_path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(bendl_path) + assert dec.is_bundle() + assert list(dec) == samples + + +def test_pybendecoder_xben_plain_stream(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + xben_path = tmp_path / "plain.xben" + compress_jsonl_to_xben( + src, xben_path, overwrite=True, variant="standard", + n_threads=1, compression_level=1, + ) + + dec = PyBenDecoder(xben_path, mode="xben") + assert not dec.is_bundle() + assert dec.assignment_format() == "xben" + assert list(dec) == samples + + +# --------------------------------------------------------------------------- +# PyBenDecoder subsample validation errors +# --------------------------------------------------------------------------- + + +def test_pybendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "empty_idx.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_indices([]) + + +def test_pybendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "zero_idx.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_indices([0, 1, 2]) + + +def test_pybendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "zero_start.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_range(0, 2) + + +def test_pybendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "bad_range.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_range(5, 2) + + +def test_pybendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "zero_step.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_every(0) + + +def test_pybendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None: + samples = [[1], [2]] + path = tmp_path / "zero_off.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + with pytest.raises(Exception): + dec.subsample_every(1, offset=0) + + +# --------------------------------------------------------------------------- +# PyBenDecoder subsample on plain streams +# --------------------------------------------------------------------------- + + +def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5]] + path = tmp_path / "plain_sub.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_indices([1, 3, 5]) + assert list(dec) == [[1], [3], [5]] + + +def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5]] + path = tmp_path / "plain_range.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_range(2, 4) + assert list(dec) == [[2], [3], [4]] + + +def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5], [6]] + path = tmp_path / "plain_every.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_every(2, offset=1) + assert list(dec) == [[1], [3], [5]] + + +# --------------------------------------------------------------------------- +# PyBenDecoder len/count on plain streams +# --------------------------------------------------------------------------- + + +def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: + samples = [[1], [2], [3]] + path = tmp_path / "plain_len.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert dec.count_samples() == 3 + assert len(dec) == 3 + + +def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: + samples = [[1], [2], [3], [4], [5]] + path = tmp_path / "plain_sub_len.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_range(2, 4) + assert len(dec) == 3 + assert dec.count_samples() == 5 + assert len(dec) == 3 + + +# --------------------------------------------------------------------------- +# PyBenDecoder multiple iteration passes +# --------------------------------------------------------------------------- + + +def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4]] + path = tmp_path / "multi_iter.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + assert list(dec) == samples + assert list(dec) == samples + assert list(dec) == samples + + +def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 8)] + path = tmp_path / "plain_re_sub.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path) + dec.subsample_every(2, offset=1) + expected = [[1], [3], [5], [7]] + assert list(dec) == expected + assert list(dec) == expected + + +# --------------------------------------------------------------------------- +# PyBenEncoder ben_file_only mode coverage +# --------------------------------------------------------------------------- + + +def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: + samples = [[10, 20, 30], [40, 50, 60]] + path = tmp_path / "ben_only.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path, mode="ben") + assert not dec.is_bundle() + assert list(dec) == samples + + +def test_pybenencoder_ben_file_only_mkv(tmp_path: Path) -> None: + samples = [[1, 2], [1, 2], [3, 4]] + path = tmp_path / "ben_mkv.ben" + with PyBenEncoder(path, overwrite=True, variant="mkv_chain", ben_file_only=True) as enc: + for a in samples: + enc.write(a) + + dec = PyBenDecoder(path, mode="ben") + assert list(dec) == samples + + +def test_pybenencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: + samples = [[5, 6]] + path = tmp_path / "close_reopen.ben" + enc = PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) + enc.write(samples[0]) + enc.close() + + dec = PyBenDecoder(path, mode="ben") + assert list(dec) == samples + + +# --------------------------------------------------------------------------- +# PyBenEncoder bundle with metadata +# --------------------------------------------------------------------------- + + +def test_pybenencoder_bundle_with_metadata(tmp_path: Path) -> None: + samples = [[1, 2]] + graph = {"nodes": [{"id": 0}], "adjacency": [[]]} + path = tmp_path / "with_meta.bendl" + with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + enc.write(samples[0]) + + dec = PyBenDecoder(path) + assert dec.read_graph() == graph + assert list(dec) == samples + + +# --------------------------------------------------------------------------- +# PyBenDecoder extract_stream on plain stream raises +# --------------------------------------------------------------------------- + + +def test_pybendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: + path = tmp_path / "plain_extract.ben" + with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + enc.write([1, 2]) + + dec = PyBenDecoder(path, mode="ben") + with pytest.raises(Exception, match="only available on .bendl"): + dec.extract_stream(tmp_path / "out.ben") + + +# --------------------------------------------------------------------------- +# decompress_ben_to_jsonl and decompress_xben_to_jsonl coverage +# --------------------------------------------------------------------------- + + +def test_decompress_ben_to_jsonl_roundtrip(tmp_path: Path) -> None: + samples = [[1, 2, 3], [4, 5, 6]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + out = tmp_path / "round.jsonl" + decompress_ben_to_jsonl(ben, out, overwrite=True) + + restored = read_jsonl_assignments(out) + assert restored == samples + + +def test_decompress_xben_to_jsonl_roundtrip(tmp_path: Path) -> None: + samples = [[1, 2, 3], [4, 5, 6]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + xben = tmp_path / "out.xben" + compress_jsonl_to_xben( + src, xben, overwrite=True, variant="standard", + n_threads=1, compression_level=1, + ) + + out = tmp_path / "round.jsonl" + decompress_xben_to_jsonl(xben, out, overwrite=True) + + restored = read_jsonl_assignments(out) + assert restored == samples + + +# --------------------------------------------------------------------------- +# compress_ben_to_xben coverage +# --------------------------------------------------------------------------- + + +def test_compress_ben_to_xben_roundtrip(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + + ben = tmp_path / "out.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + + xben = tmp_path / "from_ben.xben" + compress_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + + out = tmp_path / "round.jsonl" + decompress_xben_to_jsonl(xben, out, overwrite=True) + + restored = read_jsonl_assignments(out) + assert restored == samples + + +# --------------------------------------------------------------------------- +# PyBenDecoder unknown mode error +# --------------------------------------------------------------------------- + + +def test_pybendecoder_unknown_mode_raises(tmp_path: Path) -> None: + path = tmp_path / "dummy.ben" + path.write_bytes(b"\x00" * 100) + with pytest.raises(Exception): + PyBenDecoder(path, mode="bogus") + + +# --------------------------------------------------------------------------- +# PyBenDecoder MkvChain plain stream +# --------------------------------------------------------------------------- + + +def test_pybendecoder_mkv_plain_stream(tmp_path: Path) -> None: + samples = [[1, 2], [1, 2], [3, 4]] + src = tmp_path / "mkv_src.jsonl" + write_jsonl(samples, src) + + ben = tmp_path / "mkv.ben" + compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + + dec = PyBenDecoder(ben, mode="ben") + assert list(dec) == samples + assert dec.count_samples() == 3 From 9ae12c2c395a76a71ada5ad53a0466b1bf966b59 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 20 Apr 2026 07:57:40 -0600 Subject: [PATCH 070/221] More testing --- Taskfile.yml | 4 +- ben/src/codec/decode/tests/mod.rs | 97 +++++ ben/src/codec/encode/tests.rs | 12 + ben/src/codec/translate/tests.rs | 44 +++ ben/src/io/bundle/format.rs | 62 +++ ben/src/io/reader/tests.rs | 444 +++++++++++++++++++++- ben/src/io/reader/twodelta.rs | 1 - ben/src/io/reader/xz_assignment_reader.rs | 96 +---- ben/src/io/writer/tests.rs | 185 +++++++-- ben/src/json/graph/tests/test_algos.rs | 52 +++ ben/src/ops/extract/tests.rs | 8 + 11 files changed, 902 insertions(+), 103 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 8c78cac..a9696bd 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -137,7 +137,7 @@ tasks: env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)bin/' + - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)(bin|cli)/' coverage-pyben: desc: Run Python-driven Rust coverage for pyben @@ -208,7 +208,7 @@ tasks: ben_report_file=/tmp/ben-coverage-report.txt; pyben_report_file=/tmp/pyben-coverage-report.txt; - cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)bin/'"'"' > "$ben_report_file"; + cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)(bin|cli)/'"'"' > "$ben_report_file"; ben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_report_file")"; cargo llvm-cov clean --workspace >/dev/null; diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index b834e42..7cb6dfd 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -224,3 +224,100 @@ fn decode_error_remaining_variants() { let io_err: io::Error = err.into(); assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); } + +#[test] +fn decode_xben_to_ben_twodelta_with_repeated_assignments() { + use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; + use crate::codec::encode::encode_jsonl_to_xben; + use crate::BenVariant; + use serde_json::Value; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,1,2,2],"sample":2} +{"assignment":[1,1,2,2],"sample":3} +{"assignment":[2,1,2,2],"sample":4} +"#; + let mut xben = Vec::new(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + BenVariant::TwoDelta, + Some(1), + Some(1), + None, + ) + .unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + + let mut jsonl_out = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut jsonl_out).unwrap(); + let output_str = String::from_utf8(jsonl_out).unwrap(); + let lines: Vec<&str> = output_str.trim().split('\n').collect(); + assert_eq!(lines.len(), 4); + let v1: Value = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(v1["assignment"], serde_json::json!([1, 1, 2, 2])); + let v4: Value = serde_json::from_str(lines[3]).unwrap(); + assert_eq!(v4["assignment"], serde_json::json!([2, 1, 2, 2])); +} + +#[test] +fn xz_decompress_roundtrip() { + use crate::codec::decode::xz_decompress; + use crate::codec::encode::xz_compress; + use std::io::BufReader; + + let original = b"hello world, this is a test of xz_decompress"; + let mut compressed = Vec::new(); + xz_compress(original.as_slice(), &mut compressed, Some(1), Some(1)).unwrap(); + + let mut decompressed = Vec::new(); + xz_decompress(BufReader::new(compressed.as_slice()), &mut decompressed).unwrap(); + assert_eq!(decompressed, original); +} + +#[test] +fn xz_compress_direct_test() { + use crate::codec::encode::xz_compress; + + let data = b"compress me please with xz"; + let mut out = Vec::new(); + xz_compress(data.as_slice(), &mut out, None, None).unwrap(); + assert!(!out.is_empty()); + + let mut decompressed = Vec::new(); + crate::codec::decode::xz_decompress( + std::io::BufReader::new(out.as_slice()), + &mut decompressed, + ) + .unwrap(); + assert_eq!(decompressed, data); +} + +#[test] +fn encode_ben_to_xben_rejects_invalid_banner() { + use crate::codec::encode::encode_ben_to_xben; + + let garbage = b"GARBAGE BANNER!!!extra_padding"; + let mut out = Vec::new(); + let err = encode_ben_to_xben(garbage.as_slice(), &mut out, Some(1), Some(1), None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn decode_xben_to_ben_rejects_invalid_banner() { + use crate::codec::decode::decode_xben_to_ben; + use crate::codec::encode::xz_compress; + use std::io::BufReader; + + let mut bad_data = b"GARBAGE BANNER!!!".to_vec(); + bad_data.extend_from_slice(&[0u8; 20]); + let mut xz = Vec::new(); + xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1)).unwrap(); + + let mut output = Vec::new(); + let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut output).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 20029d8..392fbfa 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1177,3 +1177,15 @@ fn encode_jsonl_to_xben_twodelta_roundtrip() { let v3: Value = serde_json::from_str(lines[2]).unwrap(); assert_eq!(v3["assignment"], serde_json::json!([2, 2, 1, 1])); } + +#[test] +fn twodelta_encode_outside_pair_change_errors() { + use super::twodelta::encode_twodelta_frame; + + // prev=[1,2,3,4], curr=[2,1,3,5] — positions 0,1 swap pair (1,2), + // but position 3 changes from 4→5 which is outside the pair. + let prev = vec![1u16, 2, 3, 4]; + let curr = vec![2u16, 1, 3, 5]; + let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index d766b06..259bcaa 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -234,6 +234,50 @@ fn test_random_translation_ben_to_ben32() { assert_eq!(writer, &buffer); } +#[test] +fn test_ben_to_ben32_lines_non_eof_error_on_frame_boundary() { + // Provide a valid BEN frame followed by a read that errors with a non-EOF + // error at exactly the point where the next frame's first byte would be read. + // This exercises the `return Err(e)` branch (line ~191) in the + // `read_exact → match → Err(e) → not UnexpectedEof` path. + struct FailOnSecondFrame { + data: Vec, + pos: usize, + frame_boundary: usize, + } + + impl Read for FailOnSecondFrame { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos >= self.frame_boundary { + return Err(io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke on boundary")); + } + let available = (self.frame_boundary - self.pos).min(buf.len()); + let end = self.pos + available; + buf[..available].copy_from_slice(&self.data[self.pos..end]); + self.pos = end; + Ok(available) + } + } + + // Build a valid BEN Standard stream (without banner) containing one frame. + let jsonl = r#"{"assignment":[1,2],"sample":1} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + let body = ben[17..].to_vec(); // strip banner + let boundary = body.len(); // error right after the first frame + + let reader = FailOnSecondFrame { + data: body, + pos: 0, + frame_boundary: boundary, + }; + + let mut output = Vec::new(); + let err = ben_to_ben32_lines(reader, &mut output, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + #[test] fn test_ben32_to_ben_line_rejects_invalid_length() { let err = ben32_to_ben_line(vec![1, 2, 3]).unwrap_err(); diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 950408f..e2f62d8 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -665,4 +665,66 @@ mod tests { assert_eq!(HEADER_SIZE, 64); assert_eq!(DIRECTORY_ENTRY_HEADER_SIZE, 28); } + + #[test] + fn directory_entry_name_too_long() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "x".repeat(u16::MAX as usize + 1), + payload_offset: 0, + payload_len: 0, + checksum: None, + }; + let err = entry.to_bytes().unwrap_err(); + assert!(matches!(err, BendlFormatError::NameTooLong { .. })); + assert!(err.to_string().contains("exceeds")); + } + + #[test] + fn directory_entry_name_not_utf8() { + let mut bytes = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "ok".to_string(), + payload_offset: 0, + payload_len: 0, + checksum: None, + } + .to_bytes() + .unwrap(); + + // Patch the name bytes to invalid UTF-8 (0xFF 0xFE) + let name_offset = DIRECTORY_ENTRY_HEADER_SIZE; + bytes[name_offset] = 0xFF; + bytes[name_offset + 1] = 0xFE; + + let mut cursor = &bytes[..]; + let err = BendlDirectoryEntry::read_from(&mut cursor).unwrap_err(); + assert!(matches!(err, BendlFormatError::NameNotUtf8)); + assert!(err.to_string().contains("UTF-8")); + } + + #[test] + fn header_read_from_truncated() { + let short = [0u8; 10]; + let err = BendlHeader::read_from(&mut &short[..]).unwrap_err(); + assert!(matches!(err, BendlFormatError::Io(_))); + } + + #[test] + fn bendl_format_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let fmt_err = BendlFormatError::Io(inner); + let io_err: io::Error = fmt_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); + } + + #[test] + fn trailing_directory_bytes_error_display() { + let err = BendlFormatError::TrailingDirectoryBytes { remaining: 42 }; + assert!(err.to_string().contains("42")); + assert!(err.to_string().contains("trailing")); + } } diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index df5ee88..4d3908e 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -3,7 +3,7 @@ use crate::io::reader::errors::DecoderInitError; use crate::io::reader::{XZAssignmentFrameReader, XZAssignmentReader}; use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; -use std::io::Cursor; +use std::io::{Cursor, Write}; use xz2::write::XzEncoder; /// Build a minimal XBEN stream from JSONL input for testing. @@ -1071,3 +1071,445 @@ fn assignment_reader_write_all_jsonl() { let v2: serde_json::Value = serde_json::from_str(lines[1]).unwrap(); assert_eq!(v2["assignment"], serde_json::json!([30, 40])); } + +// ── Zero-count frame errors in XZAssignmentReader ────────────────────────── + +#[test] +fn xz_reader_standard_zero_count_frame_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + // Write banner + encoder + .write_all(b"STANDARD BEN FILE") + .unwrap(); + // Write a ben32 frame: one RLE pair (value=1, count=3) + zero terminator + let frame: &[u8] = &[ + 0, 1, 0, 3, // (value=1, count=3) + 0, 0, 0, 0, // zero terminator + ]; + encoder.write_all(frame).unwrap(); + encoder.finish().unwrap(); + } + + // Manually patch: for Standard, there's no count field after the + // terminator. Zero-count only fires for MkvChain where the count is explicit. + // So test MkvChain zero-count instead. + let mut xben_mkv = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben_mkv, 1); + encoder + .write_all(b"MKVCHAIN BEN FILE") + .unwrap(); + let frame: &[u8] = &[ + 0, 1, 0, 3, // (value=1, count=3) + 0, 0, 0, 0, // zero terminator + 0, 0, // count = 0 <-- triggers zero_count_frame_error + ]; + encoder.write_all(frame).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben_mkv)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!(err.to_string().contains("zero")); +} + +#[test] +fn xz_reader_twodelta_unknown_frame_tag_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder + .write_all(b"TWODELTA BEN FILE") + .unwrap(); + // Write a byte with unknown tag (0xFF) + encoder.write_all(&[0xFF]).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!(err.to_string().contains("0xff") || err.to_string().contains("unknown")); +} + +#[test] +fn xz_reader_truncated_stream_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder + .write_all(b"STANDARD BEN FILE") + .unwrap(); + // Write a partial ben32 frame (no zero terminator) + encoder.write_all(&[0, 1, 0, 3]).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!(err.to_string().contains("truncated") || err.to_string().contains("Truncated")); +} + +// ── Subsample Every branch: first > hi ───────────────────────────────────── + +#[test] +fn subsample_every_first_past_hi() { + // 4 samples, step=10, offset=5: first selected = 5, but only 4 samples + // exist → the `first > hi` branch fires for every frame. + let jsonl = concat!( + "{\"assignment\":[1,2],\"sample\":1}\n", + "{\"assignment\":[3,4],\"sample\":2}\n", + "{\"assignment\":[5,6],\"sample\":3}\n", + "{\"assignment\":[7,8],\"sample\":4}\n", + ); + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let sub = reader.into_subsample_every(10, 5); + let results: Vec<_> = sub.map(|r| r.unwrap()).collect(); + assert!(results.is_empty()); +} + +// ── MkvChain extract with count>1 mid-block sample ───────────────────────── + +#[test] +fn extract_assignment_ben_mkv_mid_block() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::ops::extract::extract_assignment_ben; + + let jsonl = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + "{\"assignment\":[4,5,6],\"sample\":4}\n", + ); + + let mut ben = Vec::new(); + encode_jsonl_to_ben( + jsonl.as_bytes(), + std::io::BufWriter::new(&mut ben), + BenVariant::MkvChain, + ) + .unwrap(); + + // Sample 2 is in the middle of the first MkvChain block (count=3) + let result = extract_assignment_ben(ben.as_slice(), 2).unwrap(); + assert_eq!(result, vec![1, 2, 3]); +} + +#[test] +fn xz_reader_twodelta_full_frame_zero_count_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + + // Full frame with count=0 + encoder.write_all(&[0u8]).unwrap(); // tag=0 + encoder.write_all(&1u32.to_be_bytes()).unwrap(); // 1 run + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // value=1 + encoder.write_all(&2u16.to_be_bytes()).unwrap(); // len=2 + encoder.write_all(&0u16.to_be_bytes()).unwrap(); // count=0 + + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!(err.to_string().contains("zero")); +} + +#[test] +fn xz_reader_twodelta_chunk_zero_count_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + + // Full frame (tag=0): anchor [1,2] + encoder.write_all(&[0u8]).unwrap(); + encoder.write_all(&2u32.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); + encoder.write_all(&2u16.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // count=1 + + // Chunk (tag=2) with 1 frame, count=0 + encoder.write_all(&[2u8]).unwrap(); // tag=2 + encoder.write_all(&1u32.to_be_bytes()).unwrap(); // n_frames=1 + // Pair channel: (2,1) + encoder.write_all(&2u16.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); + // Count channel: 0 + encoder.write_all(&0u16.to_be_bytes()).unwrap(); + // Run-count channel: 1 run + encoder.write_all(&1u32.to_be_bytes()).unwrap(); + // Run-length data: 2 + encoder.write_all(&2u16.to_be_bytes()).unwrap(); + + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.collect(); + assert_eq!(results.len(), 2); // anchor + chunk frame + assert!(results[0].is_ok()); + assert!(results[1].as_ref().unwrap_err().to_string().contains("zero")); +} + +// ── Subsample with indices that skip past frame boundaries ────────── + +#[test] +fn subsample_indices_skip_past_lo() { + // MkvChain stream where first frame has count=5 but we only want indices [7,8]. + // This forces the Indices selection to skip past `lo` (line 160-161 in subsample.rs). + let jsonl = concat!( + "{\"assignment\":[1,2,3],\"sample\":1}\n", + "{\"assignment\":[1,2,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + "{\"assignment\":[1,2,3],\"sample\":4}\n", + "{\"assignment\":[1,2,3],\"sample\":5}\n", + "{\"assignment\":[4,5,6],\"sample\":6}\n", + "{\"assignment\":[4,5,6],\"sample\":7}\n", + "{\"assignment\":[4,5,6],\"sample\":8}\n", + ); + let xben = make_xben(jsonl, BenVariant::MkvChain); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader + .into_subsample_by_indices(vec![7, 8]) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(results.len(), 1); // one frame covering both + assert_eq!(results[0].0, vec![4, 5, 6]); + assert_eq!(results[0].1, 2); +} + +// ── Subsample indices with zero (below 1-based lo) ────────────────── + +#[test] +fn subsample_indices_with_zero_skips_past_lo() { + let assignments = vec![vec![1u16, 2], vec![3, 4], vec![5, 6]]; + let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + // Index 0 is below the 1-based lo boundary, exercises the `next < lo` skip. + let results: Vec<_> = reader + .into_subsample_by_indices(vec![0, 2]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], vec![3, 4]); +} + +// ── XZAssignmentFrameReader for MkvChain zero-count ───────────────── + +#[test] +fn xz_frame_reader_mkv_zero_count_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); + let frame: &[u8] = &[ + 0, 1, 0, 3, // (value=1, count=3) + 0, 0, 0, 0, // zero terminator + 0, 0, // count = 0 + ]; + encoder.write_all(frame).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!(err.to_string().contains("zero")); +} + +// ── XZAssignmentReader TwoDelta truncated stream ───────────────────── + +#[test] +fn xz_reader_twodelta_truncated_stream_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + // Write a full tag + partial run count (not enough bytes for a complete frame) + encoder.write_all(&[0u8, 0, 0]).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!( + err.to_string().contains("truncated") || err.to_string().contains("Truncated"), + "got: {}", + err + ); +} + +// ── Legacy TwoDelta delta without anchor (NoAnchorFrame) ──────────── + +#[test] +fn xz_reader_twodelta_tag1_rejected_as_unknown() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + + // Full frame (tag=0) anchor so the stream is valid up to this point. + encoder.write_all(&[0u8]).unwrap(); + encoder.write_all(&1u32.to_be_bytes()).unwrap(); // 1 run + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // value=1 + encoder.write_all(&2u16.to_be_bytes()).unwrap(); // len=2 + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // count=1 + + // Tag 1 (removed legacy delta) should now be rejected as unknown. + encoder.write_all(&[1u8]).unwrap(); + // Enough trailing bytes so the reader can attempt to parse. + encoder.write_all(&[0u8; 20]).unwrap(); + + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut iter = reader.into_iter(); + let _first = iter.next().unwrap().unwrap(); // consume the valid full frame + let err = iter.next().unwrap().unwrap_err(); + assert!( + err.to_string().to_lowercase().contains("unknown") + || err.to_string().contains("tag"), + "expected unknown-tag error, got: {}", + err + ); +} + +// ── Chunk delta without anchor (NoAnchorFrame via chunk queue) ─────── + +#[test] +fn xz_reader_twodelta_chunk_delta_without_anchor_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + + // Write a chunk (tag=2) with 1 delta frame but no preceding full frame. + encoder.write_all(&[2u8]).unwrap(); // tag=2 + encoder.write_all(&1u32.to_be_bytes()).unwrap(); // n_frames=1 + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // pair.0=1 + encoder.write_all(&2u16.to_be_bytes()).unwrap(); // pair.1=2 + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // count=1 + encoder.write_all(&1u32.to_be_bytes()).unwrap(); // 1 run + encoder.write_all(&2u16.to_be_bytes()).unwrap(); // rl=2 + + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!( + err.to_string().contains("full-assignment") || err.to_string().contains("anchor"), + "got: {}", + err + ); +} + +// ── for_each_assignment with stream error ──────────────────────────── + +#[test] +fn xz_reader_for_each_assignment_stream_error() { + use xz2::write::XzEncoder; + + // Create a valid TwoDelta stream that ends with truncated data + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); + + // Valid full frame + encoder.write_all(&[0u8]).unwrap(); + encoder.write_all(&1u32.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); + encoder.write_all(&2u16.to_be_bytes()).unwrap(); + encoder.write_all(&1u16.to_be_bytes()).unwrap(); // count=1 + + // Truncated second frame + encoder.write_all(&[0u8, 0]).unwrap(); + encoder.finish().unwrap(); + } + + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut count = 0usize; + let result = reader.for_each_assignment(|_assignment, _cnt| { + count += 1; + Ok(true) + }); + // Should get the first assignment but error on the truncated second frame + assert!(count >= 1); + assert!(result.is_err()); +} + +// ── XZAssignmentFrameReader truncated TwoDelta ────────────────────── + +#[test] +fn xz_frame_reader_twodelta_truncated_errors() { + use xz2::write::XzEncoder; + + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"STANDARD BEN FILE").unwrap(); + // Partial ben32 frame — no zero terminator, triggers truncated error + encoder.write_all(&[0, 1, 0, 3]).unwrap(); + encoder.finish().unwrap(); + } + + let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let err = reader.into_iter().next().unwrap().unwrap_err(); + assert!( + err.to_string().contains("truncated") || err.to_string().contains("Truncated"), + "got: {}", + err + ); +} + +// ── Standard/MkvChain frame decode error ───────────────────────────── + +#[test] +fn xz_reader_standard_corrupt_frame_errors() { + use xz2::write::XzEncoder; + + // Write a valid-looking ben32 frame structure but with corrupted content + // that decode_xben_frame_to_assignment can't parse + let mut xben = Vec::new(); + { + let mut encoder = XzEncoder::new(&mut xben, 1); + encoder.write_all(b"STANDARD BEN FILE").unwrap(); + // Write 4 bytes followed by zero terminator — the frame decodes to + // a single run (value=255, count=255). This should actually be valid. + // Instead, write a completely empty frame (just the zero terminator). + encoder.write_all(&[0, 0, 0, 0]).unwrap(); // just zero terminator (no runs) + encoder.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.collect(); + // An empty frame (no RLE pairs before terminator) yields an empty assignment + assert_eq!(results.len(), 1); + assert_eq!(results[0].as_ref().unwrap().0, Vec::::new()); +} diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs index 2fb31b6..0ff4406 100644 --- a/ben/src/io/reader/twodelta.rs +++ b/ben/src/io/reader/twodelta.rs @@ -1,5 +1,4 @@ pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; -pub(super) const XBEN_TWODELTA_DELTA_TAG: u8 = 1; pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; pub(super) enum XBenTwoDeltaFrame { diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs index b360a35..9eff90c 100644 --- a/ben/src/io/reader/xz_assignment_reader.rs +++ b/ben/src/io/reader/xz_assignment_reader.rs @@ -1,8 +1,6 @@ use super::errors::DecoderInitError; use super::subsample::{Ben32Frame, DecodeFrame, MkvRecord, SubsampleFrameDecoder}; -use super::twodelta::{ - XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_DELTA_TAG, XBEN_TWODELTA_FULL_TAG, -}; +use super::twodelta::{XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; use crate::format::banners::{variant_from_banner, BANNER_LEN}; @@ -191,41 +189,6 @@ impl XZAssignmentReader { let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) } - XBEN_TWODELTA_DELTA_TAG => { - if overflow.len() < 11 { - return None; - } - let pair = ( - u16::from_be_bytes([overflow[1], overflow[2]]), - u16::from_be_bytes([overflow[3], overflow[4]]), - ); - let run_count = - u32::from_be_bytes([overflow[5], overflow[6], overflow[7], overflow[8]]) - as usize; - let payload_len = run_count.checked_mul(2)?; - let total_len = 1usize - .checked_add(2)? - .checked_add(2)? - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; - if overflow.len() < total_len { - return None; - } - - let mut run_lengths = Vec::with_capacity(run_count); - let mut cursor = 9usize; - for _ in 0..run_count { - run_lengths.push(u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]])); - cursor += 2; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok(( - XBenTwoDeltaFrame::Delta { pair, run_lengths }, - total_len, - count, - ))) - } XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { tag, @@ -493,22 +456,19 @@ impl Iterator for XZAssignmentReader { } BenVariant::TwoDelta => { // Drain frames from a previously parsed chunk first. + // Chunks only contain Delta frames. if let Some((frame, count)) = self.chunk_queue.pop_front() { if count == 0 { return Some(Err(zero_count_frame_error())); } - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(prev) => { - apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) - } - None => { - Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)) - } - } + let XBenTwoDeltaFrame::Delta { pair, run_lengths } = frame else { + unreachable!("chunk queue only contains Delta frames"); + }; + let assignment = match self.previous_assignment.take() { + Some(prev) => { + apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) } + None => Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)), }; return Some(match assignment { Ok(a) => { @@ -527,7 +487,8 @@ impl Iterator for XZAssignmentReader { } } - // Try a single legacy frame (tag 0 or 1). + // Try a single frame from overflow (only Full/tag-0 frames + // or errors — tag-1 is no longer supported). if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { let res = match parsed { Ok((frame, consumed, count)) => { @@ -535,34 +496,15 @@ impl Iterator for XZAssignmentReader { self.overflow.drain(..consumed); return Some(Err(zero_count_frame_error())); } - let assignment = match frame { - XBenTwoDeltaFrame::Full { runs } => Ok(rle_to_vec(runs)), - XBenTwoDeltaFrame::Delta { pair, run_lengths } => { - match self.previous_assignment.take() { - Some(previous_assignment) => { - apply_twodelta_runs_to_assignment( - previous_assignment, - pair, - &run_lengths, - ) - } - None => Err(io::Error::from( - DecodeError::TwoDeltaNoAnchorFrame, - )), - } - } + let XBenTwoDeltaFrame::Full { runs } = frame else { + unreachable!( + "pop_twodelta_frame_from_overflow only returns Full frames" + ); }; - match assignment { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - self.overflow.drain(..consumed); - Ok((assignment, count)) - } - Err(err) => { - self.overflow.drain(..consumed); - Err(err) - } - } + let assignment = rle_to_vec(runs); + self.previous_assignment = Some(assignment.clone()); + self.overflow.drain(..consumed); + Ok((assignment, count)) } Err(err) => { self.overflow.clear(); diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 9b76cd9..abc4b00 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -35,13 +35,19 @@ fn roundtrip_xben_counts(assignments: &[Vec], variant: BenVariant) -> Vec<( #[test] fn writer_standard_basic_roundtrip() { let assignments = vec![vec![1u16, 2, 3], vec![4, 5, 6]]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::Standard), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::Standard), + assignments + ); } #[test] fn writer_standard_single_element_assignments() { let assignments = vec![vec![42u16], vec![99]]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::Standard), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::Standard), + assignments + ); } // ── MkvChain variant roundtrips ─────────────────────────────────────── @@ -60,13 +66,19 @@ fn writer_mkv_deduplication() { #[test] fn writer_twodelta_basic_roundtrip() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::TwoDelta), + assignments + ); } #[test] fn writer_twodelta_anchor_only() { let assignments = vec![vec![1u16, 2, 3, 4]]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::TwoDelta), + assignments + ); } #[test] @@ -85,12 +97,7 @@ fn writer_twodelta_repeated_anchor() { fn writer_twodelta_repeated_delta() { let anchor = vec![1u16, 1, 2, 2]; let delta = vec![2u16, 1, 2, 2]; - let assignments = vec![ - anchor.clone(), - delta.clone(), - delta.clone(), - delta.clone(), - ]; + let assignments = vec![anchor.clone(), delta.clone(), delta.clone(), delta.clone()]; let results = roundtrip_xben_counts(&assignments, BenVariant::TwoDelta); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, 4); @@ -103,7 +110,13 @@ fn writer_twodelta_chunk_size_1() { let anchor = vec![1u16, 1, 2, 2]; let delta = vec![2u16, 2, 1, 1]; let assignments: Vec<_> = (0..10) - .map(|i| if i % 2 == 0 { anchor.clone() } else { delta.clone() }) + .map(|i| { + if i % 2 == 0 { + anchor.clone() + } else { + delta.clone() + } + }) .collect(); let mut xben = Vec::new(); @@ -168,6 +181,31 @@ fn writer_twodelta_u16_max_value_in_assignment() { } } +// ── BEN AssignmentWriter TwoDelta repeat frame ────────────────────── + +#[test] +fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { + use crate::io::reader::AssignmentReader; + use crate::io::writer::AssignmentWriter; + + // Assignment with 3 distinct values exercises the `continue` skip path + // inside `twodelta_repeat_frame` for values outside the picked pair. + let assign = vec![1u16, 2, 3, 1, 2]; + let n = u16::MAX as usize + 2; // 65537: triggers overflow → repeat frame + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for _ in 0..n { + w.write_assignment(assign.clone()).unwrap(); + } + } + + let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); + assert_eq!(total, n); +} + // ── TwoDelta write_json_value ───────────────────────────────────────── #[test] @@ -198,9 +236,7 @@ fn writer_finish_is_idempotent() { { let encoder = XzEncoder::new(&mut xben, 1); let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); - writer - .write_assignment(vec![1u16, 2, 3, 4]) - .unwrap(); + writer.write_assignment(vec![1u16, 2, 3, 4]).unwrap(); writer.finish().unwrap(); writer.finish().unwrap(); } @@ -226,7 +262,9 @@ fn writer_write_ben_file_standard_roundtrip() { { let encoder = XzEncoder::new(&mut xben, 1); let mut writer = XZAssignmentWriter::new(encoder, BenVariant::Standard).unwrap(); - writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer + .write_ben_file(BufReader::new(ben.as_slice())) + .unwrap(); writer.finish().unwrap(); } @@ -251,7 +289,9 @@ fn writer_write_ben_file_mkv_roundtrip() { { let encoder = XzEncoder::new(&mut xben, 1); let mut writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain).unwrap(); - writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer + .write_ben_file(BufReader::new(ben.as_slice())) + .unwrap(); writer.finish().unwrap(); } @@ -266,11 +306,7 @@ fn writer_write_ben_file_twodelta_roundtrip() { use crate::io::writer::AssignmentWriter; use std::io::BufReader; - let assignments = vec![ - vec![1u16, 2, 1, 2], - vec![1, 1, 2, 2], - vec![2, 1, 2, 1], - ]; + let assignments = vec![vec![1u16, 2, 1, 2], vec![1, 1, 2, 2], vec![2, 1, 2, 1]]; let mut ben = Vec::new(); { @@ -284,7 +320,9 @@ fn writer_write_ben_file_twodelta_roundtrip() { { let encoder = XzEncoder::new(&mut xben, 1); let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); - writer.write_ben_file(BufReader::new(ben.as_slice())).unwrap(); + writer + .write_ben_file(BufReader::new(ben.as_slice())) + .unwrap(); writer.finish().unwrap(); } @@ -330,3 +368,106 @@ fn writer_twodelta_stress_many_unique_deltas() { let results = roundtrip_xben(&assignments, BenVariant::TwoDelta); assert_eq!(results, assignments); } + +// ── TwoDelta u16::MAX count overflow paths ─────────────────────────── + +#[test] +fn writer_twodelta_anchor_count_overflow_u16max() { + // Use 3 distinct values to exercise the `continue` skip in + // twodelta_repeat_buffered_frame for values outside the picked pair. + let assign = vec![1u16, 2, 3, 1, 2]; + let n = u16::MAX as usize + 2; // 65537 — triggers the overflow branch + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + for _ in 0..n { + writer.write_assignment(assign.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); + assert_eq!(total, n); +} + +#[test] +fn writer_twodelta_delta_count_overflow_u16max() { + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![2u16, 1, 2, 2]; + let n_delta = u16::MAX as usize + 1; // 65536 identical deltas + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(n_delta + 1); + writer.write_assignment(anchor.clone()).unwrap(); + for _ in 0..n_delta { + writer.write_assignment(delta.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, n_delta + 1); +} + +// ── TwoDelta translate via write_ben_file with chunk flush ─────────── + +#[test] +fn writer_translate_ben_twodelta_chunk_flush() { + use crate::io::writer::AssignmentWriter; + use std::io::BufReader; + + let a = vec![1u16, 1, 2, 2]; + let b = vec![2u16, 2, 1, 1]; + let assignments: Vec<_> = (0..30) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); + + let mut ben = Vec::new(); + { + let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + w.write_assignment(a.clone()).unwrap(); + } + } + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) + .unwrap() + .with_chunk_size(5); + writer + .write_ben_file(BufReader::new(ben.as_slice())) + .unwrap(); + writer.finish().unwrap(); + } + + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, assignments); +} + +// ── MkvChain u16::MAX overflow ─────────────────────────────────────── + +#[test] +fn writer_mkv_count_overflow_u16max() { + let assign = vec![1u16, 2, 3]; + let n = u16::MAX as usize + 2; // overflow + + let mut xben = Vec::new(); + { + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain).unwrap(); + for _ in 0..n { + writer.write_assignment(assign.clone()).unwrap(); + } + } + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); + assert_eq!(total, n); +} diff --git a/ben/src/json/graph/tests/test_algos.rs b/ben/src/json/graph/tests/test_algos.rs index 544ac88..01a0f30 100644 --- a/ben/src/json/graph/tests/test_algos.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -445,3 +445,55 @@ fn test_sort_by_key_directed_graph() { assert_eq!(mapping[&2], 1); assert_eq!(mapping[&0], 2); } + +#[test] +fn test_sort_json_file_by_key_id() { + let input = r#"{ + "nodes": [ + {"id": 2}, + {"id": 0}, + {"id": 1} + ], + "adjacency": [ + [{"id": 0}], + [{"id": 1}], + [{"id": 2}] + ] + }"#; + + let mut output = Vec::new(); + let mapping = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(output_json["nodes"][0]["id"], 0); + assert_eq!(output_json["nodes"][1]["id"], 1); + assert_eq!(output_json["nodes"][2]["id"], 2); + assert_eq!(mapping[&0], 0); + assert_eq!(mapping[&1], 1); + assert_eq!(mapping[&2], 2); +} + +#[test] +fn test_sort_json_file_by_key_mixed_numeric_and_string() { + let input = r#"{ + "nodes": [ + {"id": 0, "key": 42}, + {"id": 1, "key": "alpha"}, + {"id": 2, "key": 7} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 2}], + [{"id": 0}] + ] + }"#; + + let mut output = Vec::new(); + sort_json_file_by_key(input.as_bytes(), &mut output, "key").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + // Numeric values come first (7 < 42), then string "alpha" + assert_eq!(output_json["nodes"][0]["key"], 7); + assert_eq!(output_json["nodes"][1]["key"], 42); + assert_eq!(output_json["nodes"][2]["key"], "alpha"); +} diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 89ce057..46969ec 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -179,3 +179,11 @@ fn test_sample_error_conversion_and_sources() { assert!(sample_err.to_string().starts_with("JSON Error: ")); assert!(sample_err.source().is_some()); } + +#[test] +fn test_sample_error_new_io_error() { + let io_err = io::Error::new(io::ErrorKind::NotFound, "file gone"); + let sample_err = SampleError::new_io_error(io_err); + assert!(matches!(sample_err, SampleError::IoError(_))); + assert_eq!(sample_err.to_string(), "IO Error: file gone"); +} From a43cf519560ae2a4b7fdbe8ee47b8011e34d5bff Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 11:24:30 -0600 Subject: [PATCH 071/221] Get coverage to 98% on rust side --- ben/src/cli/ben.rs | 209 ++++++++++++++++ ben/src/cli/bendl.rs | 284 ++++++++++++++++++++++ ben/src/cli/pben.rs | 21 ++ ben/src/codec/decode/jsonl.rs | 33 ++- ben/src/codec/decode/xz.rs | 33 ++- ben/src/codec/encode/tests.rs | 139 +++++++++++ ben/src/codec/frames/tests.rs | 35 ++- ben/src/io/bundle/format.rs | 38 +-- ben/src/io/bundle/reader.rs | 73 ++++-- ben/src/io/bundle/writer.rs | 253 +++++++++++++++---- ben/src/io/reader/assignment_reader.rs | 27 +- ben/src/io/reader/tests.rs | 152 +++++++++++- ben/src/io/reader/twodelta.rs | 10 - ben/src/io/reader/xz_assignment_reader.rs | 162 +++++------- ben/src/io/writer/assignment_writer.rs | 16 ++ ben/src/io/writer/mod.rs | 1 + ben/src/io/writer/tests.rs | 17 +- ben/src/io/writer/xz_assignment_writer.rs | 72 ++++-- ben/src/json/graph/tests/test_algos.rs | 64 +++++ ben/src/lib.rs | 3 + ben/src/ops/extract/mod.rs | 9 +- ben/src/ops/relabel/tests.rs | 104 ++++++++ 22 files changed, 1479 insertions(+), 276 deletions(-) diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index 7048062..aa50bb7 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -1106,4 +1106,213 @@ mod tests { assert_eq!(fs::read_to_string(&path).unwrap(), "derived"); fs::remove_file(path).unwrap(); } + + #[test] + fn resolve_variant_standard_arm() { + assert_eq!( + resolve_variant(Some(CliVariant::Standard), false), + BenVariant::Standard + ); + } + + #[test] + fn count_jsonl_lines_counts_nonempty_lines() { + let path = unique_path("count.jsonl"); + fs::write(&path, b"{\"a\":1}\n\n{\"b\":2}\n").unwrap(); + let count = count_jsonl_lines(&path).unwrap(); + assert_eq!(count, 2); + fs::remove_file(path).unwrap(); + } + + /// Write a two-sample Standard BEN JSONL file to a temp path. + fn write_temp_jsonl(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + fs::write( + &path, + b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n", + ) + .unwrap(); + path + } + + /// Write a minimal graph JSON file to a temp path. + fn write_temp_graph(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + fs::write(&path, b"{\"nodes\":[0,1,2],\"adj\":[[1],[0,2],[1]]}").unwrap(); + path + } + + #[test] + fn append_graph_asset_adds_graph_to_bundle() { + use crate::io::bundle::{AddAssetOptions, BendlReader, BendlWriter}; + use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH}; + use std::io::Cursor; + + // Build a minimal finalized .bendl in memory, write to temp file. + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("append_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + let graph_path = write_temp_graph("append_graph.json"); + + append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap(); + + // Verify the graph asset was added. + let file = fs::File::open(&bendl_path).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&bendl_path).unwrap(); + fs::remove_file(&graph_path).unwrap(); + } + + #[test] + fn run_encode_bundle_with_graph_creates_bendl() { + use crate::io::bundle::BendlReader; + + let jsonl = write_temp_jsonl("enc_graph_input.jsonl"); + let graph = write_temp_graph("enc_graph.json"); + let out = unique_path("enc_graph_output.bendl"); + + run_encode_bundle_with_graph(&jsonl, out.to_str().unwrap(), BenVariant::Standard, &graph) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + assert_eq!(reader.sample_count(), Some(2)); + + fs::remove_file(&jsonl).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); + } + + #[test] + fn run_xencode_bundle_with_graph_from_jsonl_creates_bendl() { + use crate::io::bundle::BendlReader; + + let jsonl = write_temp_jsonl("xencode_graph_input.jsonl"); + let graph = write_temp_graph("xencode_graph.json"); + let out = unique_path("xencode_graph_output.bendl"); + + run_xencode_bundle_with_graph( + &jsonl, + out.to_str().unwrap(), + BenVariant::Standard, + false, + None, + None, + None, + &graph, + ) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&jsonl).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); + } + + #[test] + fn run_xencode_bundle_with_graph_from_ben_creates_bendl() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::bundle::BendlReader; + use std::io::Cursor; + + // First create a BEN file from JSONL. + let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n{\"assignment\":[2,1],\"sample\":2}\n"; + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben_bytes, BenVariant::Standard).unwrap(); + let ben_path = unique_path("xencode_from_ben_input.ben"); + fs::write(&ben_path, &ben_bytes).unwrap(); + + let graph = write_temp_graph("xencode_from_ben_graph.json"); + let out = unique_path("xencode_from_ben_output.bendl"); + + run_xencode_bundle_with_graph( + &ben_path, + out.to_str().unwrap(), + BenVariant::Standard, + true, + None, + None, + None, + &graph, + ) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&ben_path).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); + } + + #[test] + fn append_graph_asset_errors_on_missing_graph_file() { + use crate::io::bundle::{BendlWriter}; + use crate::io::bundle::format::AssignmentFormat; + use std::io::Cursor; + + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("err_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + let nonexistent = unique_path("nonexistent.json"); + let err = append_graph_asset(bendl_path.to_str().unwrap(), &nonexistent).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to read graph")); + let _ = fs::remove_file(&bendl_path); + } + + #[test] + fn run_encode_bundle_with_graph_errors_on_missing_graph() { + let jsonl = write_temp_jsonl("err_enc_input.jsonl"); + let out = unique_path("err_enc_output.bendl"); + let nonexistent = unique_path("nonexistent.json"); + + let err = run_encode_bundle_with_graph( + &jsonl, out.to_str().unwrap(), BenVariant::Standard, &nonexistent, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to stat graph")); + let _ = fs::remove_file(&jsonl); + let _ = fs::remove_file(&out); + } + + #[test] + fn run_xencode_bundle_with_graph_errors_on_missing_graph() { + let jsonl = write_temp_jsonl("err_xenc_input.jsonl"); + let out = unique_path("err_xenc_output.bendl"); + let nonexistent = unique_path("nonexistent.json"); + + let err = run_xencode_bundle_with_graph( + &jsonl, out.to_str().unwrap(), BenVariant::Standard, false, + None, None, None, &nonexistent, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to stat graph")); + let _ = fs::remove_file(&jsonl); + let _ = fs::remove_file(&out); + } } diff --git a/ben/src/cli/bendl.rs b/ben/src/cli/bendl.rs index b574ac2..df455f5 100644 --- a/ben/src/cli/bendl.rs +++ b/ben/src/cli/bendl.rs @@ -467,3 +467,287 @@ fn append_file_asset PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("bendl-cli-{name}-{nonce}")) + } + + /// Write a minimal finalized .bendl file and return its path. + fn write_temp_bendl(name: &str, format: AssignmentFormat) -> PathBuf { + let path = unique_path(name); + let stream = b"STANDARD BEN FILE\x00fake"; + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); + writer.write_stream_bytes(stream, 1).unwrap(); + writer.finish().unwrap(); + std::fs::write(&path, &buf).unwrap(); + path + } + + #[test] + fn write_temp_bendl_xben_variant_works() { + // Exercises the Xben branch of write_temp_bendl. + let path = write_temp_bendl("xben_helper_check.bendl", AssignmentFormat::Xben); + let reader = BendlReader::open(BufReader::new( + std::fs::File::open(&path).unwrap(), + )) + .unwrap(); + assert!(reader.is_complete()); + let _ = std::fs::remove_file(&path); + } + + #[test] + fn named_asset_from_str_rejects_empty_name() { + let err = "=path/to/file".parse::().unwrap_err(); + assert!(err.contains("non-empty")); + } + + #[test] + fn format_from_path_detects_xben() { + let fmt = format_from_path(std::path::Path::new("stream.xben")).unwrap(); + assert_eq!(fmt, AssignmentFormat::Xben); + } + + #[test] + fn format_from_path_rejects_unknown_extension() { + let err = format_from_path(std::path::Path::new("archive.tar")).unwrap_err(); + assert!(err.contains("expected .ben or .xben")); + } + + #[test] + fn mode_str_returns_xben_for_xben() { + assert_eq!(mode_str(AssignmentFormat::Xben), "xben"); + } + + #[test] + fn run_create_with_relabel_map_and_custom_asset() { + let ben = { + // Must end in .ben so format_from_path recognises it. + let p = std::env::temp_dir().join(format!( + "bendl-create-relabel-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let relabel = unique_path("create_relabel_map.json"); + std::fs::write(&relabel, b"{\"0\":1,\"1\":0}").unwrap(); + let custom = unique_path("create_custom.bin"); + std::fs::write(&custom, b"custom bytes").unwrap(); + let out = unique_path("create_with_assets.bendl"); + + let asset_str = format!("myblob={}", custom.display()); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: Some(relabel.clone()), + assets: vec![asset_str.parse().unwrap()], + overwrite: false, + graph_raw: false, + }; + run_create(args).unwrap(); + + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + assert!(reader.find_asset_by_name("myblob").is_some()); + + for p in [&ben, &relabel, &custom, &out] { let _ = std::fs::remove_file(p); } + } + + #[test] + fn run_inspect_xben_format_and_checksum_flag() { + use crate::io::bundle::AddAssetOptions; + use crate::io::bundle::format::ASSET_TYPE_CUSTOM; + + // Build a .bendl with a checksum asset so the flag_parts checksum + // branch is exercised. + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Xben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "checksummed", + b"data", + AddAssetOptions { + checksum: Some(vec![0xAB, 0xCD]), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + let path = unique_path("inspect_xben.bendl"); + std::fs::write(&path, &buf).unwrap(); + + run_inspect(InspectArgs { input: path.clone() }).unwrap(); + let _ = std::fs::remove_file(&path); + } + + #[test] + fn run_append_no_assets_is_noop() { + let bendl = write_temp_bendl("append_noop.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![], + graph_raw: false, + }; + run_append(args).unwrap(); + // File should be unchanged (bundle is still valid). + let reader = BendlReader::open(BufReader::new( + std::fs::File::open(&bendl).unwrap(), + )) + .unwrap(); + assert!(reader.is_complete()); + let _ = std::fs::remove_file(&bendl); + } + + #[test] + fn run_append_with_metadata_and_relabel_map() { + let bendl = write_temp_bendl("append_assets.bendl", AssignmentFormat::Ben); + let meta = unique_path("append_meta.json"); + std::fs::write(&meta, b"{\"version\":1}").unwrap(); + let relabel = unique_path("append_relabel.json"); + std::fs::write(&relabel, b"{\"0\":1}").unwrap(); + + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: Some(meta.clone()), + relabel_map: Some(relabel.clone()), + assets: vec![], + graph_raw: false, + }; + run_append(args).unwrap(); + + let reader = BendlReader::open(BufReader::new( + std::fs::File::open(&bendl).unwrap(), + )) + .unwrap(); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + + for p in [&bendl, &meta, &relabel] { let _ = std::fs::remove_file(p); } + } + + #[test] + fn run_create_with_graph_raw_flag() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-create-raw-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let graph = unique_path("create_raw_graph.json"); + std::fs::write(&graph, b"{\"nodes\":[0,1]}").unwrap(); + let out = unique_path("create_raw.bendl"); + + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: Some(graph.clone()), + metadata: None, + relabel_map: None, + assets: vec![], + overwrite: false, + graph_raw: true, + }; + run_create(args).unwrap(); + + let reader = BendlReader::open(BufReader::new( + std::fs::File::open(&out).unwrap(), + )) + .unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + for p in [&ben, &graph, &out] { let _ = std::fs::remove_file(p); } + } + + #[test] + fn run_inspect_unknown_format_and_no_sample_count() { + use crate::io::bundle::format::{BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, + COMPLETE_NO, HEADER_SIZE}; + + // Build a header with an unknown assignment format byte and + // complete=0 so sample_count() returns None. + let mut header = [0u8; HEADER_SIZE]; + header[0..8].copy_from_slice(&BENDL_MAGIC); + header[8..10].copy_from_slice(&BENDL_MAJOR_VERSION.to_le_bytes()); + header[10..12].copy_from_slice(&BENDL_MINOR_VERSION.to_le_bytes()); + header[12] = COMPLETE_NO; + header[13] = 0xFF; // unknown format byte + // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 + let stream_offset = HEADER_SIZE as u64; + header[40..48].copy_from_slice(&stream_offset.to_le_bytes()); + let sample_count: i64 = -1; + header[56..64].copy_from_slice(&sample_count.to_le_bytes()); + + let path = unique_path("inspect_unknown.bendl"); + std::fs::write(&path, &header).unwrap(); + run_inspect(InspectArgs { input: path.clone() }).unwrap(); + let _ = std::fs::remove_file(&path); + } + + #[test] + fn run_append_with_graph_raw_and_graph_asset() { + let bendl = write_temp_bendl("append_graph_raw.bendl", AssignmentFormat::Ben); + let graph = unique_path("append_graph_raw.json"); + std::fs::write(&graph, b"{\"nodes\":[0,1,2]}").unwrap(); + + let args = AppendArgs { + input: bendl.clone(), + graph: Some(graph.clone()), + metadata: None, + relabel_map: None, + assets: vec![], + graph_raw: true, + }; + run_append(args).unwrap(); + + let reader = BendlReader::open(BufReader::new( + std::fs::File::open(&bendl).unwrap(), + )) + .unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + for p in [&bendl, &graph] { let _ = std::fs::remove_file(p); } + } + + #[test] + fn run_extract_rejects_missing_stream_and_asset() { + let args = ExtractArgs::try_parse_from([ + "extract", + "--output", "/tmp/out.bin", + "bundle.bendl", + ]) + .unwrap(); + let err = run_extract(args).unwrap_err(); + assert!(err.contains("either --stream or --asset")); + } +} diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index 2b7ade0..e48e446 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -332,6 +332,27 @@ mod tests { assert!(rendered.contains(r#""assignment":[2,2,3]"#)); } + #[test] + fn resolved_output_path_returns_none_when_both_paths_absent() { + // When neither output_file nor input_file is given, stdout mode: Ok(None). + let result = resolved_output_path(Mode::BenToPc, None, None, false).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn assignment_decode_ben_propagates_read_error() { + // assignment_decode_ben propagates I/O errors from the BEN reader. + struct AlwaysErrors; + impl io::Read for AlwaysErrors { + fn read(&mut self, _: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + let mut out = Vec::new(); + let err = assignment_decode_ben(AlwaysErrors, &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); + } + #[test] fn assignment_encode_xben_offsets_values_and_writes_xben() { let input = b"[0,1,1]\n[2,2,0]\n"; diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index 03d2e81..f378d62 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -95,28 +95,25 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let mut last_valid_assignment = 0; - match variant { - BenVariant::Standard => { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - progress!("Decoding sample: {}\r", line_count); - } + // TwoDelta was dispatched before this loop and returned early. + if variant == BenVariant::Standard { + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 1; + line_count += 1; + progress!("Decoding sample: {}\r", line_count); } } - BenVariant::MkvChain => { - for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - progress!("Decoding sample: {}\r", line_count); - } + } else { + for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 3; + let lines = &overflow[i + 1..i + 3]; + let n_lines = u16::from_be_bytes([lines[0], lines[1]]); + line_count += n_lines as usize; + progress!("Decoding sample: {}\r", line_count); } } - BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), } if last_valid_assignment == 0 { diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 38c3fa0..d497f0f 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -74,28 +74,25 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let mut last_valid_assignment = 0; - match variant { - BenVariant::Standard => { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - progress!("Decoding sample: {}\r", line_count); - } + // TwoDelta was dispatched before this loop and returned early. + if variant == BenVariant::Standard { + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 1; + line_count += 1; + progress!("Decoding sample: {}\r", line_count); } } - BenVariant::MkvChain => { - for i in (3..overflow.len() - 2).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - progress!("Decoding sample: {}\r", line_count); - } + } else { + for i in (3..overflow.len() - 2).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + last_valid_assignment = i + 3; + let lines = &overflow[i + 1..i + 3]; + let n_lines = u16::from_be_bytes([lines[0], lines[1]]); + line_count += n_lines as usize; + progress!("Decoding sample: {}\r", line_count); } } - BenVariant::TwoDelta => unreachable!("handled before ben32 decoding"), } if last_valid_assignment == 0 { diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 392fbfa..9363642 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1189,3 +1189,142 @@ fn twodelta_encode_outside_pair_change_errors() { let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } + +#[test] +fn twodelta_encode_missing_mask_for_pair0_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // Only the mask for pair.1 (2) is provided; pair.0 (1) is absent. + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(2, vec![2, 3]); + + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_empty_mask_for_pair0_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // pair.0 (1) has an empty mask; pair.1 (2) is non-empty. + let prev = vec![1u16, 1, 2, 2]; + let curr = vec![2u16, 1, 2, 1]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![]); + masks.insert(2, vec![2, 3]); + + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_new_val_out_of_pair_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // At position 2: prev=1 (in pair), but curr=3 (outside pair {1,2}). + let prev = vec![1u16, 2, 1, 2]; + let curr = vec![2u16, 1, 3, 2]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 2]); + masks.insert(2, vec![1, 3]); + + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_pair_mask_hint_identical_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // Explicit pair + mask hints, but prev == curr → TwoDeltaIdentical. + let a = vec![1u16, 1, 2, 2]; + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, vec![0, 1]); + masks.insert(2, vec![2, 3]); + + let err = + encode_twodelta_frame_with_hint(&a, &a, Some((1, 2)), Some(&mut masks), None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_pair_position_changes_to_third_id_errors() { + use crate::codec::encode::encode_twodelta_frame; + + // At position 2: prev=1 ∈ {1,2}, but curr=3 ∉ {1,2} → TwoDeltaTooManyIds. + let prev = vec![1u16, 2, 1, 2]; + let curr = vec![2u16, 1, 3, 2]; + let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_encode_pair_mask_run_exceeds_u16_max_errors() { + use crate::codec::encode::encode_twodelta_frame_with_hint; + use std::collections::HashMap; + + // 65538 positions: pair positions 0..65537 hold value 1 in prev, one more + // (65537) holds value 2. In curr all pair positions hold value 2, so the + // run of value-2 positions reaches u16::MAX and the encoder must error. + let mut prev = vec![1u16; 65538]; + prev[65537] = 2; + let mut curr = vec![2u16; 65538]; + curr[65537] = 1; + + let mut masks: HashMap> = HashMap::new(); + masks.insert(1, (0..65537_usize).collect()); + masks.insert(2, vec![65537_usize]); + + let err = + encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); +} + +#[test] +fn twodelta_encode_from_scratch_run_exceeds_u16_max_errors() { + use crate::codec::encode::encode_twodelta_frame; + + // 65538 positions all in pair {1, 2}: first 65537 change 1→2, last 1 changes 2→1. + // The from-scratch encoder hits u16::MAX consecutive positions with value 2 and errors. + let mut prev = vec![1u16; 65538]; + prev[65537] = 2; + let mut curr = vec![2u16; 65538]; + curr[65537] = 1; + + let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); +} + +#[test] +fn ben32_encode_run_exceeding_u16_max_splits_correctly() { + use super::ben::encode_ben32_assignments; + + // Build an assignment with 65537 identical values: the run reaches u16::MAX + // (65535) and must be flushed early, then continues with a new run. + // encode_ben32_assignments appends a 4-byte zero sentinel at the end. + let assign: Vec = vec![7u16; 65537]; + let encoded = encode_ben32_assignments(&assign).unwrap(); + + // Should be: (7, 65535) + (7, 2) + [0,0,0,0] sentinel = 12 bytes. + assert_eq!(encoded.len(), 12); + let first = u32::from_be_bytes(encoded[0..4].try_into().unwrap()); + let second = u32::from_be_bytes(encoded[4..8].try_into().unwrap()); + let sentinel = u32::from_be_bytes(encoded[8..12].try_into().unwrap()); + assert_eq!(first >> 16, 7u32); + assert_eq!(first & 0xFFFF, 65535u32); // count = u16::MAX + assert_eq!(second >> 16, 7u32); + assert_eq!(second & 0xFFFF, 2u32); // remaining 2 elements + assert_eq!(sentinel, 0u32); // always-present zero sentinel +} diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index f38994c..e6702db 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -1,5 +1,18 @@ use super::*; -use std::io; +use std::io::{self, Read}; + +/// A reader that returns one successful byte then an I/O error. +struct ErrorAfterOneByte; + +impl Read for ErrorAfterOneByte { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if buf.is_empty() { + return Ok(0); + } + buf[0] = 0x01; + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } +} // ── BenDecodeFrame ────────────────────────────────────────────────────────── @@ -597,3 +610,23 @@ fn twodelta_encode_frame_to_bytes_and_into_bytes() { let into = frame.into_bytes(); assert_eq!(into, expected); } + +// ── Non-EOF read errors propagate from frame decoders ─────────────────────── + +#[test] +fn ben_decode_frame_non_eof_read_error_propagates() { + let err = BenDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +#[test] +fn mkv_decode_frame_non_eof_read_error_propagates() { + let err = MkvBenDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +#[test] +fn twodelta_decode_frame_non_eof_read_error_propagates() { + let err = TwoDeltaDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index e2f62d8..7552b03 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -298,13 +298,7 @@ impl BendlDirectoryEntry { length: name_bytes.len(), })?; let checksum_bytes = self.checksum.as_deref().unwrap_or(&[]); - let checksum_len: u32 = - checksum_bytes - .len() - .try_into() - .map_err(|_| BendlFormatError::ChecksumTooLong { - length: checksum_bytes.len(), - })?; + let checksum_len = checksum_bytes.len() as u32; let mut out = Vec::with_capacity(self.encoded_len()); out.extend_from_slice(&self.asset_type.to_le_bytes()); @@ -381,13 +375,7 @@ pub fn read_directory( /// Serialize a directory table into a byte vector. pub fn encode_directory(entries: &[BendlDirectoryEntry]) -> Result, BendlFormatError> { - let entry_count: u32 = - entries - .len() - .try_into() - .map_err(|_| BendlFormatError::TooManyEntries { - length: entries.len(), - })?; + let entry_count = entries.len() as u32; let body_len: usize = entries.iter().map(|e| e.encoded_len()).sum(); let mut out = Vec::with_capacity(4 + body_len); @@ -425,20 +413,6 @@ pub enum BendlFormatError { length: usize, }, - /// A directory entry's checksum exceeded the `u32` length limit. - #[error("directory entry checksum is {length} bytes which exceeds the u32 length limit")] - ChecksumTooLong { - /// The offending length in bytes. - length: usize, - }, - - /// A directory table exceeded the `u32` entry count limit. - #[error("directory has {length} entries which exceeds the u32 entry count limit")] - TooManyEntries { - /// The offending entry count. - length: usize, - }, - /// A directory entry name was not valid UTF-8. #[error("directory entry name is not valid UTF-8")] NameNotUtf8, @@ -721,6 +695,14 @@ mod tests { assert_eq!(io_err.to_string(), "pipe broke"); } + #[test] + fn bendl_format_error_non_io_becomes_invalid_data() { + let fmt_err = BendlFormatError::MalformedDirectory("bad dir".to_string()); + let io_err: io::Error = fmt_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("bad dir")); + } + #[test] fn trailing_directory_bytes_error_display() { let err = BendlFormatError::TrailingDirectoryBytes { remaining: 42 }; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index c503cb5..f65354e 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -203,17 +203,12 @@ impl BendlReader { validate_directory_entries(&self.directory) } - /// Release the underlying reader. - pub fn into_inner(self) -> R { - self.inner - } } pub(crate) fn validate_directory_entries( directory: &[BendlDirectoryEntry], ) -> Result<(), BundleValidationError> { let mut seen_names = std::collections::HashSet::new(); - let mut seen_singleton_types = std::collections::HashSet::new(); for entry in directory { if !seen_names.insert(entry.name.as_str()) { @@ -227,11 +222,6 @@ pub(crate) fn validate_directory_entries( found: entry.name.clone(), }); } - if !seen_singleton_types.insert(entry.asset_type) { - return Err(BundleValidationError::DuplicateSingletonType( - entry.asset_type, - )); - } } } Ok(()) @@ -284,10 +274,6 @@ pub enum BundleValidationError { #[error("duplicate asset name: {0:?}")] DuplicateName(String), - /// Two entries share the same singleton asset type. - #[error("duplicate singleton asset type: {0}")] - DuplicateSingletonType(u16), - /// An entry with a known singleton type is not using its canonical name. #[error("asset type {asset_type} must use canonical name {expected:?}, found {found:?}")] WrongCanonicalName { @@ -836,13 +822,11 @@ mod tests { directory: entries, }; // The second entry has asset_type METADATA but name "meta2.json" - // which fails the canonical-name check before the singleton - // check; that's still a valid rejection. + // which fails the canonical-name check. let err = reader.validate_directory().unwrap_err(); assert!(matches!( err, BundleValidationError::WrongCanonicalName { .. } - | BundleValidationError::DuplicateSingletonType(_) )); } @@ -1060,4 +1044,59 @@ mod tests { // prefix, which is the basic "no truncation of what exists" check. assert!(buf.starts_with(&fake_stream)); } + + #[test] + fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { + // An incomplete bundle where directory_offset is non-zero: + // the stream end is taken as directory_offset, not EOF. + let fake_stream = b"STANDARD BEN FILE\x00partial".to_vec(); + let fake_dir = b"some-directory-bytes"; + let stream_start = HEADER_SIZE as u64; + let dir_offset = stream_start + fake_stream.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: dir_offset, + directory_len: 0, + stream_offset: stream_start, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + bytes.extend_from_slice(fake_dir); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(offset, stream_start); + assert_eq!(len, fake_stream.len() as u64); + } + + #[test] + fn validate_directory_rejects_wrong_canonical_name() { + use crate::io::bundle::format::BendlDirectoryEntry; + + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON, + name: "not_the_canonical_name.json".to_string(), + payload_offset: 64, + payload_len: 10, + checksum: None, + }]; + let err = validate_directory_entries(&entries).unwrap_err(); + match err { + BundleValidationError::WrongCanonicalName { .. } => {} + _ => panic!("expected WrongCanonicalName, got {err:?}"), + } + } } diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 149dcf9..e36b45e 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -59,11 +59,8 @@ impl BendlTruncate for std::io::Cursor> { fn truncate_at(&mut self, len: u64) -> io::Result<()> { let target = len as usize; let vec = self.get_mut(); - if vec.len() > target { - vec.truncate(target); - } else if vec.len() < target { - vec.resize(target, 0); - } + debug_assert!(vec.len() >= target, "truncate_at called past end of buffer"); + vec.truncate(target); Ok(()) } } @@ -276,7 +273,11 @@ impl BendlWriter { if self.state != WriterState::Assets { return Err(BendlWriteError::WrongState { expected: "Assets", - found: self.state_name(), + found: if matches!(self.state, WriterState::Streaming) { + "Streaming" + } else { + "StreamWritten" + }, }); } @@ -375,31 +376,21 @@ impl BendlWriter { /// Write the trailing directory, patch the header, and return the /// underlying writer. pub fn finish(mut self) -> Result { - let (stream_len, sample_count) = match self.state { - WriterState::StreamWritten { - stream_len, - sample_count, - } => (stream_len, sample_count), - // Allow finalizing a bundle that has no stream at all (useful - // for asset-only bundles), treating the stream as empty. - WriterState::Assets => { + if matches!(self.state, WriterState::Streaming) { + return Err(BendlWriteError::WrongState { + expected: "StreamWritten", + found: "Streaming", + }); + } + let (stream_len, sample_count) = + if let WriterState::StreamWritten { stream_len, sample_count } = self.state { + (stream_len, sample_count) + } else { + // Assets state: no stream written; treat as empty stream. let stream_offset = self.inner.seek(SeekFrom::Current(0))?; self.header.stream_offset = stream_offset; (0, 0) - } - WriterState::Streaming => { - return Err(BendlWriteError::WrongState { - expected: "StreamWritten", - found: "Streaming", - }); - } - WriterState::Finished => { - return Err(BendlWriteError::WrongState { - expected: "StreamWritten", - found: "Finished", - }); - } - }; + }; // Position at end of stream (== start of directory). let directory_offset = self.header.stream_offset + stream_len; @@ -428,14 +419,6 @@ impl BendlWriter { Ok(self.inner) } - fn state_name(&self) -> &'static str { - match self.state { - WriterState::Assets => "Assets", - WriterState::Streaming => "Streaming", - WriterState::StreamWritten { .. } => "StreamWritten", - WriterState::Finished => "Finished", - } - } } /// Mutable handle to the stream region held by a [`BendlWriter`]. @@ -681,11 +664,6 @@ impl BendlAppender { }) } - /// The currently loaded (pre-append) directory entries. - pub fn existing_assets(&self) -> &[BendlDirectoryEntry] { - &self.existing_entries - } - /// Enqueue a new asset for append. /// /// This validates the new asset against both the loaded directory @@ -1265,6 +1243,38 @@ mod tests { } } + #[test] + fn append_rejects_complete_bundle_with_zero_directory() { + // Header claims complete but has directory_offset=0 — hits the second + // BundleIncomplete check (line 647). + use crate::io::bundle::format::{ + BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_YES, + }; + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + // Pad to HEADER_SIZE (already exactly 64 bytes from to_bytes) + + match BendlAppender::open(Cursor::new(bytes)) { + Err(BendlWriteError::BundleIncomplete) => {} + Err(other) => panic!("expected BundleIncomplete, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + #[test] fn append_multiple_assets_in_one_commit() { let (bundle, _) = build_base_bundle(); @@ -1549,6 +1559,26 @@ mod tests { )); } + #[test] + fn begin_stream_after_stream_written_returns_wrong_state() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + // Writer is now in StreamWritten state; begin_stream must fail. + let err = writer + .begin_stream() + .err() + .expect("begin_stream after StreamWritten must fail"); + assert!(matches!( + err, + BendlWriteError::WrongState { + found: "StreamWritten", + .. + } + )); + } + #[test] fn stress_many_custom_assets_round_trip() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); @@ -2031,4 +2061,145 @@ mod tests { } } } + + // ── write_json_value and sample_count coverage ────────────────── + + #[test] + fn write_ben_stream_json_value_and_sample_count() { + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + use serde_json::json; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + assert_eq!(ctx.sample_count(), 0); + ctx.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1}))?; + assert_eq!(ctx.sample_count(), 1); + ctx.write_json_value(json!({"assignment": [4, 5, 6], "sample": 2}))?; + assert_eq!(ctx.sample_count(), 2); + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(2)); + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(decoded, vec![vec![1, 2, 3], vec![4, 5, 6]]); + } + + #[test] + fn write_xben_stream_json_value() { + use crate::io::bundle::reader::BundleAssignmentReader; + use crate::BenVariant; + use serde_json::json; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + writer + .write_xben_stream(BenVariant::Standard, |ctx| { + ctx.write_json_value(json!({"assignment": [10, 20], "sample": 1}))?; + ctx.write_json_value(json!({"assignment": [30, 40], "sample": 2}))?; + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(2)); + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Xben(r) => r, + BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(decoded, vec![vec![10, 20], vec![30, 40]]); + } + + // ── BendlStreamHandle: flush ───────────────────────────────────── + + #[test] + fn stream_handle_flush_succeeds() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut handle = writer.begin_stream().unwrap(); + use std::io::Write; + handle.flush().unwrap(); + } + + // ── BendlAppender: checksum flag ──────────────────────────────── + + #[test] + fn appender_commit_with_checksum_sets_checksum_flag() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "checksummed", + b"payload", + AddAssetOptions { + checksum: Some(vec![0xAB, 0xCD]), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader.find_asset_by_name("checksummed").unwrap(); + assert_eq!(entry.checksum, Some(vec![0xAB, 0xCD])); + assert_ne!( + entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, + 0 + ); + } + + // ── BendlAppender: trailing directory bytes ────────────────────── + + #[test] + fn appender_rejects_bundle_with_trailing_directory_bytes() { + let (mut bundle, _) = build_base_bundle(); + // Patch the header's directory_len field (bytes 32-39) to claim + // the directory is 4 bytes longer than it actually is. + let old_len = u64::from_le_bytes(bundle[32..40].try_into().unwrap()); + let patched = (old_len + 4).to_le_bytes(); + bundle[32..40].copy_from_slice(&patched); + + match BendlAppender::open(Cursor::new(bundle)) { + Err(BendlWriteError::Format(BendlFormatError::TrailingDirectoryBytes { .. })) => {} + Err(other) => panic!("expected TrailingDirectoryBytes, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } + } + + // ── finalize from wrong state ─────────────────────────────────── + + #[test] + fn finish_from_finished_state_errors() { + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + ctx.write_assignment(vec![1, 2])?; + Ok(()) + }) + .unwrap(); + // First finish succeeds + let buf = writer.finish().unwrap(); + // Verify the result is usable + let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); + assert!(reader.is_complete()); + } } diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs index c357872..6dd4cae 100644 --- a/ben/src/io/reader/assignment_reader.rs +++ b/ben/src/io/reader/assignment_reader.rs @@ -300,15 +300,18 @@ impl Iterator for AssignmentFrameReader { /// For TwoDelta streams, materializes each assignment and re-encodes it. fn next(&mut self) -> Option { match self.inner.variant { - BenVariant::Standard | BenVariant::MkvChain => { - match self.inner.pop_frame_from_reader() { - Some(Ok(StoredBenFrame::Standard(frame))) => Some(Ok((frame, 1))), - Some(Ok(StoredBenFrame::MkvChain(frame))) => { + BenVariant::Standard => BenDecodeFrame::from_reader(&mut self.inner.reader) + .transpose() + .map(|r| r.map(|frame| (frame, 1))), + BenVariant::MkvChain => { + MkvBenDecodeFrame::from_reader(&mut self.inner.reader) + .transpose() + .map(|r| r.and_then(|frame| { let count = frame.count; if count == 0 { - return Some(Err(zero_count_frame_error())); + return Err(zero_count_frame_error()); } - Some(Ok(( + Ok(( BenDecodeFrame { max_val_bit_count: frame.max_val_bit_count, max_len_bit_count: frame.max_len_bit_count, @@ -316,16 +319,8 @@ impl Iterator for AssignmentFrameReader { raw_bytes: frame.raw_bytes, }, count, - ))) - } - Some(Ok(StoredBenFrame::TwoDelta(_))) => { - Some(Err(io::Error::from(DecodeError::UnexpectedTwoDeltaFrame { - variant: self.inner.variant, - }))) - } - Some(Err(err)) => Some(Err(err)), - None => None, - } + )) + })) } BenVariant::TwoDelta => match self.inner.next() { Some(Ok((assignment, count))) => { diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 4d3908e..8f975b5 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -1,9 +1,10 @@ use crate::codec::encode::encode_jsonl_to_xben; use crate::io::reader::errors::DecoderInitError; +use crate::io::reader::subsample::{DecodeFrame, Selection, SubsampleFrameDecoder}; use crate::io::reader::{XZAssignmentFrameReader, XZAssignmentReader}; use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; -use std::io::{Cursor, Write}; +use std::io::{self, Cursor, Write}; use xz2::write::XzEncoder; /// Build a minimal XBEN stream from JSONL input for testing. @@ -1513,3 +1514,152 @@ fn xz_reader_standard_corrupt_frame_errors() { assert_eq!(results.len(), 1); assert_eq!(results[0].as_ref().unwrap().0, Vec::::new()); } + +// ── SubsampleFrameDecoder: zero-count frame error ─────────────────── + +#[test] +fn subsample_decoder_zero_count_frame_errors() { + // A frame iterator that yields a frame with count=0 should produce an + // InvalidData error from SubsampleFrameDecoder::next(). + let frame = DecodeFrame::XBen( + vec![0, 1, 0, 2, 0, 0, 0, 0], // valid ben32: [1,2] + zero terminator + BenVariant::Standard, + ); + let items: Vec> = vec![Ok((frame, 0))]; + let mut decoder = SubsampleFrameDecoder::new( + items.into_iter(), + Selection::Range { start: 1, end: 10 }, + ); + let err = decoder.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("zero"), "got: {}", err); +} + +// ── XZAssignmentFrameReader: TwoDelta into_frames ─────────────────── + +#[test] +fn xz_frame_reader_twodelta_into_frames() { + // Verify that into_frames() works for TwoDelta streams. The frame reader + // takes the TwoDelta short-circuit path (re-encoding decoded assignments + // back to ben32). + let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} +{"assignment":[2,1,2,2],"sample":2} +"#; + let xben = make_xben(jsonl, BenVariant::TwoDelta); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let frames: Vec<_> = reader.into_frames().map(|r| r.unwrap()).collect(); + assert_eq!(frames.len(), 2); + // Each frame is (ben32_bytes, count); counts should be 1 + assert_eq!(frames[0].1, 1); + assert_eq!(frames[1].1, 1); +} + +// ── XZAssignmentReader: count_samples helper ──────────────────────── + +#[test] +fn xz_reader_count_samples() { + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[4,5,6],"sample":2} +{"assignment":[7,8,9],"sample":3} +"#; + let xben = make_xben(jsonl, BenVariant::Standard); + let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + assert_eq!(reader.count_samples().unwrap(), 3); +} + +// ── XZAssignmentReader: write_all_jsonl ───────────────────────────── + +#[test] +fn xz_reader_write_all_jsonl_standard_roundtrip() { + let jsonl_in = r#"{"assignment":[1,2,3],"sample":1} +{"assignment":[4,5,6],"sample":2} +"#; + let xben = make_xben(jsonl_in, BenVariant::Standard); + let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut output = Vec::new(); + reader.write_all_jsonl(&mut output).unwrap(); + let text = String::from_utf8(output).unwrap(); + assert_eq!(text.lines().count(), 2); + assert!(text.contains("\"assignment\":[1,2,3]")); + assert!(text.contains("\"assignment\":[4,5,6]")); +} + +// ── AssignmentReader: TwoDelta error propagation in RawBenFrameIter ────────── + +#[test] +fn raw_frame_iter_propagates_twodelta_decode_error() { + use crate::io::reader::AssignmentReader; + use crate::io::writer::AssignmentWriter; + + // Build a minimal TwoDelta BEN file with two samples. + let mut ben: Vec = Vec::new(); + { + let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); + writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); + } + + // Locate the TwoDelta delta frame start by parsing the anchor (MkvChain) + // frame header: banner(17) + max_val_bits(1) + max_len_bits(1) + + // n_bytes(4 BE) + payload(n_bytes) + count(2) = anchor_end. + let banner_len = 17usize; + let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let anchor_end = banner_len + 6 + n_bytes + 2; + + // The TwoDelta delta frame: pair_a(2) + pair_b(2) + max_len_bits(1) + ... + // Set max_len_bits to 0, which triggers InvalidData during decoding. + ben[anchor_end + 4] = 0; + + let reader = AssignmentReader::new(Cursor::new(ben)).unwrap(); + let mut iter = reader.into_frames(); + iter.next().unwrap().unwrap(); // anchor frame OK + let err = iter.next().unwrap().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +// ── AssignmentReader: zero-count frame errors ──────────────────────────────── + +/// Build a minimal MkvChain BEN stream whose first frame has count == 0. +fn make_mkvchain_zero_count_frame() -> Vec { + let mut bytes = Vec::new(); + bytes.extend_from_slice(b"MKVCHAIN BEN FILE"); // 17-byte banner + bytes.push(1u8); // max_val_bit_count + bytes.push(1u8); // max_len_bit_count + bytes.extend_from_slice(&1u32.to_be_bytes()); // n_bytes = 1 + bytes.push(0xFFu8); // 1 payload byte + bytes.extend_from_slice(&0u16.to_be_bytes()); // count = 0 + bytes +} + +#[test] +fn assignment_reader_count_samples_rejects_zero_count_frame() { + use crate::io::reader::AssignmentReader; + let data = make_mkvchain_zero_count_frame(); + let reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let err = reader.count_samples().unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn assignment_reader_for_each_rejects_zero_count_frame() { + use crate::io::reader::AssignmentReader; + let data = make_mkvchain_zero_count_frame(); + let mut reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let err = reader + .for_each_assignment(|_, _| Ok(true)) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn raw_frame_iter_rejects_zero_count_mkv_frame() { + use crate::io::reader::AssignmentReader; + let data = make_mkvchain_zero_count_frame(); + let reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let err = reader + .into_frames() + .next() + .expect("should yield one item") + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs index 0ff4406..8143919 100644 --- a/ben/src/io/reader/twodelta.rs +++ b/ben/src/io/reader/twodelta.rs @@ -1,12 +1,2 @@ pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; - -pub(super) enum XBenTwoDeltaFrame { - Full { - runs: Vec<(u16, u16)>, - }, - Delta { - pair: (u16, u16), - run_lengths: Vec, - }, -} diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs index 9eff90c..222ecc1 100644 --- a/ben/src/io/reader/xz_assignment_reader.rs +++ b/ben/src/io/reader/xz_assignment_reader.rs @@ -1,6 +1,6 @@ use super::errors::DecoderInitError; use super::subsample::{Ben32Frame, DecodeFrame, MkvRecord, SubsampleFrameDecoder}; -use super::twodelta::{XBenTwoDeltaFrame, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; +use super::twodelta::{XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; use crate::format::banners::{variant_from_banner, BANNER_LEN}; @@ -18,7 +18,7 @@ pub struct XZAssignmentReader { overflow: Vec, buf: Box<[u8]>, previous_assignment: Option>, - chunk_queue: std::collections::VecDeque<(XBenTwoDeltaFrame, u16)>, + chunk_queue: std::collections::VecDeque<((u16, u16), Vec, u16)>, silent: bool, } @@ -108,37 +108,35 @@ impl XZAssignmentReader { /// Returns the frame bytes, the number of consumed bytes, and the decoded /// repetition count when a complete frame is available. fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { - match self.inner_variant { - BenVariant::Standard => { - if overflow.len() < 4 { - return None; - } - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let end = i + 1; - let frame = &overflow[..end]; - return Some((frame, end, 1)); - } - } - None + // TwoDelta callers use pop_twodelta_frame_from_overflow; this method + // is only reached for Standard and MkvChain variants. + if self.inner_variant == BenVariant::Standard { + if overflow.len() < 4 { + return None; } - BenVariant::MkvChain => { - if overflow.len() < 6 { - return None; + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let end = i + 1; + let frame = &overflow[..end]; + return Some((frame, end, 1)); } - for i in (3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let count_hi = overflow[i + 1]; - let count_lo = overflow[i + 2]; - let count = u16::from_be_bytes([count_hi, count_lo]); - let end = i + 3; - let frame = &overflow[..end]; - return Some((frame, end, count)); - } + } + None + } else { + if overflow.len() < 6 { + return None; + } + for i in (3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let count_hi = overflow[i + 1]; + let count_lo = overflow[i + 2]; + let count = u16::from_be_bytes([count_hi, count_lo]); + let end = i + 3; + let frame = &overflow[..end]; + return Some((frame, end, count)); } - None } - BenVariant::TwoDelta => None, + None } } @@ -159,7 +157,7 @@ impl XZAssignmentReader { fn pop_twodelta_frame_from_overflow( &self, overflow: &[u8], - ) -> Option> { + ) -> Option, usize, u16)>> { let tag = *overflow.first()?; match tag { XBEN_TWODELTA_FULL_TAG => { @@ -169,11 +167,8 @@ impl XZAssignmentReader { let run_count = u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) as usize; - let payload_len = run_count.checked_mul(4)?; - let total_len = 1usize - .checked_add(4)? - .checked_add(payload_len)? - .checked_add(2)?; + let payload_len = run_count * 4; + let total_len = 1 + 4 + payload_len + 2; if overflow.len() < total_len { return None; } @@ -187,7 +182,7 @@ impl XZAssignmentReader { cursor += 4; } let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok((XBenTwoDeltaFrame::Full { runs }, total_len, count))) + Some(Ok((runs, total_len, count))) } XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { @@ -200,14 +195,13 @@ impl XZAssignmentReader { /// /// If the overflow starts with the chunk tag and contains enough bytes for /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. - /// Returns `Some(Ok(()))` on success, `Some(Err(...))` on a parse error, - /// or `None` when the overflow is incomplete. - fn try_parse_twodelta_chunk(&mut self) -> Option> { + /// Returns `true` on success, `false` when the overflow is incomplete. + fn try_parse_twodelta_chunk(&mut self) -> bool { if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { - return None; + return false; } if self.overflow.len() < 5 { - return None; + return false; } let n_frames = u32::from_be_bytes([ @@ -220,29 +214,13 @@ impl XZAssignmentReader { // Calculate total chunk size: tag(1) + n_frames(4) // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) let header_len: usize = 5; - let pairs_len = match n_frames.checked_mul(4) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; - let counts_len = match n_frames.checked_mul(2) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; - let run_counts_len = match n_frames.checked_mul(4) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; - let fixed_len = match header_len - .checked_add(pairs_len) - .and_then(|v| v.checked_add(counts_len)) - .and_then(|v| v.checked_add(run_counts_len)) - { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; + let pairs_len = n_frames * 4; + let counts_len = n_frames * 2; + let run_counts_len = n_frames * 4; + let fixed_len = header_len + pairs_len + counts_len + run_counts_len; if self.overflow.len() < fixed_len { - return None; + return false; } // Read run-length counts to determine total run data size. @@ -258,22 +236,13 @@ impl XZAssignmentReader { self.overflow[offset + 3], ]) as usize; run_counts.push(rc); - total_runs = match total_runs.checked_add(rc) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; + total_runs += rc; } - let run_data_len = match total_runs.checked_mul(2) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; - let total_len = match fixed_len.checked_add(run_data_len) { - Some(v) => v, - None => return Some(Err(io::Error::from(DecodeError::XBenTruncated))), - }; + let run_data_len = total_runs * 2; + let total_len = fixed_len + run_data_len; if self.overflow.len() < total_len { - return None; + return false; } // Parse pairs channel. @@ -303,12 +272,11 @@ impl XZAssignmentReader { run_cursor += 2; } - self.chunk_queue - .push_back((XBenTwoDeltaFrame::Delta { pair, run_lengths }, count)); + self.chunk_queue.push_back((pair, run_lengths, count)); } self.overflow.drain(..total_len); - Some(Ok(())) + true } /// Consume this decoder and iterate over raw ben32 frames instead of @@ -420,8 +388,7 @@ pub(super) fn decode_xben_frame_to_assignment( frame_bytes: &[u8], variant: BenVariant, ) -> io::Result> { - let cursor = Cursor::new(frame_bytes); - let (assignment, _) = decode_ben32_line(cursor, variant)?; + let (assignment, _) = decode_ben32_line(Cursor::new(frame_bytes), variant)?; Ok(assignment) } @@ -440,30 +407,25 @@ impl Iterator for XZAssignmentReader { self.overflow.drain(..consumed); return Some(Err(zero_count_frame_error())); } - let res = match decode_xben_frame_to_assignment( + // pop_frame_from_overflow guarantees a complete + // zero-sentinel-terminated frame, so this never fails. + let assignment = decode_xben_frame_to_assignment( frame_bytes, self.inner_variant, - ) { - Ok(assignment) => { - self.previous_assignment = Some(assignment.clone()); - Ok((assignment, count)) - } - Err(e) => Err(e), - }; + ) + .expect("complete frame from pop_frame_from_overflow"); + self.previous_assignment = Some(assignment.clone()); self.overflow.drain(..consumed); - return Some(res); + return Some(Ok((assignment, count))); } } BenVariant::TwoDelta => { // Drain frames from a previously parsed chunk first. // Chunks only contain Delta frames. - if let Some((frame, count)) = self.chunk_queue.pop_front() { + if let Some((pair, run_lengths, count)) = self.chunk_queue.pop_front() { if count == 0 { return Some(Err(zero_count_frame_error())); } - let XBenTwoDeltaFrame::Delta { pair, run_lengths } = frame else { - unreachable!("chunk queue only contains Delta frames"); - }; let assignment = match self.previous_assignment.take() { Some(prev) => { apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) @@ -480,27 +442,19 @@ impl Iterator for XZAssignmentReader { } // Try to parse a columnar chunk. - if let Some(result) = self.try_parse_twodelta_chunk() { - match result { - Ok(()) => continue, // Loop to drain chunk_queue. - Err(e) => return Some(Err(e)), - } + if self.try_parse_twodelta_chunk() { + continue; // Loop to drain chunk_queue. } // Try a single frame from overflow (only Full/tag-0 frames // or errors — tag-1 is no longer supported). if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { let res = match parsed { - Ok((frame, consumed, count)) => { + Ok((runs, consumed, count)) => { if count == 0 { self.overflow.drain(..consumed); return Some(Err(zero_count_frame_error())); } - let XBenTwoDeltaFrame::Full { runs } = frame else { - unreachable!( - "pop_twodelta_frame_from_overflow only returns Full frames" - ); - }; let assignment = rle_to_vec(runs); self.previous_assignment = Some(assignment.clone()); self.overflow.drain(..consumed); diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs index 9ca0df4..ce4f1f0 100644 --- a/ben/src/io/writer/assignment_writer.rs +++ b/ben/src/io/writer/assignment_writer.rs @@ -211,3 +211,19 @@ impl Drop for AssignmentWriter { let _ = self.finish(); } } + +#[cfg(test)] +mod tests { + use super::twodelta_repeat_frame; + use std::io; + + #[test] + fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { + // All-identical-value assignment with 65536 elements: the pair-position + // run reaches u16::MAX and the encoder must error. + let assign = vec![1u16; 65536]; + let err = twodelta_repeat_frame(&assign, 1).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); + } +} diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 2f93639..822ab8d 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -1,5 +1,6 @@ pub mod assignment_writer; pub(crate) mod frames; +#[cfg(test)] pub(crate) mod tests; pub(crate) mod twodelta; pub(crate) mod utils; diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index abc4b00..a9b6a35 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -1,5 +1,5 @@ use crate::io::reader::XZAssignmentReader; -use crate::io::writer::XZAssignmentWriter; +use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; use std::io::Cursor; use xz2::write::XzEncoder; @@ -452,6 +452,21 @@ fn writer_translate_ben_twodelta_chunk_flush() { assert_eq!(results, assignments); } +// ── TwoDelta encoding error propagation ───────────────────────────── + +#[test] +fn xz_writer_twodelta_too_many_ids_propagates_on_write() { + // Writing a third assignment that changes 3 distinct IDs errors at line 228. + let anchor = vec![1u16, 1, 2, 2]; + let invalid = vec![2u16, 3, 1, 3]; // 3 distinct changing ids + let mut xben = Vec::new(); + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + writer.write_assignment(anchor).unwrap(); + let err = writer.write_assignment(invalid).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + // ── MkvChain u16::MAX overflow ─────────────────────────────────────── #[test] diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index 81ed5c1..3a3fd02 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -40,24 +40,19 @@ impl XZAssignmentWriter { None => return Ok(()), }; - match self.variant { - BenVariant::Standard => { - let encoded = encode_ben32_assignments(&pending)?; - self.encoder.write_all(&encoded)?; - } - BenVariant::MkvChain => { - let encoded = encode_ben32_assignments(&pending)?; - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&self.count.to_be_bytes())?; - } - BenVariant::TwoDelta => { - for (idx, &val) in pending.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - let encoded = encode_xben_twodelta_full_frame(&pending); - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&self.count.to_be_bytes())?; + // Standard writes each assignment immediately; MkvChain and TwoDelta buffer. + if self.variant == BenVariant::MkvChain { + let encoded = encode_ben32_assignments(&pending)?; + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&self.count.to_be_bytes())?; + } else { + // TwoDelta + for (idx, &val) in pending.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); } + let encoded = encode_xben_twodelta_full_frame(&pending); + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&self.count.to_be_bytes())?; } self.previous_assignment = pending; @@ -425,3 +420,46 @@ impl Drop for XZAssignmentWriter { let _ = self.finish(); } } + +#[cfg(test)] +mod tests { + use super::*; + use std::io::{self, Read}; + use xz2::write::XzEncoder; + + #[test] + fn twodelta_repeat_buffered_frame_run_exceeds_u16_max_errors() { + let assign = vec![1u16; 65536]; + let result = twodelta_repeat_buffered_frame(&assign, 1); + let err = result.err().expect("expected error"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); + } + + #[test] + fn translate_twodelta_non_eof_read_error_propagates() { + // write_ben_file in TwoDelta mode calls translate_ben_twodelta_to_xben. + // After reading the anchor frame it loops reading delta frames; a + // non-EOF error on pair_a (first u16 read in the loop) must propagate. + let mut xben = Vec::new(); + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + + // Banner (17 bytes) + minimal anchor frame: + // max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1 + let mut input: Vec = b"TWODELTA BEN FILE".to_vec(); + input.extend_from_slice(&[0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01]); + + // Append an error source after the anchor frame bytes. + struct ErrorAfterEof; + impl Read for ErrorAfterEof { + fn read(&mut self, _: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + + let reader = std::io::BufReader::new(input.as_slice().chain(ErrorAfterEof)); + let err = writer.write_ben_file(reader).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); + } +} diff --git a/ben/src/json/graph/tests/test_algos.rs b/ben/src/json/graph/tests/test_algos.rs index 01a0f30..5bf93df 100644 --- a/ben/src/json/graph/tests/test_algos.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -497,3 +497,67 @@ fn test_sort_json_file_by_key_mixed_numeric_and_string() { assert_eq!(output_json["nodes"][1]["key"], 42); assert_eq!(output_json["nodes"][2]["key"], "alpha"); } + +#[test] +fn test_sort_json_file_by_key_missing_attribute_uses_null() { + // When a node lacks the sort key, compare_attr_values receives None + // which maps to the string "null" for comparison purposes. + let input = r#"{ + "nodes": [ + {"id": 0, "rank": 5}, + {"id": 1}, + {"id": 2, "rank": 3} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 0}, {"id": 2}], + [{"id": 1}] + ] + }"#; + + let mut output = Vec::new(); + sort_json_file_by_key(input.as_bytes(), &mut output, "rank").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + // Numeric values (3, 5) sort first; the node missing "rank" sorts as "null" (string). + assert_eq!(output_json["nodes"][0]["rank"], 3); + assert_eq!(output_json["nodes"][1]["rank"], 5); + // The node without "rank" is last (string "null" > numeric). + assert!(output_json["nodes"][2].get("rank").is_none()); +} + +#[test] +fn test_mlc_with_isolated_node() { + // A graph containing an isolated node (no edges) triggers the + // single-node-component early return in mlc_component. + let input = r#"{ + "nodes": [ + {"id": 0}, + {"id": 1}, + {"id": 2}, + {"id": 3} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 0}], + [], + [] + ] + }"#; + + let mut output = Vec::new(); + let mapping = sort_json_file_by_ordering( + input.as_bytes(), + &mut output, + GraphOrderingMethod::MultiLevelCluster, + ) + .unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); + assert_eq!(mapping.len(), 4); + // Every original node appears exactly once in the mapping. + let mut positions: Vec = mapping.values().copied().collect(); + positions.sort(); + assert_eq!(positions, vec![0, 1, 2, 3]); +} diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 6ab7497..95b174a 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -20,6 +20,9 @@ //! - `reben`: A tool for relabeling BEN files to improve compression ratios. //! +#[cfg(not(target_pointer_width = "64"))] +compile_error!("binary-ensemble requires a 64-bit target"); + /// Command-line entrypoints shared by the thin binaries in `src/bin`. pub mod cli; /// Encoding, decoding, and format-to-format translation helpers. diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 35c8181..54af481 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -106,10 +106,11 @@ pub fn extract_assignment_xben( for frame in frame_iterator { let frame = frame.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + frame.1 as usize > sample_number { - match decode_ben32_line(Cursor::new(&frame.0), variant) { - Ok((assignment, _)) => return Ok(assignment), - Err(e) => return Err(SampleError::new_io_error(e)), - }; + // XZAssignmentFrameReader guarantees complete zero-sentinel + // frames, so decode_ben32_line always succeeds here. + let (assignment, _) = decode_ben32_line(Cursor::new(&frame.0), variant) + .expect("complete frame from XZAssignmentFrameReader"); + return Ok(assignment); } current_sample += frame.1 as usize; } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index edfe51c..3ac5f61 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -9,6 +9,20 @@ use rand_chacha::ChaCha8Rng; use rand_distr::{Distribution, Uniform}; use std::collections::HashMap; use std::io; +use std::io::Read; + +/// A reader that returns one byte successfully then an I/O error. +struct ErrorAfterOneByte; + +impl Read for ErrorAfterOneByte { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if buf.is_empty() { + return Ok(0); + } + buf[0] = 0x01; + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } +} fn shuffle_with_mapping(vec: &mut Vec) -> HashMap where @@ -956,3 +970,93 @@ fn test_canonicalize_assignment() { assert_eq!(canonicalize_assignment(&[]), Vec::::new()); assert_eq!(canonicalize_assignment(&[42]), vec![1]); } + +// ── relabel_ben_lines_with_map: LengthMismatch ───────────────────── + +#[test] +fn test_relabel_ben_length_mismatch() { + // Build a BEN stream with assignment length 3 ([1,2,3]), + // then supply a permutation of length 5 — triggers LengthMismatch. + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + let body = &ben[17..]; // strip banner + + // Permutation of length 5 (identity, doesn't matter — length check comes first) + let map: HashMap = (0..5).map(|i| (i, i)).collect(); + + let mut output = Vec::new(); + let err = + relabel_ben_lines_with_map(body, &mut output, map, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!( + err.to_string().contains("length") || err.to_string().contains("mismatch"), + "got: {}", + err + ); +} + +#[test] +fn test_relabel_ben_lines_non_eof_read_error_propagates() { + // relabel_ben_lines_impl returns a non-EOF I/O error when the reader fails. + let mut output = Vec::new(); + let err = relabel_ben_lines(ErrorAfterOneByte, &mut output, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +#[test] +fn test_relabel_ben_file_with_map_non_eof_read_error_propagates() { + // relabel_ben_file_impl returns a non-EOF I/O error when the reader fails. + let map: HashMap = (0..4).map(|i| (i, i)).collect(); + let mut output = Vec::new(); + let err = + relabel_ben_lines_with_map(ErrorAfterOneByte, &mut output, map, BenVariant::Standard) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +#[test] +fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { + // relabel_ben_file_via_decoder propagates decode errors for TwoDelta streams. + // Build a valid 2-sample TwoDelta BEN file, then corrupt the delta frame. + let mut ben: Vec = Vec::new(); + { + let mut writer = crate::io::writer::AssignmentWriter::new(&mut ben, BenVariant::TwoDelta) + .unwrap(); + writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); + writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); + } + // Locate the delta frame start: banner(17) + max_val_bits(1) + max_len_bits(1) + + // n_bytes(4 BE) + payload(n_bytes) + count(2) = anchor_end. + let banner_len = 17usize; + let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let anchor_end = banner_len + 6 + n_bytes + 2; + // Set delta frame's max_len_bits (5th byte) to 0 to trigger InvalidData. + ben[anchor_end + 4] = 0; + + let mut output = Vec::new(); + let err = relabel_ben_file(ben.as_slice(), &mut output).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} + +#[test] +fn test_relabel_ben_file_with_map_twodelta_malformed_frame_error_propagates() { + let mut ben: Vec = Vec::new(); + { + let mut writer = crate::io::writer::AssignmentWriter::new(&mut ben, BenVariant::TwoDelta) + .unwrap(); + writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); + writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); + } + let banner_len = 17usize; + let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let anchor_end = banner_len + 6 + n_bytes + 2; + ben[anchor_end + 4] = 0; + + let map: HashMap = (0..4).map(|i| (i, i)).collect(); + let mut output = Vec::new(); + let err = relabel_ben_file_with_map(ben.as_slice(), &mut output, map) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} From d58276cf324f158e7de03e2483afeb68c0fcbcf1 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:22:26 -0600 Subject: [PATCH 072/221] better json reader --- ben/src/cli/reben.rs | 418 +++++++++++++++++++++ ben/src/json/graph/mod.rs | 44 +-- ben/src/json/graph/petxgraph/nx_convert.rs | 40 +- ben/src/json/graph/petxgraph/sort.rs | 14 +- ben/src/json/graph/tests/test_algos.rs | 122 +++--- ben/tests/test_cli.rs | 6 +- 6 files changed, 519 insertions(+), 125 deletions(-) diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs index 9ddf7af..93f05e1 100644 --- a/ben/src/cli/reben.rs +++ b/ben/src/cli/reben.rs @@ -471,7 +471,31 @@ fn relabeling_label( #[cfg(test)] mod tests { use super::*; + use crate::codec::encode::encode_jsonl_to_ben; use clap::{CommandFactory, Parser}; + use std::{ + fs, + io::Cursor, + time::{SystemTime, UNIX_EPOCH}, + }; + + fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("reben-{name}-{nonce}")) + } + + /// Write a minimal Standard BEN file to a temp path and return the path. + fn write_temp_ben(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + fs::write(&path, &ben).unwrap(); + path + } #[test] fn clap_metadata_uses_package_version() { @@ -556,4 +580,398 @@ mod tests { assert_eq!(args.output_variant, Some(BenCliVariant::TwoDelta)); assert!(args.convert_only); } + + #[test] + fn run_json_mode_rejects_n_items() { + let args = Args::try_parse_from([ + "reben", "x.json", "--mode", "json", "--key", "k", "--n-items", "5", + ]) + .unwrap(); + let err = run_json_mode(args).unwrap_err(); + assert!(err.contains("--n-items")); + } + + #[test] + fn run_ben_mode_rejects_convert_only_without_variant() { + let args = Args::try_parse_from([ + "reben", "x.ben", "--mode", "ben", "--convert-only", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + assert!(err.contains("--output-variant")); + } + + #[test] + fn run_ben_mode_rejects_convert_only_with_relabeling() { + let args = Args::try_parse_from([ + "reben", "x.ben", "--mode", "ben", + "--convert-only", "--output-variant", "standard", "--key", "k", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + assert!(err.contains("--convert-only cannot be combined")); + } + + #[test] + fn ben_variant_name_covers_all_variants() { + assert_eq!(ben_variant_name(BenVariant::Standard), "standard"); + assert_eq!(ben_variant_name(BenVariant::MkvChain), "mkvchain"); + assert_eq!(ben_variant_name(BenVariant::TwoDelta), "twodelta"); + } + + #[test] + fn to_ben_variant_covers_standard() { + assert_eq!(to_ben_variant(&BenCliVariant::Standard), BenVariant::Standard); + } + + #[test] + fn relabeling_label_errors_on_both_key_and_ordering() { + let err = relabeling_label( + Some("k"), + Some(&OrderingMethod::MultiLevelCluster), + ) + .unwrap_err(); + assert!(err.contains("not both")); + } + + #[test] + fn relabeling_label_errors_on_neither() { + let err = relabeling_label(None, None).unwrap_err(); + assert!(err.contains("either")); + } + + #[test] + fn run_ben_mode_with_n_items_limit() { + let input = write_temp_ben("n_items_input.jsonl.ben"); + let out = unique_path("n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", "ben", + "--n-items", "1", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); + } + + #[test] + fn run_json_mode_with_ordering_derives_output_name() { + // Create a minimal graph JSON file. + let shape = unique_path("ordering_shape.json"); + fs::write( + &shape, + br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, + ) + .unwrap(); + let args = Args::try_parse_from([ + "reben", + shape.to_str().unwrap(), + "--mode", "json", + "--ordering", "reverse-cuthill-mckee", + ]) + .unwrap(); + let result = run_json_mode(args); + // Clean up derived output file. + let derived = shape.to_str().unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee_map.json"; + let derived2 = shape.to_str().unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee.jsonl.ben"; + let _ = fs::remove_file(&derived); + let _ = fs::remove_file(&derived2); + let _ = fs::remove_file(&shape); + result.unwrap(); + } + + #[test] + fn run_ben_mode_with_map_file_and_n_items() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::Cursor; + + // Build a 3-node BEN file. + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_n_items.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_n_items_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + + let out = unique_path("map_n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", "ben", + "--map-file", map_path.to_str().unwrap(), + "--n-items", "1", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + + for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } + } + + #[test] + fn run_ben_mode_with_map_file_no_limit() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::Cursor; + + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_nolimit.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_nolimit_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + + let out = unique_path("map_nolimit_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", "ben", + "--map-file", map_path.to_str().unwrap(), + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + + for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } + } + + #[test] + fn run_ben_mode_with_output_variant_and_n_items() { + let input = write_temp_ben("var_n_items.jsonl.ben"); + let out = unique_path("var_n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", input.to_str().unwrap(), + "--mode", "ben", + "--output-variant", "standard", + "--n-items", "1", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); + } + + #[test] + fn run_ben_mode_with_shape_file_and_ordering() { + // Covers the shape_file + ordering path (lines 265-269). + // Creates a map from the shape file ordering, then relabels the BEN. + let input = write_temp_ben("shape_order_input.jsonl.ben"); + let shape = unique_path("shape_order_shape.json"); + fs::write( + &shape, + br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, + ) + .unwrap(); + let out = unique_path("shape_order_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", input.to_str().unwrap(), + "--mode", "ben", + "--shape-file", shape.to_str().unwrap(), + "--ordering", "reverse-cuthill-mckee", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + // Clean up the map file the function derives automatically. + let map = shape.to_str().unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee_map.json"; + let sorted_json = shape.to_str().unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee.json"; + let _ = fs::remove_file(&map); + let _ = fs::remove_file(&sorted_json); + for p in [&input, &shape, &out] { let _ = fs::remove_file(p); } + result.unwrap(); + } + + #[test] + fn run_ben_mode_with_map_file_and_output_variant_n_items() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::Cursor; + + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_var_n.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_var_n_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + let out = unique_path("map_var_n_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", ben_path.to_str().unwrap(), + "--mode", "ben", + "--map-file", map_path.to_str().unwrap(), + "--output-variant", "standard", + "--n-items", "1", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } + } + + #[test] + fn run_ben_mode_with_map_file_and_output_variant_no_limit() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::Cursor; + + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_var_nolim.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_var_nolim_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + let out = unique_path("map_var_nolim_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", ben_path.to_str().unwrap(), + "--mode", "ben", + "--map-file", map_path.to_str().unwrap(), + "--output-variant", "standard", + "--output-file", out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } + } + + #[test] + fn run_ben_mode_map_file_without_output_file_derives_name() { + // Covers the None branch of output_file (lines 306-307). + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::Cursor; + + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let input = unique_path("map_derive.jsonl.ben"); + fs::write(&input, &ben).unwrap(); + + let map_path = unique_path("map_derive_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1},\"key\":\"sort\"}", + ) + .unwrap(); + let args = Args::try_parse_from([ + "reben", input.to_str().unwrap(), + "--mode", "ben", + "--map-file", map_path.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + // Derived output: input stripped of ".jsonl.ben" + "_sorted_by_{label}.jsonl.ben" + let derived = input.to_str().unwrap() + .trim_end_matches(".jsonl.ben") + .to_owned() + + "_sorted_by_sort.jsonl.ben"; + let _ = fs::remove_file(&derived); + for p in [&input, &map_path] { let _ = fs::remove_file(p); } + result.unwrap(); + } + + #[test] + fn read_relabel_map_file_rejects_non_integer_index() { + let map_path = unique_path("bad_index_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"not_a_number\":0}}", + ) + .unwrap(); + let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + assert!(err.contains("invalid old node index")); + let _ = fs::remove_file(&map_path); + } + + #[test] + fn read_relabel_map_file_rejects_non_integer_value() { + let map_path = unique_path("bad_value_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":\"not_a_number\"}}", + ) + .unwrap(); + let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + assert!(err.contains("non-integer")); + let _ = fs::remove_file(&map_path); + } + + #[test] + fn run_ben_mode_canonicalize_derives_output_name() { + let input = write_temp_ben("canon.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", "ben", + ]) + .unwrap(); + // run_ben_mode should succeed and write a derived output file. + let result = run_ben_mode(args); + // Clean up any produced files. + let derived = input + .to_str() + .unwrap() + .trim_end_matches(".jsonl.ben") + .to_owned() + + "_canonicalized_assignments.jsonl.ben"; + let _ = fs::remove_file(&derived); + fs::remove_file(&input).unwrap(); + result.unwrap(); + } + + #[test] + fn run_ben_mode_with_output_variant_derives_name() { + let input = write_temp_ben("variant.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", "ben", + "--output-variant", "standard", + ]) + .unwrap(); + let result = run_ben_mode(args); + let derived = input + .to_str() + .unwrap() + .trim_end_matches(".ben") + .to_owned() + + "_standard.ben"; + let _ = fs::remove_file(&derived); + fs::remove_file(&input).unwrap(); + result.unwrap(); + } } diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index bb0c8d9..9f52c3e 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -47,7 +47,6 @@ pub fn sort_json_file_by_key( ) -> Result> { tracing::trace!("Loading JSON file..."); let nx_graph: NxGraphAdjFormat = serde_json::from_reader(reader)?; - let original_ids = extract_usize_ids(&nx_graph)?; tracing::trace!("Sorting JSON file by key: {}", key); let (result, order) = if nx_graph.directed { @@ -63,7 +62,7 @@ pub fn sort_json_file_by_key( }; write_nx_graph(writer, &result)?; - Ok(build_id_mapping(&original_ids, &order)) + Ok(build_id_mapping(&order)) } /// Reorder a JSON-formatted NetworkX graph file using a topology-based method. @@ -88,7 +87,6 @@ pub fn sort_json_file_by_ordering( ) -> Result> { tracing::trace!("Loading JSON file..."); let nx_graph: NxGraphAdjFormat = serde_json::from_reader(reader)?; - let original_ids = extract_usize_ids(&nx_graph)?; tracing::trace!("Sorting JSON file by ordering method: {:?}", method); let (result, order) = if nx_graph.directed { @@ -104,7 +102,7 @@ pub fn sort_json_file_by_ordering( }; write_nx_graph(writer, &result)?; - Ok(build_id_mapping(&original_ids, &order)) + Ok(build_id_mapping(&order)) } /// Dispatch to the appropriate ordering algorithm. @@ -128,50 +126,20 @@ fn run_ordering_method( } } -/// Extract the integer node ids from an [`NxGraphAdjFormat`] in order. +/// Build a mapping from original node positions to new positions after reordering. /// /// # Arguments /// -/// * `nx_graph` - The parsed NetworkX graph whose node ids are extracted. -/// -/// # Returns -/// -/// A vector of `usize` ids in the same order as `nx_graph.nodes`. -/// -/// # Errors -/// -/// Returns an error if any node id is not a non-negative integer. -fn extract_usize_ids(nx_graph: &NxGraphAdjFormat) -> io::Result> { - nx_graph - .nodes - .iter() - .map(|n| { - n.id.as_u64().map(|v| v as usize).ok_or_else(|| { - Error::new( - ErrorKind::InvalidData, - format!("Node id is not an unsigned integer: {}", n.id), - ) - }) - }) - .collect() -} - -/// Build a mapping from original node ids to new positional ids. -/// -/// # Arguments -/// -/// * `original_ids` - The node ids before reordering, indexed by the old -/// node position. /// * `order` - The permutation that was applied: `order[new_index]` is the /// old `NodeIndex`. /// /// # Returns /// -/// A map where `mapping[original_id] == new_positional_id`. -fn build_id_mapping(original_ids: &[usize], order: &[NodeIndex]) -> HashMap { +/// A map where `mapping[old_position] == new_position`. +fn build_id_mapping(order: &[NodeIndex]) -> HashMap { let mut mapping = HashMap::with_capacity(order.len()); for (new_idx, &old_node_idx) in order.iter().enumerate() { - mapping.insert(original_ids[old_node_idx.index()], new_idx); + mapping.insert(old_node_idx.index(), new_idx); } mapping } diff --git a/ben/src/json/graph/petxgraph/nx_convert.rs b/ben/src/json/graph/petxgraph/nx_convert.rs index e3238bd..b75e440 100644 --- a/ben/src/json/graph/petxgraph/nx_convert.rs +++ b/ben/src/json/graph/petxgraph/nx_convert.rs @@ -129,23 +129,20 @@ where for (source_idx_orig, neighbors) in adjacency.into_iter().enumerate() { let source_idx = NodeIndex::new(source_idx_orig); - let source_node = graph.node_weight(source_idx).ok_or_else(|| { - NxPetgraphError::Other(format!( - "invalid adjacency: source index {} out of bounds for nodes list", - source_idx.index() - )) - })?; - - let source_id = source_node.attrs.get("__networkx_id__").ok_or_else(|| { - NxPetgraphError::Other("missing __networkx_id__ on source node".to_string()) - })?; - - let source_key = serde_json::to_string(source_id).map_err(|e| { - NxPetgraphError::Other(format!( - "failed to serialize source node id to string: {}", - e - )) - })?; + // Adjacency length was validated against nodes length above. + let source_node = graph + .node_weight(source_idx) + .expect("adjacency length validated against nodes length"); + + // __networkx_id__ is always inserted by nx_node_to_petx_node. + let source_id = source_node + .attrs + .get("__networkx_id__") + .expect("__networkx_id__ always set by nx_node_to_petx_node"); + + // serde_json::Value is always serializable. + let source_key = serde_json::to_string(source_id) + .expect("serde_json::Value always serializes"); for edge in neighbors { let target_id = &edge.id; @@ -156,12 +153,9 @@ where if is_directed { graph.add_edge(source_idx, *target_idx, edge); } else { - let target_key = serde_json::to_string(target_id).map_err(|e| { - NxPetgraphError::Other(format!( - "failed to serialize target node id to string: {}", - e - )) - })?; + // serde_json::Value is always serializable. + let target_key = serde_json::to_string(target_id) + .expect("serde_json::Value always serializes"); let edge_key_str = edge .key diff --git a/ben/src/json/graph/petxgraph/sort.rs b/ben/src/json/graph/petxgraph/sort.rs index 6563cda..d8c1c8c 100644 --- a/ben/src/json/graph/petxgraph/sort.rs +++ b/ben/src/json/graph/petxgraph/sort.rs @@ -55,9 +55,9 @@ fn get_sort_attr<'a>(node: &'a PetxNode, key: &str) -> Option<&'a Value> { /// Compare two optional attribute values for sorting. /// -/// Values are compared numerically when both can be interpreted as `u64`. -/// Otherwise they are compared as strings. `None` is treated as the string -/// `"null"`. +/// Values are compared numerically when both can be interpreted as `f64` +/// (covers integers, floats, and numeric strings). Otherwise they are +/// compared as strings. `None` is treated as the string `"null"`. /// /// # Arguments /// @@ -68,17 +68,17 @@ fn get_sort_attr<'a>(node: &'a PetxNode, key: &str) -> Option<&'a Value> { /// /// An [`Ordering`] suitable for use in a sort comparator. fn compare_attr_values(a: Option<&Value>, b: Option<&Value>) -> Ordering { - let extract = |val: Option<&Value>| -> Result { + let extract = |val: Option<&Value>| -> Result { match val { - Some(Value::String(s)) => s.parse::().map_err(|_| s.clone()), - Some(Value::Number(n)) => n.as_u64().ok_or_else(|| n.to_string()), + Some(Value::String(s)) => s.parse::().map_err(|_| s.clone()), + Some(Value::Number(n)) => n.as_f64().ok_or_else(|| n.to_string()), Some(v) => Err(v.to_string()), None => Err("null".to_string()), } }; match (extract(a), extract(b)) { - (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num), + (Ok(a_num), Ok(b_num)) => a_num.partial_cmp(&b_num).unwrap_or(Ordering::Equal), (Err(a_str), Err(b_str)) => a_str.cmp(&b_str), (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()), (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str), diff --git a/ben/src/json/graph/tests/test_algos.rs b/ben/src/json/graph/tests/test_algos.rs index 5bf93df..d88e267 100644 --- a/ben/src/json/graph/tests/test_algos.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -324,57 +324,6 @@ fn test_sort_json_file_by_multi_level_cluster() { assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); } -#[test] -fn test_extract_usize_ids_rejects_non_integer_node_id() { - let input = r#"{ - "nodes": [ - {"id": 0}, - {"id": "not-a-number"}, - {"id": 2} - ], - "adjacency": [ - [{"id": 2}], - [{"id": 0}], - [{"id": 0}] - ] - }"#; - let mut output = Vec::new(); - let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); - assert!(err.to_string().contains("not an unsigned integer")); -} - -#[test] -fn test_extract_usize_ids_rejects_negative_node_id() { - let input = r#"{ - "nodes": [ - {"id": -1}, - {"id": 1} - ], - "adjacency": [ - [{"id": 1}], - [{"id": -1}] - ] - }"#; - let mut output = Vec::new(); - let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); -} - -#[test] -fn test_extract_usize_ids_rejects_float_node_id() { - let input = r#"{ - "nodes": [ - {"id": 1.5} - ], - "adjacency": [ - [] - ] - }"#; - let mut output = Vec::new(); - let err = sort_json_file_by_key(input.as_bytes(), &mut output, "id").unwrap_err(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); -} #[test] fn test_sort_by_ordering_directed_rcm() { @@ -468,9 +417,10 @@ fn test_sort_json_file_by_key_id() { assert_eq!(output_json["nodes"][0]["id"], 0); assert_eq!(output_json["nodes"][1]["id"], 1); assert_eq!(output_json["nodes"][2]["id"], 2); - assert_eq!(mapping[&0], 0); - assert_eq!(mapping[&1], 1); - assert_eq!(mapping[&2], 2); + // Map keys are original positions (0=id2, 1=id0, 2=id1), values are new positions. + assert_eq!(mapping[&0], 2); // pos 0 (id=2) → new pos 2 + assert_eq!(mapping[&1], 0); // pos 1 (id=0) → new pos 0 + assert_eq!(mapping[&2], 1); // pos 2 (id=1) → new pos 1 } #[test] @@ -561,3 +511,67 @@ fn test_mlc_with_isolated_node() { positions.sort(); assert_eq!(positions, vec![0, 1, 2, 3]); } + +#[test] +fn test_sort_json_file_by_key_fips_string_ids() { + // Node IDs are FIPS codes stored as JSON strings ("360191010003" etc.). + // The mapping must use original positions (0-indexed) as keys, not the + // raw FIPS values, so that downstream BEN relabeling can index correctly. + let input = r#"{ + "nodes": [ + {"id": "360191010003", "rank": 30}, + {"id": "360191010001", "rank": 10}, + {"id": "360191010002", "rank": 20} + ], + "adjacency": [ + [{"id": "360191010001"}], + [{"id": "360191010002"}], + [{"id": "360191010003"}] + ] + }"#; + + let mut output = Vec::new(); + let mapping = sort_json_file_by_key(input.as_bytes(), &mut output, "rank").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + // After sorting by rank: pos1(rank=10) → 0, pos2(rank=20) → 1, pos0(rank=30) → 2. + assert_eq!(output_json["nodes"][0]["rank"], 10); + assert_eq!(output_json["nodes"][1]["rank"], 20); + assert_eq!(output_json["nodes"][2]["rank"], 30); + // Map keys are original positions (not FIPS codes). + assert_eq!(mapping[&0], 2); // pos 0 (rank=30) → new pos 2 + assert_eq!(mapping[&1], 0); // pos 1 (rank=10) → new pos 0 + assert_eq!(mapping[&2], 1); // pos 2 (rank=20) → new pos 1 + // All new positions 0..N-1 are valid BEN array indices. + let mut new_positions: Vec = mapping.values().copied().collect(); + new_positions.sort(); + assert_eq!(new_positions, vec![0, 1, 2]); +} + +#[test] +fn test_sort_json_file_by_key_float_sort_values() { + // Sort key values are JSON floats; they should sort numerically, not as strings + // (e.g. 1.5 < 10.0, not "1.5" < "10.0" would also hold, but 2.5 < 10.0 would break + // lexicographically as "10.0" < "2.5"). + let input = r#"{ + "nodes": [ + {"id": 0, "score": 10.0}, + {"id": 1, "score": 2.5}, + {"id": 2, "score": 1.5} + ], + "adjacency": [ + [{"id": 1}], + [{"id": 2}], + [{"id": 0}] + ] + }"#; + + let mut output = Vec::new(); + sort_json_file_by_key(input.as_bytes(), &mut output, "score").unwrap(); + let output_json: Value = serde_json::from_slice(&output).unwrap(); + + // Numeric order: 1.5 < 2.5 < 10.0 (lexicographic "10.0" < "2.5" would be wrong). + assert_eq!(output_json["nodes"][0]["score"], 1.5); + assert_eq!(output_json["nodes"][1]["score"], 2.5); + assert_eq!(output_json["nodes"][2]["score"], 10.0); +} diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 451582f..d997b19 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -906,7 +906,7 @@ fn reben_cli_json_and_ben_modes_work() { ) .unwrap(); let relabeled_text = String::from_utf8(relabeled_jsonl).unwrap(); - assert!(relabeled_text.contains(r#""assignment":[9,4,9]"#)); + assert!(relabeled_text.contains(r#""assignment":[9,9,4]"#)); } #[test] @@ -1079,7 +1079,7 @@ fn reben_cli_can_limit_ben_relabeling_to_first_n_items() { .unwrap(); assert_eq!( String::from_utf8(relabeled_jsonl).unwrap(), - "{\"assignment\":[9,4,9],\"sample\":1}\n" + "{\"assignment\":[9,9,4],\"sample\":1}\n" ); } @@ -1171,7 +1171,7 @@ fn reben_cli_supports_twodelta_ben_mode() { .unwrap(); assert!(String::from_utf8(relabeled_jsonl) .unwrap() - .contains(r#""assignment":[1,2,1]"#)); + .contains(r#""assignment":[2,1,1]"#)); } #[test] From 75049bf29c97ec7bb810fe3d886a8a10ed4ad0c4 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 19:35:44 -0600 Subject: [PATCH 073/221] we will continue testing until moral improves --- ben/src/cli/ben.rs | 74 +++++++++++++- ben/src/cli/bendl.rs | 185 +++++++++++++++++++++++++++++++++- ben/src/cli/pben.rs | 31 ++++++ ben/src/codec/decode/ben.rs | 49 ++++++--- ben/src/codec/decode/jsonl.rs | 30 ++++++ ben/src/io/bundle/reader.rs | 12 +-- ben/src/io/writer/tests.rs | 2 +- ben/src/logging.rs | 5 +- 8 files changed, 357 insertions(+), 31 deletions(-) diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs index aa50bb7..d5e3324 100644 --- a/ben/src/cli/ben.rs +++ b/ben/src/cli/ben.rs @@ -1144,8 +1144,8 @@ mod tests { #[test] fn append_graph_asset_adds_graph_to_bundle() { - use crate::io::bundle::{AddAssetOptions, BendlReader, BendlWriter}; - use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH}; + use crate::io::bundle::{BendlReader, BendlWriter}; + use crate::io::bundle::format::AssignmentFormat; use std::io::Cursor; // Build a minimal finalized .bendl in memory, write to temp file. @@ -1315,4 +1315,74 @@ mod tests { let _ = fs::remove_file(&jsonl); let _ = fs::remove_file(&out); } + + #[test] + fn append_graph_asset_errors_when_bundle_already_has_graph() { + use crate::io::bundle::{AddAssetOptions, BendlWriter}; + use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH}; + use std::io::Cursor; + + // Build a .bendl that already contains graph.json. + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}", AddAssetOptions::defaults().json()) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("dup_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + // graph.json already exists — add_asset must fail with duplicate name. + let graph_path = write_temp_graph("dup_graph.json"); + let err = append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to add graph asset")); + + let _ = fs::remove_file(&bendl_path); + let _ = fs::remove_file(&graph_path); + } + + #[test] + fn run_xencode_bundle_with_graph_errors_on_invalid_jsonl() { + // from_ben=false path: encode_jsonl_to_xben fails on invalid JSONL. + let bad_jsonl = unique_path("bad.jsonl"); + fs::write(&bad_jsonl, b"not valid json\n").unwrap(); + let graph = write_temp_graph("xenc_bad_jsonl_graph.json"); + let out = unique_path("xenc_bad_jsonl.bendl"); + + let err = run_xencode_bundle_with_graph( + &bad_jsonl, out.to_str().unwrap(), BenVariant::Standard, false, + None, None, None, &graph, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + let _ = fs::remove_file(&bad_jsonl); + let _ = fs::remove_file(&graph); + let _ = fs::remove_file(&out); + } + + #[test] + fn run_xencode_bundle_with_graph_errors_on_invalid_ben() { + // from_ben=true path: encode_ben_to_xben fails on a file with no BEN banner. + let bad_ben = unique_path("bad.ben"); + fs::write(&bad_ben, b"this is not a ben file").unwrap(); + let graph = write_temp_graph("xenc_bad_ben_graph.json"); + let out = unique_path("xenc_bad_ben.bendl"); + + let err = run_xencode_bundle_with_graph( + &bad_ben, out.to_str().unwrap(), BenVariant::Standard, true, + None, None, None, &graph, + ) + .unwrap_err(); + // encode_ben_to_xben fails when it can't read a valid banner. + assert!(err.kind() != io::ErrorKind::NotFound); + + let _ = fs::remove_file(&bad_ben); + let _ = fs::remove_file(&graph); + let _ = fs::remove_file(&out); + } } diff --git a/ben/src/cli/bendl.rs b/ben/src/cli/bendl.rs index df455f5..279a71a 100644 --- a/ben/src/cli/bendl.rs +++ b/ben/src/cli/bendl.rs @@ -376,9 +376,11 @@ fn run_extract(args: ExtractArgs) -> Result<(), String> { .assignment_stream_reader() .map_err(|e| format!("failed to open stream region: {e}"))?; io::copy(&mut stream, &mut out).map_err(|e| format!("failed to copy stream bytes: {e}"))?; - } else if let Some(name) = args.asset.as_deref() { + } else { + // asset is Some — validated by the early return above. + let name = args.asset.unwrap(); let entry = reader - .find_asset_by_name(name) + .find_asset_by_name(&name) .cloned() .ok_or_else(|| format!("no asset named {name:?} in bundle"))?; let mut asset = reader @@ -750,4 +752,183 @@ mod tests { let err = run_extract(args).unwrap_err(); assert!(err.contains("either --stream or --asset")); } + + #[test] + fn run_create_errors_on_missing_metadata_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-meta-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_meta.bendl"); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: Some(unique_path("nonexistent_meta.json")), + relabel_map: None, + assets: vec![], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); + } + + #[test] + fn run_create_errors_on_missing_relabel_map_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-relabel-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let mut b = Vec::new(); + encode_jsonl_to_ben( + Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), + &mut b, + crate::BenVariant::Standard, + ).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_relabel.bendl"); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: Some(unique_path("nonexistent_relabel.json")), + assets: vec![], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); + } + + #[test] + fn run_create_errors_on_missing_custom_asset_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-custom-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let mut b = Vec::new(); + encode_jsonl_to_ben( + Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), + &mut b, + crate::BenVariant::Standard, + ).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_custom.bendl"); + let nonexistent: PathBuf = unique_path("nonexistent.bin"); + let asset_str = format!("myasset={}", nonexistent.display()); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![asset_str.parse().unwrap()], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); + } + + #[test] + fn run_extract_asset_by_name() { + use crate::io::bundle::AddAssetOptions; + use crate::io::bundle::format::ASSET_TYPE_CUSTOM; + + // Build a bundle with a named asset then extract it. + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_asset(ASSET_TYPE_CUSTOM, "hello.txt", b"world", AddAssetOptions::defaults()) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + let bendl = unique_path("extract_asset.bendl"); + std::fs::write(&bendl, &buf).unwrap(); + + let out = unique_path("extract_asset_out.txt"); + let args = ExtractArgs::try_parse_from([ + "extract", + "--asset", "hello.txt", + "--output", out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + run_extract(args).unwrap(); + assert_eq!(std::fs::read(&out).unwrap(), b"world"); + + let _ = std::fs::remove_file(&bendl); + let _ = std::fs::remove_file(&out); + } + + #[test] + fn run_append_errors_on_missing_metadata_file() { + let bendl = write_temp_bendl("append_err_meta.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: Some(unique_path("nonexistent_meta.json")), + relabel_map: None, + assets: vec![], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); + } + + #[test] + fn run_append_errors_on_missing_relabel_map_file() { + let bendl = write_temp_bendl("append_err_relabel.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: Some(unique_path("nonexistent_relabel.json")), + assets: vec![], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); + } + + #[test] + fn run_append_errors_on_missing_custom_asset_file() { + let bendl = write_temp_bendl("append_err_custom.bendl", AssignmentFormat::Ben); + let nonexistent = unique_path("nonexistent_custom.bin"); + let asset_str = format!("myasset={}", nonexistent.display()); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![asset_str.parse().unwrap()], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); + } } diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben.rs index e48e446..8b7948d 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben.rs @@ -367,4 +367,35 @@ mod tests { assert!(rendered.contains(r#""assignment":[1,2,2]"#)); assert!(rendered.contains(r#""assignment":[3,3,1]"#)); } + + #[test] + fn assignment_decode_ben_iterator_error_propagates() { + // Provides a valid BEN banner so AssignmentReader::new succeeds, + // then returns a non-EOF error on the next read so the iterator + // fires the Err(e) => return Err(e) arm (line 204). + use std::io::Read; + use crate::format::banners::STANDARD_BEN_BANNER; + + struct BannerThenError { + banner: &'static [u8], + pos: usize, + } + impl Read for BannerThenError { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos < self.banner.len() { + let n = buf.len().min(self.banner.len() - self.pos); + buf[..n].copy_from_slice(&self.banner[self.pos..self.pos + n]); + self.pos += n; + Ok(n) + } else { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + } + + let reader = BannerThenError { banner: STANDARD_BEN_BANNER, pos: 0 }; + let mut out = Vec::new(); + let err = assignment_decode_ben(reader, &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); + } } diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 2d0b120..63b5d25 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -74,30 +74,51 @@ pub fn decode_ben_line( len_set = false; } + // The while condition guarantees enough bits for a complete (val, len) pair. + // len_set is always false on entry (reset by the outer for body above), + // so we extract len unconditionally. while n_bits_in_buff >= max_val_bits as u16 + max_len_bits as u16 { - if n_bits_in_buff >= max_val_bits as u16 && !val_set { + if !val_set { val = (buffer >> (32 - max_val_bits)) as u16; buffer <<= max_val_bits; n_bits_in_buff -= max_val_bits as u16; - val_set = true; } - if n_bits_in_buff >= max_len_bits as u16 && val_set && !len_set { - len = (buffer >> (32 - max_len_bits)) as u16; - buffer <<= max_len_bits; - n_bits_in_buff -= max_len_bits as u16; - len_set = true; - } + len = (buffer >> (32 - max_len_bits)) as u16; + buffer <<= max_len_bits; + n_bits_in_buff -= max_len_bits as u16; - if val_set && len_set { - if len > 0 { - output_rle.push((val, len)); - } - val_set = false; - len_set = false; + if len > 0 { + output_rle.push((val, len)); } + val_set = false; } } Ok(output_rle) } + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn decode_ben_line_skips_zero_length_run() { + // max_val_bits=1, max_len_bits=1, 1 byte payload = 0x80. + // Bit layout: [val=1][len=0] → run with len=0 is not pushed. + let result = decode_ben_line(Cursor::new(&[0x80u8]), 1, 1, 1).unwrap(); + assert!(result.is_empty()); + } + + #[test] + fn decode_ben_line_partial_bits_skip_val_len_check() { + // max_val_bits=8, max_len_bits=8 → each run requires 2 bytes. + // After byte 1: val_set=true, len_set=false → `if val_set && len_set` + // is false (the `}` closing that block is the false-path counter in + // LLVM coverage). + // After byte 2: both set → run (1, 3) is pushed. + let result = decode_ben_line(Cursor::new(&[0x01u8, 0x03u8]), 8, 8, 2).unwrap(); + assert_eq!(result, vec![(1u16, 3u16)]); + } +} diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index f378d62..4dbd7e3 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -133,3 +133,33 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i tracing::trace!("Done!"); Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::codec::encode::encode_jsonl_to_xben; + use crate::BenVariant; + use std::io::{self, BufReader}; + use std::time::{SystemTime, UNIX_EPOCH}; + + #[test] + fn decode_xben_to_jsonl_writer_error_propagates() { + // Build a valid Standard XBEN stream. + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut xben = Vec::new(); + encode_jsonl_to_xben(jsonl.as_slice(), &mut xben, BenVariant::Standard, Some(1), Some(1), None) + .unwrap(); + + // Use a read-only File as the writer — writing to it fails with a + // permission error, which propagates through the jsonl_decode_ben32 + // call at line 128 of this file. No custom Write impl needed. + let nonce = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + let path = std::env::temp_dir().join(format!("xben-ro-{nonce}.tmp")); + std::fs::write(&path, b"").unwrap(); + let ro_file = std::fs::File::open(&path).unwrap(); // read-only + // Writing to a read-only file fails — the exact error kind varies by OS. + let err = decode_xben_to_jsonl(BufReader::new(xben.as_slice()), ro_file).unwrap_err(); + assert!(err.kind() != io::ErrorKind::UnexpectedEof); + let _ = std::fs::remove_file(path); + } +} diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index f65354e..b56e6b5 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -700,14 +700,10 @@ mod tests { // asset bytes must surface an error eventually (short read vs // declared length). xz would also trip on this, but this is the // raw-asset path. - match reader.asset_bytes(&entry) { - Ok(bytes) => { - // At the very least the returned bytes should not pretend - // to fill u64::MAX — saturate at what the file actually had. - assert!(bytes.len() < u64::MAX as usize); - } - Err(_) => {} - } + // Either returns an error or a slice shorter than u64::MAX. + reader.asset_bytes(&entry) + .map(|b| assert!(b.len() < u64::MAX as usize)) + .ok(); } #[test] diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index a9b6a35..dd3e569 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -1,5 +1,5 @@ use crate::io::reader::XZAssignmentReader; -use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; +use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; use std::io::Cursor; use xz2::write::XzEncoder; diff --git a/ben/src/logging.rs b/ben/src/logging.rs index cd3b868..d128d70 100644 --- a/ben/src/logging.rs +++ b/ben/src/logging.rs @@ -1,5 +1,4 @@ use std::sync::Once; -use tracing::Level; use tracing_subscriber::EnvFilter; static INIT_LOGGER: Once = Once::new(); @@ -48,7 +47,5 @@ pub fn init_logging() { /// /// This function does not return a value. pub fn trace_progress(args: std::fmt::Arguments<'_>) { - if tracing::enabled!(Level::TRACE) { - eprint!("{args}"); - } + tracing::trace!("{args}"); } From 70912ca579b029726e00033300d99d74998f9aaf Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 19:51:36 -0600 Subject: [PATCH 074/221] move writer tests --- ben/src/io/writer/assignment_writer.rs | 21 ++------- ben/src/io/writer/tests.rs | 55 +++++++++++++++++++++++ ben/src/io/writer/xz_assignment_writer.rs | 45 +------------------ 3 files changed, 60 insertions(+), 61 deletions(-) diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs index ce4f1f0..ecb2107 100644 --- a/ben/src/io/writer/assignment_writer.rs +++ b/ben/src/io/writer/assignment_writer.rs @@ -162,7 +162,10 @@ impl AssignmentWriter { } } -fn twodelta_repeat_frame(assignment: &[u16], count: u16) -> io::Result { +pub(super) fn twodelta_repeat_frame( + assignment: &[u16], + count: u16, +) -> io::Result { let first = assignment.first().copied().unwrap_or(0); let second = assignment .iter() @@ -211,19 +214,3 @@ impl Drop for AssignmentWriter { let _ = self.finish(); } } - -#[cfg(test)] -mod tests { - use super::twodelta_repeat_frame; - use std::io; - - #[test] - fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { - // All-identical-value assignment with 65536 elements: the pair-position - // run reaches u16::MAX and the encoder must error. - let assign = vec![1u16; 65536]; - let err = twodelta_repeat_frame(&assign, 1).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("u16::MAX")); - } -} diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index dd3e569..bac4962 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -486,3 +486,58 @@ fn writer_mkv_count_overflow_u16max() { let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); assert_eq!(total, n); } + +// ── Private helper coverage (relocated from sibling source files) ───── + +#[test] +fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { + use super::assignment_writer::twodelta_repeat_frame; + use std::io; + + // All-identical-value assignment with 65536 elements: the pair-position + // run reaches u16::MAX and the encoder must error. + let assign = vec![1u16; 65536]; + let err = twodelta_repeat_frame(&assign, 1).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); +} + +#[test] +fn twodelta_repeat_buffered_frame_run_exceeds_u16_max_errors() { + use super::xz_assignment_writer::twodelta_repeat_buffered_frame; + use std::io; + + let assign = vec![1u16; 65536]; + let result = twodelta_repeat_buffered_frame(&assign, 1); + let err = result.err().expect("expected error"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("u16::MAX")); +} + +#[test] +fn translate_twodelta_non_eof_read_error_propagates() { + use std::io::{self, Read}; + + // write_ben_file in TwoDelta mode calls translate_ben_twodelta_to_xben. + // After reading the anchor frame it loops reading delta frames; a + // non-EOF error on pair_a (first u16 read in the loop) must propagate. + let mut xben = Vec::new(); + let encoder = XzEncoder::new(&mut xben, 1); + let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + + // Banner (17 bytes) + minimal anchor frame: + // max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1 + let mut input: Vec = b"TWODELTA BEN FILE".to_vec(); + input.extend_from_slice(&[0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01]); + + struct ErrorAfterEof; + impl Read for ErrorAfterEof { + fn read(&mut self, _: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + + let reader = std::io::BufReader::new(input.as_slice().chain(ErrorAfterEof)); + let err = writer.write_ben_file(reader).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index 3a3fd02..b680ece 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -368,7 +368,7 @@ impl XZAssignmentWriter { } } -fn twodelta_repeat_buffered_frame( +pub(super) fn twodelta_repeat_buffered_frame( assignment: &[u16], count: u16, ) -> io::Result { @@ -420,46 +420,3 @@ impl Drop for XZAssignmentWriter { let _ = self.finish(); } } - -#[cfg(test)] -mod tests { - use super::*; - use std::io::{self, Read}; - use xz2::write::XzEncoder; - - #[test] - fn twodelta_repeat_buffered_frame_run_exceeds_u16_max_errors() { - let assign = vec![1u16; 65536]; - let result = twodelta_repeat_buffered_frame(&assign, 1); - let err = result.err().expect("expected error"); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("u16::MAX")); - } - - #[test] - fn translate_twodelta_non_eof_read_error_propagates() { - // write_ben_file in TwoDelta mode calls translate_ben_twodelta_to_xben. - // After reading the anchor frame it loops reading delta frames; a - // non-EOF error on pair_a (first u16 read in the loop) must propagate. - let mut xben = Vec::new(); - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); - - // Banner (17 bytes) + minimal anchor frame: - // max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1 - let mut input: Vec = b"TWODELTA BEN FILE".to_vec(); - input.extend_from_slice(&[0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01]); - - // Append an error source after the anchor frame bytes. - struct ErrorAfterEof; - impl Read for ErrorAfterEof { - fn read(&mut self, _: &mut [u8]) -> io::Result { - Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) - } - } - - let reader = std::io::BufReader::new(input.as_slice().chain(ErrorAfterEof)); - let err = writer.write_ben_file(reader).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); - } -} From 5b35451bf1ec11a3eae8b3646e54547bfad16370 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:10:07 -0600 Subject: [PATCH 075/221] reorganize bundle module --- ben/src/io/bundle/format.rs | 268 ------ ben/src/io/bundle/manifest.rs | 28 - ben/src/io/bundle/mod.rs | 3 + ben/src/io/bundle/reader.rs | 810 ---------------- ben/src/io/bundle/tests/format.rs | 263 ++++++ ben/src/io/bundle/tests/manifest.rs | 24 + ben/src/io/bundle/tests/mod.rs | 4 + ben/src/io/bundle/tests/reader.rs | 784 +++++++++++++++ ben/src/io/bundle/tests/writer.rs | 1328 ++++++++++++++++++++++++++ ben/src/io/bundle/writer.rs | 1365 --------------------------- 10 files changed, 2406 insertions(+), 2471 deletions(-) create mode 100644 ben/src/io/bundle/tests/format.rs create mode 100644 ben/src/io/bundle/tests/manifest.rs create mode 100644 ben/src/io/bundle/tests/mod.rs create mode 100644 ben/src/io/bundle/tests/reader.rs create mode 100644 ben/src/io/bundle/tests/writer.rs diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 7552b03..034d0ff 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -442,271 +442,3 @@ impl From for io::Error { } } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn magic_is_eight_bytes_and_matches_spec() { - assert_eq!(BENDL_MAGIC.len(), 8); - assert_eq!(&BENDL_MAGIC[..5], b"BENDL"); - } - - #[test] - fn canonical_name_lookup() { - assert_eq!( - canonical_name_for(ASSET_TYPE_METADATA), - Some("metadata.json") - ); - assert_eq!(canonical_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); - assert_eq!( - canonical_name_for(ASSET_TYPE_RELABEL_MAP), - Some("relabel_map.json") - ); - assert_eq!(canonical_name_for(ASSET_TYPE_CUSTOM), None); - assert_eq!(canonical_name_for(9999), None); - } - - #[test] - fn default_compression_policy() { - assert!(default_compresses_by_type(ASSET_TYPE_GRAPH)); - assert!(!default_compresses_by_type(ASSET_TYPE_METADATA)); - assert!(!default_compresses_by_type(ASSET_TYPE_RELABEL_MAP)); - assert!(!default_compresses_by_type(ASSET_TYPE_CUSTOM)); - } - - #[test] - fn assignment_format_roundtrip() { - for fmt in [AssignmentFormat::Ben, AssignmentFormat::Xben] { - assert_eq!(AssignmentFormat::from_u8(fmt.to_u8()), Some(fmt)); - } - assert_eq!(AssignmentFormat::from_u8(0), None); - assert_eq!(AssignmentFormat::from_u8(255), None); - } - - #[test] - fn header_is_exactly_64_bytes() { - let header = BendlHeader::provisional(AssignmentFormat::Ben, 64); - assert_eq!(header.to_bytes().len(), HEADER_SIZE); - assert_eq!(HEADER_SIZE, 64); - } - - #[test] - fn header_round_trip_provisional() { - let header = BendlHeader::provisional(AssignmentFormat::Xben, 64); - let decoded = BendlHeader::from_bytes(&header.to_bytes()).unwrap(); - assert_eq!(header, decoded); - assert!(!decoded.is_complete()); - assert_eq!( - decoded.assignment_format_typed(), - Some(AssignmentFormat::Xben) - ); - assert_eq!(decoded.sample_count, -1); - assert_eq!(decoded.stream_len, 0); - assert_eq!(decoded.directory_offset, 0); - } - - #[test] - fn header_round_trip_finalized() { - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: ASSIGNMENT_FORMAT_BEN, - reserved_0: 0, - flags: 0x0000_0000_0000_000F, - directory_offset: 1_000_000, - directory_len: 256, - stream_offset: 64, - stream_len: 999_936, - sample_count: 4242, - }; - let bytes = header.to_bytes(); - let decoded = BendlHeader::from_bytes(&bytes).unwrap(); - assert_eq!(decoded, header); - assert!(decoded.is_complete()); - } - - #[test] - fn header_rejects_invalid_magic() { - let mut header = BendlHeader::provisional(AssignmentFormat::Ben, 64); - header.magic = *b"NOTABEND"; - let err = BendlHeader::from_bytes(&header.to_bytes()).unwrap_err(); - assert!(matches!(err, BendlFormatError::InvalidMagic(_))); - } - - #[test] - fn header_rejects_unsupported_major_version() { - let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); - bytes[8..10].copy_from_slice(&999u16.to_le_bytes()); - let err = BendlHeader::from_bytes(&bytes).unwrap_err(); - assert!(matches!( - err, - BendlFormatError::UnsupportedMajorVersion { found: 999, .. } - )); - } - - #[test] - fn directory_entry_round_trip_no_checksum() { - let entry = BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), - payload_offset: 128, - payload_len: 4096, - checksum: None, - }; - let bytes = entry.to_bytes().unwrap(); - assert_eq!(bytes.len(), entry.encoded_len()); - let mut cursor = &bytes[..]; - let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); - assert_eq!(decoded, entry); - } - - #[test] - fn directory_entry_round_trip_with_checksum() { - let entry = BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: ASSET_FLAG_CHECKSUM, - name: "custom_blob".to_string(), - payload_offset: 2048, - payload_len: 512, - checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]), - }; - let bytes = entry.to_bytes().unwrap(); - let mut cursor = &bytes[..]; - let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); - assert_eq!(decoded, entry); - assert_eq!( - decoded.checksum.unwrap(), - vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE] - ); - } - - #[test] - fn directory_table_round_trip() { - let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), - payload_offset: 64, - payload_len: 2048, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: ASSET_FLAG_JSON, - name: CANONICAL_NAME_METADATA.to_string(), - payload_offset: 2112, - payload_len: 128, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "provenance.bin".to_string(), - payload_offset: 2240, - payload_len: 32, - checksum: None, - }, - ]; - - let encoded = encode_directory(&entries).unwrap(); - let mut cursor = &encoded[..]; - let decoded = read_directory(&mut cursor).unwrap(); - assert_eq!(decoded, entries); - } - - #[test] - fn empty_directory_table_round_trip() { - let encoded = encode_directory(&[]).unwrap(); - assert_eq!(encoded, vec![0, 0, 0, 0]); - let mut cursor = &encoded[..]; - let decoded = read_directory(&mut cursor).unwrap(); - assert!(decoded.is_empty()); - } - - #[test] - fn header_and_directory_entry_header_sizes_are_stable() { - // These sizes are baked into the on-disk format; regressing them - // would silently break existing bundles. - assert_eq!(HEADER_SIZE, 64); - assert_eq!(DIRECTORY_ENTRY_HEADER_SIZE, 28); - } - - #[test] - fn directory_entry_name_too_long() { - let entry = BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "x".repeat(u16::MAX as usize + 1), - payload_offset: 0, - payload_len: 0, - checksum: None, - }; - let err = entry.to_bytes().unwrap_err(); - assert!(matches!(err, BendlFormatError::NameTooLong { .. })); - assert!(err.to_string().contains("exceeds")); - } - - #[test] - fn directory_entry_name_not_utf8() { - let mut bytes = BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "ok".to_string(), - payload_offset: 0, - payload_len: 0, - checksum: None, - } - .to_bytes() - .unwrap(); - - // Patch the name bytes to invalid UTF-8 (0xFF 0xFE) - let name_offset = DIRECTORY_ENTRY_HEADER_SIZE; - bytes[name_offset] = 0xFF; - bytes[name_offset + 1] = 0xFE; - - let mut cursor = &bytes[..]; - let err = BendlDirectoryEntry::read_from(&mut cursor).unwrap_err(); - assert!(matches!(err, BendlFormatError::NameNotUtf8)); - assert!(err.to_string().contains("UTF-8")); - } - - #[test] - fn header_read_from_truncated() { - let short = [0u8; 10]; - let err = BendlHeader::read_from(&mut &short[..]).unwrap_err(); - assert!(matches!(err, BendlFormatError::Io(_))); - } - - #[test] - fn bendl_format_error_io_passthrough() { - let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); - let fmt_err = BendlFormatError::Io(inner); - let io_err: io::Error = fmt_err.into(); - assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); - assert_eq!(io_err.to_string(), "pipe broke"); - } - - #[test] - fn bendl_format_error_non_io_becomes_invalid_data() { - let fmt_err = BendlFormatError::MalformedDirectory("bad dir".to_string()); - let io_err: io::Error = fmt_err.into(); - assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); - assert!(io_err.to_string().contains("bad dir")); - } - - #[test] - fn trailing_directory_bytes_error_display() { - let err = BendlFormatError::TrailingDirectoryBytes { remaining: 42 }; - assert!(err.to_string().contains("42")); - assert!(err.to_string().contains("trailing")); - } -} diff --git a/ben/src/io/bundle/manifest.rs b/ben/src/io/bundle/manifest.rs index b6c2120..1c41e50 100644 --- a/ben/src/io/bundle/manifest.rs +++ b/ben/src/io/bundle/manifest.rs @@ -29,31 +29,3 @@ pub struct BendlManifest { /// Whether the bundle was finalized successfully. pub complete: bool, } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn manifest_json_round_trip() { - let manifest = BendlManifest { - major_version: 1, - minor_version: 0, - assignment_format: "xben".to_string(), - variant: Some("mkv_chain".to_string()), - complete: false, - }; - let json = serde_json::to_string(&manifest).unwrap(); - let decoded: BendlManifest = serde_json::from_str(&json).unwrap(); - assert_eq!(decoded, manifest); - } - - #[test] - fn manifest_accepts_missing_variant() { - let json = - r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; - let decoded: BendlManifest = serde_json::from_str(json).unwrap(); - assert_eq!(decoded.variant, None); - assert!(decoded.complete); - } -} diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index 369a169..c0b2824 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -18,6 +18,9 @@ pub mod manifest; pub mod reader; pub mod writer; +#[cfg(test)] +mod tests; + pub use reader::{ BendlReader, BundleAssignmentReader, BundleAssignmentReaderError, BundleValidationError, }; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index b56e6b5..6b41a98 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -286,813 +286,3 @@ pub enum BundleValidationError { }, } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use std::io::{Cursor, Write}; - - use xz2::write::XzEncoder; - - use super::*; - use crate::io::bundle::format::{ - encode_directory, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, - ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, - COMPLETE_YES, HEADER_SIZE, - }; - - /// Build a complete in-memory finalized bundle with two assets: - /// an xz-compressed `graph.json` and a raw custom blob, followed by - /// a fake BEN stream and a trailing directory. - fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { - // Asset payloads (decoded): - let graph_json = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#.to_vec(); - let custom_blob = vec![0xAA, 0xBB, 0xCC, 0xDD, 0xEE]; - let fake_stream = b"STANDARD BEN FILE\x00\x01\x02\x03fake payload".to_vec(); - - // xz-compress graph_json using the default preset. - let mut encoder = XzEncoder::new(Vec::new(), 6); - encoder.write_all(&graph_json).unwrap(); - let compressed_graph = encoder.finish().unwrap(); - - // Layout: - // [0 .. 64) header - // [64 .. 64+len(compressed_graph)) graph payload - // [... .. ...+len(custom_blob)) custom payload - // [stream_offset .. stream_offset+len(fake_stream)) stream - // [directory_offset .. EOF) directory - let mut bundle = Vec::new(); - // Reserve space for header; fill later. - bundle.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); - - let graph_offset = bundle.len() as u64; - bundle.extend_from_slice(&compressed_graph); - - let custom_offset = bundle.len() as u64; - bundle.extend_from_slice(&custom_blob); - - let stream_offset = bundle.len() as u64; - bundle.extend_from_slice(&fake_stream); - let stream_len = fake_stream.len() as u64; - - let directory_offset = bundle.len() as u64; - - let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: "graph.json".to_string(), - payload_offset: graph_offset, - payload_len: compressed_graph.len() as u64, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "custom.bin".to_string(), - payload_offset: custom_offset, - payload_len: custom_blob.len() as u64, - checksum: None, - }, - ]; - let directory_bytes = encode_directory(&entries).unwrap(); - bundle.extend_from_slice(&directory_bytes); - let directory_len = directory_bytes.len() as u64; - - // Now patch the header. - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset, - directory_len, - stream_offset, - stream_len, - sample_count: 42, - }; - bundle[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); - - (bundle, graph_json, custom_blob, fake_stream) - } - - #[test] - fn open_finalized_bundle_and_read_metadata() { - let (bytes, _, _, _) = build_finalized_bundle(); - let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(42)); - assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); - assert_eq!(reader.assets().len(), 2); - assert!(reader.validate_directory().is_ok()); - } - - #[test] - fn read_compressed_graph_asset_decodes_through_xz() { - let (bytes, graph_json, _, _) = build_finalized_bundle(); - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .expect("graph entry"); - let bytes_out = reader.asset_bytes(&entry).unwrap(); - assert_eq!(bytes_out, graph_json); - } - - #[test] - fn read_raw_custom_asset_returns_exact_bytes() { - let (bytes, _, custom_blob, _) = build_finalized_bundle(); - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let entry = reader - .find_asset_by_name("custom.bin") - .cloned() - .expect("custom entry"); - let bytes_out = reader.asset_bytes(&entry).unwrap(); - assert_eq!(bytes_out, custom_blob); - } - - #[test] - fn assignment_stream_range_matches_finalized_header() { - let (bytes, _, _, fake_stream) = build_finalized_bundle(); - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let (offset, len) = reader.assignment_stream_range().unwrap(); - assert_eq!(len, fake_stream.len() as u64); - let mut buf = Vec::new(); - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut buf) - .unwrap(); - assert_eq!(buf, fake_stream); - // Sanity-check the offset is consistent with the header. - assert_eq!(offset, reader.header().stream_offset); - } - - #[test] - fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { - // Build an incomplete bundle: header + some fake stream bytes, no directory. - let fake_stream = b"STANDARD BEN FILE\x00\x01some partial bytes".to_vec(); - let mut bytes = Vec::new(); - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: 0, - directory_len: 0, - stream_offset: HEADER_SIZE as u64, - stream_len: 0, - sample_count: -1, - }; - bytes.extend_from_slice(&header.to_bytes()); - bytes.extend_from_slice(&fake_stream); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); - assert_eq!(reader.sample_count(), None); - assert!(reader.assets().is_empty()); - - let (offset, len) = reader.assignment_stream_range().unwrap(); - assert_eq!(offset, HEADER_SIZE as u64); - assert_eq!(len, fake_stream.len() as u64); - - let mut buf = Vec::new(); - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut buf) - .unwrap(); - assert_eq!(buf, fake_stream); - } - - #[test] - fn open_rejects_malformed_magic() { - let mut bytes = vec![0u8; HEADER_SIZE]; - bytes[0..8].copy_from_slice(b"NOPENOPE"); - match BendlReader::open(Cursor::new(bytes)) { - Err(BendlFormatError::InvalidMagic(_)) => {} - Err(other) => panic!("expected InvalidMagic, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn validate_directory_catches_duplicate_names() { - let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "a".to_string(), - payload_offset: 64, - payload_len: 1, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "a".to_string(), - payload_offset: 65, - payload_len: 1, - checksum: None, - }, - ]; - let reader = BendlReader { - inner: Cursor::new(Vec::::new()), - header: BendlHeader::provisional(AssignmentFormat::Ben, 64), - directory: entries, - }; - let err = reader.validate_directory().unwrap_err(); - assert!(matches!(err, BundleValidationError::DuplicateName(ref n) if n == "a")); - } - - #[test] - fn validate_directory_catches_wrong_canonical_name() { - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: 0, - name: "not_graph.json".to_string(), - payload_offset: 64, - payload_len: 1, - checksum: None, - }]; - let reader = BendlReader { - inner: Cursor::new(Vec::::new()), - header: BendlHeader::provisional(AssignmentFormat::Ben, 64), - directory: entries, - }; - let err = reader.validate_directory().unwrap_err(); - assert!(matches!( - err, - BundleValidationError::WrongCanonicalName { - asset_type: ASSET_TYPE_GRAPH, - .. - } - )); - } - - // ----------------------------------------------------------------------- - // Robustness tests - // ----------------------------------------------------------------------- - - /// Build a small finalized bundle with a known graph asset, metadata - /// asset, empty stream, and no validation pitfalls. Useful as a base - /// that tests can mutate byte-by-byte. - fn build_basic_finalized_bundle() -> Vec { - let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); - - // One raw metadata asset right after the header. - let metadata_payload = br#"{"k":"v"}"#.to_vec(); - let metadata_offset = bytes.len() as u64; - bytes.extend_from_slice(&metadata_payload); - - // Stream region is empty. - let stream_offset = bytes.len() as u64; - let stream_len = 0u64; - - // Directory at EOF with one entry. - let directory_offset = bytes.len() as u64; - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: ASSET_FLAG_JSON, - name: "metadata.json".to_string(), - payload_offset: metadata_offset, - payload_len: metadata_payload.len() as u64, - checksum: None, - }]; - let directory = encode_directory(&entries).unwrap(); - bytes.extend_from_slice(&directory); - let directory_len = directory.len() as u64; - - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset, - directory_len, - stream_offset, - stream_len, - sample_count: 0, - }; - bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); - bytes - } - - #[test] - fn open_rejects_short_header() { - let too_short = vec![0u8; HEADER_SIZE - 1]; - match BendlReader::open(Cursor::new(too_short)) { - Err(BendlFormatError::Io(_)) => {} - Err(other) => panic!("expected Io, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn open_rejects_unsupported_major_version() { - let mut bytes = build_basic_finalized_bundle(); - // major_version lives at offset 8..10 in the header. - bytes[8..10].copy_from_slice(&(BENDL_MAJOR_VERSION + 1).to_le_bytes()); - match BendlReader::open(Cursor::new(bytes)) { - Err(BendlFormatError::UnsupportedMajorVersion { .. }) => {} - Err(other) => panic!("expected UnsupportedMajorVersion, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn open_rejects_directory_with_inflated_entry_count() { - let mut bytes = build_basic_finalized_bundle(); - // Read directory_offset from the header (bytes 24..32). - let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Blow up the entry count at the start of the directory to a - // value that cannot possibly fit in the remaining file bytes. - bytes[directory_offset..directory_offset + 4].copy_from_slice(&9999u32.to_le_bytes()); - match BendlReader::open(Cursor::new(bytes)) { - Err(BendlFormatError::Io(_)) => {} - Err(other) => panic!("expected Io, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn open_rejects_directory_with_chopped_final_entry() { - // Drop the last byte of the file, which lies inside the name - // field of the final directory entry. - let mut bytes = build_basic_finalized_bundle(); - bytes.pop(); - match BendlReader::open(Cursor::new(bytes)) { - Err(BendlFormatError::Io(_)) => {} - Err(other) => panic!("expected Io, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn asset_bytes_read_twice_returns_identical_payload() { - let (bytes, _, custom_blob, _) = build_finalized_bundle(); - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); - let first = reader.asset_bytes(&entry).unwrap(); - let second = reader.asset_bytes(&entry).unwrap(); - assert_eq!(first, second); - assert_eq!(first, custom_blob); - } - - #[test] - fn interleaved_reads_do_not_corrupt_each_other() { - // Read asset A, then stream, then asset A again, then asset B. - let (bytes, graph_json, custom_blob, fake_stream) = build_finalized_bundle(); - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - - let graph_entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .unwrap(); - let custom_entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); - - let graph_first = reader.asset_bytes(&graph_entry).unwrap(); - assert_eq!(graph_first, graph_json); - - let mut stream_buf = Vec::new(); - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut stream_buf) - .unwrap(); - assert_eq!(stream_buf, fake_stream); - - let graph_second = reader.asset_bytes(&graph_entry).unwrap(); - assert_eq!(graph_second, graph_json); - - let custom = reader.asset_bytes(&custom_entry).unwrap(); - assert_eq!(custom, custom_blob); - } - - #[test] - fn asset_bytes_errors_when_declared_length_runs_past_eof() { - // Hand-construct a bundle where the metadata directory entry - // claims a payload_len that extends well past EOF. - let mut bytes = build_basic_finalized_bundle(); - // Parse the directory offset to find where the entry lives. - let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Skip the u32 entry count (4 bytes) and then the 16-byte fixed - // entry header up to `payload_len` (bytes 16..24 of the entry). - let entry_start = directory_offset + 4; - let payload_len_offset = entry_start + 16; - bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); - // The reader opens fine — the directory parses. But reading the - // asset bytes must surface an error eventually (short read vs - // declared length). xz would also trip on this, but this is the - // raw-asset path. - // Either returns an error or a slice shorter than u64::MAX. - reader.asset_bytes(&entry) - .map(|b| assert!(b.len() < u64::MAX as usize)) - .ok(); - } - - #[test] - fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { - // Build an incomplete bundle but stuff a stale sample count into - // the header. `sample_count()` must still return None because - // the `complete` flag is what makes the value authoritative. - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: 0, - directory_len: 0, - stream_offset: HEADER_SIZE as u64, - stream_len: 0, - sample_count: 999_999, // lie, but header is "incomplete" - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); - assert_eq!(reader.sample_count(), None); - } - - #[test] - fn unknown_assignment_format_reports_none_on_typed_getter() { - // Build a finalized but otherwise-empty bundle and corrupt the - // assignment_format byte to a value that is neither BEN nor XBEN. - let mut bytes = build_basic_finalized_bundle(); - // assignment_format byte is at offset 13 in the header. - bytes[13] = 42; - let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert_eq!(reader.assignment_format(), None); - // The header still parses and the directory is still available. - assert_eq!(reader.assets().len(), 1); - } - - #[test] - fn open_assignment_reader_rejects_unknown_assignment_format() { - let mut bytes = build_basic_finalized_bundle(); - bytes[13] = 42; // corrupt assignment format byte - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - match reader.open_assignment_reader() { - Err(BundleAssignmentReaderError::UnknownAssignmentFormat(42)) => {} - Err(other) => panic!("expected UnknownAssignmentFormat(42), got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { - let fake_stream = b"STANDARD BEN FILE\x00\x01payload bytes".to_vec(); - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: 0, - directory_len: 0, - stream_offset: HEADER_SIZE as u64, - stream_len: 0, - sample_count: -1, - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - bytes.extend_from_slice(&fake_stream); - let eof = bytes.len() as u64; - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let (off, len) = reader.assignment_stream_range().unwrap(); - assert_eq!(off, HEADER_SIZE as u64); - assert_eq!(off + len, eof); - } - - #[test] - fn validate_directory_catches_duplicate_singleton_types() { - // Two entries of type METADATA (both with canonical name - // "metadata.json"). The canonical-name check would fire for - // the second entry because the name is duplicated, so force a - // different name shape: this is a belt-and-braces test that - // confirms the singleton check is separate from the name check. - let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: 0, - name: "metadata.json".to_string(), - payload_offset: 64, - payload_len: 1, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: 0, - // Distinct name so the duplicate-name check does not fire - // first; the singleton-type check should catch this. - name: "meta2.json".to_string(), - payload_offset: 65, - payload_len: 1, - checksum: None, - }, - ]; - let reader = BendlReader { - inner: Cursor::new(Vec::::new()), - header: BendlHeader::provisional(AssignmentFormat::Ben, 64), - directory: entries, - }; - // The second entry has asset_type METADATA but name "meta2.json" - // which fails the canonical-name check. - let err = reader.validate_directory().unwrap_err(); - assert!(matches!( - err, - BundleValidationError::WrongCanonicalName { .. } - )); - } - - #[test] - fn validate_directory_accepts_well_formed_multi_singleton_bundle() { - // A bundle with one of every singleton type, plus two custom - // assets with distinct names, should validate cleanly. - let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: ASSET_FLAG_JSON, - name: "metadata.json".to_string(), - payload_offset: 64, - payload_len: 4, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: "graph.json".to_string(), - payload_offset: 68, - payload_len: 4, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: crate::io::bundle::format::ASSET_TYPE_RELABEL_MAP, - asset_flags: ASSET_FLAG_JSON, - name: "relabel_map.json".to_string(), - payload_offset: 72, - payload_len: 4, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "a.bin".to_string(), - payload_offset: 76, - payload_len: 4, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "b.bin".to_string(), - payload_offset: 80, - payload_len: 4, - checksum: None, - }, - ]; - let reader = BendlReader { - inner: Cursor::new(Vec::::new()), - header: BendlHeader::provisional(AssignmentFormat::Ben, 64), - directory: entries, - }; - reader.validate_directory().expect("well-formed directory"); - } - - #[test] - fn stress_thousand_custom_assets_round_trip() { - // Build a directory with 1000 small custom assets, each with a - // unique payload derived from its index, and confirm they all - // round-trip via `asset_bytes`. This catches any off-by-one or - // seek-caching bugs that might only show up with many entries. - const N: usize = 1000; - - let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); - - let mut entries = Vec::with_capacity(N); - let mut expected = Vec::with_capacity(N); - for i in 0..N { - let payload: Vec = (0..(i % 31 + 1) as u8) - .map(|j| (i as u8).wrapping_add(j)) - .collect(); - let offset = bytes.len() as u64; - bytes.extend_from_slice(&payload); - entries.push(BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: format!("blob-{i:04}.bin"), - payload_offset: offset, - payload_len: payload.len() as u64, - checksum: None, - }); - expected.push(payload); - } - - let stream_offset = bytes.len() as u64; - let stream_len = 0u64; - let directory_offset = bytes.len() as u64; - let directory = encode_directory(&entries).unwrap(); - bytes.extend_from_slice(&directory); - let directory_len = directory.len() as u64; - - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset, - directory_len, - stream_offset, - stream_len, - sample_count: 0, - }; - bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert_eq!(reader.assets().len(), N); - reader.validate_directory().unwrap(); - // Access in scrambled order to exercise seeking. - for &idx in &[0usize, N - 1, 1, N / 2, N / 3, 2 * N / 3, 7, 999] { - let name = format!("blob-{idx:04}.bin"); - let entry = reader.find_asset_by_name(&name).cloned().unwrap(); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(got, expected[idx], "mismatch at index {idx}"); - } - } - - #[test] - fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { - // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ - // whose payload bytes are not a valid xz container. `asset_bytes` - // must surface an io::Error rather than panicking. - let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); - - let bad_payload = vec![0xFFu8, 0xFE, 0xFD, 0xFC, 0xFB]; - let payload_offset = bytes.len() as u64; - bytes.extend_from_slice(&bad_payload); - - let stream_offset = bytes.len() as u64; - let directory_offset = bytes.len() as u64; - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: ASSET_FLAG_XZ, - name: "broken.xz".to_string(), - payload_offset, - payload_len: bad_payload.len() as u64, - checksum: None, - }]; - let directory = encode_directory(&entries).unwrap(); - bytes.extend_from_slice(&directory); - - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset, - directory_len: directory.len() as u64, - stream_offset, - stream_len: 0, - sample_count: 0, - }; - bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let entry = reader.find_asset_by_name("broken.xz").cloned().unwrap(); - let res = reader.asset_bytes(&entry); - assert!(res.is_err(), "expected xz decode error, got {res:?}"); - } - - #[test] - fn reader_scales_to_very_wide_stream_offset_field() { - // Confirm the `Take` bound clamps a stream reader even when the - // header's stream_len is much larger than the actual remaining - // bytes: the reader must return the shorter slice rather than - // loop forever or panic. This is a "short read" tolerance check. - let fake_stream = b"STANDARD BEN FILE\x00\x01tiny".to_vec(); - let actual_len = fake_stream.len() as u64; - let directory_offset = HEADER_SIZE as u64 + actual_len; - // Build a bundle that lies about stream_len: claims ten times - // what's actually present. - let entries: Vec = Vec::new(); - let directory_bytes = encode_directory(&entries).unwrap(); - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset, - directory_len: directory_bytes.len() as u64, - stream_offset: HEADER_SIZE as u64, - stream_len: actual_len * 10, // lie - sample_count: 0, - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - bytes.extend_from_slice(&fake_stream); - bytes.extend_from_slice(&directory_bytes); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - let mut buf = Vec::new(); - // Take will try to read `stream_len` bytes but the Cursor will - // just return however many bytes remain from stream_offset to EOF. - // The reader must not panic; it must simply return what it got. - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut buf) - .unwrap(); - // Take includes the directory bytes in the window since they come - // after stream_offset and the claim exceeds file size — so we - // assert only that we got *at least* the real stream bytes as a - // prefix, which is the basic "no truncation of what exists" check. - assert!(buf.starts_with(&fake_stream)); - } - - #[test] - fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { - // An incomplete bundle where directory_offset is non-zero: - // the stream end is taken as directory_offset, not EOF. - let fake_stream = b"STANDARD BEN FILE\x00partial".to_vec(); - let fake_dir = b"some-directory-bytes"; - let stream_start = HEADER_SIZE as u64; - let dir_offset = stream_start + fake_stream.len() as u64; - - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: dir_offset, - directory_len: 0, - stream_offset: stream_start, - stream_len: 0, - sample_count: -1, - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - bytes.extend_from_slice(&fake_stream); - bytes.extend_from_slice(fake_dir); - - let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); - - let (offset, len) = reader.assignment_stream_range().unwrap(); - assert_eq!(offset, stream_start); - assert_eq!(len, fake_stream.len() as u64); - } - - #[test] - fn validate_directory_rejects_wrong_canonical_name() { - use crate::io::bundle::format::BendlDirectoryEntry; - - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON, - name: "not_the_canonical_name.json".to_string(), - payload_offset: 64, - payload_len: 10, - checksum: None, - }]; - let err = validate_directory_entries(&entries).unwrap_err(); - match err { - BundleValidationError::WrongCanonicalName { .. } => {} - _ => panic!("expected WrongCanonicalName, got {err:?}"), - } - } -} diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs new file mode 100644 index 0000000..bafccdc --- /dev/null +++ b/ben/src/io/bundle/tests/format.rs @@ -0,0 +1,263 @@ +use std::io; + +use crate::io::bundle::format::*; + +#[test] +fn magic_is_eight_bytes_and_matches_spec() { + assert_eq!(BENDL_MAGIC.len(), 8); + assert_eq!(&BENDL_MAGIC[..5], b"BENDL"); +} + +#[test] +fn canonical_name_lookup() { + assert_eq!( + canonical_name_for(ASSET_TYPE_METADATA), + Some("metadata.json") + ); + assert_eq!(canonical_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); + assert_eq!( + canonical_name_for(ASSET_TYPE_RELABEL_MAP), + Some("relabel_map.json") + ); + assert_eq!(canonical_name_for(ASSET_TYPE_CUSTOM), None); + assert_eq!(canonical_name_for(9999), None); +} + +#[test] +fn default_compression_policy() { + assert!(default_compresses_by_type(ASSET_TYPE_GRAPH)); + assert!(!default_compresses_by_type(ASSET_TYPE_METADATA)); + assert!(!default_compresses_by_type(ASSET_TYPE_RELABEL_MAP)); + assert!(!default_compresses_by_type(ASSET_TYPE_CUSTOM)); +} + +#[test] +fn assignment_format_roundtrip() { + for fmt in [AssignmentFormat::Ben, AssignmentFormat::Xben] { + assert_eq!(AssignmentFormat::from_u8(fmt.to_u8()), Some(fmt)); + } + assert_eq!(AssignmentFormat::from_u8(0), None); + assert_eq!(AssignmentFormat::from_u8(255), None); +} + +#[test] +fn header_is_exactly_64_bytes() { + let header = BendlHeader::provisional(AssignmentFormat::Ben, 64); + assert_eq!(header.to_bytes().len(), HEADER_SIZE); + assert_eq!(HEADER_SIZE, 64); +} + +#[test] +fn header_round_trip_provisional() { + let header = BendlHeader::provisional(AssignmentFormat::Xben, 64); + let decoded = BendlHeader::from_bytes(&header.to_bytes()).unwrap(); + assert_eq!(header, decoded); + assert!(!decoded.is_complete()); + assert_eq!( + decoded.assignment_format_typed(), + Some(AssignmentFormat::Xben) + ); + assert_eq!(decoded.sample_count, -1); + assert_eq!(decoded.stream_len, 0); + assert_eq!(decoded.directory_offset, 0); +} + +#[test] +fn header_round_trip_finalized() { + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: ASSIGNMENT_FORMAT_BEN, + reserved_0: 0, + flags: 0x0000_0000_0000_000F, + directory_offset: 1_000_000, + directory_len: 256, + stream_offset: 64, + stream_len: 999_936, + sample_count: 4242, + }; + let bytes = header.to_bytes(); + let decoded = BendlHeader::from_bytes(&bytes).unwrap(); + assert_eq!(decoded, header); + assert!(decoded.is_complete()); +} + +#[test] +fn header_rejects_invalid_magic() { + let mut header = BendlHeader::provisional(AssignmentFormat::Ben, 64); + header.magic = *b"NOTABEND"; + let err = BendlHeader::from_bytes(&header.to_bytes()).unwrap_err(); + assert!(matches!(err, BendlFormatError::InvalidMagic(_))); +} + +#[test] +fn header_rejects_unsupported_major_version() { + let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); + bytes[8..10].copy_from_slice(&999u16.to_le_bytes()); + let err = BendlHeader::from_bytes(&bytes).unwrap_err(); + assert!(matches!( + err, + BendlFormatError::UnsupportedMajorVersion { found: 999, .. } + )); +} + +#[test] +fn directory_entry_round_trip_no_checksum() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset: 128, + payload_len: 4096, + checksum: None, + }; + let bytes = entry.to_bytes().unwrap(); + assert_eq!(bytes.len(), entry.encoded_len()); + let mut cursor = &bytes[..]; + let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); + assert_eq!(decoded, entry); +} + +#[test] +fn directory_entry_round_trip_with_checksum() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_CHECKSUM, + name: "custom_blob".to_string(), + payload_offset: 2048, + payload_len: 512, + checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]), + }; + let bytes = entry.to_bytes().unwrap(); + let mut cursor = &bytes[..]; + let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); + assert_eq!(decoded, entry); + assert_eq!( + decoded.checksum.unwrap(), + vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE] + ); +} + +#[test] +fn directory_table_round_trip() { + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset: 64, + payload_len: 2048, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: CANONICAL_NAME_METADATA.to_string(), + payload_offset: 2112, + payload_len: 128, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "provenance.bin".to_string(), + payload_offset: 2240, + payload_len: 32, + checksum: None, + }, + ]; + + let encoded = encode_directory(&entries).unwrap(); + let mut cursor = &encoded[..]; + let decoded = read_directory(&mut cursor).unwrap(); + assert_eq!(decoded, entries); +} + +#[test] +fn empty_directory_table_round_trip() { + let encoded = encode_directory(&[]).unwrap(); + assert_eq!(encoded, vec![0, 0, 0, 0]); + let mut cursor = &encoded[..]; + let decoded = read_directory(&mut cursor).unwrap(); + assert!(decoded.is_empty()); +} + +#[test] +fn header_and_directory_entry_header_sizes_are_stable() { + // These sizes are baked into the on-disk format; regressing them + // would silently break existing bundles. + assert_eq!(HEADER_SIZE, 64); + assert_eq!(DIRECTORY_ENTRY_HEADER_SIZE, 28); +} + +#[test] +fn directory_entry_name_too_long() { + let entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "x".repeat(u16::MAX as usize + 1), + payload_offset: 0, + payload_len: 0, + checksum: None, + }; + let err = entry.to_bytes().unwrap_err(); + assert!(matches!(err, BendlFormatError::NameTooLong { .. })); + assert!(err.to_string().contains("exceeds")); +} + +#[test] +fn directory_entry_name_not_utf8() { + let mut bytes = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "ok".to_string(), + payload_offset: 0, + payload_len: 0, + checksum: None, + } + .to_bytes() + .unwrap(); + + // Patch the name bytes to invalid UTF-8 (0xFF 0xFE) + let name_offset = DIRECTORY_ENTRY_HEADER_SIZE; + bytes[name_offset] = 0xFF; + bytes[name_offset + 1] = 0xFE; + + let mut cursor = &bytes[..]; + let err = BendlDirectoryEntry::read_from(&mut cursor).unwrap_err(); + assert!(matches!(err, BendlFormatError::NameNotUtf8)); + assert!(err.to_string().contains("UTF-8")); +} + +#[test] +fn header_read_from_truncated() { + let short = [0u8; 10]; + let err = BendlHeader::read_from(&mut &short[..]).unwrap_err(); + assert!(matches!(err, BendlFormatError::Io(_))); +} + +#[test] +fn bendl_format_error_io_passthrough() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let fmt_err = BendlFormatError::Io(inner); + let io_err: io::Error = fmt_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe); + assert_eq!(io_err.to_string(), "pipe broke"); +} + +#[test] +fn bendl_format_error_non_io_becomes_invalid_data() { + let fmt_err = BendlFormatError::MalformedDirectory("bad dir".to_string()); + let io_err: io::Error = fmt_err.into(); + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("bad dir")); +} + +#[test] +fn trailing_directory_bytes_error_display() { + let err = BendlFormatError::TrailingDirectoryBytes { remaining: 42 }; + assert!(err.to_string().contains("42")); + assert!(err.to_string().contains("trailing")); +} diff --git a/ben/src/io/bundle/tests/manifest.rs b/ben/src/io/bundle/tests/manifest.rs new file mode 100644 index 0000000..bfe199a --- /dev/null +++ b/ben/src/io/bundle/tests/manifest.rs @@ -0,0 +1,24 @@ +use crate::io::bundle::manifest::BendlManifest; + +#[test] +fn manifest_json_round_trip() { + let manifest = BendlManifest { + major_version: 1, + minor_version: 0, + assignment_format: "xben".to_string(), + variant: Some("mkv_chain".to_string()), + complete: false, + }; + let json = serde_json::to_string(&manifest).unwrap(); + let decoded: BendlManifest = serde_json::from_str(&json).unwrap(); + assert_eq!(decoded, manifest); +} + +#[test] +fn manifest_accepts_missing_variant() { + let json = + r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; + let decoded: BendlManifest = serde_json::from_str(json).unwrap(); + assert_eq!(decoded.variant, None); + assert!(decoded.complete); +} diff --git a/ben/src/io/bundle/tests/mod.rs b/ben/src/io/bundle/tests/mod.rs new file mode 100644 index 0000000..6d7d6ee --- /dev/null +++ b/ben/src/io/bundle/tests/mod.rs @@ -0,0 +1,4 @@ +mod format; +mod manifest; +mod reader; +mod writer; diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs new file mode 100644 index 0000000..c7b850e --- /dev/null +++ b/ben/src/io/bundle/tests/reader.rs @@ -0,0 +1,784 @@ +use std::io::{Cursor, Read, Write}; + +use xz2::write::XzEncoder; + +use crate::io::bundle::format::{ + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, + ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + ASSET_TYPE_RELABEL_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, + COMPLETE_YES, HEADER_SIZE, +}; +use crate::io::bundle::reader::{ + validate_directory_entries, BendlReader, BundleAssignmentReaderError, BundleValidationError, +}; + +/// Build a complete in-memory finalized bundle with two assets: +/// an xz-compressed `graph.json` and a raw custom blob, followed by +/// a fake BEN stream and a trailing directory. +fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { + // Asset payloads (decoded): + let graph_json = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#.to_vec(); + let custom_blob = vec![0xAA, 0xBB, 0xCC, 0xDD, 0xEE]; + let fake_stream = b"STANDARD BEN FILE\x00\x01\x02\x03fake payload".to_vec(); + + // xz-compress graph_json using the default preset. + let mut encoder = XzEncoder::new(Vec::new(), 6); + encoder.write_all(&graph_json).unwrap(); + let compressed_graph = encoder.finish().unwrap(); + + // Layout: + // [0 .. 64) header + // [64 .. 64+len(compressed_graph)) graph payload + // [... .. ...+len(custom_blob)) custom payload + // [stream_offset .. stream_offset+len(fake_stream)) stream + // [directory_offset .. EOF) directory + let mut bundle = Vec::new(); + // Reserve space for header; fill later. + bundle.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let graph_offset = bundle.len() as u64; + bundle.extend_from_slice(&compressed_graph); + + let custom_offset = bundle.len() as u64; + bundle.extend_from_slice(&custom_blob); + + let stream_offset = bundle.len() as u64; + bundle.extend_from_slice(&fake_stream); + let stream_len = fake_stream.len() as u64; + + let directory_offset = bundle.len() as u64; + + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: graph_offset, + payload_len: compressed_graph.len() as u64, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "custom.bin".to_string(), + payload_offset: custom_offset, + payload_len: custom_blob.len() as u64, + checksum: None, + }, + ]; + let directory_bytes = encode_directory(&entries).unwrap(); + bundle.extend_from_slice(&directory_bytes); + let directory_len = directory_bytes.len() as u64; + + // Now patch the header. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 42, + }; + bundle[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + (bundle, graph_json, custom_blob, fake_stream) +} + +#[test] +fn open_finalized_bundle_and_read_metadata() { + let (bytes, _, _, _) = build_finalized_bundle(); + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(42)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + assert_eq!(reader.assets().len(), 2); + assert!(reader.validate_directory().is_ok()); +} + +#[test] +fn read_compressed_graph_asset_decodes_through_xz() { + let (bytes, graph_json, _, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph entry"); + let bytes_out = reader.asset_bytes(&entry).unwrap(); + assert_eq!(bytes_out, graph_json); +} + +#[test] +fn read_raw_custom_asset_returns_exact_bytes() { + let (bytes, _, custom_blob, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader + .find_asset_by_name("custom.bin") + .cloned() + .expect("custom entry"); + let bytes_out = reader.asset_bytes(&entry).unwrap(); + assert_eq!(bytes_out, custom_blob); +} + +#[test] +fn assignment_stream_range_matches_finalized_header() { + let (bytes, _, _, fake_stream) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(len, fake_stream.len() as u64); + let mut buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); + assert_eq!(buf, fake_stream); + // Sanity-check the offset is consistent with the header. + assert_eq!(offset, reader.header().stream_offset); +} + +#[test] +fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { + // Build an incomplete bundle: header + some fake stream bytes, no directory. + let fake_stream = b"STANDARD BEN FILE\x00\x01some partial bytes".to_vec(); + let mut bytes = Vec::new(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + assert_eq!(reader.sample_count(), None); + assert!(reader.assets().is_empty()); + + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(offset, HEADER_SIZE as u64); + assert_eq!(len, fake_stream.len() as u64); + + let mut buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); + assert_eq!(buf, fake_stream); +} + +#[test] +fn open_rejects_malformed_magic() { + let mut bytes = vec![0u8; HEADER_SIZE]; + bytes[0..8].copy_from_slice(b"NOPENOPE"); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::InvalidMagic(_)) => {} + Err(other) => panic!("expected InvalidMagic, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn validate_directory_catches_duplicate_names() { + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a".to_string(), + payload_offset: 65, + payload_len: 1, + checksum: None, + }, + ]; + let err = validate_directory_entries(&entries).unwrap_err(); + assert!(matches!(err, BundleValidationError::DuplicateName(ref n) if n == "a")); +} + +#[test] +fn validate_directory_catches_wrong_canonical_name() { + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: 0, + name: "not_graph.json".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }]; + let err = validate_directory_entries(&entries).unwrap_err(); + assert!(matches!( + err, + BundleValidationError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } + )); +} + +// ----------------------------------------------------------------------- +// Robustness tests +// ----------------------------------------------------------------------- + +/// Build a small finalized bundle with a known graph asset, metadata +/// asset, empty stream, and no validation pitfalls. Useful as a base +/// that tests can mutate byte-by-byte. +fn build_basic_finalized_bundle() -> Vec { + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + // One raw metadata asset right after the header. + let metadata_payload = br#"{"k":"v"}"#.to_vec(); + let metadata_offset = bytes.len() as u64; + bytes.extend_from_slice(&metadata_payload); + + // Stream region is empty. + let stream_offset = bytes.len() as u64; + let stream_len = 0u64; + + // Directory at EOF with one entry. + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: "metadata.json".to_string(), + payload_offset: metadata_offset, + payload_len: metadata_payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let directory_len = directory.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + bytes +} + +#[test] +fn open_rejects_short_header() { + let too_short = vec![0u8; HEADER_SIZE - 1]; + match BendlReader::open(Cursor::new(too_short)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn open_rejects_unsupported_major_version() { + let mut bytes = build_basic_finalized_bundle(); + // major_version lives at offset 8..10 in the header. + bytes[8..10].copy_from_slice(&(BENDL_MAJOR_VERSION + 1).to_le_bytes()); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::UnsupportedMajorVersion { .. }) => {} + Err(other) => panic!("expected UnsupportedMajorVersion, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn open_rejects_directory_with_inflated_entry_count() { + let mut bytes = build_basic_finalized_bundle(); + // Read directory_offset from the header (bytes 24..32). + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + // Blow up the entry count at the start of the directory to a + // value that cannot possibly fit in the remaining file bytes. + bytes[directory_offset..directory_offset + 4].copy_from_slice(&9999u32.to_le_bytes()); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn open_rejects_directory_with_chopped_final_entry() { + // Drop the last byte of the file, which lies inside the name + // field of the final directory entry. + let mut bytes = build_basic_finalized_bundle(); + bytes.pop(); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::Io(_)) => {} + Err(other) => panic!("expected Io, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn asset_bytes_read_twice_returns_identical_payload() { + let (bytes, _, custom_blob, _) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); + let first = reader.asset_bytes(&entry).unwrap(); + let second = reader.asset_bytes(&entry).unwrap(); + assert_eq!(first, second); + assert_eq!(first, custom_blob); +} + +#[test] +fn interleaved_reads_do_not_corrupt_each_other() { + // Read asset A, then stream, then asset A again, then asset B. + let (bytes, graph_json, custom_blob, fake_stream) = build_finalized_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + + let graph_entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let custom_entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); + + let graph_first = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_first, graph_json); + + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, fake_stream); + + let graph_second = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_second, graph_json); + + let custom = reader.asset_bytes(&custom_entry).unwrap(); + assert_eq!(custom, custom_blob); +} + +#[test] +fn asset_bytes_errors_when_declared_length_runs_past_eof() { + // Hand-construct a bundle where the metadata directory entry + // claims a payload_len that extends well past EOF. + let mut bytes = build_basic_finalized_bundle(); + // Parse the directory offset to find where the entry lives. + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + // Skip the u32 entry count (4 bytes) and then the 16-byte fixed + // entry header up to `payload_len` (bytes 16..24 of the entry). + let entry_start = directory_offset + 4; + let payload_len_offset = entry_start + 16; + bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + // The reader opens fine — the directory parses. But reading the + // asset bytes must surface an error eventually (short read vs + // declared length). xz would also trip on this, but this is the + // raw-asset path. + // Either returns an error or a slice shorter than u64::MAX. + reader + .asset_bytes(&entry) + .map(|b| assert!(b.len() < u64::MAX as usize)) + .ok(); +} + +#[test] +fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { + // Build an incomplete bundle but stuff a stale sample count into + // the header. `sample_count()` must still return None because + // the `complete` flag is what makes the value authoritative. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 999_999, // lie, but header is "incomplete" + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + assert_eq!(reader.sample_count(), None); +} + +#[test] +fn unknown_assignment_format_reports_none_on_typed_getter() { + // Build a finalized but otherwise-empty bundle and corrupt the + // assignment_format byte to a value that is neither BEN nor XBEN. + let mut bytes = build_basic_finalized_bundle(); + // assignment_format byte is at offset 13 in the header. + bytes[13] = 42; + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert_eq!(reader.assignment_format(), None); + // The header still parses and the directory is still available. + assert_eq!(reader.assets().len(), 1); +} + +#[test] +fn open_assignment_reader_rejects_unknown_assignment_format() { + let mut bytes = build_basic_finalized_bundle(); + bytes[13] = 42; // corrupt assignment format byte + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + match reader.open_assignment_reader() { + Err(BundleAssignmentReaderError::UnknownAssignmentFormat(42)) => {} + Err(other) => panic!("expected UnknownAssignmentFormat(42), got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { + let fake_stream = b"STANDARD BEN FILE\x00\x01payload bytes".to_vec(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + let eof = bytes.len() as u64; + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let (off, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(off, HEADER_SIZE as u64); + assert_eq!(off + len, eof); +} + +#[test] +fn validate_directory_catches_duplicate_singleton_types() { + // Two entries of type METADATA. The second one uses a non-canonical + // name to confirm the canonical-name check fires (it lands first + // here, and is the path we cover; the singleton check is exercised + // elsewhere via duplicate canonical names). + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: 0, + name: "metadata.json".to_string(), + payload_offset: 64, + payload_len: 1, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: 0, + // Distinct name so the duplicate-name check does not fire + // first; the singleton-type check should catch this. + name: "meta2.json".to_string(), + payload_offset: 65, + payload_len: 1, + checksum: None, + }, + ]; + // The second entry has asset_type METADATA but name "meta2.json" + // which fails the canonical-name check. + let err = validate_directory_entries(&entries).unwrap_err(); + assert!(matches!( + err, + BundleValidationError::WrongCanonicalName { .. } + )); +} + +#[test] +fn validate_directory_accepts_well_formed_multi_singleton_bundle() { + // A bundle with one of every singleton type, plus two custom + // assets with distinct names, should validate cleanly. + let entries = vec![ + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: "metadata.json".to_string(), + payload_offset: 64, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: 68, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_RELABEL_MAP, + asset_flags: ASSET_FLAG_JSON, + name: "relabel_map.json".to_string(), + payload_offset: 72, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a.bin".to_string(), + payload_offset: 76, + payload_len: 4, + checksum: None, + }, + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "b.bin".to_string(), + payload_offset: 80, + payload_len: 4, + checksum: None, + }, + ]; + validate_directory_entries(&entries).expect("well-formed directory"); +} + +#[test] +fn stress_thousand_custom_assets_round_trip() { + // Build a directory with 1000 small custom assets, each with a + // unique payload derived from its index, and confirm they all + // round-trip via `asset_bytes`. This catches any off-by-one or + // seek-caching bugs that might only show up with many entries. + const N: usize = 1000; + + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let mut entries = Vec::with_capacity(N); + let mut expected = Vec::with_capacity(N); + for i in 0..N { + let payload: Vec = (0..(i % 31 + 1) as u8) + .map(|j| (i as u8).wrapping_add(j)) + .collect(); + let offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + entries.push(BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: format!("blob-{i:04}.bin"), + payload_offset: offset, + payload_len: payload.len() as u64, + checksum: None, + }); + expected.push(payload); + } + + let stream_offset = bytes.len() as u64; + let stream_len = 0u64; + let directory_offset = bytes.len() as u64; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let directory_len = directory.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len, + stream_offset, + stream_len, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert_eq!(reader.assets().len(), N); + reader.validate_directory().unwrap(); + // Access in scrambled order to exercise seeking. + for &idx in &[0usize, N - 1, 1, N / 2, N / 3, 2 * N / 3, 7, 999] { + let name = format!("blob-{idx:04}.bin"); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(got, expected[idx], "mismatch at index {idx}"); + } +} + +#[test] +fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { + // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ + // whose payload bytes are not a valid xz container. `asset_bytes` + // must surface an io::Error rather than panicking. + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let bad_payload = vec![0xFFu8, 0xFE, 0xFD, 0xFC, 0xFB]; + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&bad_payload); + + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_XZ, + name: "broken.xz".to_string(), + payload_offset, + payload_len: bad_payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("broken.xz").cloned().unwrap(); + let res = reader.asset_bytes(&entry); + assert!(res.is_err(), "expected xz decode error, got {res:?}"); +} + +#[test] +fn reader_scales_to_very_wide_stream_offset_field() { + // Confirm the `Take` bound clamps a stream reader even when the + // header's stream_len is much larger than the actual remaining + // bytes: the reader must return the shorter slice rather than + // loop forever or panic. This is a "short read" tolerance check. + let fake_stream = b"STANDARD BEN FILE\x00\x01tiny".to_vec(); + let actual_len = fake_stream.len() as u64; + let directory_offset = HEADER_SIZE as u64 + actual_len; + // Build a bundle that lies about stream_len: claims ten times + // what's actually present. + let entries: Vec = Vec::new(); + let directory_bytes = encode_directory(&entries).unwrap(); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory_bytes.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: actual_len * 10, // lie + sample_count: 0, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + bytes.extend_from_slice(&directory_bytes); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let mut buf = Vec::new(); + // Take will try to read `stream_len` bytes but the Cursor will + // just return however many bytes remain from stream_offset to EOF. + // The reader must not panic; it must simply return what it got. + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); + // Take includes the directory bytes in the window since they come + // after stream_offset and the claim exceeds file size — so we + // assert only that we got *at least* the real stream bytes as a + // prefix, which is the basic "no truncation of what exists" check. + assert!(buf.starts_with(&fake_stream)); +} + +#[test] +fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { + // An incomplete bundle where directory_offset is non-zero: + // the stream end is taken as directory_offset, not EOF. + let fake_stream = b"STANDARD BEN FILE\x00partial".to_vec(); + let fake_dir = b"some-directory-bytes"; + let stream_start = HEADER_SIZE as u64; + let dir_offset = stream_start + fake_stream.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: dir_offset, + directory_len: 0, + stream_offset: stream_start, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&fake_stream); + bytes.extend_from_slice(fake_dir); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(!reader.is_complete()); + + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert_eq!(offset, stream_start); + assert_eq!(len, fake_stream.len() as u64); +} + +#[test] +fn validate_directory_rejects_wrong_canonical_name() { + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON, + name: "not_the_canonical_name.json".to_string(), + payload_offset: 64, + payload_len: 10, + checksum: None, + }]; + let err = validate_directory_entries(&entries).unwrap_err(); + match err { + BundleValidationError::WrongCanonicalName { .. } => {} + _ => panic!("expected WrongCanonicalName, got {err:?}"), + } +} diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs new file mode 100644 index 0000000..262e519 --- /dev/null +++ b/ben/src/io/bundle/tests/writer.rs @@ -0,0 +1,1328 @@ +use std::io::{self, Cursor, Read, Write}; + +use crate::io::bundle::format::{ + AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, + BENDL_MINOR_VERSION, COMPLETE_NO, COMPLETE_YES, HEADER_SIZE, +}; +use crate::io::bundle::reader::{BendlReader, BundleAssignmentReader}; +use crate::io::bundle::writer::{ + AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter, +}; + +fn make_buffer() -> Cursor> { + Cursor::new(Vec::new()) +} + +#[test] +fn minimal_bundle_round_trip_through_reader() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", br#"{"note":"hello"}"#) + .unwrap(); + let stream_bytes = b"STANDARD BEN FILE\x00\x01fake".to_vec(); + writer.write_stream_bytes(&stream_bytes, 7).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(7)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + assert_eq!(reader.assets().len(), 1); + + let entry = reader + .find_asset_by_type(ASSET_TYPE_METADATA) + .cloned() + .expect("metadata entry present"); + assert_eq!(entry.name, "metadata.json"); + assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); + let meta_bytes = reader.asset_bytes(&entry).unwrap(); + assert_eq!(meta_bytes, br#"{"note":"hello"}"#); + + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, stream_bytes); +} + +#[test] +fn graph_asset_is_compressed_by_default() { + let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9],"edges":[[0,1],[1,2],[2,3],[3,4]]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph entry present"); + assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); + // Compressed size should differ from the raw size for a non-trivial + // JSON payload. For very short payloads xz actually inflates the + // bytes, so this just checks the size is non-zero and different. + assert_ne!(entry.payload_len, graph.len() as u64); + + // Decoded bytes round-trip. + let decoded = reader.asset_bytes(&entry).unwrap(); + assert_eq!(decoded, graph); +} + +#[test] +fn graph_asset_can_be_forced_raw() { + let graph = br#"{"nodes":[0,1,2]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + graph, + AddAssetOptions::defaults().json().raw(), + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .expect("graph entry present"); + assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); + assert_eq!(entry.payload_len, graph.len() as u64); +} + +#[test] +fn writer_rejects_second_graph() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateSingletonType(t) if t == ASSET_TYPE_GRAPH)); +} + +#[test] +fn writer_rejects_wrong_canonical_name_for_singleton() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph_but_wrong_name.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } + )); +} + +#[test] +fn writer_rejects_duplicate_custom_name() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"first", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"second", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); +} + +#[test] +fn writer_rejects_asset_added_after_stream_begins() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + { + let mut handle = writer.begin_stream().unwrap(); + handle.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + handle.finish(1).unwrap(); + } + let err = writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::AssetsAfterStream)); +} + +#[test] +fn asset_only_bundle_finalizes_with_empty_stream() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(0)); + assert_eq!(reader.header().stream_len, 0); +} + +#[test] +fn finalized_directory_lives_at_eof() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let header = reader.header(); + let file_len = buf.len() as u64; + assert_eq!(header.directory_offset + header.directory_len, file_len); + // Stream ends where directory begins. + assert_eq!( + header.stream_offset + header.stream_len, + header.directory_offset + ); +} + +// ----------------------------------------------------------------------- +// Append-path tests +// ----------------------------------------------------------------------- + +/// Build a finalized bundle with a single `metadata.json` asset and +/// a short fake stream, then return both the bytes and the byte +/// range (offset, len) occupied by the stream region. +fn build_base_bundle() -> (Vec, (u64, u64)) { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"version\":1}") + .unwrap(); + let stream = b"STANDARD BEN FILE\x00\x01\x02\x03\x04\x05stream bytes"; + writer.write_stream_bytes(stream, 3).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let range = (reader.header().stream_offset, reader.header().stream_len); + (buf, range) +} + +#[test] +fn append_adds_new_asset_and_preserves_old_entries() { + let (bundle, _) = build_base_bundle(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 2); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + // Finalized bundle invariants still hold. + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(3)); +} + +#[test] +fn append_leaves_stream_bytes_byte_for_byte_unchanged() { + let (bundle, (stream_offset, stream_len)) = build_base_bundle(); + let original_stream_bytes = + bundle[stream_offset as usize..(stream_offset + stream_len) as usize].to_vec(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"appended custom bytes", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Read back the new header to locate the stream region, then + // confirm the stream bytes are byte-identical to the original. + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let (off, len) = (reader.header().stream_offset, reader.header().stream_len); + let appended_stream_bytes = buf[off as usize..(off + len) as usize].to_vec(); + assert_eq!(appended_stream_bytes, original_stream_bytes); + // Stream offset should not have moved either. + assert_eq!(off, stream_offset); + assert_eq!(len, stream_len); +} + +#[test] +fn append_preserves_existing_entries_payload_offsets() { + let (bundle, _) = build_base_bundle(); + + // Snapshot the metadata entry's payload_offset before append. + let reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); + let old_offset = reader + .find_asset_by_name("metadata.json") + .unwrap() + .payload_offset; + drop(reader); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[0,1,2,3,4,5]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let new_offset = reader + .find_asset_by_name("metadata.json") + .unwrap() + .payload_offset; + assert_eq!( + old_offset, new_offset, + "existing asset offset must not move" + ); +} + +#[test] +fn append_rejects_duplicate_singleton_without_touching_file() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"new\":true}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateSingletonType(_))); + + // Abort and confirm the file is byte-for-byte unchanged. + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); +} + +#[test] +fn append_rejects_duplicate_custom_name_without_touching_file() { + // Start from a bundle containing a custom asset named "blob", then + // try to append another "blob". + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"original", + AddAssetOptions::defaults(), + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let bundle = writer.finish().unwrap().into_inner(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob", + b"dup", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); +} + +#[test] +fn append_rejects_wrong_canonical_name_without_touching_file() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_GRAPH, + .. + } + )); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); +} + +#[test] +fn append_rejects_incomplete_bundle() { + // Construct a minimal incomplete bundle: just the provisional + // header and some stream bytes, no directory. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(b"STANDARD BEN FILE\x00fake"); + + match BendlAppender::open(Cursor::new(bytes)) { + Err(BendlWriteError::BundleIncomplete) => {} + Err(other) => panic!("expected BundleIncomplete, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn append_rejects_complete_bundle_with_zero_directory() { + // Header claims complete but has directory_offset=0 — hits the second + // BundleIncomplete check. + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + complete: COMPLETE_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + + match BendlAppender::open(Cursor::new(bytes)) { + Err(BendlWriteError::BundleIncomplete) => {} + Err(other) => panic!("expected BundleIncomplete, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn append_multiple_assets_in_one_commit() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") + .unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob1", + b"blob one", + AddAssetOptions::defaults(), + ) + .unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "blob2", + b"blob two", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 4); + // Round-trip the appended graph through the reader to confirm + // compression happened and decodes cleanly. + let graph_entry = reader + .find_asset_by_name("graph.json") + .cloned() + .expect("graph entry present"); + assert_ne!(graph_entry.asset_flags & ASSET_FLAG_XZ, 0); + let graph_bytes = reader.asset_bytes(&graph_entry).unwrap(); + assert_eq!(graph_bytes, b"{\"n\":[0,1,2]}"); +} + +#[test] +fn append_rejects_conflicting_pending_additions() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "new_blob", + b"a", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = appender + .add_asset( + ASSET_TYPE_CUSTOM, + "new_blob", + b"b", + AddAssetOptions::defaults(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(_))); + + let buf = appender.abort().into_inner(); + assert_eq!(buf, bundle_before); +} + +// -------- Phase 4: assignment-stream integration tests -------- + +#[test] +fn write_ben_stream_round_trips_through_assignment_reader() { + use crate::BenVariant; + + let samples: Vec> = vec![ + vec![0, 0, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], // repeat + vec![1, 1, 1, 1, 2, 2], + ]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + // Four write_assignment calls → sample_count == 4. + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); + + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); +} + +#[test] +fn write_xben_stream_round_trips_through_assignment_reader() { + use crate::BenVariant; + + let samples: Vec> = vec![ + vec![0, 1, 2, 3, 4, 5], + vec![0, 1, 2, 3, 4, 5], // repeat + vec![1, 1, 2, 3, 4, 5], + vec![1, 1, 2, 3, 4, 4], + ]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + writer + .write_xben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); + + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Xben(r) => r, + BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); +} + +#[test] +fn write_ben_stream_alongside_front_loaded_asset() { + use crate::BenVariant; + + let graph = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#; + let samples: Vec> = vec![vec![0, 1, 1, 2], vec![0, 1, 2, 2]]; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + + // Front-loaded graph asset survives round trip through xz. + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .expect("graph asset present"); + assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); + let decoded_graph = reader.asset_bytes(&entry).unwrap(); + assert_eq!(decoded_graph, graph); + + // Assignment stream is still intact after pulling asset bytes. + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner + .silent(true) + .flat_map(|r| { + let (assign, count) = r.unwrap(); + std::iter::repeat(assign).take(count as usize) + }) + .collect(); + assert_eq!(decoded, samples); +} + +#[test] +fn open_assignment_reader_rejects_mismatched_format() { + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + ctx.write_assignment(vec![0, 1])?; + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let decoder: BundleAssignmentReader<_> = reader.open_assignment_reader().unwrap(); + assert!(decoder.is_ben()); + assert!(!decoder.is_xben()); +} + +// ----------------------------------------------------------------------- +// Robustness tests +// ----------------------------------------------------------------------- + +#[test] +fn fully_empty_bundle_finalizes_and_round_trips() { + // No assets, no stream bytes, no stream phase at all. + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); + assert_eq!(reader.sample_count(), Some(0)); + assert_eq!(reader.header().stream_len, 0); + assert_eq!(reader.assets().len(), 0); + // Even with zero assets the directory is present and empty. + assert_ne!(reader.header().directory_offset, 0); + // directory_len should equal the 4-byte empty entry-count header. + assert_eq!(reader.header().directory_len, 4); +} + +#[test] +fn begin_stream_twice_returns_wrong_state_error() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + { + let handle = match writer.begin_stream() { + Ok(h) => h, + Err(_) => panic!("first begin_stream must succeed"), + }; + // Drop the handle without calling finish() — the writer is + // now stuck in the Streaming state. + drop(handle); + } + let err = writer + .begin_stream() + .err() + .expect("second begin_stream must fail"); + assert!(matches!(err, BendlWriteError::WrongState { .. })); +} + +#[test] +fn finish_from_streaming_state_errors() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + match writer.begin_stream() { + Ok(handle) => drop(handle), + Err(_) => panic!("begin_stream must succeed"), + } + // Intentionally leave the writer in the Streaming state. + let err = writer.finish().unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongState { + found: "Streaming", + .. + } + )); +} + +#[test] +fn begin_stream_after_stream_written_returns_wrong_state() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + // Writer is now in StreamWritten state; begin_stream must fail. + let err = writer + .begin_stream() + .err() + .expect("begin_stream after StreamWritten must fail"); + assert!(matches!( + err, + BendlWriteError::WrongState { + found: "StreamWritten", + .. + } + )); +} + +#[test] +fn stress_many_custom_assets_round_trip() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let count = 500usize; + for i in 0..count { + let name = format!("blob_{i:05}"); + let payload = vec![(i & 0xFF) as u8; (i % 17) + 1]; + writer + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) + .unwrap(); + } + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), count); + // Spot-check a handful of entries by reading their payload bytes back. + for i in [0usize, 1, 42, 199, 499] { + let name = format!("blob_{i:05}"); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(got, vec![(i & 0xFF) as u8; (i % 17) + 1]); + } +} + +#[test] +fn append_empty_commit_is_noop() { + let (bundle, _) = build_base_bundle(); + let bundle_before = bundle.clone(); + let appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + // No add_asset calls. Commit should return the file unchanged. + let buf = appender.commit().unwrap().into_inner(); + assert_eq!(buf, bundle_before); +} + +#[test] +fn append_then_reopen_and_append_again() { + let (bundle, _) = build_base_bundle(); + + // First commit: add a graph. + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Second commit: reopen the same bytes and add a custom blob. + let mut appender = BendlAppender::open(Cursor::new(buf)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"later", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + // Final read: all three assets should be present. + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let names: Vec<&str> = reader.assets().iter().map(|e| e.name.as_str()).collect(); + assert!(names.contains(&"metadata.json")); + assert!(names.contains(&"graph.json")); + assert!(names.contains(&"extra.bin")); + // Sample count from the original stream is preserved across both + // appends. + assert_eq!(reader.sample_count(), Some(3)); +} + +#[test] +fn append_does_not_disturb_front_loaded_asset_bytes() { + // Base bundle has a graph.json asset with known bytes; after + // append of a custom blob, reading graph.json must still return + // exactly the same decoded bytes as before. + let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9,10]}"#; + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let bundle = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let graph_before = reader.asset_bytes(&entry).unwrap(); + drop(reader); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"0123456789", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .unwrap(); + let graph_after = reader.asset_bytes(&entry).unwrap(); + assert_eq!(graph_before, graph_after); +} + +#[test] +fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { + // A custom asset named "graph.json" is not a singleton because the + // singleton uniqueness check keys off asset_type, not name. Adding + // a real GRAPH singleton after it must then fail on DuplicateName. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "graph.json", + b"custom graph-ish bytes", + AddAssetOptions::defaults(), + ) + .unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "graph.json")); +} + +#[test] +fn writer_asset_with_checksum_round_trips_through_reader() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let checksum = vec![0x01, 0x02, 0x03, 0x04]; + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "with_checksum", + b"hello", + AddAssetOptions { + checksum: Some(checksum.clone()), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader.find_asset_by_name("with_checksum").cloned().unwrap(); + assert_eq!(entry.checksum, Some(checksum)); + assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); +} + +#[test] +fn finished_writer_rejects_further_operations() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + // `finish` consumes `self`, which is itself the protection — there + // is no way to call add_asset/begin_stream afterwards. + let buf = writer.finish().unwrap().into_inner(); + // The resulting buffer is a valid finalized bundle. + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete()); +} + +#[test] +fn appender_commit_after_abort_is_not_possible_but_abort_leaves_bytes_unchanged() { + let (bundle, _) = build_base_bundle(); + let before = bundle.clone(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "wont_land", + b"orphan", + AddAssetOptions::defaults(), + ) + .unwrap(); + let buf = appender.abort().into_inner(); + assert_eq!(buf, before, "abort must leave file bytes unchanged"); +} + +#[test] +fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .add_json_asset(ASSET_TYPE_METADATA, "meta.json", b"{}") + .unwrap_err(); + assert!(matches!( + err, + BendlWriteError::WrongCanonicalName { + asset_type: ASSET_TYPE_METADATA, + .. + } + )); + // After a rejected add, no entries have been recorded — a + // subsequent valid add proceeds normally. + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") + .unwrap(); + writer + .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.assets().len(), 1); +} + +#[test] +fn writer_rejected_add_leaves_singleton_slot_usable() { + // A rejected singleton add must not consume the singleton slot — + // otherwise a future valid add with the correct canonical name + // would spuriously fail with DuplicateSingletonType. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + // First try with wrong canonical name — rejected. + let _ = writer + .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") + .unwrap_err(); + // Now retry with correct name; should succeed. + writer + .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") + .unwrap(); +} + +#[test] +fn append_rejects_duplicate_name_across_existing_and_pending() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + // First pending add: "blob". + appender + .add_asset(ASSET_TYPE_CUSTOM, "blob", b"1", AddAssetOptions::defaults()) + .unwrap(); + // Second pending add with same name must be rejected. + let err = appender + .add_asset(ASSET_TYPE_CUSTOM, "blob", b"2", AddAssetOptions::defaults()) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::DuplicateName(_))); + // Committing the still-valid first pending add should still work. + let buf = appender.commit().unwrap().into_inner(); + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.find_asset_by_name("blob").is_some()); +} + +#[test] +fn write_ben_stream_closure_error_short_circuits_finalize() { + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let err = writer + .write_ben_stream(BenVariant::Standard, |_ctx| { + Err(io::Error::new(io::ErrorKind::Other, "boom")) + }) + .unwrap_err(); + match err { + BendlWriteError::Io(e) => assert_eq!(e.kind(), io::ErrorKind::Other), + other => panic!("expected Io(Other), got {other:?}"), + } +} + +// ----------------------------------------------------------------------- +// Randomized / stress tests +// ----------------------------------------------------------------------- + +/// Build a bundle from a random set of custom assets (plus an optional +/// metadata asset) and fully round-trip it through the reader. Repeated +/// with a seeded ChaCha PRNG so the sequence is deterministic but +/// covers a wide surface. +#[test] +fn randomized_round_trip_many_custom_assets() { + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + for seed in 0u64..12 { + let mut rng = ChaCha8Rng::seed_from_u64(seed ^ 0xA110_CADE_F00D); + let n_assets: usize = rng.random_range(0..=25); + let include_metadata = rng.random_bool(0.5); + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + + let mut expected: Vec<(String, Vec, bool)> = Vec::new(); + if include_metadata { + let payload = format!(r#"{{"seed":{seed}}}"#).into_bytes(); + writer + .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", &payload) + .unwrap(); + expected.push(("metadata.json".to_string(), payload, false)); + } + + for i in 0..n_assets { + let size: usize = rng.random_range(0..=512); + let payload: Vec = (0..size).map(|_| rng.random::()).collect(); + let compress = rng.random_bool(0.4); + let is_json = rng.random_bool(0.15) && size > 0; + let payload = if is_json { + // Override with a synthetic JSON blob so the json flag + // actually matches the content. + format!(r#"{{"i":{i},"seed":{seed}}}"#).into_bytes() + } else { + payload + }; + + let mut opts = AddAssetOptions::defaults(); + if compress { + opts = opts.compress(); + } else { + opts = opts.raw(); + } + if is_json { + opts = opts.json(); + } + let name = format!("seed{seed}-asset{i}.bin"); + writer + .add_asset(ASSET_TYPE_CUSTOM, &name, &payload, opts) + .unwrap(); + expected.push((name, payload, is_json)); + } + + // Write a small deterministic stream so the bundle is + // assignment-complete. + let sample_count: i64 = rng.random_range(0..=20); + let fake_stream = b"STANDARD BEN FILE\x00\x01\x02payload".to_vec(); + writer + .write_stream_bytes(&fake_stream, sample_count) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert!(reader.is_complete(), "seed {seed}: not finalized"); + assert_eq!(reader.sample_count(), Some(sample_count)); + reader + .validate_directory() + .unwrap_or_else(|e| panic!("seed {seed}: validation failed: {e:?}")); + assert_eq!(reader.assets().len(), expected.len(), "seed {seed}"); + + for (name, want, _is_json) in &expected { + let entry = reader + .find_asset_by_name(name) + .cloned() + .unwrap_or_else(|| panic!("seed {seed}: asset {name:?} missing")); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "seed {seed}: payload mismatch for {name}"); + } + + // Stream must also read back exactly. + let mut stream_buf = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream_buf) + .unwrap(); + assert_eq!(stream_buf, fake_stream, "seed {seed}"); + } +} + +#[test] +fn five_successive_appends_preserve_everything() { + // Start from a finalized bundle with only a metadata asset and a + // short stream. Then open it five times via BendlAppender and add + // one asset per round. After every round, the previous assets must + // still be readable and sample_count must remain authoritative. + let (mut buf, _) = build_base_bundle(); + + // Sanity-check the baseline. + let baseline_reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let baseline_samples = baseline_reader.sample_count(); + assert!(baseline_samples.is_some()); + drop(baseline_reader); + + let mut accumulated: Vec<(String, Vec)> = + vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; + + for round in 0..5 { + let cursor = Cursor::new(buf); + let mut appender = BendlAppender::open(cursor).unwrap(); + let name = format!("round-{round}.bin"); + let payload: Vec = (0u8..=(round as u8 * 7 + 3)).collect(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) + .unwrap(); + let commit = appender.commit().unwrap(); + buf = commit.into_inner(); + accumulated.push((name, payload)); + + // Re-open and verify the full set is intact and sample_count + // still matches the baseline (append must not touch it). + let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + assert!(reader.is_complete(), "round {round}"); + assert_eq!( + reader.sample_count(), + baseline_samples, + "sample count drifted at round {round}" + ); + assert_eq!( + reader.assets().len(), + accumulated.len(), + "asset count wrong at round {round}" + ); + reader.validate_directory().unwrap(); + + for (n, want) in &accumulated { + let entry = reader + .find_asset_by_name(n) + .cloned() + .unwrap_or_else(|| panic!("round {round}: {n:?} missing")); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "round {round}: payload mismatch for {n}"); + } + } +} + +#[test] +fn randomized_append_sequence_preserves_all_prior_entries() { + // Independent coverage for append: random number of rounds, random + // payload sizes. Catches any bookkeeping drift in the appender's + // directory-rewrite path. + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + let (mut buf, _) = build_base_bundle(); + let mut accumulated: Vec<(String, Vec)> = + vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; + + let mut rng = ChaCha8Rng::seed_from_u64(0xDEAD_BEEF_CAFE_F00D); + let rounds: usize = rng.random_range(3..=8); + for round in 0..rounds { + let adds: usize = rng.random_range(1..=4); + let cursor = Cursor::new(buf); + let mut appender = BendlAppender::open(cursor).unwrap(); + for k in 0..adds { + let size: usize = rng.random_range(0..=256); + let payload: Vec = (0..size).map(|_| rng.random::()).collect(); + let name = format!("r{round}-a{k}.bin"); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + &name, + &payload, + AddAssetOptions::defaults(), + ) + .unwrap(); + accumulated.push((name, payload)); + } + let commit = appender.commit().unwrap(); + buf = commit.into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + reader.validate_directory().unwrap(); + assert_eq!(reader.assets().len(), accumulated.len()); + for (n, want) in &accumulated { + let entry = reader.find_asset_by_name(n).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "append round {round}: {n}"); + } + } +} + +// ── write_json_value and sample_count coverage ────────────────── + +#[test] +fn write_ben_stream_json_value_and_sample_count() { + use crate::BenVariant; + use serde_json::json; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + assert_eq!(ctx.sample_count(), 0); + ctx.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1}))?; + assert_eq!(ctx.sample_count(), 1); + ctx.write_json_value(json!({"assignment": [4, 5, 6], "sample": 2}))?; + assert_eq!(ctx.sample_count(), 2); + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(2)); + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Ben(r) => r, + BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), + }; + let decoded: Vec> = inner.silent(true).map(|r| r.unwrap().0).collect(); + assert_eq!(decoded, vec![vec![1, 2, 3], vec![4, 5, 6]]); +} + +#[test] +fn write_xben_stream_json_value() { + use crate::BenVariant; + use serde_json::json; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + writer + .write_xben_stream(BenVariant::Standard, |ctx| { + ctx.write_json_value(json!({"assignment": [10, 20], "sample": 1}))?; + ctx.write_json_value(json!({"assignment": [30, 40], "sample": 2}))?; + Ok(()) + }) + .unwrap(); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + assert_eq!(reader.sample_count(), Some(2)); + let decoder = reader.open_assignment_reader().unwrap(); + let inner = match decoder { + BundleAssignmentReader::Xben(r) => r, + BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), + }; + let decoded: Vec> = inner.silent(true).map(|r| r.unwrap().0).collect(); + assert_eq!(decoded, vec![vec![10, 20], vec![30, 40]]); +} + +// ── BendlStreamHandle: flush ───────────────────────────────────── + +#[test] +fn stream_handle_flush_succeeds() { + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut handle = writer.begin_stream().unwrap(); + handle.flush().unwrap(); +} + +// ── BendlAppender: checksum flag ──────────────────────────────── + +#[test] +fn appender_commit_with_checksum_sets_checksum_flag() { + let (bundle, _) = build_base_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "checksummed", + b"payload", + AddAssetOptions { + checksum: Some(vec![0xAB, 0xCD]), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + let buf = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader.find_asset_by_name("checksummed").unwrap(); + assert_eq!(entry.checksum, Some(vec![0xAB, 0xCD])); + assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); +} + +// ── BendlAppender: trailing directory bytes ────────────────────── + +#[test] +fn appender_rejects_bundle_with_trailing_directory_bytes() { + let (mut bundle, _) = build_base_bundle(); + // Patch the header's directory_len field (bytes 32-39) to claim + // the directory is 4 bytes longer than it actually is. + let old_len = u64::from_le_bytes(bundle[32..40].try_into().unwrap()); + let patched = (old_len + 4).to_le_bytes(); + bundle[32..40].copy_from_slice(&patched); + + match BendlAppender::open(Cursor::new(bundle)) { + Err(BendlWriteError::Format(BendlFormatError::TrailingDirectoryBytes { .. })) => {} + Err(other) => panic!("expected TrailingDirectoryBytes, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +// ── finalize from wrong state ─────────────────────────────────── + +#[test] +fn finish_from_finished_state_errors() { + use crate::BenVariant; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .write_ben_stream(BenVariant::Standard, |ctx| { + ctx.write_assignment(vec![1, 2])?; + Ok(()) + }) + .unwrap(); + // First finish succeeds + let buf = writer.finish().unwrap(); + // Verify the result is usable + let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); + assert!(reader.is_complete()); +} diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index e36b45e..2f98a91 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -838,1368 +838,3 @@ impl BendlAppender { self.inner } } - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use std::io::{Cursor, Read}; - - use super::*; - use crate::io::bundle::format::{ - ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, - }; - use crate::io::bundle::reader::BendlReader; - - fn make_buffer() -> Cursor> { - Cursor::new(Vec::new()) - } - - #[test] - fn minimal_bundle_round_trip_through_reader() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", br#"{"note":"hello"}"#) - .unwrap(); - let stream_bytes = b"STANDARD BEN FILE\x00\x01fake".to_vec(); - writer.write_stream_bytes(&stream_bytes, 7).unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(7)); - assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); - assert_eq!(reader.assets().len(), 1); - - let entry = reader - .find_asset_by_type(ASSET_TYPE_METADATA) - .cloned() - .expect("metadata entry present"); - assert_eq!(entry.name, "metadata.json"); - assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); - let meta_bytes = reader.asset_bytes(&entry).unwrap(); - assert_eq!(meta_bytes, br#"{"note":"hello"}"#); - - let mut stream_buf = Vec::new(); - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut stream_buf) - .unwrap(); - assert_eq!(stream_buf, stream_bytes); - } - - #[test] - fn graph_asset_is_compressed_by_default() { - let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9],"edges":[[0,1],[1,2],[2,3],[3,4]]}"#; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .expect("graph entry present"); - assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); - // Compressed size should differ from the raw size for a non-trivial - // JSON payload. For very short payloads xz actually inflates the - // bytes, so this just checks the size is non-zero and different. - assert_ne!(entry.payload_len, graph.len() as u64); - - // Decoded bytes round-trip. - let decoded = reader.asset_bytes(&entry).unwrap(); - assert_eq!(decoded, graph); - } - - #[test] - fn graph_asset_can_be_forced_raw() { - let graph = br#"{"nodes":[0,1,2]}"#; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_asset( - ASSET_TYPE_GRAPH, - "graph.json", - graph, - AddAssetOptions::defaults().json().raw(), - ) - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .expect("graph entry present"); - assert_eq!(entry.asset_flags & ASSET_FLAG_XZ, 0); - assert_eq!(entry.payload_len, graph.len() as u64); - } - - #[test] - fn writer_rejects_second_graph() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") - .unwrap(); - let err = writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateSingletonType(t) if t == ASSET_TYPE_GRAPH)); - } - - #[test] - fn writer_rejects_wrong_canonical_name_for_singleton() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let err = writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph_but_wrong_name.json", b"{}") - .unwrap_err(); - assert!(matches!( - err, - BendlWriteError::WrongCanonicalName { - asset_type: ASSET_TYPE_GRAPH, - .. - } - )); - } - - #[test] - fn writer_rejects_duplicate_custom_name() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"first", - AddAssetOptions::defaults(), - ) - .unwrap(); - let err = writer - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"second", - AddAssetOptions::defaults(), - ) - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); - } - - #[test] - fn writer_rejects_asset_added_after_stream_begins() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - { - let mut handle = writer.begin_stream().unwrap(); - handle.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); - handle.finish(1).unwrap(); - } - let err = writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") - .unwrap_err(); - assert!(matches!(err, BendlWriteError::AssetsAfterStream)); - } - - #[test] - fn asset_only_bundle_finalizes_with_empty_stream() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(0)); - assert_eq!(reader.header().stream_len, 0); - } - - #[test] - fn finalized_directory_lives_at_eof() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - let header = reader.header(); - let file_len = buf.len() as u64; - assert_eq!(header.directory_offset + header.directory_len, file_len); - // Stream ends where directory begins. - assert_eq!( - header.stream_offset + header.stream_len, - header.directory_offset - ); - } - - // ----------------------------------------------------------------------- - // Append-path tests - // ----------------------------------------------------------------------- - - /// Build a finalized bundle with a single `metadata.json` asset and - /// a short fake stream, then return both the bytes and the byte - /// range (offset, len) occupied by the stream region. - fn build_base_bundle() -> (Vec, (u64, u64)) { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"version\":1}") - .unwrap(); - let stream = b"STANDARD BEN FILE\x00\x01\x02\x03\x04\x05stream bytes"; - writer.write_stream_bytes(stream, 3).unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - let range = (reader.header().stream_offset, reader.header().stream_len); - (buf, range) - } - - #[test] - fn append_adds_new_asset_and_preserves_old_entries() { - let (bundle, _) = build_base_bundle(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[]}") - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.assets().len(), 2); - assert!(reader.find_asset_by_name("metadata.json").is_some()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - // Finalized bundle invariants still hold. - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(3)); - } - - #[test] - fn append_leaves_stream_bytes_byte_for_byte_unchanged() { - let (bundle, (stream_offset, stream_len)) = build_base_bundle(); - let original_stream_bytes = - bundle[stream_offset as usize..(stream_offset + stream_len) as usize].to_vec(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"appended custom bytes", - AddAssetOptions::defaults(), - ) - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - // Read back the new header to locate the stream region, then - // confirm the stream bytes are byte-identical to the original. - let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - let (off, len) = (reader.header().stream_offset, reader.header().stream_len); - let appended_stream_bytes = buf[off as usize..(off + len) as usize].to_vec(); - assert_eq!(appended_stream_bytes, original_stream_bytes); - // Stream offset should not have moved either. - assert_eq!(off, stream_offset); - assert_eq!(len, stream_len); - } - - #[test] - fn append_preserves_existing_entries_payload_offsets() { - let (bundle, _) = build_base_bundle(); - - // Snapshot the metadata entry's payload_offset before append. - let reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); - let old_offset = reader - .find_asset_by_name("metadata.json") - .unwrap() - .payload_offset; - drop(reader); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"nodes\":[0,1,2,3,4,5]}") - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let new_offset = reader - .find_asset_by_name("metadata.json") - .unwrap() - .payload_offset; - assert_eq!( - old_offset, new_offset, - "existing asset offset must not move" - ); - } - - #[test] - fn append_rejects_duplicate_singleton_without_touching_file() { - let (bundle, _) = build_base_bundle(); - let bundle_before = bundle.clone(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - let err = appender - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"new\":true}") - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateSingletonType(_))); - - // Abort and confirm the file is byte-for-byte unchanged. - let buf = appender.abort().into_inner(); - assert_eq!(buf, bundle_before); - } - - #[test] - fn append_rejects_duplicate_custom_name_without_touching_file() { - // Start from a bundle containing a custom asset named "blob", then - // try to append another "blob". - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"original", - AddAssetOptions::defaults(), - ) - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let bundle = writer.finish().unwrap().into_inner(); - let bundle_before = bundle.clone(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - let err = appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob", - b"dup", - AddAssetOptions::defaults(), - ) - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "blob")); - - let buf = appender.abort().into_inner(); - assert_eq!(buf, bundle_before); - } - - #[test] - fn append_rejects_wrong_canonical_name_without_touching_file() { - let (bundle, _) = build_base_bundle(); - let bundle_before = bundle.clone(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - let err = appender - .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") - .unwrap_err(); - assert!(matches!( - err, - BendlWriteError::WrongCanonicalName { - asset_type: ASSET_TYPE_GRAPH, - .. - } - )); - - let buf = appender.abort().into_inner(); - assert_eq!(buf, bundle_before); - } - - #[test] - fn append_rejects_incomplete_bundle() { - // Construct a minimal incomplete bundle: just the provisional - // header and some stream bytes, no directory. - use crate::io::bundle::format::{ - BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, - }; - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: 0, - directory_len: 0, - stream_offset: HEADER_SIZE as u64, - stream_len: 0, - sample_count: -1, - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - bytes.extend_from_slice(b"STANDARD BEN FILE\x00fake"); - - match BendlAppender::open(Cursor::new(bytes)) { - Err(BendlWriteError::BundleIncomplete) => {} - Err(other) => panic!("expected BundleIncomplete, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn append_rejects_complete_bundle_with_zero_directory() { - // Header claims complete but has directory_offset=0 — hits the second - // BundleIncomplete check (line 647). - use crate::io::bundle::format::{ - BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_YES, - }; - let header = BendlHeader { - magic: BENDL_MAGIC, - major_version: BENDL_MAJOR_VERSION, - minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, - assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, - directory_offset: 0, - directory_len: 0, - stream_offset: HEADER_SIZE as u64, - stream_len: 0, - sample_count: 0, - }; - let mut bytes = Vec::new(); - bytes.extend_from_slice(&header.to_bytes()); - // Pad to HEADER_SIZE (already exactly 64 bytes from to_bytes) - - match BendlAppender::open(Cursor::new(bytes)) { - Err(BendlWriteError::BundleIncomplete) => {} - Err(other) => panic!("expected BundleIncomplete, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - #[test] - fn append_multiple_assets_in_one_commit() { - let (bundle, _) = build_base_bundle(); - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") - .unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob1", - b"blob one", - AddAssetOptions::defaults(), - ) - .unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "blob2", - b"blob two", - AddAssetOptions::defaults(), - ) - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.assets().len(), 4); - // Round-trip the appended graph through the reader to confirm - // compression happened and decodes cleanly. - let graph_entry = reader - .find_asset_by_name("graph.json") - .cloned() - .expect("graph entry present"); - assert_ne!(graph_entry.asset_flags & ASSET_FLAG_XZ, 0); - let graph_bytes = reader.asset_bytes(&graph_entry).unwrap(); - assert_eq!(graph_bytes, b"{\"n\":[0,1,2]}"); - } - - #[test] - fn append_rejects_conflicting_pending_additions() { - let (bundle, _) = build_base_bundle(); - let bundle_before = bundle.clone(); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "new_blob", - b"a", - AddAssetOptions::defaults(), - ) - .unwrap(); - let err = appender - .add_asset( - ASSET_TYPE_CUSTOM, - "new_blob", - b"b", - AddAssetOptions::defaults(), - ) - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateName(_))); - - let buf = appender.abort().into_inner(); - assert_eq!(buf, bundle_before); - } - - // -------- Phase 4: assignment-stream integration tests -------- - - #[test] - fn write_ben_stream_round_trips_through_assignment_reader() { - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - - let samples: Vec> = vec![ - vec![0, 0, 1, 1, 2, 2], - vec![0, 1, 1, 1, 2, 2], - vec![0, 1, 1, 1, 2, 2], // repeat - vec![1, 1, 1, 1, 2, 2], - ]; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_ben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - // Four write_assignment calls → sample_count == 4. - assert_eq!(reader.sample_count(), Some(samples.len() as i64)); - assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); - - let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner - .silent(true) - .flat_map(|r| { - let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) - }) - .collect(); - assert_eq!(decoded, samples); - } - - #[test] - fn write_xben_stream_round_trips_through_assignment_reader() { - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - - let samples: Vec> = vec![ - vec![0, 1, 2, 3, 4, 5], - vec![0, 1, 2, 3, 4, 5], // repeat - vec![1, 1, 2, 3, 4, 5], - vec![1, 1, 2, 3, 4, 4], - ]; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - writer - .write_xben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(samples.len() as i64)); - assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); - - let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Xben(r) => r, - BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), - }; - let decoded: Vec> = inner - .silent(true) - .flat_map(|r| { - let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) - }) - .collect(); - assert_eq!(decoded, samples); - } - - #[test] - fn write_ben_stream_alongside_front_loaded_asset() { - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - - let graph = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#; - let samples: Vec> = vec![vec![0, 1, 1, 2], vec![0, 1, 2, 2]]; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) - .unwrap(); - writer - .write_ben_stream(BenVariant::Standard, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.sample_count(), Some(samples.len() as i64)); - - // Front-loaded graph asset survives round trip through xz. - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .expect("graph asset present"); - assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); - let decoded_graph = reader.asset_bytes(&entry).unwrap(); - assert_eq!(decoded_graph, graph); - - // Assignment stream is still intact after pulling asset bytes. - let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner - .silent(true) - .flat_map(|r| { - let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) - }) - .collect(); - assert_eq!(decoded, samples); - } - - #[test] - fn open_assignment_reader_rejects_mismatched_format() { - // Build a BEN bundle and open a reader, and verify the is_ben/is_xben - // discriminators reflect the header. - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_ben_stream(BenVariant::Standard, |ctx| { - ctx.write_assignment(vec![0, 1])?; - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let decoder: BundleAssignmentReader<_> = reader.open_assignment_reader().unwrap(); - assert!(decoder.is_ben()); - assert!(!decoder.is_xben()); - } - - // ----------------------------------------------------------------------- - // Robustness tests - // ----------------------------------------------------------------------- - - #[test] - fn fully_empty_bundle_finalizes_and_round_trips() { - // No assets, no stream bytes, no stream phase at all. - let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let buf = writer.finish().unwrap().into_inner(); - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - assert_eq!(reader.sample_count(), Some(0)); - assert_eq!(reader.header().stream_len, 0); - assert_eq!(reader.assets().len(), 0); - // Even with zero assets the directory is present and empty. - assert_ne!(reader.header().directory_offset, 0); - // directory_len should equal the 4-byte empty entry-count header. - assert_eq!(reader.header().directory_len, 4); - } - - #[test] - fn begin_stream_twice_returns_wrong_state_error() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - { - let handle = match writer.begin_stream() { - Ok(h) => h, - Err(_) => panic!("first begin_stream must succeed"), - }; - // Drop the handle without calling finish() — the writer is - // now stuck in the Streaming state. - drop(handle); - } - let err = writer - .begin_stream() - .err() - .expect("second begin_stream must fail"); - assert!(matches!(err, BendlWriteError::WrongState { .. })); - } - - #[test] - fn finish_from_streaming_state_errors() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - match writer.begin_stream() { - Ok(handle) => drop(handle), - Err(_) => panic!("begin_stream must succeed"), - } - // Intentionally leave the writer in the Streaming state. - let err = writer.finish().unwrap_err(); - assert!(matches!( - err, - BendlWriteError::WrongState { - found: "Streaming", - .. - } - )); - } - - #[test] - fn begin_stream_after_stream_written_returns_wrong_state() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - // Writer is now in StreamWritten state; begin_stream must fail. - let err = writer - .begin_stream() - .err() - .expect("begin_stream after StreamWritten must fail"); - assert!(matches!( - err, - BendlWriteError::WrongState { - found: "StreamWritten", - .. - } - )); - } - - #[test] - fn stress_many_custom_assets_round_trip() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let count = 500usize; - for i in 0..count { - let name = format!("blob_{i:05}"); - let payload = vec![(i & 0xFF) as u8; (i % 17) + 1]; - writer - .add_asset( - ASSET_TYPE_CUSTOM, - &name, - &payload, - AddAssetOptions::defaults(), - ) - .unwrap(); - } - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.assets().len(), count); - // Spot-check a handful of entries by reading their payload bytes back. - for i in [0usize, 1, 42, 199, 499] { - let name = format!("blob_{i:05}"); - let entry = reader.find_asset_by_name(&name).cloned().unwrap(); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(got, vec![(i & 0xFF) as u8; (i % 17) + 1]); - } - } - - #[test] - fn append_empty_commit_is_noop() { - let (bundle, _) = build_base_bundle(); - let bundle_before = bundle.clone(); - let appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - // No add_asset calls. Commit should return the file unchanged. - let buf = appender.commit().unwrap().into_inner(); - assert_eq!(buf, bundle_before); - } - - #[test] - fn append_then_reopen_and_append_again() { - let (bundle, _) = build_base_bundle(); - - // First commit: add a graph. - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{\"n\":[0,1,2]}") - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - // Second commit: reopen the same bytes and add a custom blob. - let mut appender = BendlAppender::open(Cursor::new(buf)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "extra.bin", - b"later", - AddAssetOptions::defaults(), - ) - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - // Final read: all three assets should be present. - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let names: Vec<&str> = reader.assets().iter().map(|e| e.name.as_str()).collect(); - assert!(names.contains(&"metadata.json")); - assert!(names.contains(&"graph.json")); - assert!(names.contains(&"extra.bin")); - // Sample count from the original stream is preserved across both - // appends. - assert_eq!(reader.sample_count(), Some(3)); - } - - #[test] - fn append_does_not_disturb_front_loaded_asset_bytes() { - // Base bundle has a graph.json asset with known bytes; after - // append of a custom blob, reading graph.json must still return - // exactly the same decoded bytes as before. - let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9,10]}"#; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let bundle = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .unwrap(); - let graph_before = reader.asset_bytes(&entry).unwrap(); - drop(reader); - - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "extra.bin", - b"0123456789", - AddAssetOptions::defaults(), - ) - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader - .find_asset_by_type(ASSET_TYPE_GRAPH) - .cloned() - .unwrap(); - let graph_after = reader.asset_bytes(&entry).unwrap(); - assert_eq!(graph_before, graph_after); - } - - #[test] - fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { - // A custom asset named "graph.json" is not a singleton because the - // singleton uniqueness check keys off asset_type, not name. Adding - // a real GRAPH singleton after it must then fail on DuplicateName. - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .add_asset( - ASSET_TYPE_CUSTOM, - "graph.json", - b"custom graph-ish bytes", - AddAssetOptions::defaults(), - ) - .unwrap(); - let err = writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateName(ref n) if n == "graph.json")); - } - - #[test] - fn writer_asset_with_checksum_round_trips_through_reader() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let checksum = vec![0x01, 0x02, 0x03, 0x04]; - writer - .add_asset( - ASSET_TYPE_CUSTOM, - "with_checksum", - b"hello", - AddAssetOptions { - checksum: Some(checksum.clone()), - ..AddAssetOptions::defaults() - }, - ) - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader.find_asset_by_name("with_checksum").cloned().unwrap(); - assert_eq!(entry.checksum, Some(checksum)); - assert_ne!( - entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, - 0 - ); - } - - #[test] - fn finished_writer_rejects_further_operations() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - // Take a handle to the writer by going through begin_stream first. - // Actually finish() consumes self, so instead assert the state - // machine barfs when we manually poke it in the Finished state. - // - // We simulate by calling finish() and then checking there is no - // way to call add_asset/begin_stream afterwards — `finish` consumes - // `self`, which is itself the protection. - let buf = writer.finish().unwrap().into_inner(); - // The resulting buffer is a valid finalized bundle. - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); - } - - #[test] - fn appender_commit_after_abort_is_not_possible_but_abort_leaves_bytes_unchanged() { - let (bundle, _) = build_base_bundle(); - let before = bundle.clone(); - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "wont_land", - b"orphan", - AddAssetOptions::defaults(), - ) - .unwrap(); - let buf = appender.abort().into_inner(); - assert_eq!(buf, before, "abort must leave file bytes unchanged"); - } - - #[test] - fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let err = writer - .add_json_asset(ASSET_TYPE_METADATA, "meta.json", b"{}") - .unwrap_err(); - assert!(matches!( - err, - BendlWriteError::WrongCanonicalName { - asset_type: ASSET_TYPE_METADATA, - .. - } - )); - // After a rejected add, no entries have been recorded — a - // subsequent valid add proceeds normally. - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") - .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.assets().len(), 1); - } - - #[test] - fn writer_rejected_add_leaves_singleton_slot_usable() { - // A rejected singleton add must not consume the singleton slot — - // otherwise a future valid add with the correct canonical name - // would spuriously fail with DuplicateSingletonType. - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - // First try with wrong canonical name — rejected. - let _ = writer - .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") - .unwrap_err(); - // Now retry with correct name; should succeed. - writer - .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}") - .unwrap(); - } - - #[test] - fn append_rejects_duplicate_name_across_existing_and_pending() { - let (bundle, _) = build_base_bundle(); - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - // First pending add: "blob". - appender - .add_asset(ASSET_TYPE_CUSTOM, "blob", b"1", AddAssetOptions::defaults()) - .unwrap(); - // Second pending add with same name must be rejected. - let err = appender - .add_asset(ASSET_TYPE_CUSTOM, "blob", b"2", AddAssetOptions::defaults()) - .unwrap_err(); - assert!(matches!(err, BendlWriteError::DuplicateName(_))); - // Committing the still-valid first pending add should still work. - let buf = appender.commit().unwrap().into_inner(); - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.find_asset_by_name("blob").is_some()); - } - - #[test] - fn write_ben_stream_closure_error_short_circuits_finalize() { - use crate::BenVariant; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let err = writer - .write_ben_stream(BenVariant::Standard, |_ctx| { - Err(io::Error::new(io::ErrorKind::Other, "boom")) - }) - .unwrap_err(); - match err { - BendlWriteError::Io(e) => assert_eq!(e.kind(), io::ErrorKind::Other), - other => panic!("expected Io(Other), got {other:?}"), - } - } - - // ----------------------------------------------------------------------- - // Randomized / stress tests - // ----------------------------------------------------------------------- - - /// Build a bundle from a random set of custom assets (plus an optional - /// metadata asset) and fully round-trip it through the reader. Repeated - /// with a seeded ChaCha PRNG so the sequence is deterministic but - /// covers a wide surface. - #[test] - fn randomized_round_trip_many_custom_assets() { - use rand::{Rng, SeedableRng}; - use rand_chacha::ChaCha8Rng; - - for seed in 0u64..12 { - let mut rng = ChaCha8Rng::seed_from_u64(seed ^ 0xA110_CADE_F00D); - let n_assets: usize = rng.random_range(0..=25); - let include_metadata = rng.random_bool(0.5); - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - - let mut expected: Vec<(String, Vec, bool)> = Vec::new(); - if include_metadata { - let payload = format!(r#"{{"seed":{seed}}}"#).into_bytes(); - writer - .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", &payload) - .unwrap(); - expected.push(("metadata.json".to_string(), payload, false)); - } - - for i in 0..n_assets { - let size: usize = rng.random_range(0..=512); - let payload: Vec = (0..size).map(|_| rng.random::()).collect(); - let compress = rng.random_bool(0.4); - let is_json = rng.random_bool(0.15) && size > 0; - let payload = if is_json { - // Override with a synthetic JSON blob so the json flag - // actually matches the content. - format!(r#"{{"i":{i},"seed":{seed}}}"#).into_bytes() - } else { - payload - }; - - let mut opts = AddAssetOptions::defaults(); - if compress { - opts = opts.compress(); - } else { - opts = opts.raw(); - } - if is_json { - opts = opts.json(); - } - let name = format!("seed{seed}-asset{i}.bin"); - writer - .add_asset(ASSET_TYPE_CUSTOM, &name, &payload, opts) - .unwrap(); - expected.push((name, payload, is_json)); - } - - // Write a small deterministic stream so the bundle is - // assignment-complete. - let sample_count: i64 = rng.random_range(0..=20); - let fake_stream = b"STANDARD BEN FILE\x00\x01\x02payload".to_vec(); - writer - .write_stream_bytes(&fake_stream, sample_count) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete(), "seed {seed}: not finalized"); - assert_eq!(reader.sample_count(), Some(sample_count)); - reader - .validate_directory() - .unwrap_or_else(|e| panic!("seed {seed}: validation failed: {e:?}")); - assert_eq!(reader.assets().len(), expected.len(), "seed {seed}"); - - for (name, want, _is_json) in &expected { - let entry = reader - .find_asset_by_name(name) - .cloned() - .unwrap_or_else(|| panic!("seed {seed}: asset {name:?} missing")); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(&got, want, "seed {seed}: payload mismatch for {name}"); - } - - // Stream must also read back exactly. - let mut stream_buf = Vec::new(); - reader - .assignment_stream_reader() - .unwrap() - .read_to_end(&mut stream_buf) - .unwrap(); - assert_eq!(stream_buf, fake_stream, "seed {seed}"); - } - } - - #[test] - fn five_successive_appends_preserve_everything() { - // Start from a finalized bundle with only a metadata asset and a - // short stream. Then open it five times via BendlAppender and add - // one asset per round. After every round, the previous assets must - // still be readable and sample_count must remain authoritative. - let (mut buf, _) = build_base_bundle(); - - // Sanity-check the baseline. - let baseline_reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - let baseline_samples = baseline_reader.sample_count(); - assert!(baseline_samples.is_some()); - drop(baseline_reader); - - let mut accumulated: Vec<(String, Vec)> = - vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; - - for round in 0..5 { - let cursor = Cursor::new(buf); - let mut appender = BendlAppender::open(cursor).unwrap(); - let name = format!("round-{round}.bin"); - let payload: Vec = (0u8..=(round as u8 * 7 + 3)).collect(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - &name, - &payload, - AddAssetOptions::defaults(), - ) - .unwrap(); - let commit = appender.commit().unwrap(); - buf = commit.into_inner(); - accumulated.push((name, payload)); - - // Re-open and verify the full set is intact and sample_count - // still matches the baseline (append must not touch it). - let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - assert!(reader.is_complete(), "round {round}"); - assert_eq!( - reader.sample_count(), - baseline_samples, - "sample count drifted at round {round}" - ); - assert_eq!( - reader.assets().len(), - accumulated.len(), - "asset count wrong at round {round}" - ); - reader.validate_directory().unwrap(); - - for (n, want) in &accumulated { - let entry = reader - .find_asset_by_name(n) - .cloned() - .unwrap_or_else(|| panic!("round {round}: {n:?} missing")); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(&got, want, "round {round}: payload mismatch for {n}"); - } - } - } - - #[test] - fn randomized_append_sequence_preserves_all_prior_entries() { - // Independent coverage for append: random number of rounds, random - // payload sizes. Catches any bookkeeping drift in the appender's - // directory-rewrite path. - use rand::{Rng, SeedableRng}; - use rand_chacha::ChaCha8Rng; - - let (mut buf, _) = build_base_bundle(); - let mut accumulated: Vec<(String, Vec)> = - vec![("metadata.json".to_string(), br#"{"version":1}"#.to_vec())]; - - let mut rng = ChaCha8Rng::seed_from_u64(0xDEAD_BEEF_CAFE_F00D); - let rounds: usize = rng.random_range(3..=8); - for round in 0..rounds { - let adds: usize = rng.random_range(1..=4); - let cursor = Cursor::new(buf); - let mut appender = BendlAppender::open(cursor).unwrap(); - for k in 0..adds { - let size: usize = rng.random_range(0..=256); - let payload: Vec = (0..size).map(|_| rng.random::()).collect(); - let name = format!("r{round}-a{k}.bin"); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - &name, - &payload, - AddAssetOptions::defaults(), - ) - .unwrap(); - accumulated.push((name, payload)); - } - let commit = appender.commit().unwrap(); - buf = commit.into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - reader.validate_directory().unwrap(); - assert_eq!(reader.assets().len(), accumulated.len()); - for (n, want) in &accumulated { - let entry = reader.find_asset_by_name(n).cloned().unwrap(); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(&got, want, "append round {round}: {n}"); - } - } - } - - // ── write_json_value and sample_count coverage ────────────────── - - #[test] - fn write_ben_stream_json_value_and_sample_count() { - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - use serde_json::json; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_ben_stream(BenVariant::Standard, |ctx| { - assert_eq!(ctx.sample_count(), 0); - ctx.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1}))?; - assert_eq!(ctx.sample_count(), 1); - ctx.write_json_value(json!({"assignment": [4, 5, 6], "sample": 2}))?; - assert_eq!(ctx.sample_count(), 2); - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.sample_count(), Some(2)); - let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner - .silent(true) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(decoded, vec![vec![1, 2, 3], vec![4, 5, 6]]); - } - - #[test] - fn write_xben_stream_json_value() { - use crate::io::bundle::reader::BundleAssignmentReader; - use crate::BenVariant; - use serde_json::json; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - writer - .write_xben_stream(BenVariant::Standard, |ctx| { - ctx.write_json_value(json!({"assignment": [10, 20], "sample": 1}))?; - ctx.write_json_value(json!({"assignment": [30, 40], "sample": 2}))?; - Ok(()) - }) - .unwrap(); - let buf = writer.finish().unwrap().into_inner(); - - let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert_eq!(reader.sample_count(), Some(2)); - let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Xben(r) => r, - BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), - }; - let decoded: Vec> = inner - .silent(true) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(decoded, vec![vec![10, 20], vec![30, 40]]); - } - - // ── BendlStreamHandle: flush ───────────────────────────────────── - - #[test] - fn stream_handle_flush_succeeds() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let mut handle = writer.begin_stream().unwrap(); - use std::io::Write; - handle.flush().unwrap(); - } - - // ── BendlAppender: checksum flag ──────────────────────────────── - - #[test] - fn appender_commit_with_checksum_sets_checksum_flag() { - let (bundle, _) = build_base_bundle(); - let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); - appender - .add_asset( - ASSET_TYPE_CUSTOM, - "checksummed", - b"payload", - AddAssetOptions { - checksum: Some(vec![0xAB, 0xCD]), - ..AddAssetOptions::defaults() - }, - ) - .unwrap(); - let buf = appender.commit().unwrap().into_inner(); - - let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let entry = reader.find_asset_by_name("checksummed").unwrap(); - assert_eq!(entry.checksum, Some(vec![0xAB, 0xCD])); - assert_ne!( - entry.asset_flags & crate::io::bundle::format::ASSET_FLAG_CHECKSUM, - 0 - ); - } - - // ── BendlAppender: trailing directory bytes ────────────────────── - - #[test] - fn appender_rejects_bundle_with_trailing_directory_bytes() { - let (mut bundle, _) = build_base_bundle(); - // Patch the header's directory_len field (bytes 32-39) to claim - // the directory is 4 bytes longer than it actually is. - let old_len = u64::from_le_bytes(bundle[32..40].try_into().unwrap()); - let patched = (old_len + 4).to_le_bytes(); - bundle[32..40].copy_from_slice(&patched); - - match BendlAppender::open(Cursor::new(bundle)) { - Err(BendlWriteError::Format(BendlFormatError::TrailingDirectoryBytes { .. })) => {} - Err(other) => panic!("expected TrailingDirectoryBytes, got {other:?}"), - Ok(_) => panic!("expected error, got Ok"), - } - } - - // ── finalize from wrong state ─────────────────────────────────── - - #[test] - fn finish_from_finished_state_errors() { - use crate::BenVariant; - - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_ben_stream(BenVariant::Standard, |ctx| { - ctx.write_assignment(vec![1, 2])?; - Ok(()) - }) - .unwrap(); - // First finish succeeds - let buf = writer.finish().unwrap(); - // Verify the result is usable - let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); - assert!(reader.is_complete()); - } -} From 092bdac8ca83e7b0622c9f017651a99485c4d708 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:39:51 -0600 Subject: [PATCH 076/221] reorg cli module --- ben/src/cli/ben.rs | 1388 -------------------------- ben/src/cli/ben/args.rs | 137 +++ ben/src/cli/ben/bundle.rs | 146 +++ ben/src/cli/ben/mod.rs | 382 +++++++ ben/src/cli/ben/paths.rs | 182 ++++ ben/src/cli/ben/tests.rs | 600 +++++++++++ ben/src/cli/bendl.rs | 934 ----------------- ben/src/cli/bendl/append.rs | 72 ++ ben/src/cli/bendl/args.rs | 130 +++ ben/src/cli/bendl/create.rs | 93 ++ ben/src/cli/bendl/extract.rs | 47 + ben/src/cli/bendl/helpers.rs | 53 + ben/src/cli/bendl/inspect.rs | 71 ++ ben/src/cli/bendl/mod.rs | 48 + ben/src/cli/bendl/tests.rs | 478 +++++++++ ben/src/cli/common.rs | 113 --- ben/src/cli/common/mod.rs | 55 + ben/src/cli/common/tests.rs | 57 ++ ben/src/cli/{pben.rs => pben/mod.rs} | 146 +-- ben/src/cli/pben/tests.rs | 143 +++ ben/src/cli/reben.rs | 977 ------------------ ben/src/cli/reben/args.rs | 83 ++ ben/src/cli/reben/ben_mode.rs | 181 ++++ ben/src/cli/reben/helpers.rs | 99 ++ ben/src/cli/reben/json_mode.rs | 60 ++ ben/src/cli/reben/mod.rs | 34 + ben/src/cli/reben/tests.rs | 565 +++++++++++ 27 files changed, 3717 insertions(+), 3557 deletions(-) delete mode 100644 ben/src/cli/ben.rs create mode 100644 ben/src/cli/ben/args.rs create mode 100644 ben/src/cli/ben/bundle.rs create mode 100644 ben/src/cli/ben/mod.rs create mode 100644 ben/src/cli/ben/paths.rs create mode 100644 ben/src/cli/ben/tests.rs delete mode 100644 ben/src/cli/bendl.rs create mode 100644 ben/src/cli/bendl/append.rs create mode 100644 ben/src/cli/bendl/args.rs create mode 100644 ben/src/cli/bendl/create.rs create mode 100644 ben/src/cli/bendl/extract.rs create mode 100644 ben/src/cli/bendl/helpers.rs create mode 100644 ben/src/cli/bendl/inspect.rs create mode 100644 ben/src/cli/bendl/mod.rs create mode 100644 ben/src/cli/bendl/tests.rs delete mode 100644 ben/src/cli/common.rs create mode 100644 ben/src/cli/common/mod.rs create mode 100644 ben/src/cli/common/tests.rs rename ben/src/cli/{pben.rs => pben/mod.rs} (63%) create mode 100644 ben/src/cli/pben/tests.rs delete mode 100644 ben/src/cli/reben.rs create mode 100644 ben/src/cli/reben/args.rs create mode 100644 ben/src/cli/reben/ben_mode.rs create mode 100644 ben/src/cli/reben/helpers.rs create mode 100644 ben/src/cli/reben/json_mode.rs create mode 100644 ben/src/cli/reben/mod.rs create mode 100644 ben/src/cli/reben/tests.rs diff --git a/ben/src/cli/ben.rs b/ben/src/cli/ben.rs deleted file mode 100644 index d5e3324..0000000 --- a/ben/src/cli/ben.rs +++ /dev/null @@ -1,1388 +0,0 @@ -use crate::cli::common::{check_overwrite, set_verbose}; -use crate::codec::decode::{ - decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, -}; -use crate::codec::encode::{ - encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, -}; -use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH}; -use crate::io::bundle::writer::BendlAppender; -use crate::io::bundle::{AddAssetOptions, BendlWriter}; -use crate::io::reader::subsample::count_samples_from_file; -use crate::ops::extract::extract_assignment_ben; -use crate::BenVariant; -use clap::{Parser, ValueEnum}; -use std::{ - fs::{File, OpenOptions}, - io::{self, BufRead, BufReader, BufWriter, Result, Write}, - path::{Path, PathBuf}, -}; - -type DynReader = Box; -type DynWriter = Box; - -#[derive(Debug, Clone, Copy, ValueEnum, PartialEq)] -enum CliVariant { - /// Store each sample independently. - Standard, - /// Store one frame plus a repetition count for repeated consecutive samples. - #[value(alias = "mkv_chain")] - Mkvchain, - /// Store delta-encoded frames. - #[value(alias = "two_delta")] - Twodelta, -} - -/// Resolve the BEN variant from the CLI flags. -/// -/// `--variant` takes precedence over `--save-all`. -/// If neither is given, defaults to MkvChain. -fn resolve_variant(variant: Option, save_all: bool) -> BenVariant { - match variant { - Some(CliVariant::Standard) => BenVariant::Standard, - Some(CliVariant::Mkvchain) => BenVariant::MkvChain, - Some(CliVariant::Twodelta) => BenVariant::TwoDelta, - None if save_all => BenVariant::Standard, - None => BenVariant::MkvChain, - } -} - -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -/// Defines the mode of operation. -enum Mode { - /// Encode JSONL into BEN. - Encode, - /// Encode JSONL or BEN into XBEN. - XEncode, - /// Decode BEN or XBEN into its less compressed representation. - Decode, - /// Fully decode XBEN into JSONL. - XDecode, - /// Read a single sample from a BEN file. - Read, - /// Compress an arbitrary stream with XZ. - XzCompress, - /// Decompress an `.xz` file. - XzDecompress, -} - -#[derive(Parser, Debug)] -#[command( - name = "Binary Ensemble CLI Tool", - about = "This is a command line tool for encoding and decoding binary ensemble files.", - version -)] -/// Defines the command line arguments accepted by the program. -struct Args { - /// Mode to run the program in (encode, decode, or read). - #[arg(short, long, value_enum)] - mode: Mode, - /// Input file to read from. - #[arg()] - input_file: Option, - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. - #[arg(short, long)] - output_file: Option, - /// The standard behaviour is to try and derive the output file - /// name from the input file name. If this flag is set, then this - /// logic is ignored and the output is printed to stdout. - /// This flag is considered a higher priority than - /// the output_file flag, so if both are present, the output - /// will be printed to stdout. - #[arg(short, long)] - print: bool, - /// Sample number to extract. Optional. - #[arg(short = 'n', long)] - sample_number: Option, - /// If input and output files are not provided, - /// then this tells the x-encode, x-decode, and decode modes - /// that the expected formats are BEN and XBEN - #[arg(short = 'b', long)] - ben_and_xben: bool, - /// If input and output files are not provided, - /// then this tells the x-encode and x-decode modes - /// that the expected formats are JSONL and XBEN - #[arg(short = 'J', long)] - jsonl_and_xben: bool, - /// If the input and output files are not provided, - /// then this tells the decode mode that the expected - /// formats are JSONL and BEN - #[arg(short = 'j', long)] - jsonl_and_ben: bool, - /// When saving a file in the BEN format, the deault is to have - /// an assignment vector saved followed by the number of repetitions - /// of that assignment vector (this is useful for Markov chian methods - /// like ReCom). This flag will cause the program to forgo the repetition - /// count and just save all of the assignment vectors as they are encountered. - /// Equivalent to `--variant standard`. Ignored if `--variant` is set. - #[arg(short = 'a', long)] - save_all: bool, - /// BEN variant to use when encoding. - /// Possible values: standard, mkvchain, twodelta. - /// Defaults to mkvchain if neither this nor --save-all is given. - /// Takes precedence over --save-all when both are provided. - #[arg(short = 't', long, value_enum)] - variant: Option, - /// If the output file already exists, this flag - /// will cause the program to overwrite it without - /// asking the user for confirmation. - #[arg(short = 'w', long)] - overwrite: bool, - /// Enables verbose printing for the CLI. Optional. - #[arg(short, long)] - verbose: bool, - /// When running x-encoder, this flag will determine the number of cpus to use on the - /// system. By default, all available cpus will be used. - #[arg(short = 'c', long)] - n_cpus: Option, - /// When running x-encoder, this flag will deterimine the level of compression to use. - /// By default, the highest level of compression will be used. - /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. - #[arg(short = 'l', long)] - compression_level: Option, - /// Number of TwoDelta delta frames per columnar chunk in XBEN encoding. - /// Only affects TwoDelta variant. Larger chunks improve XZ compression. - /// Default is 10,000. - #[arg(long)] - chunk_size: Option, - /// Embed a graph JSON asset alongside the assignment stream and emit - /// the result as a `.bendl` bundle. The graph is added after the - /// assignment stream has been fully written. Only applies to the - /// `encode` and `x-encode` modes. - #[arg(long)] - graph: Option, -} - -/// Derive the output path for encode-style CLI modes. -/// -/// # Arguments -/// -/// * `mode` - The encode-oriented CLI mode being executed. -/// * `input_file_name` - The input file path supplied by the user. -/// * `output_file_name` - An optional explicit output path. -/// * `overwrite` - Whether to skip overwrite prompting. -/// * `with_graph` - When true, the output is a `.bendl` bundle instead -/// of a bare `.ben`/`.xben` stream, so the derived extension is -/// `.bendl` regardless of `mode`. -/// -/// # Returns -/// -/// Returns the resolved output path. -fn encode_setup( - mode: Mode, - input_file_name: String, - output_file_name: Option, - overwrite: bool, - with_graph: bool, -) -> Result { - let extension = if with_graph { - ".bendl" - } else if mode == Mode::XEncode { - ".xben" - } else if mode == Mode::Encode { - ".ben" - } else { - ".xz" - }; - - let out_file_name = match output_file_name { - Some(name) => name.to_owned(), - None => { - let stripped_ben = input_file_name.ends_with(".ben") - && (extension == ".xben" || extension == ".bendl"); - let stripped_xben = input_file_name.ends_with(".xben") && extension == ".bendl"; - if stripped_ben { - input_file_name.trim_end_matches(".ben").to_owned() + extension - } else if stripped_xben { - input_file_name.trim_end_matches(".xben").to_owned() + extension - } else { - input_file_name.to_string() + extension - } - } - }; - - check_overwrite(&out_file_name, overwrite)?; - Ok(out_file_name) -} - -/// Derive the output path for decode-style CLI modes. -/// -/// # Arguments -/// -/// * `in_file_name` - The input file path supplied by the user. -/// * `out_file_name` - An optional explicit output path. -/// * `full_decode` - Whether the decode should go all the way to JSONL instead -/// of stopping at BEN. -/// * `overwrite` - Whether to skip overwrite prompting. -/// -/// # Returns -/// -/// Returns the resolved output path. -fn decode_setup( - in_file_name: String, - out_file_name: Option, - full_decode: bool, - overwrite: bool, -) -> Result { - let out_file_name = if let Some(name) = out_file_name { - name.to_owned() - } else if in_file_name.ends_with(".ben") { - in_file_name.trim_end_matches(".ben").to_owned() - } else if in_file_name.ends_with(".xben") { - if !full_decode { - in_file_name.trim_end_matches(".xben").to_owned() + ".ben" - } else { - in_file_name.trim_end_matches(".xben").to_owned() - } - } else if in_file_name.ends_with(".xz") { - eprintln!( - "Error: Unsupported file type for decode mode {:?}. Please decompress xz files with \ - either the xz command line tool or the xz-decompress mode of this tool.", - in_file_name - ); - return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); - } else { - eprintln!( - "Error: Unsupported file type for decode mode {:?}. Supported types are .ben and .xben.", - in_file_name - ); - return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); - }; - - check_overwrite(&out_file_name, overwrite)?; - Ok(out_file_name) -} - -/// Open either the requested input file or stdin. -/// -/// # Arguments -/// -/// * `input_file` - An optional input file path. -/// -/// # Returns -/// -/// Returns a buffered reader for the requested file or stdin. -fn open_reader(input_file: Option<&str>) -> DynReader { - match input_file { - Some(path) => Box::new(BufReader::new(File::open(path).unwrap())), - None => Box::new(BufReader::new(io::stdin())), - } -} - -/// Open either the requested output file or stdout. -/// -/// # Arguments -/// -/// * `output_file` - An optional output file path. -/// * `print` - Whether output should be forced to stdout. -/// * `overwrite` - Whether to skip overwrite prompting for file outputs. -/// -/// # Returns -/// -/// Returns a buffered writer for the requested file or stdout. -fn open_writer(output_file: Option<&str>, print: bool, overwrite: bool) -> Result { - if print { - return Ok(Box::new(BufWriter::new(io::stdout()))); - } - - match output_file { - Some(path) => { - check_overwrite(path, overwrite)?; - Ok(Box::new(BufWriter::new(File::create(path).unwrap()))) - } - None => Ok(Box::new(BufWriter::new(io::stdout()))), - } -} - -/// Open a writer for a path computed by one of the setup helpers. -/// -/// # Arguments -/// -/// * `path` - The output path to create. -/// -/// # Returns -/// -/// Returns a buffered writer for `path`. -fn open_derived_writer(path: String) -> DynWriter { - Box::new(BufWriter::new(File::create(path).unwrap())) -} - -/// Count the number of non-empty lines in a JSONL file. Used to populate -/// the bundle header's `sample_count` when wrapping a stream encode in a -/// `.bendl` container. -fn count_jsonl_lines(path: &Path) -> io::Result { - let file = File::open(path)?; - let reader = BufReader::new(file); - let mut n: i64 = 0; - for line in reader.lines() { - let line = line?; - if !line.is_empty() { - n += 1; - } - } - Ok(n) -} - -/// After a finalized `.bendl` has been written, reopen it in append mode -/// and attach the graph asset in-place. This runs *after* the stream has -/// finished, which is why we print "Adding graph..." at this point. -fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { - eprintln!("Adding graph..."); - let graph_bytes = std::fs::read(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("failed to read graph {graph_path:?}: {e}"), - ) - })?; - - let file = OpenOptions::new().read(true).write(true).open(out_path)?; - let mut appender = BendlAppender::open(file) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - appender - .add_asset( - ASSET_TYPE_GRAPH, - CANONICAL_NAME_GRAPH, - &graph_bytes, - AddAssetOptions::defaults().json(), - ) - .map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("failed to add graph asset: {e}"), - ) - })?; - appender - .commit() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - Ok(()) -} - -/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at -/// `out_path` and then append the graph as a post-stream asset. -fn run_encode_bundle_with_graph( - input_path: &Path, - out_path: &str, - variant: BenVariant, - graph_path: &Path, -) -> Result<()> { - // Validate the graph file is readable before we do any real work, - // so a bad --graph path doesn't leave a half-written bundle behind. - std::fs::metadata(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("failed to stat graph {graph_path:?}: {e}"), - ) - })?; - - let sample_count = count_jsonl_lines(input_path)?; - - let out_file = File::create(out_path)?; - let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - { - let mut handle = bendl_writer - .begin_stream() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - let reader = BufReader::new(File::open(input_path)?); - encode_jsonl_to_ben(reader, &mut handle, variant)?; - handle - .finish(sample_count) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - } - bendl_writer - .finish() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - - append_graph_asset(out_path, graph_path) -} - -/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` -/// bundle at `out_path` and then append the graph as a post-stream asset. -#[allow(clippy::too_many_arguments)] -fn run_xencode_bundle_with_graph( - input_path: &Path, - out_path: &str, - variant: BenVariant, - from_ben: bool, - n_threads: Option, - compression_level: Option, - chunk_size: Option, - graph_path: &Path, -) -> Result<()> { - std::fs::metadata(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("failed to stat graph {graph_path:?}: {e}"), - ) - })?; - - let sample_count: i64 = if from_ben { - count_samples_from_file(input_path, "ben")? as i64 - } else { - count_jsonl_lines(input_path)? - }; - - let out_file = File::create(out_path)?; - let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - { - let mut handle = bendl_writer - .begin_stream() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - let reader = BufReader::new(File::open(input_path)?); - if from_ben { - encode_ben_to_xben( - reader, - &mut handle, - n_threads, - compression_level, - chunk_size, - )?; - } else { - encode_jsonl_to_xben( - reader, - &mut handle, - variant, - n_threads, - compression_level, - chunk_size, - )?; - } - handle - .finish(sample_count) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - } - bendl_writer - .finish() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; - - append_graph_asset(out_path, graph_path) -} - -/// Parse CLI arguments and execute the selected `ben` sub-mode. -pub fn run() { - let args = Args::parse(); - set_verbose(args.verbose); - - // --graph is only meaningful for the stream-producing modes. - if args.graph.is_some() && args.mode != Mode::Encode && args.mode != Mode::XEncode { - eprintln!("Error: --graph is only supported with --mode encode or --mode x-encode"); - return; - } - - match args.mode { - Mode::Encode => { - tracing::trace!("Running in encode mode"); - - // --graph path: produce a .bendl bundle with the BEN stream - // plus a post-stream graph asset. - if let Some(graph_path) = args.graph.as_ref() { - let in_file = match args.input_file.as_ref() { - Some(f) => f, - None => { - eprintln!("Error: --graph requires an input file (stdin not supported)."); - return; - } - }; - if args.print { - eprintln!("Error: --graph is incompatible with --print."); - return; - } - let out_path = match encode_setup( - args.mode, - in_file.clone(), - args.output_file.clone(), - args.overwrite, - true, - ) { - Ok(path) => path, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = - run_encode_bundle_with_graph(Path::new(in_file), &out_path, variant, graph_path) - { - eprintln!("Error: {:?}", err); - } - return; - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(in_file) if !args.print => match encode_setup( - args.mode, - in_file.clone(), - args.output_file.clone(), - args.overwrite, - false, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = encode_jsonl_to_ben(reader, writer, variant) { - eprintln!("Error: {:?}", err); - } - } - Mode::XEncode => { - tracing::trace!("Running in xencode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_xben = args.jsonl_and_xben; - - if let Some(in_file) = args.input_file.as_ref() { - if in_file.ends_with(".ben") { - ben_and_xben = true; - } else if in_file.ends_with(".jsonl") { - jsonl_and_xben = true; - } - } - - // --graph path: produce a .bendl bundle with the XBEN stream - // plus a post-stream graph asset. - if let Some(graph_path) = args.graph.as_ref() { - let in_file = match args.input_file.as_ref() { - Some(f) => f, - None => { - eprintln!("Error: --graph requires an input file (stdin not supported)."); - return; - } - }; - if args.print { - eprintln!("Error: --graph is incompatible with --print."); - return; - } - if !ben_and_xben && !jsonl_and_xben { - eprintln!("Error: Unsupported file type(s) for xencode mode"); - return; - } - let out_path = match encode_setup( - args.mode, - in_file.clone(), - args.output_file.clone(), - args.overwrite, - true, - ) { - Ok(path) => path, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = run_xencode_bundle_with_graph( - Path::new(in_file), - &out_path, - variant, - ben_and_xben, - args.n_cpus, - args.compression_level, - args.chunk_size, - graph_path, - ) { - eprintln!("Error: {:?}", err); - } - return; - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(in_file) if !args.print => match encode_setup( - args.mode, - in_file.clone(), - args.output_file.clone(), - args.overwrite, - false, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if ben_and_xben { - if let Err(err) = encode_ben_to_xben( - reader, - writer, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_xben { - let variant = resolve_variant(args.variant, args.save_all); - if let Err(e) = encode_jsonl_to_xben( - reader, - writer, - variant, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) { - eprintln!("Error: {:?}", e); - } - } else { - eprintln!("Error: Unsupported file type(s) for xencode mode"); - } - } - Mode::Decode => { - tracing::trace!("Running in decode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_ben = args.jsonl_and_ben; - - if let Some(file) = args.input_file.as_ref() { - if file.ends_with(".ben") { - jsonl_and_ben = true; - } else if file.ends_with(".xben") { - ben_and_xben = true; - } - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(file) if !args.print => { - match decode_setup( - file.clone(), - args.output_file.clone(), - false, - args.overwrite, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - } - } - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if ben_and_xben { - if let Err(err) = decode_xben_to_ben(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_ben { - if let Err(err) = decode_ben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else { - eprintln!("Error: Unsupported file type(s) for decode mode"); - } - } - Mode::XDecode => { - tracing::trace!("Running in x-decode mode"); - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(file) if !args.print => { - match decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite) - { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - } - } - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if let Err(err) = decode_xben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } - Mode::Read => { - tracing::trace!("Running in read mode"); - let reader = BufReader::new( - File::open( - &args - .input_file - .expect("Must provide input file for read mode."), - ) - .unwrap(), - ); - - if args.sample_number.is_none() { - eprintln!("Error: Sample number is required in read mode"); - return; - } - - let mut writer = match open_writer(args.output_file.as_deref(), args.print, false) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - - args.sample_number - .map(|n| match extract_assignment_ben(reader, n) { - Ok(vec) => writer.write_all(format!("{:?}\n", vec).as_bytes()).unwrap(), - Err(e) => eprintln!("Error: {:?}", e), - }); - } - Mode::XzCompress => { - tracing::trace!("Running in xz compress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-compress mode."); - let reader = BufReader::new(File::open(&in_file_name).unwrap()); - - let out_file_name = match args.output_file { - Some(name) => name, - None => in_file_name + ".xz", - }; - - if let Err(err) = check_overwrite(&out_file_name, args.overwrite) { - eprintln!("Error: {:?}", err); - return; - } - - let writer = BufWriter::new(File::create(out_file_name).unwrap()); - - if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { - eprintln!("Error: {:?}", err); - } - tracing::trace!("Done!"); - } - Mode::XzDecompress => { - tracing::trace!("Running in xz decompress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-decompress mode."); - - if !in_file_name.ends_with(".xz") { - eprintln!("Error: Unsupported file type for xz decompress mode"); - return; - } - - let output_file_name = match args.output_file { - Some(name) => name, - None => in_file_name[..in_file_name.len() - 3].to_string(), - }; - - if let Err(err) = check_overwrite(&output_file_name, args.overwrite) { - eprintln!("Error: {:?}", err); - return; - } - - let reader = BufReader::new(File::open(&in_file_name).unwrap()); - let writer = BufWriter::new(File::create(output_file_name).unwrap()); - - if let Err(err) = xz_decompress(reader, writer) { - eprintln!("Error: {:?}", err); - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use clap::{CommandFactory, Parser}; - use std::fs; - use std::time::{SystemTime, UNIX_EPOCH}; - - fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("ben-cli-ben-{name}-{nonce}")) - } - - #[test] - fn clap_metadata_uses_package_version() { - let mut command = Args::command(); - let help = command.render_long_help().to_string(); - - assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); - assert!(help.contains("Binary Ensemble CLI Tool")); - assert!(help.contains("--mode")); - assert!(help.contains("x-encode")); - } - - #[test] - fn parse_encode_args() { - let args = Args::try_parse_from([ - "ben", - "--mode", - "encode", - "--output-file", - "out.ben", - "--save-all", - "--verbose", - "input.jsonl", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::Encode); - assert_eq!(args.input_file.as_deref(), Some("input.jsonl")); - assert_eq!(args.output_file.as_deref(), Some("out.ben")); - assert!(args.save_all); - assert!(args.verbose); - } - - #[test] - fn parse_variant_flag() { - let args = Args::try_parse_from([ - "ben", - "--mode", - "encode", - "--variant", - "twodelta", - "input.jsonl", - ]) - .unwrap(); - - assert_eq!(args.variant, Some(CliVariant::Twodelta)); - } - - #[test] - fn parse_variant_aliases() { - let args = Args::try_parse_from([ - "ben", - "--mode", - "encode", - "--variant", - "mkv_chain", - "input.jsonl", - ]) - .unwrap(); - assert_eq!(args.variant, Some(CliVariant::Mkvchain)); - - let args = Args::try_parse_from([ - "ben", - "--mode", - "encode", - "--variant", - "two_delta", - "input.jsonl", - ]) - .unwrap(); - assert_eq!(args.variant, Some(CliVariant::Twodelta)); - } - - #[test] - fn resolve_variant_precedence() { - // --variant takes precedence over --save-all - assert_eq!( - resolve_variant(Some(CliVariant::Twodelta), true), - BenVariant::TwoDelta - ); - assert_eq!( - resolve_variant(Some(CliVariant::Mkvchain), true), - BenVariant::MkvChain - ); - // --save-all alone means Standard - assert_eq!(resolve_variant(None, true), BenVariant::Standard); - // neither means MkvChain - assert_eq!(resolve_variant(None, false), BenVariant::MkvChain); - } - - #[test] - fn parse_xencode_stream_flags() { - let args = Args::try_parse_from([ - "ben", - "--mode", - "x-encode", - "--jsonl-and-xben", - "--ben-and-xben", - "--jsonl-and-ben", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::XEncode); - assert!(args.jsonl_and_xben); - assert!(args.ben_and_xben); - assert!(args.jsonl_and_ben); - } - - #[test] - fn encode_setup_derives_extensions() { - assert_eq!( - encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, false).unwrap(), - "samples.jsonl.ben" - ); - assert_eq!( - encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, false).unwrap(), - "samples.xben" - ); - assert_eq!( - encode_setup( - Mode::XzCompress, - "samples.jsonl".to_string(), - None, - true, - false - ) - .unwrap(), - "samples.jsonl.xz" - ); - } - - #[test] - fn encode_setup_with_graph_derives_bendl_extension() { - // JSONL + encode + graph → .bendl - assert_eq!( - encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, true).unwrap(), - "samples.jsonl.bendl" - ); - // .ben input to x-encode with graph trims the .ben suffix - assert_eq!( - encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, true).unwrap(), - "samples.bendl" - ); - // .xben input to x-encode with graph trims the .xben suffix - assert_eq!( - encode_setup(Mode::XEncode, "samples.xben".to_string(), None, true, true).unwrap(), - "samples.bendl" - ); - } - - #[test] - fn encode_setup_respects_explicit_output() { - assert_eq!( - encode_setup( - Mode::Encode, - "ignored.jsonl".to_string(), - Some("custom-output.ben".to_string()), - true, - false, - ) - .unwrap(), - "custom-output.ben" - ); - } - - #[test] - fn encode_setup_checks_overwrite() { - let path = unique_path("existing.ben"); - fs::write(&path, "already here").unwrap(); - - let err = encode_setup( - Mode::Encode, - "input.jsonl".to_string(), - Some(path.to_string_lossy().into_owned()), - true, - false, - ); - assert!(err.is_ok()); - - fs::remove_file(path).unwrap(); - } - - #[test] - fn decode_setup_derives_ben_and_xben_outputs() { - assert_eq!( - decode_setup("samples.ben".to_string(), None, false, true).unwrap(), - "samples" - ); - assert_eq!( - decode_setup("samples.xben".to_string(), None, false, true).unwrap(), - "samples.ben" - ); - assert_eq!( - decode_setup("samples.xben".to_string(), None, true, true).unwrap(), - "samples" - ); - } - - #[test] - fn decode_setup_rejects_xz_input() { - let err = decode_setup("samples.xz".to_string(), None, false, true).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - } - - #[test] - fn decode_setup_rejects_unknown_input() { - let err = decode_setup("samples.data".to_string(), None, false, true).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - } - - #[test] - fn decode_setup_respects_explicit_output() { - assert_eq!( - decode_setup( - "samples.xben".to_string(), - Some("custom.jsonl".to_string()), - true, - true, - ) - .unwrap(), - "custom.jsonl" - ); - } - - #[test] - fn open_reader_reads_file_contents() { - let path = unique_path("reader.txt"); - fs::write(&path, "hello\nworld\n").unwrap(); - - let mut reader = open_reader(Some(path.to_str().unwrap())); - let mut content = String::new(); - std::io::Read::read_to_string(&mut reader, &mut content).unwrap(); - - assert_eq!(content, "hello\nworld\n"); - fs::remove_file(path).unwrap(); - } - - #[test] - fn open_reader_accepts_stdin() { - let _reader = open_reader(None); - } - - #[test] - fn open_writer_creates_file_and_writes() { - let path = unique_path("writer.txt"); - { - let mut writer = open_writer(Some(path.to_str().unwrap()), false, true).unwrap(); - writer.write_all(b"written").unwrap(); - } - - assert_eq!(fs::read_to_string(&path).unwrap(), "written"); - fs::remove_file(path).unwrap(); - } - - #[test] - fn open_writer_supports_stdout_and_print() { - let mut stdout_writer = open_writer(None, false, true).unwrap(); - stdout_writer.write_all(b"").unwrap(); - - let mut print_writer = open_writer(Some("ignored.txt"), true, false).unwrap(); - print_writer.write_all(b"").unwrap(); - } - - #[test] - fn open_derived_writer_creates_file() { - let path = unique_path("derived.txt"); - { - let mut writer = open_derived_writer(path.to_string_lossy().into_owned()); - writer.write_all(b"derived").unwrap(); - } - - assert_eq!(fs::read_to_string(&path).unwrap(), "derived"); - fs::remove_file(path).unwrap(); - } - - #[test] - fn resolve_variant_standard_arm() { - assert_eq!( - resolve_variant(Some(CliVariant::Standard), false), - BenVariant::Standard - ); - } - - #[test] - fn count_jsonl_lines_counts_nonempty_lines() { - let path = unique_path("count.jsonl"); - fs::write(&path, b"{\"a\":1}\n\n{\"b\":2}\n").unwrap(); - let count = count_jsonl_lines(&path).unwrap(); - assert_eq!(count, 2); - fs::remove_file(path).unwrap(); - } - - /// Write a two-sample Standard BEN JSONL file to a temp path. - fn write_temp_jsonl(name: &str) -> std::path::PathBuf { - let path = unique_path(name); - fs::write( - &path, - b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n", - ) - .unwrap(); - path - } - - /// Write a minimal graph JSON file to a temp path. - fn write_temp_graph(name: &str) -> std::path::PathBuf { - let path = unique_path(name); - fs::write(&path, b"{\"nodes\":[0,1,2],\"adj\":[[1],[0,2],[1]]}").unwrap(); - path - } - - #[test] - fn append_graph_asset_adds_graph_to_bundle() { - use crate::io::bundle::{BendlReader, BendlWriter}; - use crate::io::bundle::format::AssignmentFormat; - use std::io::Cursor; - - // Build a minimal finalized .bendl in memory, write to temp file. - let mut buf: Vec = Vec::new(); - { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); - writer.finish().unwrap(); - } - let bendl_path = unique_path("append_graph.bendl"); - fs::write(&bendl_path, &buf).unwrap(); - - let graph_path = write_temp_graph("append_graph.json"); - - append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap(); - - // Verify the graph asset was added. - let file = fs::File::open(&bendl_path).unwrap(); - let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.find_asset_by_name("graph.json").is_some()); - - fs::remove_file(&bendl_path).unwrap(); - fs::remove_file(&graph_path).unwrap(); - } - - #[test] - fn run_encode_bundle_with_graph_creates_bendl() { - use crate::io::bundle::BendlReader; - - let jsonl = write_temp_jsonl("enc_graph_input.jsonl"); - let graph = write_temp_graph("enc_graph.json"); - let out = unique_path("enc_graph_output.bendl"); - - run_encode_bundle_with_graph(&jsonl, out.to_str().unwrap(), BenVariant::Standard, &graph) - .unwrap(); - - let file = fs::File::open(&out).unwrap(); - let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - assert_eq!(reader.sample_count(), Some(2)); - - fs::remove_file(&jsonl).unwrap(); - fs::remove_file(&graph).unwrap(); - fs::remove_file(&out).unwrap(); - } - - #[test] - fn run_xencode_bundle_with_graph_from_jsonl_creates_bendl() { - use crate::io::bundle::BendlReader; - - let jsonl = write_temp_jsonl("xencode_graph_input.jsonl"); - let graph = write_temp_graph("xencode_graph.json"); - let out = unique_path("xencode_graph_output.bendl"); - - run_xencode_bundle_with_graph( - &jsonl, - out.to_str().unwrap(), - BenVariant::Standard, - false, - None, - None, - None, - &graph, - ) - .unwrap(); - - let file = fs::File::open(&out).unwrap(); - let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - - fs::remove_file(&jsonl).unwrap(); - fs::remove_file(&graph).unwrap(); - fs::remove_file(&out).unwrap(); - } - - #[test] - fn run_xencode_bundle_with_graph_from_ben_creates_bendl() { - use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::bundle::BendlReader; - use std::io::Cursor; - - // First create a BEN file from JSONL. - let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n{\"assignment\":[2,1],\"sample\":2}\n"; - let mut ben_bytes = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben_bytes, BenVariant::Standard).unwrap(); - let ben_path = unique_path("xencode_from_ben_input.ben"); - fs::write(&ben_path, &ben_bytes).unwrap(); - - let graph = write_temp_graph("xencode_from_ben_graph.json"); - let out = unique_path("xencode_from_ben_output.bendl"); - - run_xencode_bundle_with_graph( - &ben_path, - out.to_str().unwrap(), - BenVariant::Standard, - true, - None, - None, - None, - &graph, - ) - .unwrap(); - - let file = fs::File::open(&out).unwrap(); - let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - - fs::remove_file(&ben_path).unwrap(); - fs::remove_file(&graph).unwrap(); - fs::remove_file(&out).unwrap(); - } - - #[test] - fn append_graph_asset_errors_on_missing_graph_file() { - use crate::io::bundle::{BendlWriter}; - use crate::io::bundle::format::AssignmentFormat; - use std::io::Cursor; - - let mut buf: Vec = Vec::new(); - { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); - writer.finish().unwrap(); - } - let bendl_path = unique_path("err_graph.bendl"); - fs::write(&bendl_path, &buf).unwrap(); - - let nonexistent = unique_path("nonexistent.json"); - let err = append_graph_asset(bendl_path.to_str().unwrap(), &nonexistent).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::Other); - assert!(err.to_string().contains("failed to read graph")); - let _ = fs::remove_file(&bendl_path); - } - - #[test] - fn run_encode_bundle_with_graph_errors_on_missing_graph() { - let jsonl = write_temp_jsonl("err_enc_input.jsonl"); - let out = unique_path("err_enc_output.bendl"); - let nonexistent = unique_path("nonexistent.json"); - - let err = run_encode_bundle_with_graph( - &jsonl, out.to_str().unwrap(), BenVariant::Standard, &nonexistent, - ) - .unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::Other); - assert!(err.to_string().contains("failed to stat graph")); - let _ = fs::remove_file(&jsonl); - let _ = fs::remove_file(&out); - } - - #[test] - fn run_xencode_bundle_with_graph_errors_on_missing_graph() { - let jsonl = write_temp_jsonl("err_xenc_input.jsonl"); - let out = unique_path("err_xenc_output.bendl"); - let nonexistent = unique_path("nonexistent.json"); - - let err = run_xencode_bundle_with_graph( - &jsonl, out.to_str().unwrap(), BenVariant::Standard, false, - None, None, None, &nonexistent, - ) - .unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::Other); - assert!(err.to_string().contains("failed to stat graph")); - let _ = fs::remove_file(&jsonl); - let _ = fs::remove_file(&out); - } - - #[test] - fn append_graph_asset_errors_when_bundle_already_has_graph() { - use crate::io::bundle::{AddAssetOptions, BendlWriter}; - use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH}; - use std::io::Cursor; - - // Build a .bendl that already contains graph.json. - let mut buf: Vec = Vec::new(); - { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer - .add_asset(ASSET_TYPE_GRAPH, "graph.json", b"{}", AddAssetOptions::defaults().json()) - .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); - writer.finish().unwrap(); - } - let bendl_path = unique_path("dup_graph.bendl"); - fs::write(&bendl_path, &buf).unwrap(); - - // graph.json already exists — add_asset must fail with duplicate name. - let graph_path = write_temp_graph("dup_graph.json"); - let err = append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::Other); - assert!(err.to_string().contains("failed to add graph asset")); - - let _ = fs::remove_file(&bendl_path); - let _ = fs::remove_file(&graph_path); - } - - #[test] - fn run_xencode_bundle_with_graph_errors_on_invalid_jsonl() { - // from_ben=false path: encode_jsonl_to_xben fails on invalid JSONL. - let bad_jsonl = unique_path("bad.jsonl"); - fs::write(&bad_jsonl, b"not valid json\n").unwrap(); - let graph = write_temp_graph("xenc_bad_jsonl_graph.json"); - let out = unique_path("xenc_bad_jsonl.bendl"); - - let err = run_xencode_bundle_with_graph( - &bad_jsonl, out.to_str().unwrap(), BenVariant::Standard, false, - None, None, None, &graph, - ) - .unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidData); - - let _ = fs::remove_file(&bad_jsonl); - let _ = fs::remove_file(&graph); - let _ = fs::remove_file(&out); - } - - #[test] - fn run_xencode_bundle_with_graph_errors_on_invalid_ben() { - // from_ben=true path: encode_ben_to_xben fails on a file with no BEN banner. - let bad_ben = unique_path("bad.ben"); - fs::write(&bad_ben, b"this is not a ben file").unwrap(); - let graph = write_temp_graph("xenc_bad_ben_graph.json"); - let out = unique_path("xenc_bad_ben.bendl"); - - let err = run_xencode_bundle_with_graph( - &bad_ben, out.to_str().unwrap(), BenVariant::Standard, true, - None, None, None, &graph, - ) - .unwrap_err(); - // encode_ben_to_xben fails when it can't read a valid banner. - assert!(err.kind() != io::ErrorKind::NotFound); - - let _ = fs::remove_file(&bad_ben); - let _ = fs::remove_file(&graph); - let _ = fs::remove_file(&out); - } -} diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs new file mode 100644 index 0000000..8594f84 --- /dev/null +++ b/ben/src/cli/ben/args.rs @@ -0,0 +1,137 @@ +use crate::BenVariant; +use clap::{Parser, ValueEnum}; +use std::path::PathBuf; + +#[derive(Debug, Clone, Copy, ValueEnum, PartialEq)] +pub(super) enum CliVariant { + /// Store each sample independently. + Standard, + /// Store one frame plus a repetition count for repeated consecutive samples. + #[value(alias = "mkv_chain")] + Mkvchain, + /// Store delta-encoded frames. + #[value(alias = "two_delta")] + Twodelta, +} + +/// Resolve the BEN variant from the CLI flags. +/// +/// `--variant` takes precedence over `--save-all`. +/// If neither is given, defaults to MkvChain. +pub(super) fn resolve_variant(variant: Option, save_all: bool) -> BenVariant { + match variant { + Some(CliVariant::Standard) => BenVariant::Standard, + Some(CliVariant::Mkvchain) => BenVariant::MkvChain, + Some(CliVariant::Twodelta) => BenVariant::TwoDelta, + None if save_all => BenVariant::Standard, + None => BenVariant::MkvChain, + } +} + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +pub(super) enum Mode { + /// Encode JSONL into BEN. + Encode, + /// Encode JSONL or BEN into XBEN. + XEncode, + /// Decode BEN or XBEN into its less compressed representation. + Decode, + /// Fully decode XBEN into JSONL. + XDecode, + /// Read a single sample from a BEN file. + Read, + /// Compress an arbitrary stream with XZ. + XzCompress, + /// Decompress an `.xz` file. + XzDecompress, +} + +#[derive(Parser, Debug)] +#[command( + name = "Binary Ensemble CLI Tool", + about = "This is a command line tool for encoding and decoding binary ensemble files.", + version +)] +/// Defines the command line arguments accepted by the program. +pub(super) struct Args { + /// Mode to run the program in (encode, decode, or read). + #[arg(short, long, value_enum)] + pub mode: Mode, + /// Input file to read from. + #[arg()] + pub input_file: Option, + /// Output file to write to. Optional. + /// If not provided, the output file will be determined + /// based on the input file and the mode of operation. + #[arg(short, long)] + pub output_file: Option, + /// The standard behaviour is to try and derive the output file + /// name from the input file name. If this flag is set, then this + /// logic is ignored and the output is printed to stdout. + /// This flag is considered a higher priority than + /// the output_file flag, so if both are present, the output + /// will be printed to stdout. + #[arg(short, long)] + pub print: bool, + /// Sample number to extract. Optional. + #[arg(short = 'n', long)] + pub sample_number: Option, + /// If input and output files are not provided, + /// then this tells the x-encode, x-decode, and decode modes + /// that the expected formats are BEN and XBEN + #[arg(short = 'b', long)] + pub ben_and_xben: bool, + /// If input and output files are not provided, + /// then this tells the x-encode and x-decode modes + /// that the expected formats are JSONL and XBEN + #[arg(short = 'J', long)] + pub jsonl_and_xben: bool, + /// If the input and output files are not provided, + /// then this tells the decode mode that the expected + /// formats are JSONL and BEN + #[arg(short = 'j', long)] + pub jsonl_and_ben: bool, + /// When saving a file in the BEN format, the deault is to have + /// an assignment vector saved followed by the number of repetitions + /// of that assignment vector (this is useful for Markov chian methods + /// like ReCom). This flag will cause the program to forgo the repetition + /// count and just save all of the assignment vectors as they are encountered. + /// Equivalent to `--variant standard`. Ignored if `--variant` is set. + #[arg(short = 'a', long)] + pub save_all: bool, + /// BEN variant to use when encoding. + /// Possible values: standard, mkvchain, twodelta. + /// Defaults to mkvchain if neither this nor --save-all is given. + /// Takes precedence over --save-all when both are provided. + #[arg(short = 't', long, value_enum)] + pub variant: Option, + /// If the output file already exists, this flag + /// will cause the program to overwrite it without + /// asking the user for confirmation. + #[arg(short = 'w', long)] + pub overwrite: bool, + /// Enables verbose printing for the CLI. Optional. + #[arg(short, long)] + pub verbose: bool, + /// When running x-encoder, this flag will determine the number of cpus to use on the + /// system. By default, all available cpus will be used. + #[arg(short = 'c', long)] + pub n_cpus: Option, + /// When running x-encoder, this flag will deterimine the level of compression to use. + /// By default, the highest level of compression will be used. + /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. + #[arg(short = 'l', long)] + pub compression_level: Option, + /// Number of TwoDelta delta frames per columnar chunk in XBEN encoding. + /// Only affects TwoDelta variant. Larger chunks improve XZ compression. + /// Default is 10,000. + #[arg(long)] + pub chunk_size: Option, + /// Embed a graph JSON asset alongside the assignment stream and emit + /// the result as a `.bendl` bundle. The graph is added after the + /// assignment stream has been fully written. Only applies to the + /// `encode` and `x-encode` modes. + #[arg(long)] + pub graph: Option, +} diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs new file mode 100644 index 0000000..83644f7 --- /dev/null +++ b/ben/src/cli/ben/bundle.rs @@ -0,0 +1,146 @@ +use super::paths::count_jsonl_lines; +use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; +use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH}; +use crate::io::bundle::writer::BendlAppender; +use crate::io::bundle::{AddAssetOptions, BendlWriter}; +use crate::io::reader::subsample::count_samples_from_file; +use crate::BenVariant; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, Result}; +use std::path::Path; + +/// After a finalized `.bendl` has been written, reopen it in append mode +/// and attach the graph asset in-place. This runs *after* the stream has +/// finished, which is why we print "Adding graph..." at this point. +pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { + eprintln!("Adding graph..."); + let graph_bytes = std::fs::read(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to read graph {graph_path:?}: {e}"), + ) + })?; + + let file = OpenOptions::new().read(true).write(true).open(out_path)?; + let mut appender = BendlAppender::open(file) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + appender + .add_asset( + ASSET_TYPE_GRAPH, + CANONICAL_NAME_GRAPH, + &graph_bytes, + AddAssetOptions::defaults().json(), + ) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to add graph asset: {e}"), + ) + })?; + appender + .commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + Ok(()) +} + +/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at +/// `out_path` and then append the graph as a post-stream asset. +pub(super) fn run_encode_bundle_with_graph( + input_path: &Path, + out_path: &str, + variant: BenVariant, + graph_path: &Path, +) -> Result<()> { + // Validate the graph file is readable before we do any real work, + // so a bad --graph path doesn't leave a half-written bundle behind. + std::fs::metadata(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to stat graph {graph_path:?}: {e}"), + ) + })?; + + let sample_count = count_jsonl_lines(input_path)?; + + let out_file = File::create(out_path)?; + let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + { + let mut handle = bendl_writer + .begin_stream() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let reader = BufReader::new(File::open(input_path)?); + encode_jsonl_to_ben(reader, &mut handle, variant)?; + handle + .finish(sample_count) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + } + bendl_writer + .finish() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + + append_graph_asset(out_path, graph_path) +} + +/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` +/// bundle at `out_path` and then append the graph as a post-stream asset. +#[allow(clippy::too_many_arguments)] +pub(super) fn run_xencode_bundle_with_graph( + input_path: &Path, + out_path: &str, + variant: BenVariant, + from_ben: bool, + n_threads: Option, + compression_level: Option, + chunk_size: Option, + graph_path: &Path, +) -> Result<()> { + std::fs::metadata(graph_path).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to stat graph {graph_path:?}: {e}"), + ) + })?; + + let sample_count: i64 = if from_ben { + count_samples_from_file(input_path, "ben")? as i64 + } else { + count_jsonl_lines(input_path)? + }; + + let out_file = File::create(out_path)?; + let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + { + let mut handle = bendl_writer + .begin_stream() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let reader = BufReader::new(File::open(input_path)?); + if from_ben { + encode_ben_to_xben( + reader, + &mut handle, + n_threads, + compression_level, + chunk_size, + )?; + } else { + encode_jsonl_to_xben( + reader, + &mut handle, + variant, + n_threads, + compression_level, + chunk_size, + )?; + } + handle + .finish(sample_count) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + } + bendl_writer + .finish() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + + append_graph_asset(out_path, graph_path) +} diff --git a/ben/src/cli/ben/mod.rs b/ben/src/cli/ben/mod.rs new file mode 100644 index 0000000..0a350cf --- /dev/null +++ b/ben/src/cli/ben/mod.rs @@ -0,0 +1,382 @@ +//! `ben` CLI: encode, decode, and stream-compress BEN/XBEN files. + +mod args; +mod bundle; +mod paths; + +#[cfg(test)] +mod tests; + +use args::{resolve_variant, Args, Mode}; +use bundle::{run_encode_bundle_with_graph, run_xencode_bundle_with_graph}; +use paths::{decode_setup, encode_setup, open_derived_writer, open_reader, open_writer}; + +use crate::cli::common::{check_overwrite, set_verbose}; +use crate::codec::decode::{ + decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, +}; +use crate::codec::encode::{ + encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, +}; +use crate::ops::extract::extract_assignment_ben; +use clap::Parser; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; +use std::path::Path; + +/// Parse CLI arguments and execute the selected `ben` sub-mode. +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + // --graph is only meaningful for the stream-producing modes. + if args.graph.is_some() && args.mode != Mode::Encode && args.mode != Mode::XEncode { + eprintln!("Error: --graph is only supported with --mode encode or --mode x-encode"); + return; + } + + match args.mode { + Mode::Encode => { + tracing::trace!("Running in encode mode"); + + // --graph path: produce a .bendl bundle with the BEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = match args.input_file.as_ref() { + Some(f) => f, + None => { + eprintln!("Error: --graph requires an input file (stdin not supported)."); + return; + } + }; + if args.print { + eprintln!("Error: --graph is incompatible with --print."); + return; + } + let out_path = match encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + ) { + Ok(path) => path, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = + run_encode_bundle_with_graph(Path::new(in_file), &out_path, variant, graph_path) + { + eprintln!("Error: {:?}", err); + } + return; + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => match encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + false, + ) { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = encode_jsonl_to_ben(reader, writer, variant) { + eprintln!("Error: {:?}", err); + } + } + Mode::XEncode => { + tracing::trace!("Running in xencode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_xben = args.jsonl_and_xben; + + if let Some(in_file) = args.input_file.as_ref() { + if in_file.ends_with(".ben") { + ben_and_xben = true; + } else if in_file.ends_with(".jsonl") { + jsonl_and_xben = true; + } + } + + // --graph path: produce a .bendl bundle with the XBEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = match args.input_file.as_ref() { + Some(f) => f, + None => { + eprintln!("Error: --graph requires an input file (stdin not supported)."); + return; + } + }; + if args.print { + eprintln!("Error: --graph is incompatible with --print."); + return; + } + if !ben_and_xben && !jsonl_and_xben { + eprintln!("Error: Unsupported file type(s) for xencode mode"); + return; + } + let out_path = match encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + ) { + Ok(path) => path, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + let variant = resolve_variant(args.variant, args.save_all); + if let Err(err) = run_xencode_bundle_with_graph( + Path::new(in_file), + &out_path, + variant, + ben_and_xben, + args.n_cpus, + args.compression_level, + args.chunk_size, + graph_path, + ) { + eprintln!("Error: {:?}", err); + } + return; + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => match encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + false, + ) { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if ben_and_xben { + if let Err(err) = encode_ben_to_xben( + reader, + writer, + args.n_cpus, + args.compression_level, + args.chunk_size, + ) { + eprintln!("Error: {:?}", err); + } + } else if jsonl_and_xben { + let variant = resolve_variant(args.variant, args.save_all); + if let Err(e) = encode_jsonl_to_xben( + reader, + writer, + variant, + args.n_cpus, + args.compression_level, + args.chunk_size, + ) { + eprintln!("Error: {:?}", e); + } + } else { + eprintln!("Error: Unsupported file type(s) for xencode mode"); + } + } + Mode::Decode => { + tracing::trace!("Running in decode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_ben = args.jsonl_and_ben; + + if let Some(file) = args.input_file.as_ref() { + if file.ends_with(".ben") { + jsonl_and_ben = true; + } else if file.ends_with(".xben") { + ben_and_xben = true; + } + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + match decode_setup( + file.clone(), + args.output_file.clone(), + false, + args.overwrite, + ) { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + } + } + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if ben_and_xben { + if let Err(err) = decode_xben_to_ben(reader, writer) { + eprintln!("Error: {:?}", err); + } + } else if jsonl_and_ben { + if let Err(err) = decode_ben_to_jsonl(reader, writer) { + eprintln!("Error: {:?}", err); + } + } else { + eprintln!("Error: Unsupported file type(s) for decode mode"); + } + } + Mode::XDecode => { + tracing::trace!("Running in x-decode mode"); + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + match decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite) + { + Ok(path) => open_derived_writer(path), + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + } + } + _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }, + }; + + if let Err(err) = decode_xben_to_jsonl(reader, writer) { + eprintln!("Error: {:?}", err); + } + } + Mode::Read => { + tracing::trace!("Running in read mode"); + let reader = BufReader::new( + File::open( + &args + .input_file + .expect("Must provide input file for read mode."), + ) + .unwrap(), + ); + + if args.sample_number.is_none() { + eprintln!("Error: Sample number is required in read mode"); + return; + } + + let mut writer = match open_writer(args.output_file.as_deref(), args.print, false) { + Ok(writer) => writer, + Err(err) => { + eprintln!("Error: {:?}", err); + return; + } + }; + + args.sample_number + .map(|n| match extract_assignment_ben(reader, n) { + Ok(vec) => writer.write_all(format!("{:?}\n", vec).as_bytes()).unwrap(), + Err(e) => eprintln!("Error: {:?}", e), + }); + } + Mode::XzCompress => { + tracing::trace!("Running in xz compress mode"); + + let in_file_name = args + .input_file + .expect("Must provide input file for xz-compress mode."); + let reader = BufReader::new(File::open(&in_file_name).unwrap()); + + let out_file_name = match args.output_file { + Some(name) => name, + None => in_file_name + ".xz", + }; + + if let Err(err) = check_overwrite(&out_file_name, args.overwrite) { + eprintln!("Error: {:?}", err); + return; + } + + let writer = BufWriter::new(File::create(out_file_name).unwrap()); + + if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { + eprintln!("Error: {:?}", err); + } + tracing::trace!("Done!"); + } + Mode::XzDecompress => { + tracing::trace!("Running in xz decompress mode"); + + let in_file_name = args + .input_file + .expect("Must provide input file for xz-decompress mode."); + + if !in_file_name.ends_with(".xz") { + eprintln!("Error: Unsupported file type for xz decompress mode"); + return; + } + + let output_file_name = match args.output_file { + Some(name) => name, + None => in_file_name[..in_file_name.len() - 3].to_string(), + }; + + if let Err(err) = check_overwrite(&output_file_name, args.overwrite) { + eprintln!("Error: {:?}", err); + return; + } + + let reader = BufReader::new(File::open(&in_file_name).unwrap()); + let writer = BufWriter::new(File::create(output_file_name).unwrap()); + + if let Err(err) = xz_decompress(reader, writer) { + eprintln!("Error: {:?}", err); + } + } + } +} diff --git a/ben/src/cli/ben/paths.rs b/ben/src/cli/ben/paths.rs new file mode 100644 index 0000000..85a2ff7 --- /dev/null +++ b/ben/src/cli/ben/paths.rs @@ -0,0 +1,182 @@ +use super::args::Mode; +use crate::cli::common::check_overwrite; +use std::fs::File; +use std::io::{self, BufRead, BufReader, BufWriter, Result, Write}; +use std::path::Path; + +pub(super) type DynReader = Box; +pub(super) type DynWriter = Box; + +/// Derive the output path for encode-style CLI modes. +/// +/// # Arguments +/// +/// * `mode` - The encode-oriented CLI mode being executed. +/// * `input_file_name` - The input file path supplied by the user. +/// * `output_file_name` - An optional explicit output path. +/// * `overwrite` - Whether to skip overwrite prompting. +/// * `with_graph` - When true, the output is a `.bendl` bundle instead +/// of a bare `.ben`/`.xben` stream, so the derived extension is +/// `.bendl` regardless of `mode`. +/// +/// # Returns +/// +/// Returns the resolved output path. +pub(super) fn encode_setup( + mode: Mode, + input_file_name: String, + output_file_name: Option, + overwrite: bool, + with_graph: bool, +) -> Result { + let extension = if with_graph { + ".bendl" + } else if mode == Mode::XEncode { + ".xben" + } else if mode == Mode::Encode { + ".ben" + } else { + ".xz" + }; + + let out_file_name = match output_file_name { + Some(name) => name.to_owned(), + None => { + let stripped_ben = input_file_name.ends_with(".ben") + && (extension == ".xben" || extension == ".bendl"); + let stripped_xben = input_file_name.ends_with(".xben") && extension == ".bendl"; + if stripped_ben { + input_file_name.trim_end_matches(".ben").to_owned() + extension + } else if stripped_xben { + input_file_name.trim_end_matches(".xben").to_owned() + extension + } else { + input_file_name.to_string() + extension + } + } + }; + + check_overwrite(&out_file_name, overwrite)?; + Ok(out_file_name) +} + +/// Derive the output path for decode-style CLI modes. +/// +/// # Arguments +/// +/// * `in_file_name` - The input file path supplied by the user. +/// * `out_file_name` - An optional explicit output path. +/// * `full_decode` - Whether the decode should go all the way to JSONL instead +/// of stopping at BEN. +/// * `overwrite` - Whether to skip overwrite prompting. +/// +/// # Returns +/// +/// Returns the resolved output path. +pub(super) fn decode_setup( + in_file_name: String, + out_file_name: Option, + full_decode: bool, + overwrite: bool, +) -> Result { + let out_file_name = if let Some(name) = out_file_name { + name.to_owned() + } else if in_file_name.ends_with(".ben") { + in_file_name.trim_end_matches(".ben").to_owned() + } else if in_file_name.ends_with(".xben") { + if !full_decode { + in_file_name.trim_end_matches(".xben").to_owned() + ".ben" + } else { + in_file_name.trim_end_matches(".xben").to_owned() + } + } else if in_file_name.ends_with(".xz") { + eprintln!( + "Error: Unsupported file type for decode mode {:?}. Please decompress xz files with \ + either the xz command line tool or the xz-decompress mode of this tool.", + in_file_name + ); + return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); + } else { + eprintln!( + "Error: Unsupported file type for decode mode {:?}. Supported types are .ben and .xben.", + in_file_name + ); + return Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)); + }; + + check_overwrite(&out_file_name, overwrite)?; + Ok(out_file_name) +} + +/// Open either the requested input file or stdin. +/// +/// # Arguments +/// +/// * `input_file` - An optional input file path. +/// +/// # Returns +/// +/// Returns a buffered reader for the requested file or stdin. +pub(super) fn open_reader(input_file: Option<&str>) -> DynReader { + match input_file { + Some(path) => Box::new(BufReader::new(File::open(path).unwrap())), + None => Box::new(BufReader::new(io::stdin())), + } +} + +/// Open either the requested output file or stdout. +/// +/// # Arguments +/// +/// * `output_file` - An optional output file path. +/// * `print` - Whether output should be forced to stdout. +/// * `overwrite` - Whether to skip overwrite prompting for file outputs. +/// +/// # Returns +/// +/// Returns a buffered writer for the requested file or stdout. +pub(super) fn open_writer( + output_file: Option<&str>, + print: bool, + overwrite: bool, +) -> Result { + if print { + return Ok(Box::new(BufWriter::new(io::stdout()))); + } + + match output_file { + Some(path) => { + check_overwrite(path, overwrite)?; + Ok(Box::new(BufWriter::new(File::create(path).unwrap()))) + } + None => Ok(Box::new(BufWriter::new(io::stdout()))), + } +} + +/// Open a writer for a path computed by one of the setup helpers. +/// +/// # Arguments +/// +/// * `path` - The output path to create. +/// +/// # Returns +/// +/// Returns a buffered writer for `path`. +pub(super) fn open_derived_writer(path: String) -> DynWriter { + Box::new(BufWriter::new(File::create(path).unwrap())) +} + +/// Count the number of non-empty lines in a JSONL file. Used to populate +/// the bundle header's `sample_count` when wrapping a stream encode in a +/// `.bendl` container. +pub(super) fn count_jsonl_lines(path: &Path) -> io::Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut n: i64 = 0; + for line in reader.lines() { + let line = line?; + if !line.is_empty() { + n += 1; + } + } + Ok(n) +} diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs new file mode 100644 index 0000000..23f22d4 --- /dev/null +++ b/ben/src/cli/ben/tests.rs @@ -0,0 +1,600 @@ +use super::args::{Args, CliVariant, Mode}; +use super::args::resolve_variant; +use super::bundle::{ + append_graph_asset, run_encode_bundle_with_graph, run_xencode_bundle_with_graph, +}; +use super::paths::{ + count_jsonl_lines, decode_setup, encode_setup, open_derived_writer, open_reader, open_writer, +}; +use crate::BenVariant; +use clap::{CommandFactory, Parser}; +use std::fs; +use std::io::{self, Write}; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("ben-cli-ben-{name}-{nonce}")) +} + +#[test] +fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("Binary Ensemble CLI Tool")); + assert!(help.contains("--mode")); + assert!(help.contains("x-encode")); +} + +#[test] +fn parse_encode_args() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--output-file", + "out.ben", + "--save-all", + "--verbose", + "input.jsonl", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Encode); + assert_eq!(args.input_file.as_deref(), Some("input.jsonl")); + assert_eq!(args.output_file.as_deref(), Some("out.ben")); + assert!(args.save_all); + assert!(args.verbose); +} + +#[test] +fn parse_variant_flag() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "twodelta", + "input.jsonl", + ]) + .unwrap(); + + assert_eq!(args.variant, Some(CliVariant::Twodelta)); +} + +#[test] +fn parse_variant_aliases() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "mkv_chain", + "input.jsonl", + ]) + .unwrap(); + assert_eq!(args.variant, Some(CliVariant::Mkvchain)); + + let args = Args::try_parse_from([ + "ben", + "--mode", + "encode", + "--variant", + "two_delta", + "input.jsonl", + ]) + .unwrap(); + assert_eq!(args.variant, Some(CliVariant::Twodelta)); +} + +#[test] +fn resolve_variant_precedence() { + // --variant takes precedence over --save-all + assert_eq!( + resolve_variant(Some(CliVariant::Twodelta), true), + BenVariant::TwoDelta + ); + assert_eq!( + resolve_variant(Some(CliVariant::Mkvchain), true), + BenVariant::MkvChain + ); + // --save-all alone means Standard + assert_eq!(resolve_variant(None, true), BenVariant::Standard); + // neither means MkvChain + assert_eq!(resolve_variant(None, false), BenVariant::MkvChain); +} + +#[test] +fn parse_xencode_stream_flags() { + let args = Args::try_parse_from([ + "ben", + "--mode", + "x-encode", + "--jsonl-and-xben", + "--ben-and-xben", + "--jsonl-and-ben", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::XEncode); + assert!(args.jsonl_and_xben); + assert!(args.ben_and_xben); + assert!(args.jsonl_and_ben); +} + +#[test] +fn encode_setup_derives_extensions() { + assert_eq!( + encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, false).unwrap(), + "samples.jsonl.ben" + ); + assert_eq!( + encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, false).unwrap(), + "samples.xben" + ); + assert_eq!( + encode_setup( + Mode::XzCompress, + "samples.jsonl".to_string(), + None, + true, + false + ) + .unwrap(), + "samples.jsonl.xz" + ); +} + +#[test] +fn encode_setup_with_graph_derives_bendl_extension() { + // JSONL + encode + graph → .bendl + assert_eq!( + encode_setup(Mode::Encode, "samples.jsonl".to_string(), None, true, true).unwrap(), + "samples.jsonl.bendl" + ); + // .ben input to x-encode with graph trims the .ben suffix + assert_eq!( + encode_setup(Mode::XEncode, "samples.ben".to_string(), None, true, true).unwrap(), + "samples.bendl" + ); + // .xben input to x-encode with graph trims the .xben suffix + assert_eq!( + encode_setup(Mode::XEncode, "samples.xben".to_string(), None, true, true).unwrap(), + "samples.bendl" + ); +} + +#[test] +fn encode_setup_respects_explicit_output() { + assert_eq!( + encode_setup( + Mode::Encode, + "ignored.jsonl".to_string(), + Some("custom-output.ben".to_string()), + true, + false, + ) + .unwrap(), + "custom-output.ben" + ); +} + +#[test] +fn encode_setup_checks_overwrite() { + let path = unique_path("existing.ben"); + fs::write(&path, "already here").unwrap(); + + let err = encode_setup( + Mode::Encode, + "input.jsonl".to_string(), + Some(path.to_string_lossy().into_owned()), + true, + false, + ); + assert!(err.is_ok()); + + fs::remove_file(path).unwrap(); +} + +#[test] +fn decode_setup_derives_ben_and_xben_outputs() { + assert_eq!( + decode_setup("samples.ben".to_string(), None, false, true).unwrap(), + "samples" + ); + assert_eq!( + decode_setup("samples.xben".to_string(), None, false, true).unwrap(), + "samples.ben" + ); + assert_eq!( + decode_setup("samples.xben".to_string(), None, true, true).unwrap(), + "samples" + ); +} + +#[test] +fn decode_setup_rejects_xz_input() { + let err = decode_setup("samples.xz".to_string(), None, false, true).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); +} + +#[test] +fn decode_setup_rejects_unknown_input() { + let err = decode_setup("samples.data".to_string(), None, false, true).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); +} + +#[test] +fn decode_setup_respects_explicit_output() { + assert_eq!( + decode_setup( + "samples.xben".to_string(), + Some("custom.jsonl".to_string()), + true, + true, + ) + .unwrap(), + "custom.jsonl" + ); +} + +#[test] +fn open_reader_reads_file_contents() { + let path = unique_path("reader.txt"); + fs::write(&path, "hello\nworld\n").unwrap(); + + let mut reader = open_reader(Some(path.to_str().unwrap())); + let mut content = String::new(); + std::io::Read::read_to_string(&mut reader, &mut content).unwrap(); + + assert_eq!(content, "hello\nworld\n"); + fs::remove_file(path).unwrap(); +} + +#[test] +fn open_reader_accepts_stdin() { + let _reader = open_reader(None); +} + +#[test] +fn open_writer_creates_file_and_writes() { + let path = unique_path("writer.txt"); + { + let mut writer = open_writer(Some(path.to_str().unwrap()), false, true).unwrap(); + writer.write_all(b"written").unwrap(); + } + + assert_eq!(fs::read_to_string(&path).unwrap(), "written"); + fs::remove_file(path).unwrap(); +} + +#[test] +fn open_writer_supports_stdout_and_print() { + let mut stdout_writer = open_writer(None, false, true).unwrap(); + stdout_writer.write_all(b"").unwrap(); + + let mut print_writer = open_writer(Some("ignored.txt"), true, false).unwrap(); + print_writer.write_all(b"").unwrap(); +} + +#[test] +fn open_derived_writer_creates_file() { + let path = unique_path("derived.txt"); + { + let mut writer = open_derived_writer(path.to_string_lossy().into_owned()); + writer.write_all(b"derived").unwrap(); + } + + assert_eq!(fs::read_to_string(&path).unwrap(), "derived"); + fs::remove_file(path).unwrap(); +} + +#[test] +fn resolve_variant_standard_arm() { + assert_eq!( + resolve_variant(Some(CliVariant::Standard), false), + BenVariant::Standard + ); +} + +#[test] +fn count_jsonl_lines_counts_nonempty_lines() { + let path = unique_path("count.jsonl"); + fs::write(&path, b"{\"a\":1}\n\n{\"b\":2}\n").unwrap(); + let count = count_jsonl_lines(&path).unwrap(); + assert_eq!(count, 2); + fs::remove_file(path).unwrap(); +} + +/// Write a two-sample Standard BEN JSONL file to a temp path. +fn write_temp_jsonl(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + fs::write( + &path, + b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n", + ) + .unwrap(); + path +} + +/// Write a minimal graph JSON file to a temp path. +fn write_temp_graph(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + fs::write(&path, b"{\"nodes\":[0,1,2],\"adj\":[[1],[0,2],[1]]}").unwrap(); + path +} + +#[test] +fn append_graph_asset_adds_graph_to_bundle() { + use crate::io::bundle::format::AssignmentFormat; + use crate::io::bundle::{BendlReader, BendlWriter}; + use std::io::Cursor; + + // Build a minimal finalized .bendl in memory, write to temp file. + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("append_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + let graph_path = write_temp_graph("append_graph.json"); + + append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap(); + + // Verify the graph asset was added. + let file = fs::File::open(&bendl_path).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&bendl_path).unwrap(); + fs::remove_file(&graph_path).unwrap(); +} + +#[test] +fn run_encode_bundle_with_graph_creates_bendl() { + use crate::io::bundle::BendlReader; + + let jsonl = write_temp_jsonl("enc_graph_input.jsonl"); + let graph = write_temp_graph("enc_graph.json"); + let out = unique_path("enc_graph_output.bendl"); + + run_encode_bundle_with_graph(&jsonl, out.to_str().unwrap(), BenVariant::Standard, &graph) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + assert_eq!(reader.sample_count(), Some(2)); + + fs::remove_file(&jsonl).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); +} + +#[test] +fn run_xencode_bundle_with_graph_from_jsonl_creates_bendl() { + use crate::io::bundle::BendlReader; + + let jsonl = write_temp_jsonl("xencode_graph_input.jsonl"); + let graph = write_temp_graph("xencode_graph.json"); + let out = unique_path("xencode_graph_output.bendl"); + + run_xencode_bundle_with_graph( + &jsonl, + out.to_str().unwrap(), + BenVariant::Standard, + false, + None, + None, + None, + &graph, + ) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&jsonl).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); +} + +#[test] +fn run_xencode_bundle_with_graph_from_ben_creates_bendl() { + use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::bundle::BendlReader; + use std::io::Cursor; + + // First create a BEN file from JSONL. + let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n{\"assignment\":[2,1],\"sample\":2}\n"; + let mut ben_bytes = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben_bytes, BenVariant::Standard).unwrap(); + let ben_path = unique_path("xencode_from_ben_input.ben"); + fs::write(&ben_path, &ben_bytes).unwrap(); + + let graph = write_temp_graph("xencode_from_ben_graph.json"); + let out = unique_path("xencode_from_ben_output.bendl"); + + run_xencode_bundle_with_graph( + &ben_path, + out.to_str().unwrap(), + BenVariant::Standard, + true, + None, + None, + None, + &graph, + ) + .unwrap(); + + let file = fs::File::open(&out).unwrap(); + let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); + assert!(reader.is_complete()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + fs::remove_file(&ben_path).unwrap(); + fs::remove_file(&graph).unwrap(); + fs::remove_file(&out).unwrap(); +} + +#[test] +fn append_graph_asset_errors_on_missing_graph_file() { + use crate::io::bundle::format::AssignmentFormat; + use crate::io::bundle::BendlWriter; + use std::io::Cursor; + + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("err_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + let nonexistent = unique_path("nonexistent.json"); + let err = append_graph_asset(bendl_path.to_str().unwrap(), &nonexistent).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to read graph")); + let _ = fs::remove_file(&bendl_path); +} + +#[test] +fn run_encode_bundle_with_graph_errors_on_missing_graph() { + let jsonl = write_temp_jsonl("err_enc_input.jsonl"); + let out = unique_path("err_enc_output.bendl"); + let nonexistent = unique_path("nonexistent.json"); + + let err = run_encode_bundle_with_graph( + &jsonl, + out.to_str().unwrap(), + BenVariant::Standard, + &nonexistent, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to stat graph")); + let _ = fs::remove_file(&jsonl); + let _ = fs::remove_file(&out); +} + +#[test] +fn run_xencode_bundle_with_graph_errors_on_missing_graph() { + let jsonl = write_temp_jsonl("err_xenc_input.jsonl"); + let out = unique_path("err_xenc_output.bendl"); + let nonexistent = unique_path("nonexistent.json"); + + let err = run_xencode_bundle_with_graph( + &jsonl, + out.to_str().unwrap(), + BenVariant::Standard, + false, + None, + None, + None, + &nonexistent, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to stat graph")); + let _ = fs::remove_file(&jsonl); + let _ = fs::remove_file(&out); +} + +#[test] +fn append_graph_asset_errors_when_bundle_already_has_graph() { + use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH}; + use crate::io::bundle::{AddAssetOptions, BendlWriter}; + use std::io::Cursor; + + // Build a .bendl that already contains graph.json. + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + b"{}", + AddAssetOptions::defaults().json(), + ) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + } + let bendl_path = unique_path("dup_graph.bendl"); + fs::write(&bendl_path, &buf).unwrap(); + + // graph.json already exists — add_asset must fail with duplicate name. + let graph_path = write_temp_graph("dup_graph.json"); + let err = append_graph_asset(bendl_path.to_str().unwrap(), &graph_path).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert!(err.to_string().contains("failed to add graph asset")); + + let _ = fs::remove_file(&bendl_path); + let _ = fs::remove_file(&graph_path); +} + +#[test] +fn run_xencode_bundle_with_graph_errors_on_invalid_jsonl() { + // from_ben=false path: encode_jsonl_to_xben fails on invalid JSONL. + let bad_jsonl = unique_path("bad.jsonl"); + fs::write(&bad_jsonl, b"not valid json\n").unwrap(); + let graph = write_temp_graph("xenc_bad_jsonl_graph.json"); + let out = unique_path("xenc_bad_jsonl.bendl"); + + let err = run_xencode_bundle_with_graph( + &bad_jsonl, + out.to_str().unwrap(), + BenVariant::Standard, + false, + None, + None, + None, + &graph, + ) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + let _ = fs::remove_file(&bad_jsonl); + let _ = fs::remove_file(&graph); + let _ = fs::remove_file(&out); +} + +#[test] +fn run_xencode_bundle_with_graph_errors_on_invalid_ben() { + // from_ben=true path: encode_ben_to_xben fails on a file with no BEN banner. + let bad_ben = unique_path("bad.ben"); + fs::write(&bad_ben, b"this is not a ben file").unwrap(); + let graph = write_temp_graph("xenc_bad_ben_graph.json"); + let out = unique_path("xenc_bad_ben.bendl"); + + let err = run_xencode_bundle_with_graph( + &bad_ben, + out.to_str().unwrap(), + BenVariant::Standard, + true, + None, + None, + None, + &graph, + ) + .unwrap_err(); + // encode_ben_to_xben fails when it can't read a valid banner. + assert!(err.kind() != io::ErrorKind::NotFound); + + let _ = fs::remove_file(&bad_ben); + let _ = fs::remove_file(&graph); + let _ = fs::remove_file(&out); +} diff --git a/ben/src/cli/bendl.rs b/ben/src/cli/bendl.rs deleted file mode 100644 index 279a71a..0000000 --- a/ben/src/cli/bendl.rs +++ /dev/null @@ -1,934 +0,0 @@ -//! CLI front-end for the `.bendl` bundle container. -//! -//! Exposes four subcommands: -//! -//! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional -//! asset files into a finalized `.bendl` bundle. -//! - `inspect` — print the header and directory of a `.bendl` file. -//! - `extract` — copy the embedded stream region or a named asset out -//! of a bundle to disk. -//! - `append` — add new asset files to an already-finalized bundle -//! without rewriting the stream. - -use std::fs::{File, OpenOptions}; -use std::io::{self, BufReader, BufWriter, Read, Seek, Write}; -use std::path::{Path, PathBuf}; - -use clap::{Parser, Subcommand}; - -use crate::cli::common::{check_overwrite, set_verbose}; -use crate::io::bundle::format::{ - AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, - ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, -}; -use crate::io::bundle::writer::BendlAppender; -use crate::io::bundle::{AddAssetOptions, BendlReader, BendlWriteError, BendlWriter}; -use crate::io::reader::subsample::count_samples_from_file; - -/// Parsed form of a `name=path` option such as `--asset myblob=/tmp/x`. -#[derive(Debug, Clone)] -struct NamedAsset { - name: String, - path: PathBuf, -} - -impl std::str::FromStr for NamedAsset { - type Err = String; - fn from_str(s: &str) -> Result { - let (name, path) = s - .split_once('=') - .ok_or_else(|| format!("expected NAME=PATH, got {s:?}"))?; - if name.is_empty() { - return Err("custom asset name must be non-empty".to_string()); - } - Ok(NamedAsset { - name: name.to_string(), - path: PathBuf::from(path), - }) - } -} - -/// `bendl` CLI entry point. -#[derive(Parser, Debug)] -#[command( - name = "bendl", - about = "Create, inspect, extract from, and append to .bendl bundle files.", - version -)] -struct Args { - /// Enable verbose tracing output. - #[arg(short, long, global = true)] - verbose: bool, - - #[command(subcommand)] - command: Command, -} - -#[derive(Subcommand, Debug)] -enum Command { - /// Package a `.ben` or `.xben` stream (plus optional assets) into a `.bendl`. - Create(CreateArgs), - /// Print the header and directory of a `.bendl` file. - Inspect(InspectArgs), - /// Extract the embedded stream or a named asset to a file. - Extract(ExtractArgs), - /// Append new assets to an already-finalized `.bendl` bundle. - Append(AppendArgs), -} - -#[derive(Parser, Debug)] -struct CreateArgs { - /// Path to the `.ben` or `.xben` assignment stream to embed. - /// File extension chooses the container format. - #[arg(short = 'i', long)] - input: PathBuf, - /// Destination `.bendl` path. - #[arg(short = 'o', long)] - output: PathBuf, - /// Optional `graph.json` asset path. Will be stored under the - /// canonical name `graph.json` and xz-compressed by default. - #[arg(long)] - graph: Option, - /// Optional `metadata.json` asset path. Stored under canonical name. - #[arg(long)] - metadata: Option, - /// Optional `relabel_map.json` asset path. Stored under canonical name. - #[arg(long)] - relabel_map: Option, - /// Additional custom assets, specified as `NAME=PATH`. May be repeated. - #[arg(long = "asset")] - assets: Vec, - /// Overwrite the output file if it already exists. - #[arg(short = 'w', long)] - overwrite: bool, - /// Store `graph.json` raw instead of compressing it. - #[arg(long)] - graph_raw: bool, -} - -#[derive(Parser, Debug)] -struct InspectArgs { - /// `.bendl` file to inspect. - input: PathBuf, -} - -#[derive(Parser, Debug)] -struct ExtractArgs { - /// `.bendl` file to extract from. - input: PathBuf, - /// Output file path for the extracted bytes. - #[arg(short = 'o', long)] - output: PathBuf, - /// Extract the embedded assignment stream region verbatim. Mutually - /// exclusive with `--asset`. - #[arg(long, conflicts_with = "asset")] - stream: bool, - /// Name of the asset to extract (e.g. `graph.json`). If the asset is - /// xz-compressed, the extracted file contains the decompressed bytes. - #[arg(long)] - asset: Option, - /// Overwrite the output file if it already exists. - #[arg(short = 'w', long)] - overwrite: bool, -} - -#[derive(Parser, Debug)] -struct AppendArgs { - /// `.bendl` file to append to. Must be finalized (`complete == 1`). - input: PathBuf, - /// Optional `graph.json` asset path to add. - #[arg(long)] - graph: Option, - /// Optional `metadata.json` asset path to add. - #[arg(long)] - metadata: Option, - /// Optional `relabel_map.json` asset path to add. - #[arg(long)] - relabel_map: Option, - /// Additional custom assets, specified as `NAME=PATH`. May be repeated. - #[arg(long = "asset")] - assets: Vec, - /// Store `graph.json` raw instead of compressing it. - #[arg(long)] - graph_raw: bool, -} - -/// Parse CLI arguments and execute the selected subcommand. -pub fn run() { - let args = Args::parse(); - set_verbose(args.verbose); - - let result = match args.command { - Command::Create(a) => run_create(a), - Command::Inspect(a) => run_inspect(a), - Command::Extract(a) => run_extract(a), - Command::Append(a) => run_append(a), - }; - - if let Err(err) = result { - eprintln!("Error: {err}"); - std::process::exit(1); - } -} - -/// Detect the container format of `path` from its extension. -fn format_from_path(path: &Path) -> Result { - match path.extension().and_then(|e| e.to_str()) { - Some("ben") => Ok(AssignmentFormat::Ben), - Some("xben") => Ok(AssignmentFormat::Xben), - other => Err(format!( - "unable to determine assignment format from extension {other:?}; \ - expected .ben or .xben" - )), - } -} - -/// `mode` argument expected by `count_samples_from_file`. -fn mode_str(format: AssignmentFormat) -> &'static str { - match format { - AssignmentFormat::Ben => "ben", - AssignmentFormat::Xben => "xben", - } -} - -fn run_create(args: CreateArgs) -> Result<(), String> { - let format = format_from_path(&args.input)?; - check_overwrite( - args.output.to_str().ok_or("non-utf8 output path")?, - args.overwrite, - ) - .map_err(|e| format!("{e}"))?; - - // Count samples up front so we can patch the header at finalize time. - // This pre-scan is O(stream size); the second pass streams bytes directly. - let sample_count: i64 = count_samples_from_file(&args.input, mode_str(format)) - .map_err(|e| format!("failed to count samples in {:?}: {e}", args.input))? - as i64; - - let out_file = File::create(&args.output) - .map_err(|e| format!("failed to create {:?}: {e}", args.output))?; - let mut writer = BendlWriter::new(out_file, format) - .map_err(|e| format!("failed to initialize bundle writer: {e}"))?; - - // Add singleton assets first, in canonical order. - if let Some(ref path) = args.metadata { - add_file_asset( - &mut writer, - ASSET_TYPE_METADATA, - "metadata.json", - path, - AddAssetOptions::defaults().json(), - )?; - } - if let Some(ref path) = args.graph { - let opts = if args.graph_raw { - AddAssetOptions::defaults().json().raw() - } else { - AddAssetOptions::defaults().json() - }; - add_file_asset(&mut writer, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; - } - if let Some(ref path) = args.relabel_map { - add_file_asset( - &mut writer, - ASSET_TYPE_RELABEL_MAP, - "relabel_map.json", - path, - AddAssetOptions::defaults().json(), - )?; - } - for NamedAsset { name, path } in &args.assets { - add_file_asset( - &mut writer, - ASSET_TYPE_CUSTOM, - name, - path, - AddAssetOptions::defaults(), - )?; - } - - // Stream phase: copy bytes from the input file directly into the - // bundle's stream region. This preserves the exact BEN/XBEN bytes. - { - let mut handle = writer - .begin_stream() - .map_err(|e| format!("failed to open stream region: {e}"))?; - let mut input = BufReader::new( - File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?, - ); - io::copy(&mut input, &mut handle) - .map_err(|e| format!("failed to copy assignment stream: {e}"))?; - handle - .finish(sample_count) - .map_err(|e| format!("failed to close stream region: {e}"))?; - } - - writer - .finish() - .map_err(|e| format!("failed to finalize bundle: {e}"))?; - - eprintln!( - "Wrote {:?} ({} samples, format = {:?})", - args.output, sample_count, format - ); - Ok(()) -} - -fn add_file_asset( - writer: &mut BendlWriter, - asset_type: u16, - name: &str, - path: &Path, - options: AddAssetOptions, -) -> Result<(), String> { - let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; - writer - .add_asset(asset_type, name, &bytes, options) - .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) -} - -fn run_inspect(args: InspectArgs) -> Result<(), String> { - let file = - File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; - let reader = BendlReader::open(BufReader::new(file)) - .map_err(|e| format!("failed to parse bundle header: {e}"))?; - - let header = reader.header(); - println!("file: {}", args.input.display()); - println!( - "version: {}.{}", - header.major_version, header.minor_version - ); - println!("complete: {}", reader.is_complete()); - println!( - "assignment_format: {}", - match reader.assignment_format() { - Some(AssignmentFormat::Ben) => "ben", - Some(AssignmentFormat::Xben) => "xben", - None => "unknown", - } - ); - println!( - "sample_count: {}", - match reader.sample_count() { - Some(n) => n.to_string(), - None => "".to_string(), - } - ); - println!( - "stream: offset={} len={}", - header.stream_offset, header.stream_len - ); - println!( - "directory: offset={} len={}", - header.directory_offset, header.directory_len - ); - - let entries = reader.assets(); - println!("assets: {} entries", entries.len()); - for entry in entries { - let mut flag_parts: Vec<&str> = Vec::new(); - if entry.asset_flags & ASSET_FLAG_JSON != 0 { - flag_parts.push("json"); - } - if entry.asset_flags & ASSET_FLAG_XZ != 0 { - flag_parts.push("xz"); - } - if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { - flag_parts.push("checksum"); - } - let flag_str = if flag_parts.is_empty() { - "-".to_string() - } else { - flag_parts.join(",") - }; - println!( - " type={:<4} name={:<24} offset={:<10} len={:<10} flags={}", - entry.asset_type, entry.name, entry.payload_offset, entry.payload_len, flag_str - ); - } - - Ok(()) -} - -fn run_extract(args: ExtractArgs) -> Result<(), String> { - if !args.stream && args.asset.is_none() { - return Err("extract requires either --stream or --asset ".to_string()); - } - check_overwrite( - args.output.to_str().ok_or("non-utf8 output path")?, - args.overwrite, - ) - .map_err(|e| format!("{e}"))?; - - let file = - File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; - let mut reader = BendlReader::open(BufReader::new(file)) - .map_err(|e| format!("failed to parse bundle header: {e}"))?; - - let mut out = BufWriter::new( - File::create(&args.output) - .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, - ); - - if args.stream { - let mut stream = reader - .assignment_stream_reader() - .map_err(|e| format!("failed to open stream region: {e}"))?; - io::copy(&mut stream, &mut out).map_err(|e| format!("failed to copy stream bytes: {e}"))?; - } else { - // asset is Some — validated by the early return above. - let name = args.asset.unwrap(); - let entry = reader - .find_asset_by_name(&name) - .cloned() - .ok_or_else(|| format!("no asset named {name:?} in bundle"))?; - let mut asset = reader - .asset_reader(&entry) - .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; - io::copy(&mut asset, &mut out).map_err(|e| format!("failed to copy asset bytes: {e}"))?; - } - - out.flush().map_err(|e| format!("flush failed: {e}"))?; - Ok(()) -} - -fn run_append(args: AppendArgs) -> Result<(), String> { - let file = OpenOptions::new() - .read(true) - .write(true) - .open(&args.input) - .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; - let mut appender = - BendlAppender::open(file).map_err(|e| format!("failed to open appender: {e}"))?; - - let mut added = 0usize; - if let Some(ref path) = args.metadata { - append_file_asset( - &mut appender, - ASSET_TYPE_METADATA, - "metadata.json", - path, - AddAssetOptions::defaults().json(), - )?; - added += 1; - } - if let Some(ref path) = args.graph { - let opts = if args.graph_raw { - AddAssetOptions::defaults().json().raw() - } else { - AddAssetOptions::defaults().json() - }; - append_file_asset(&mut appender, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; - added += 1; - } - if let Some(ref path) = args.relabel_map { - append_file_asset( - &mut appender, - ASSET_TYPE_RELABEL_MAP, - "relabel_map.json", - path, - AddAssetOptions::defaults().json(), - )?; - added += 1; - } - for NamedAsset { name, path } in &args.assets { - append_file_asset( - &mut appender, - ASSET_TYPE_CUSTOM, - name, - path, - AddAssetOptions::defaults(), - )?; - added += 1; - } - - if added == 0 { - // Nothing to do; leave the file untouched. - appender.abort(); - eprintln!("No assets specified; bundle is unchanged."); - return Ok(()); - } - - appender - .commit() - .map_err(|e| format!("failed to commit append: {e}"))?; - eprintln!("Appended {added} asset(s) to {:?}", args.input); - Ok(()) -} - -fn append_file_asset( - appender: &mut BendlAppender, - asset_type: u16, - name: &str, - path: &Path, - options: AddAssetOptions, -) -> Result<(), String> { - let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; - appender - .add_asset(asset_type, name, &bytes, options) - .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::bundle::{BendlReader, BendlWriter}; - use crate::io::bundle::format::AssignmentFormat; - use clap::Parser; - use std::io::{BufReader, Cursor}; - use std::time::{SystemTime, UNIX_EPOCH}; - - fn unique_path(name: &str) -> PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("bendl-cli-{name}-{nonce}")) - } - - /// Write a minimal finalized .bendl file and return its path. - fn write_temp_bendl(name: &str, format: AssignmentFormat) -> PathBuf { - let path = unique_path(name); - let stream = b"STANDARD BEN FILE\x00fake"; - let mut buf: Vec = Vec::new(); - let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); - writer.write_stream_bytes(stream, 1).unwrap(); - writer.finish().unwrap(); - std::fs::write(&path, &buf).unwrap(); - path - } - - #[test] - fn write_temp_bendl_xben_variant_works() { - // Exercises the Xben branch of write_temp_bendl. - let path = write_temp_bendl("xben_helper_check.bendl", AssignmentFormat::Xben); - let reader = BendlReader::open(BufReader::new( - std::fs::File::open(&path).unwrap(), - )) - .unwrap(); - assert!(reader.is_complete()); - let _ = std::fs::remove_file(&path); - } - - #[test] - fn named_asset_from_str_rejects_empty_name() { - let err = "=path/to/file".parse::().unwrap_err(); - assert!(err.contains("non-empty")); - } - - #[test] - fn format_from_path_detects_xben() { - let fmt = format_from_path(std::path::Path::new("stream.xben")).unwrap(); - assert_eq!(fmt, AssignmentFormat::Xben); - } - - #[test] - fn format_from_path_rejects_unknown_extension() { - let err = format_from_path(std::path::Path::new("archive.tar")).unwrap_err(); - assert!(err.contains("expected .ben or .xben")); - } - - #[test] - fn mode_str_returns_xben_for_xben() { - assert_eq!(mode_str(AssignmentFormat::Xben), "xben"); - } - - #[test] - fn run_create_with_relabel_map_and_custom_asset() { - let ben = { - // Must end in .ben so format_from_path recognises it. - let p = std::env::temp_dir().join(format!( - "bendl-create-relabel-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() - )); - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; - let mut b = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); - std::fs::write(&p, &b).unwrap(); - p - }; - let relabel = unique_path("create_relabel_map.json"); - std::fs::write(&relabel, b"{\"0\":1,\"1\":0}").unwrap(); - let custom = unique_path("create_custom.bin"); - std::fs::write(&custom, b"custom bytes").unwrap(); - let out = unique_path("create_with_assets.bendl"); - - let asset_str = format!("myblob={}", custom.display()); - let args = CreateArgs { - input: ben.clone(), - output: out.clone(), - graph: None, - metadata: None, - relabel_map: Some(relabel.clone()), - assets: vec![asset_str.parse().unwrap()], - overwrite: false, - graph_raw: false, - }; - run_create(args).unwrap(); - - let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); - assert!(reader.find_asset_by_name("relabel_map.json").is_some()); - assert!(reader.find_asset_by_name("myblob").is_some()); - - for p in [&ben, &relabel, &custom, &out] { let _ = std::fs::remove_file(p); } - } - - #[test] - fn run_inspect_xben_format_and_checksum_flag() { - use crate::io::bundle::AddAssetOptions; - use crate::io::bundle::format::ASSET_TYPE_CUSTOM; - - // Build a .bendl with a checksum asset so the flag_parts checksum - // branch is exercised. - let mut buf: Vec = Vec::new(); - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Xben).unwrap(); - writer - .add_asset( - ASSET_TYPE_CUSTOM, - "checksummed", - b"data", - AddAssetOptions { - checksum: Some(vec![0xAB, 0xCD]), - ..AddAssetOptions::defaults() - }, - ) - .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); - writer.finish().unwrap(); - let path = unique_path("inspect_xben.bendl"); - std::fs::write(&path, &buf).unwrap(); - - run_inspect(InspectArgs { input: path.clone() }).unwrap(); - let _ = std::fs::remove_file(&path); - } - - #[test] - fn run_append_no_assets_is_noop() { - let bendl = write_temp_bendl("append_noop.bendl", AssignmentFormat::Ben); - let args = AppendArgs { - input: bendl.clone(), - graph: None, - metadata: None, - relabel_map: None, - assets: vec![], - graph_raw: false, - }; - run_append(args).unwrap(); - // File should be unchanged (bundle is still valid). - let reader = BendlReader::open(BufReader::new( - std::fs::File::open(&bendl).unwrap(), - )) - .unwrap(); - assert!(reader.is_complete()); - let _ = std::fs::remove_file(&bendl); - } - - #[test] - fn run_append_with_metadata_and_relabel_map() { - let bendl = write_temp_bendl("append_assets.bendl", AssignmentFormat::Ben); - let meta = unique_path("append_meta.json"); - std::fs::write(&meta, b"{\"version\":1}").unwrap(); - let relabel = unique_path("append_relabel.json"); - std::fs::write(&relabel, b"{\"0\":1}").unwrap(); - - let args = AppendArgs { - input: bendl.clone(), - graph: None, - metadata: Some(meta.clone()), - relabel_map: Some(relabel.clone()), - assets: vec![], - graph_raw: false, - }; - run_append(args).unwrap(); - - let reader = BendlReader::open(BufReader::new( - std::fs::File::open(&bendl).unwrap(), - )) - .unwrap(); - assert!(reader.find_asset_by_name("metadata.json").is_some()); - assert!(reader.find_asset_by_name("relabel_map.json").is_some()); - - for p in [&bendl, &meta, &relabel] { let _ = std::fs::remove_file(p); } - } - - #[test] - fn run_create_with_graph_raw_flag() { - let ben = { - let p = std::env::temp_dir().join(format!( - "bendl-create-raw-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() - )); - let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n"; - let mut b = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); - std::fs::write(&p, &b).unwrap(); - p - }; - let graph = unique_path("create_raw_graph.json"); - std::fs::write(&graph, b"{\"nodes\":[0,1]}").unwrap(); - let out = unique_path("create_raw.bendl"); - - let args = CreateArgs { - input: ben.clone(), - output: out.clone(), - graph: Some(graph.clone()), - metadata: None, - relabel_map: None, - assets: vec![], - overwrite: false, - graph_raw: true, - }; - run_create(args).unwrap(); - - let reader = BendlReader::open(BufReader::new( - std::fs::File::open(&out).unwrap(), - )) - .unwrap(); - assert!(reader.find_asset_by_name("graph.json").is_some()); - - for p in [&ben, &graph, &out] { let _ = std::fs::remove_file(p); } - } - - #[test] - fn run_inspect_unknown_format_and_no_sample_count() { - use crate::io::bundle::format::{BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, - COMPLETE_NO, HEADER_SIZE}; - - // Build a header with an unknown assignment format byte and - // complete=0 so sample_count() returns None. - let mut header = [0u8; HEADER_SIZE]; - header[0..8].copy_from_slice(&BENDL_MAGIC); - header[8..10].copy_from_slice(&BENDL_MAJOR_VERSION.to_le_bytes()); - header[10..12].copy_from_slice(&BENDL_MINOR_VERSION.to_le_bytes()); - header[12] = COMPLETE_NO; - header[13] = 0xFF; // unknown format byte - // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 - let stream_offset = HEADER_SIZE as u64; - header[40..48].copy_from_slice(&stream_offset.to_le_bytes()); - let sample_count: i64 = -1; - header[56..64].copy_from_slice(&sample_count.to_le_bytes()); - - let path = unique_path("inspect_unknown.bendl"); - std::fs::write(&path, &header).unwrap(); - run_inspect(InspectArgs { input: path.clone() }).unwrap(); - let _ = std::fs::remove_file(&path); - } - - #[test] - fn run_append_with_graph_raw_and_graph_asset() { - let bendl = write_temp_bendl("append_graph_raw.bendl", AssignmentFormat::Ben); - let graph = unique_path("append_graph_raw.json"); - std::fs::write(&graph, b"{\"nodes\":[0,1,2]}").unwrap(); - - let args = AppendArgs { - input: bendl.clone(), - graph: Some(graph.clone()), - metadata: None, - relabel_map: None, - assets: vec![], - graph_raw: true, - }; - run_append(args).unwrap(); - - let reader = BendlReader::open(BufReader::new( - std::fs::File::open(&bendl).unwrap(), - )) - .unwrap(); - assert!(reader.find_asset_by_name("graph.json").is_some()); - - for p in [&bendl, &graph] { let _ = std::fs::remove_file(p); } - } - - #[test] - fn run_extract_rejects_missing_stream_and_asset() { - let args = ExtractArgs::try_parse_from([ - "extract", - "--output", "/tmp/out.bin", - "bundle.bendl", - ]) - .unwrap(); - let err = run_extract(args).unwrap_err(); - assert!(err.contains("either --stream or --asset")); - } - - #[test] - fn run_create_errors_on_missing_metadata_file() { - let ben = { - let p = std::env::temp_dir().join(format!( - "bendl-err-meta-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() - )); - let jsonl = b"{\"assignment\":[1],\"sample\":1}\n"; - let mut b = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); - std::fs::write(&p, &b).unwrap(); - p - }; - let out = unique_path("err_meta.bendl"); - let args = CreateArgs { - input: ben.clone(), - output: out.clone(), - graph: None, - metadata: Some(unique_path("nonexistent_meta.json")), - relabel_map: None, - assets: vec![], - overwrite: false, - graph_raw: false, - }; - let err = run_create(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&ben); - let _ = std::fs::remove_file(&out); - } - - #[test] - fn run_create_errors_on_missing_relabel_map_file() { - let ben = { - let p = std::env::temp_dir().join(format!( - "bendl-err-relabel-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() - )); - let mut b = Vec::new(); - encode_jsonl_to_ben( - Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), - &mut b, - crate::BenVariant::Standard, - ).unwrap(); - std::fs::write(&p, &b).unwrap(); - p - }; - let out = unique_path("err_relabel.bendl"); - let args = CreateArgs { - input: ben.clone(), - output: out.clone(), - graph: None, - metadata: None, - relabel_map: Some(unique_path("nonexistent_relabel.json")), - assets: vec![], - overwrite: false, - graph_raw: false, - }; - let err = run_create(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&ben); - let _ = std::fs::remove_file(&out); - } - - #[test] - fn run_create_errors_on_missing_custom_asset_file() { - let ben = { - let p = std::env::temp_dir().join(format!( - "bendl-err-custom-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() - )); - let mut b = Vec::new(); - encode_jsonl_to_ben( - Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), - &mut b, - crate::BenVariant::Standard, - ).unwrap(); - std::fs::write(&p, &b).unwrap(); - p - }; - let out = unique_path("err_custom.bendl"); - let nonexistent: PathBuf = unique_path("nonexistent.bin"); - let asset_str = format!("myasset={}", nonexistent.display()); - let args = CreateArgs { - input: ben.clone(), - output: out.clone(), - graph: None, - metadata: None, - relabel_map: None, - assets: vec![asset_str.parse().unwrap()], - overwrite: false, - graph_raw: false, - }; - let err = run_create(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&ben); - let _ = std::fs::remove_file(&out); - } - - #[test] - fn run_extract_asset_by_name() { - use crate::io::bundle::AddAssetOptions; - use crate::io::bundle::format::ASSET_TYPE_CUSTOM; - - // Build a bundle with a named asset then extract it. - let mut buf: Vec = Vec::new(); - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer - .add_asset(ASSET_TYPE_CUSTOM, "hello.txt", b"world", AddAssetOptions::defaults()) - .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); - writer.finish().unwrap(); - let bendl = unique_path("extract_asset.bendl"); - std::fs::write(&bendl, &buf).unwrap(); - - let out = unique_path("extract_asset_out.txt"); - let args = ExtractArgs::try_parse_from([ - "extract", - "--asset", "hello.txt", - "--output", out.to_str().unwrap(), - bendl.to_str().unwrap(), - ]) - .unwrap(); - run_extract(args).unwrap(); - assert_eq!(std::fs::read(&out).unwrap(), b"world"); - - let _ = std::fs::remove_file(&bendl); - let _ = std::fs::remove_file(&out); - } - - #[test] - fn run_append_errors_on_missing_metadata_file() { - let bendl = write_temp_bendl("append_err_meta.bendl", AssignmentFormat::Ben); - let args = AppendArgs { - input: bendl.clone(), - graph: None, - metadata: Some(unique_path("nonexistent_meta.json")), - relabel_map: None, - assets: vec![], - graph_raw: false, - }; - let err = run_append(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&bendl); - } - - #[test] - fn run_append_errors_on_missing_relabel_map_file() { - let bendl = write_temp_bendl("append_err_relabel.bendl", AssignmentFormat::Ben); - let args = AppendArgs { - input: bendl.clone(), - graph: None, - metadata: None, - relabel_map: Some(unique_path("nonexistent_relabel.json")), - assets: vec![], - graph_raw: false, - }; - let err = run_append(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&bendl); - } - - #[test] - fn run_append_errors_on_missing_custom_asset_file() { - let bendl = write_temp_bendl("append_err_custom.bendl", AssignmentFormat::Ben); - let nonexistent = unique_path("nonexistent_custom.bin"); - let asset_str = format!("myasset={}", nonexistent.display()); - let args = AppendArgs { - input: bendl.clone(), - graph: None, - metadata: None, - relabel_map: None, - assets: vec![asset_str.parse().unwrap()], - graph_raw: false, - }; - let err = run_append(args).unwrap_err(); - assert!(err.contains("failed to read")); - let _ = std::fs::remove_file(&bendl); - } -} diff --git a/ben/src/cli/bendl/append.rs b/ben/src/cli/bendl/append.rs new file mode 100644 index 0000000..cc801d0 --- /dev/null +++ b/ben/src/cli/bendl/append.rs @@ -0,0 +1,72 @@ +use super::args::{AppendArgs, NamedAsset}; +use super::helpers::append_file_asset; +use crate::io::bundle::format::{ + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, +}; +use crate::io::bundle::writer::BendlAppender; +use crate::io::bundle::AddAssetOptions; +use std::fs::OpenOptions; + +pub(super) fn run_append(args: AppendArgs) -> Result<(), String> { + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&args.input) + .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; + let mut appender = + BendlAppender::open(file).map_err(|e| format!("failed to open appender: {e}"))?; + + let mut added = 0usize; + if let Some(ref path) = args.metadata { + append_file_asset( + &mut appender, + ASSET_TYPE_METADATA, + "metadata.json", + path, + AddAssetOptions::defaults().json(), + )?; + added += 1; + } + if let Some(ref path) = args.graph { + let opts = if args.graph_raw { + AddAssetOptions::defaults().json().raw() + } else { + AddAssetOptions::defaults().json() + }; + append_file_asset(&mut appender, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + added += 1; + } + if let Some(ref path) = args.relabel_map { + append_file_asset( + &mut appender, + ASSET_TYPE_RELABEL_MAP, + "relabel_map.json", + path, + AddAssetOptions::defaults().json(), + )?; + added += 1; + } + for NamedAsset { name, path } in &args.assets { + append_file_asset( + &mut appender, + ASSET_TYPE_CUSTOM, + name, + path, + AddAssetOptions::defaults(), + )?; + added += 1; + } + + if added == 0 { + // Nothing to do; leave the file untouched. + appender.abort(); + eprintln!("No assets specified; bundle is unchanged."); + return Ok(()); + } + + appender + .commit() + .map_err(|e| format!("failed to commit append: {e}"))?; + eprintln!("Appended {added} asset(s) to {:?}", args.input); + Ok(()) +} diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs new file mode 100644 index 0000000..4522ab2 --- /dev/null +++ b/ben/src/cli/bendl/args.rs @@ -0,0 +1,130 @@ +use clap::{Parser, Subcommand}; +use std::path::PathBuf; + +/// Parsed form of a `name=path` option such as `--asset myblob=/tmp/x`. +#[derive(Debug, Clone)] +pub(super) struct NamedAsset { + pub name: String, + pub path: PathBuf, +} + +impl std::str::FromStr for NamedAsset { + type Err = String; + fn from_str(s: &str) -> Result { + let (name, path) = s + .split_once('=') + .ok_or_else(|| format!("expected NAME=PATH, got {s:?}"))?; + if name.is_empty() { + return Err("custom asset name must be non-empty".to_string()); + } + Ok(NamedAsset { + name: name.to_string(), + path: PathBuf::from(path), + }) + } +} + +/// `bendl` CLI entry point. +#[derive(Parser, Debug)] +#[command( + name = "bendl", + about = "Create, inspect, extract from, and append to .bendl bundle files.", + version +)] +pub(super) struct Args { + /// Enable verbose tracing output. + #[arg(short, long, global = true)] + pub verbose: bool, + + #[command(subcommand)] + pub command: Command, +} + +#[derive(Subcommand, Debug)] +pub(super) enum Command { + /// Package a `.ben` or `.xben` stream (plus optional assets) into a `.bendl`. + Create(CreateArgs), + /// Print the header and directory of a `.bendl` file. + Inspect(InspectArgs), + /// Extract the embedded stream or a named asset to a file. + Extract(ExtractArgs), + /// Append new assets to an already-finalized `.bendl` bundle. + Append(AppendArgs), +} + +#[derive(Parser, Debug)] +pub(super) struct CreateArgs { + /// Path to the `.ben` or `.xben` assignment stream to embed. + /// File extension chooses the container format. + #[arg(short = 'i', long)] + pub input: PathBuf, + /// Destination `.bendl` path. + #[arg(short = 'o', long)] + pub output: PathBuf, + /// Optional `graph.json` asset path. Will be stored under the + /// canonical name `graph.json` and xz-compressed by default. + #[arg(long)] + pub graph: Option, + /// Optional `metadata.json` asset path. Stored under canonical name. + #[arg(long)] + pub metadata: Option, + /// Optional `relabel_map.json` asset path. Stored under canonical name. + #[arg(long)] + pub relabel_map: Option, + /// Additional custom assets, specified as `NAME=PATH`. May be repeated. + #[arg(long = "asset")] + pub assets: Vec, + /// Overwrite the output file if it already exists. + #[arg(short = 'w', long)] + pub overwrite: bool, + /// Store `graph.json` raw instead of compressing it. + #[arg(long)] + pub graph_raw: bool, +} + +#[derive(Parser, Debug)] +pub(super) struct InspectArgs { + /// `.bendl` file to inspect. + pub input: PathBuf, +} + +#[derive(Parser, Debug)] +pub(super) struct ExtractArgs { + /// `.bendl` file to extract from. + pub input: PathBuf, + /// Output file path for the extracted bytes. + #[arg(short = 'o', long)] + pub output: PathBuf, + /// Extract the embedded assignment stream region verbatim. Mutually + /// exclusive with `--asset`. + #[arg(long, conflicts_with = "asset")] + pub stream: bool, + /// Name of the asset to extract (e.g. `graph.json`). If the asset is + /// xz-compressed, the extracted file contains the decompressed bytes. + #[arg(long)] + pub asset: Option, + /// Overwrite the output file if it already exists. + #[arg(short = 'w', long)] + pub overwrite: bool, +} + +#[derive(Parser, Debug)] +pub(super) struct AppendArgs { + /// `.bendl` file to append to. Must be finalized (`complete == 1`). + pub input: PathBuf, + /// Optional `graph.json` asset path to add. + #[arg(long)] + pub graph: Option, + /// Optional `metadata.json` asset path to add. + #[arg(long)] + pub metadata: Option, + /// Optional `relabel_map.json` asset path to add. + #[arg(long)] + pub relabel_map: Option, + /// Additional custom assets, specified as `NAME=PATH`. May be repeated. + #[arg(long = "asset")] + pub assets: Vec, + /// Store `graph.json` raw instead of compressing it. + #[arg(long)] + pub graph_raw: bool, +} diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs new file mode 100644 index 0000000..7709680 --- /dev/null +++ b/ben/src/cli/bendl/create.rs @@ -0,0 +1,93 @@ +use super::args::{CreateArgs, NamedAsset}; +use super::helpers::{add_file_asset, format_from_path, mode_str}; +use crate::cli::common::check_overwrite; +use crate::io::bundle::format::{ + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, +}; +use crate::io::bundle::{AddAssetOptions, BendlWriter}; +use crate::io::reader::subsample::count_samples_from_file; +use std::fs::File; +use std::io::{self, BufReader}; + +pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { + let format = format_from_path(&args.input)?; + check_overwrite( + args.output.to_str().ok_or("non-utf8 output path")?, + args.overwrite, + ) + .map_err(|e| format!("{e}"))?; + + // Count samples up front so we can patch the header at finalize time. + // This pre-scan is O(stream size); the second pass streams bytes directly. + let sample_count: i64 = count_samples_from_file(&args.input, mode_str(format)) + .map_err(|e| format!("failed to count samples in {:?}: {e}", args.input))? + as i64; + + let out_file = File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?; + let mut writer = BendlWriter::new(out_file, format) + .map_err(|e| format!("failed to initialize bundle writer: {e}"))?; + + // Add singleton assets first, in canonical order. + if let Some(ref path) = args.metadata { + add_file_asset( + &mut writer, + ASSET_TYPE_METADATA, + "metadata.json", + path, + AddAssetOptions::defaults().json(), + )?; + } + if let Some(ref path) = args.graph { + let opts = if args.graph_raw { + AddAssetOptions::defaults().json().raw() + } else { + AddAssetOptions::defaults().json() + }; + add_file_asset(&mut writer, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + } + if let Some(ref path) = args.relabel_map { + add_file_asset( + &mut writer, + ASSET_TYPE_RELABEL_MAP, + "relabel_map.json", + path, + AddAssetOptions::defaults().json(), + )?; + } + for NamedAsset { name, path } in &args.assets { + add_file_asset( + &mut writer, + ASSET_TYPE_CUSTOM, + name, + path, + AddAssetOptions::defaults(), + )?; + } + + // Stream phase: copy bytes from the input file directly into the + // bundle's stream region. This preserves the exact BEN/XBEN bytes. + { + let mut handle = writer + .begin_stream() + .map_err(|e| format!("failed to open stream region: {e}"))?; + let mut input = BufReader::new( + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?, + ); + io::copy(&mut input, &mut handle) + .map_err(|e| format!("failed to copy assignment stream: {e}"))?; + handle + .finish(sample_count) + .map_err(|e| format!("failed to close stream region: {e}"))?; + } + + writer + .finish() + .map_err(|e| format!("failed to finalize bundle: {e}"))?; + + eprintln!( + "Wrote {:?} ({} samples, format = {:?})", + args.output, sample_count, format + ); + Ok(()) +} diff --git a/ben/src/cli/bendl/extract.rs b/ben/src/cli/bendl/extract.rs new file mode 100644 index 0000000..1a0b73f --- /dev/null +++ b/ben/src/cli/bendl/extract.rs @@ -0,0 +1,47 @@ +use super::args::ExtractArgs; +use crate::cli::common::check_overwrite; +use crate::io::bundle::BendlReader; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, Write}; + +pub(super) fn run_extract(args: ExtractArgs) -> Result<(), String> { + if !args.stream && args.asset.is_none() { + return Err("extract requires either --stream or --asset ".to_string()); + } + check_overwrite( + args.output.to_str().ok_or("non-utf8 output path")?, + args.overwrite, + ) + .map_err(|e| format!("{e}"))?; + + let file = + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let mut reader = BendlReader::open(BufReader::new(file)) + .map_err(|e| format!("failed to parse bundle header: {e}"))?; + + let mut out = BufWriter::new( + File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, + ); + + if args.stream { + let mut stream = reader + .assignment_stream_reader() + .map_err(|e| format!("failed to open stream region: {e}"))?; + io::copy(&mut stream, &mut out).map_err(|e| format!("failed to copy stream bytes: {e}"))?; + } else { + // asset is Some — validated by the early return above. + let name = args.asset.unwrap(); + let entry = reader + .find_asset_by_name(&name) + .cloned() + .ok_or_else(|| format!("no asset named {name:?} in bundle"))?; + let mut asset = reader + .asset_reader(&entry) + .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; + io::copy(&mut asset, &mut out).map_err(|e| format!("failed to copy asset bytes: {e}"))?; + } + + out.flush().map_err(|e| format!("flush failed: {e}"))?; + Ok(()) +} diff --git a/ben/src/cli/bendl/helpers.rs b/ben/src/cli/bendl/helpers.rs new file mode 100644 index 0000000..b4b2577 --- /dev/null +++ b/ben/src/cli/bendl/helpers.rs @@ -0,0 +1,53 @@ +use crate::io::bundle::format::AssignmentFormat; +use crate::io::bundle::writer::BendlAppender; +use crate::io::bundle::{AddAssetOptions, BendlWriteError, BendlWriter}; +use std::io::{Read, Seek, Write}; +use std::path::Path; + +/// Detect the container format of `path` from its extension. +pub(super) fn format_from_path(path: &Path) -> Result { + match path.extension().and_then(|e| e.to_str()) { + Some("ben") => Ok(AssignmentFormat::Ben), + Some("xben") => Ok(AssignmentFormat::Xben), + other => Err(format!( + "unable to determine assignment format from extension {other:?}; \ + expected .ben or .xben" + )), + } +} + +/// `mode` argument expected by `count_samples_from_file`. +pub(super) fn mode_str(format: AssignmentFormat) -> &'static str { + match format { + AssignmentFormat::Ben => "ben", + AssignmentFormat::Xben => "xben", + } +} + +pub(super) fn add_file_asset( + writer: &mut BendlWriter, + asset_type: u16, + name: &str, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + writer + .add_asset(asset_type, name, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} + +pub(super) fn append_file_asset< + W: Read + Write + Seek + crate::io::bundle::writer::BendlTruncate, +>( + appender: &mut BendlAppender, + asset_type: u16, + name: &str, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + appender + .add_asset(asset_type, name, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} diff --git a/ben/src/cli/bendl/inspect.rs b/ben/src/cli/bendl/inspect.rs new file mode 100644 index 0000000..c66bbca --- /dev/null +++ b/ben/src/cli/bendl/inspect.rs @@ -0,0 +1,71 @@ +use super::args::InspectArgs; +use crate::io::bundle::format::{ + AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, +}; +use crate::io::bundle::BendlReader; +use std::fs::File; +use std::io::BufReader; + +pub(super) fn run_inspect(args: InspectArgs) -> Result<(), String> { + let file = + File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?; + let reader = BendlReader::open(BufReader::new(file)) + .map_err(|e| format!("failed to parse bundle header: {e}"))?; + + let header = reader.header(); + println!("file: {}", args.input.display()); + println!( + "version: {}.{}", + header.major_version, header.minor_version + ); + println!("complete: {}", reader.is_complete()); + println!( + "assignment_format: {}", + match reader.assignment_format() { + Some(AssignmentFormat::Ben) => "ben", + Some(AssignmentFormat::Xben) => "xben", + None => "unknown", + } + ); + println!( + "sample_count: {}", + match reader.sample_count() { + Some(n) => n.to_string(), + None => "".to_string(), + } + ); + println!( + "stream: offset={} len={}", + header.stream_offset, header.stream_len + ); + println!( + "directory: offset={} len={}", + header.directory_offset, header.directory_len + ); + + let entries = reader.assets(); + println!("assets: {} entries", entries.len()); + for entry in entries { + let mut flag_parts: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flag_parts.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flag_parts.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flag_parts.push("checksum"); + } + let flag_str = if flag_parts.is_empty() { + "-".to_string() + } else { + flag_parts.join(",") + }; + println!( + " type={:<4} name={:<24} offset={:<10} len={:<10} flags={}", + entry.asset_type, entry.name, entry.payload_offset, entry.payload_len, flag_str + ); + } + + Ok(()) +} diff --git a/ben/src/cli/bendl/mod.rs b/ben/src/cli/bendl/mod.rs new file mode 100644 index 0000000..11a1419 --- /dev/null +++ b/ben/src/cli/bendl/mod.rs @@ -0,0 +1,48 @@ +//! CLI front-end for the `.bendl` bundle container. +//! +//! Exposes four subcommands: +//! +//! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional +//! asset files into a finalized `.bendl` bundle. +//! - `inspect` — print the header and directory of a `.bendl` file. +//! - `extract` — copy the embedded stream region or a named asset out +//! of a bundle to disk. +//! - `append` — add new asset files to an already-finalized bundle +//! without rewriting the stream. + +mod append; +mod args; +mod create; +mod extract; +mod helpers; +mod inspect; + +#[cfg(test)] +mod tests; + +use append::run_append; +use args::{Args, Command}; +use create::run_create; +use extract::run_extract; +use inspect::run_inspect; + +use crate::cli::common::set_verbose; +use clap::Parser; + +/// Parse CLI arguments and execute the selected subcommand. +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + let result = match args.command { + Command::Create(a) => run_create(a), + Command::Inspect(a) => run_inspect(a), + Command::Extract(a) => run_extract(a), + Command::Append(a) => run_append(a), + }; + + if let Err(err) = result { + eprintln!("Error: {err}"); + std::process::exit(1); + } +} diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs new file mode 100644 index 0000000..8d231b4 --- /dev/null +++ b/ben/src/cli/bendl/tests.rs @@ -0,0 +1,478 @@ +use super::append::run_append; +use super::args::{AppendArgs, CreateArgs, ExtractArgs, InspectArgs, NamedAsset}; +use super::create::run_create; +use super::extract::run_extract; +use super::helpers::{format_from_path, mode_str}; +use super::inspect::run_inspect; +use crate::codec::encode::encode_jsonl_to_ben; +use crate::io::bundle::format::AssignmentFormat; +use crate::io::bundle::{BendlReader, BendlWriter}; +use clap::Parser; +use std::io::{BufReader, Cursor}; +use std::path::PathBuf; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn unique_path(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("bendl-cli-{name}-{nonce}")) +} + +/// Write a minimal finalized .bendl file and return its path. +fn write_temp_bendl(name: &str, format: AssignmentFormat) -> PathBuf { + let path = unique_path(name); + let stream = b"STANDARD BEN FILE\x00fake"; + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); + writer.write_stream_bytes(stream, 1).unwrap(); + writer.finish().unwrap(); + std::fs::write(&path, &buf).unwrap(); + path +} + +#[test] +fn write_temp_bendl_xben_variant_works() { + // Exercises the Xben branch of write_temp_bendl. + let path = write_temp_bendl("xben_helper_check.bendl", AssignmentFormat::Xben); + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&path).unwrap())).unwrap(); + assert!(reader.is_complete()); + let _ = std::fs::remove_file(&path); +} + +#[test] +fn named_asset_from_str_rejects_empty_name() { + let err = "=path/to/file".parse::().unwrap_err(); + assert!(err.contains("non-empty")); +} + +#[test] +fn format_from_path_detects_xben() { + let fmt = format_from_path(std::path::Path::new("stream.xben")).unwrap(); + assert_eq!(fmt, AssignmentFormat::Xben); +} + +#[test] +fn format_from_path_rejects_unknown_extension() { + let err = format_from_path(std::path::Path::new("archive.tar")).unwrap_err(); + assert!(err.contains("expected .ben or .xben")); +} + +#[test] +fn mode_str_returns_xben_for_xben() { + assert_eq!(mode_str(AssignmentFormat::Xben), "xben"); +} + +#[test] +fn run_create_with_relabel_map_and_custom_asset() { + let ben = { + // Must end in .ben so format_from_path recognises it. + let p = std::env::temp_dir().join(format!( + "bendl-create-relabel-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let relabel = unique_path("create_relabel_map.json"); + std::fs::write(&relabel, b"{\"0\":1,\"1\":0}").unwrap(); + let custom = unique_path("create_custom.bin"); + std::fs::write(&custom, b"custom bytes").unwrap(); + let out = unique_path("create_with_assets.bendl"); + + let asset_str = format!("myblob={}", custom.display()); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: Some(relabel.clone()), + assets: vec![asset_str.parse().unwrap()], + overwrite: false, + graph_raw: false, + }; + run_create(args).unwrap(); + + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + assert!(reader.find_asset_by_name("myblob").is_some()); + + for p in [&ben, &relabel, &custom, &out] { + let _ = std::fs::remove_file(p); + } +} + +#[test] +fn run_inspect_xben_format_and_checksum_flag() { + use crate::io::bundle::format::ASSET_TYPE_CUSTOM; + use crate::io::bundle::AddAssetOptions; + + // Build a .bendl with a checksum asset so the flag_parts checksum + // branch is exercised. + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Xben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "checksummed", + b"data", + AddAssetOptions { + checksum: Some(vec![0xAB, 0xCD]), + ..AddAssetOptions::defaults() + }, + ) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + let path = unique_path("inspect_xben.bendl"); + std::fs::write(&path, &buf).unwrap(); + + run_inspect(InspectArgs { + input: path.clone(), + }) + .unwrap(); + let _ = std::fs::remove_file(&path); +} + +#[test] +fn run_append_no_assets_is_noop() { + let bendl = write_temp_bendl("append_noop.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![], + graph_raw: false, + }; + run_append(args).unwrap(); + // File should be unchanged (bundle is still valid). + let reader = + BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + assert!(reader.is_complete()); + let _ = std::fs::remove_file(&bendl); +} + +#[test] +fn run_append_with_metadata_and_relabel_map() { + let bendl = write_temp_bendl("append_assets.bendl", AssignmentFormat::Ben); + let meta = unique_path("append_meta.json"); + std::fs::write(&meta, b"{\"version\":1}").unwrap(); + let relabel = unique_path("append_relabel.json"); + std::fs::write(&relabel, b"{\"0\":1}").unwrap(); + + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: Some(meta.clone()), + relabel_map: Some(relabel.clone()), + assets: vec![], + graph_raw: false, + }; + run_append(args).unwrap(); + + let reader = + BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + + for p in [&bendl, &meta, &relabel] { + let _ = std::fs::remove_file(p); + } +} + +#[test] +fn run_create_with_graph_raw_flag() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-create-raw-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let graph = unique_path("create_raw_graph.json"); + std::fs::write(&graph, b"{\"nodes\":[0,1]}").unwrap(); + let out = unique_path("create_raw.bendl"); + + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: Some(graph.clone()), + metadata: None, + relabel_map: None, + assets: vec![], + overwrite: false, + graph_raw: true, + }; + run_create(args).unwrap(); + + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + for p in [&ben, &graph, &out] { + let _ = std::fs::remove_file(p); + } +} + +#[test] +fn run_inspect_unknown_format_and_no_sample_count() { + use crate::io::bundle::format::{ + BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, HEADER_SIZE, + }; + + // Build a header with an unknown assignment format byte and + // complete=0 so sample_count() returns None. + let mut header = [0u8; HEADER_SIZE]; + header[0..8].copy_from_slice(&BENDL_MAGIC); + header[8..10].copy_from_slice(&BENDL_MAJOR_VERSION.to_le_bytes()); + header[10..12].copy_from_slice(&BENDL_MINOR_VERSION.to_le_bytes()); + header[12] = COMPLETE_NO; + header[13] = 0xFF; // unknown format byte + // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 + let stream_offset = HEADER_SIZE as u64; + header[40..48].copy_from_slice(&stream_offset.to_le_bytes()); + let sample_count: i64 = -1; + header[56..64].copy_from_slice(&sample_count.to_le_bytes()); + + let path = unique_path("inspect_unknown.bendl"); + std::fs::write(&path, &header).unwrap(); + run_inspect(InspectArgs { + input: path.clone(), + }) + .unwrap(); + let _ = std::fs::remove_file(&path); +} + +#[test] +fn run_append_with_graph_raw_and_graph_asset() { + let bendl = write_temp_bendl("append_graph_raw.bendl", AssignmentFormat::Ben); + let graph = unique_path("append_graph_raw.json"); + std::fs::write(&graph, b"{\"nodes\":[0,1,2]}").unwrap(); + + let args = AppendArgs { + input: bendl.clone(), + graph: Some(graph.clone()), + metadata: None, + relabel_map: None, + assets: vec![], + graph_raw: true, + }; + run_append(args).unwrap(); + + let reader = + BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("graph.json").is_some()); + + for p in [&bendl, &graph] { + let _ = std::fs::remove_file(p); + } +} + +#[test] +fn run_extract_rejects_missing_stream_and_asset() { + let args = ExtractArgs::try_parse_from([ + "extract", + "--output", + "/tmp/out.bin", + "bundle.bendl", + ]) + .unwrap(); + let err = run_extract(args).unwrap_err(); + assert!(err.contains("either --stream or --asset")); +} + +#[test] +fn run_create_errors_on_missing_metadata_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-meta-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let jsonl = b"{\"assignment\":[1],\"sample\":1}\n"; + let mut b = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut b, crate::BenVariant::Standard).unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_meta.bendl"); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: Some(unique_path("nonexistent_meta.json")), + relabel_map: None, + assets: vec![], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_create_errors_on_missing_relabel_map_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-relabel-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let mut b = Vec::new(); + encode_jsonl_to_ben( + Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), + &mut b, + crate::BenVariant::Standard, + ) + .unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_relabel.bendl"); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: Some(unique_path("nonexistent_relabel.json")), + assets: vec![], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_create_errors_on_missing_custom_asset_file() { + let ben = { + let p = std::env::temp_dir().join(format!( + "bendl-err-custom-{}.ben", + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + )); + let mut b = Vec::new(); + encode_jsonl_to_ben( + Cursor::new(b"{\"assignment\":[1],\"sample\":1}\n"), + &mut b, + crate::BenVariant::Standard, + ) + .unwrap(); + std::fs::write(&p, &b).unwrap(); + p + }; + let out = unique_path("err_custom.bendl"); + let nonexistent: PathBuf = unique_path("nonexistent.bin"); + let asset_str = format!("myasset={}", nonexistent.display()); + let args = CreateArgs { + input: ben.clone(), + output: out.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![asset_str.parse().unwrap()], + overwrite: false, + graph_raw: false, + }; + let err = run_create(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&ben); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_extract_asset_by_name() { + use crate::io::bundle::format::ASSET_TYPE_CUSTOM; + use crate::io::bundle::AddAssetOptions; + + // Build a bundle with a named asset then extract it. + let mut buf: Vec = Vec::new(); + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "hello.txt", + b"world", + AddAssetOptions::defaults(), + ) + .unwrap(); + writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + writer.finish().unwrap(); + let bendl = unique_path("extract_asset.bendl"); + std::fs::write(&bendl, &buf).unwrap(); + + let out = unique_path("extract_asset_out.txt"); + let args = ExtractArgs::try_parse_from([ + "extract", + "--asset", + "hello.txt", + "--output", + out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + run_extract(args).unwrap(); + assert_eq!(std::fs::read(&out).unwrap(), b"world"); + + let _ = std::fs::remove_file(&bendl); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_append_errors_on_missing_metadata_file() { + let bendl = write_temp_bendl("append_err_meta.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: Some(unique_path("nonexistent_meta.json")), + relabel_map: None, + assets: vec![], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); +} + +#[test] +fn run_append_errors_on_missing_relabel_map_file() { + let bendl = write_temp_bendl("append_err_relabel.bendl", AssignmentFormat::Ben); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: Some(unique_path("nonexistent_relabel.json")), + assets: vec![], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); +} + +#[test] +fn run_append_errors_on_missing_custom_asset_file() { + let bendl = write_temp_bendl("append_err_custom.bendl", AssignmentFormat::Ben); + let nonexistent = unique_path("nonexistent_custom.bin"); + let asset_str = format!("myasset={}", nonexistent.display()); + let args = AppendArgs { + input: bendl.clone(), + graph: None, + metadata: None, + relabel_map: None, + assets: vec![asset_str.parse().unwrap()], + graph_raw: false, + }; + let err = run_append(args).unwrap_err(); + assert!(err.contains("failed to read")); + let _ = std::fs::remove_file(&bendl); +} diff --git a/ben/src/cli/common.rs b/ben/src/cli/common.rs deleted file mode 100644 index 356c97f..0000000 --- a/ben/src/cli/common.rs +++ /dev/null @@ -1,113 +0,0 @@ -use std::io::{self, Result}; -use std::path::Path; - -/// Configure tracing for CLI execution. -/// -/// When `verbose` is set and the user has not already provided `RUST_LOG`, the -/// default log filter is elevated to `trace`. The tracing subscriber is then -/// initialized exactly once for the process. -/// -/// # Arguments -/// -/// * `verbose` - Whether verbose trace logging should be enabled by default. -/// -/// # Returns -/// -/// This function does not return a value. -pub fn set_verbose(verbose: bool) { - if verbose && std::env::var_os("RUST_LOG").is_none() { - std::env::set_var("RUST_LOG", "trace"); - } - crate::logging::init_logging(); -} - -/// Confirm whether an existing output path may be overwritten. -/// -/// If `overwrite` is `false` and the destination already exists, the user is -/// prompted on stdin. An `AlreadyExists` error is returned when the user -/// declines. -/// -/// # Arguments -/// -/// * `file_name` - The candidate output path. -/// * `overwrite` - Whether to skip the interactive overwrite prompt. -/// -/// # Returns -/// -/// Returns `Ok(())` when the output path may be used. -pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { - if Path::new(file_name).exists() && !overwrite { - eprint!( - "File {:?} already exists, do you want to overwrite it? (y/[n]): ", - file_name - ); - let mut user_input = String::new(); - io::stdin().read_line(&mut user_input).unwrap(); - eprintln!(); - if user_input.trim().to_lowercase() != "y" { - return Err(io::Error::from(io::ErrorKind::AlreadyExists)); - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fs; - use std::sync::{Mutex, OnceLock}; - use std::time::{SystemTime, UNIX_EPOCH}; - - fn env_lock() -> &'static Mutex<()> { - static LOCK: OnceLock> = OnceLock::new(); - LOCK.get_or_init(|| Mutex::new(())) - } - - fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("ben-cli-common-{name}-{nonce}")) - } - - #[test] - fn set_verbose_sets_rust_log() { - let _guard = env_lock().lock().unwrap(); - std::env::remove_var("RUST_LOG"); - set_verbose(true); - assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("trace")); - } - - #[test] - fn set_verbose_preserves_existing_log_level() { - let _guard = env_lock().lock().unwrap(); - std::env::set_var("RUST_LOG", "debug"); - set_verbose(true); - assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("debug")); - std::env::remove_var("RUST_LOG"); - } - - #[test] - fn set_verbose_initializes_logger_without_setting_trace() { - let _guard = env_lock().lock().unwrap(); - std::env::remove_var("RUST_LOG"); - set_verbose(false); - assert!(std::env::var("RUST_LOG").is_err()); - } - - #[test] - fn check_overwrite_allows_missing_file() { - let path = unique_path("missing.txt"); - assert!(!path.exists()); - check_overwrite(path.to_str().unwrap(), false).unwrap(); - } - - #[test] - fn check_overwrite_allows_existing_file_when_forced() { - let path = unique_path("existing.txt"); - fs::write(&path, "hello").unwrap(); - check_overwrite(path.to_str().unwrap(), true).unwrap(); - fs::remove_file(path).unwrap(); - } -} diff --git a/ben/src/cli/common/mod.rs b/ben/src/cli/common/mod.rs new file mode 100644 index 0000000..0f17d10 --- /dev/null +++ b/ben/src/cli/common/mod.rs @@ -0,0 +1,55 @@ +use std::io::{self, Result}; +use std::path::Path; + +/// Configure tracing for CLI execution. +/// +/// When `verbose` is set and the user has not already provided `RUST_LOG`, the +/// default log filter is elevated to `trace`. The tracing subscriber is then +/// initialized exactly once for the process. +/// +/// # Arguments +/// +/// * `verbose` - Whether verbose trace logging should be enabled by default. +/// +/// # Returns +/// +/// This function does not return a value. +pub fn set_verbose(verbose: bool) { + if verbose && std::env::var_os("RUST_LOG").is_none() { + std::env::set_var("RUST_LOG", "trace"); + } + crate::logging::init_logging(); +} + +/// Confirm whether an existing output path may be overwritten. +/// +/// If `overwrite` is `false` and the destination already exists, the user is +/// prompted on stdin. An `AlreadyExists` error is returned when the user +/// declines. +/// +/// # Arguments +/// +/// * `file_name` - The candidate output path. +/// * `overwrite` - Whether to skip the interactive overwrite prompt. +/// +/// # Returns +/// +/// Returns `Ok(())` when the output path may be used. +pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { + if Path::new(file_name).exists() && !overwrite { + eprint!( + "File {:?} already exists, do you want to overwrite it? (y/[n]): ", + file_name + ); + let mut user_input = String::new(); + io::stdin().read_line(&mut user_input).unwrap(); + eprintln!(); + if user_input.trim().to_lowercase() != "y" { + return Err(io::Error::from(io::ErrorKind::AlreadyExists)); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests; diff --git a/ben/src/cli/common/tests.rs b/ben/src/cli/common/tests.rs new file mode 100644 index 0000000..d5b1d03 --- /dev/null +++ b/ben/src/cli/common/tests.rs @@ -0,0 +1,57 @@ +use super::*; +use std::fs; +use std::sync::{Mutex, OnceLock}; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn env_lock() -> &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) +} + +fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("ben-cli-common-{name}-{nonce}")) +} + +#[test] +fn set_verbose_sets_rust_log() { + let _guard = env_lock().lock().unwrap(); + std::env::remove_var("RUST_LOG"); + set_verbose(true); + assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("trace")); +} + +#[test] +fn set_verbose_preserves_existing_log_level() { + let _guard = env_lock().lock().unwrap(); + std::env::set_var("RUST_LOG", "debug"); + set_verbose(true); + assert_eq!(std::env::var("RUST_LOG").as_deref(), Ok("debug")); + std::env::remove_var("RUST_LOG"); +} + +#[test] +fn set_verbose_initializes_logger_without_setting_trace() { + let _guard = env_lock().lock().unwrap(); + std::env::remove_var("RUST_LOG"); + set_verbose(false); + assert!(std::env::var("RUST_LOG").is_err()); +} + +#[test] +fn check_overwrite_allows_missing_file() { + let path = unique_path("missing.txt"); + assert!(!path.exists()); + check_overwrite(path.to_str().unwrap(), false).unwrap(); +} + +#[test] +fn check_overwrite_allows_existing_file_when_forced() { + let path = unique_path("existing.txt"); + fs::write(&path, "hello").unwrap(); + check_overwrite(path.to_str().unwrap(), true).unwrap(); + fs::remove_file(path).unwrap(); +} diff --git a/ben/src/cli/pben.rs b/ben/src/cli/pben/mod.rs similarity index 63% rename from ben/src/cli/pben.rs rename to ben/src/cli/pben/mod.rs index 8b7948d..d8ec616 100644 --- a/ben/src/cli/pben.rs +++ b/ben/src/cli/pben/mod.rs @@ -254,148 +254,4 @@ fn assignment_encode_xben(reader: R, writer: W) -> } #[cfg(test)] -mod tests { - use super::*; - use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; - use crate::codec::encode::encode_jsonl_to_ben; - use clap::{CommandFactory, Parser}; - use std::io::{BufReader, Cursor}; - - #[test] - fn clap_metadata_uses_package_version() { - let mut command = Args::command(); - let help = command.render_long_help().to_string(); - - assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); - assert!(help.contains("PCOMPRESS")); - assert!(help.contains("--mode")); - } - - #[test] - fn parse_pc_to_xben_args() { - let args = Args::try_parse_from([ - "pben", - "--mode", - "pc-to-xben", - "--input-file", - "input.pc", - "--output-file", - "output.xben", - "--verbose", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::PcToXben); - assert_eq!(args.input_file.as_deref(), Some("input.pc")); - assert_eq!(args.output_file.as_deref(), Some("output.xben")); - assert!(args.verbose); - } - - #[test] - fn derive_output_path_replaces_expected_suffixes() { - assert_eq!( - derive_output_path(Mode::BenToPc, "plans.ben"), - "plans.pcompress" - ); - assert_eq!( - derive_output_path(Mode::PcToBen, "plans.pcompress"), - "plans.ben" - ); - assert_eq!(derive_output_path(Mode::PcToXben, "plans.pc"), "plans.xben"); - } - - #[test] - fn assignment_decode_ben_writes_json_lines() { - let jsonl = br#"{"assignment":[1,1,2],"sample":1} -{"assignment":[2,3,3],"sample":2} -"#; - let mut ben = Vec::new(); - encode_jsonl_to_ben(BufReader::new(&jsonl[..]), &mut ben, BenVariant::Standard).unwrap(); - - let mut out = Vec::new(); - assignment_decode_ben(Cursor::new(ben), &mut out).unwrap(); - - assert_eq!(String::from_utf8(out).unwrap(), "[0,0,1]\n[1,2,2]\n"); - } - - #[test] - fn assignment_encode_ben_offsets_values_and_writes_ben() { - let input = b"[0,0,1]\n[1,1,2]\n"; - let mut ben = Vec::new(); - assignment_encode_ben(BufReader::new(&input[..]), &mut ben).unwrap(); - - let mut out = Vec::new(); - decode_ben_to_jsonl(Cursor::new(ben), &mut out).unwrap(); - - let rendered = String::from_utf8(out).unwrap(); - assert!(rendered.contains(r#""assignment":[1,1,2]"#)); - assert!(rendered.contains(r#""assignment":[2,2,3]"#)); - } - - #[test] - fn resolved_output_path_returns_none_when_both_paths_absent() { - // When neither output_file nor input_file is given, stdout mode: Ok(None). - let result = resolved_output_path(Mode::BenToPc, None, None, false).unwrap(); - assert!(result.is_none()); - } - - #[test] - fn assignment_decode_ben_propagates_read_error() { - // assignment_decode_ben propagates I/O errors from the BEN reader. - struct AlwaysErrors; - impl io::Read for AlwaysErrors { - fn read(&mut self, _: &mut [u8]) -> io::Result { - Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) - } - } - let mut out = Vec::new(); - let err = assignment_decode_ben(AlwaysErrors, &mut out).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); - } - - #[test] - fn assignment_encode_xben_offsets_values_and_writes_xben() { - let input = b"[0,1,1]\n[2,2,0]\n"; - - let mut xben = Vec::new(); - assignment_encode_xben(BufReader::new(&input[..]), &mut xben).unwrap(); - - let mut out = Vec::new(); - decode_xben_to_jsonl(Cursor::new(xben), &mut out).unwrap(); - - let rendered = String::from_utf8(out).unwrap(); - assert!(rendered.contains(r#""assignment":[1,2,2]"#)); - assert!(rendered.contains(r#""assignment":[3,3,1]"#)); - } - - #[test] - fn assignment_decode_ben_iterator_error_propagates() { - // Provides a valid BEN banner so AssignmentReader::new succeeds, - // then returns a non-EOF error on the next read so the iterator - // fires the Err(e) => return Err(e) arm (line 204). - use std::io::Read; - use crate::format::banners::STANDARD_BEN_BANNER; - - struct BannerThenError { - banner: &'static [u8], - pos: usize, - } - impl Read for BannerThenError { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.pos < self.banner.len() { - let n = buf.len().min(self.banner.len() - self.pos); - buf[..n].copy_from_slice(&self.banner[self.pos..self.pos + n]); - self.pos += n; - Ok(n) - } else { - Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) - } - } - } - - let reader = BannerThenError { banner: STANDARD_BEN_BANNER, pos: 0 }; - let mut out = Vec::new(); - let err = assignment_decode_ben(reader, &mut out).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); - } -} +mod tests; diff --git a/ben/src/cli/pben/tests.rs b/ben/src/cli/pben/tests.rs new file mode 100644 index 0000000..5ce5b17 --- /dev/null +++ b/ben/src/cli/pben/tests.rs @@ -0,0 +1,143 @@ +use super::*; +use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; +use crate::codec::encode::encode_jsonl_to_ben; +use clap::{CommandFactory, Parser}; +use std::io::{BufReader, Cursor}; + +#[test] +fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("PCOMPRESS")); + assert!(help.contains("--mode")); +} + +#[test] +fn parse_pc_to_xben_args() { + let args = Args::try_parse_from([ + "pben", + "--mode", + "pc-to-xben", + "--input-file", + "input.pc", + "--output-file", + "output.xben", + "--verbose", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::PcToXben); + assert_eq!(args.input_file.as_deref(), Some("input.pc")); + assert_eq!(args.output_file.as_deref(), Some("output.xben")); + assert!(args.verbose); +} + +#[test] +fn derive_output_path_replaces_expected_suffixes() { + assert_eq!( + derive_output_path(Mode::BenToPc, "plans.ben"), + "plans.pcompress" + ); + assert_eq!( + derive_output_path(Mode::PcToBen, "plans.pcompress"), + "plans.ben" + ); + assert_eq!(derive_output_path(Mode::PcToXben, "plans.pc"), "plans.xben"); +} + +#[test] +fn assignment_decode_ben_writes_json_lines() { + let jsonl = br#"{"assignment":[1,1,2],"sample":1} +{"assignment":[2,3,3],"sample":2} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(&jsonl[..]), &mut ben, BenVariant::Standard).unwrap(); + + let mut out = Vec::new(); + assignment_decode_ben(Cursor::new(ben), &mut out).unwrap(); + + assert_eq!(String::from_utf8(out).unwrap(), "[0,0,1]\n[1,2,2]\n"); +} + +#[test] +fn assignment_encode_ben_offsets_values_and_writes_ben() { + let input = b"[0,0,1]\n[1,1,2]\n"; + let mut ben = Vec::new(); + assignment_encode_ben(BufReader::new(&input[..]), &mut ben).unwrap(); + + let mut out = Vec::new(); + decode_ben_to_jsonl(Cursor::new(ben), &mut out).unwrap(); + + let rendered = String::from_utf8(out).unwrap(); + assert!(rendered.contains(r#""assignment":[1,1,2]"#)); + assert!(rendered.contains(r#""assignment":[2,2,3]"#)); +} + +#[test] +fn resolved_output_path_returns_none_when_both_paths_absent() { + // When neither output_file nor input_file is given, stdout mode: Ok(None). + let result = resolved_output_path(Mode::BenToPc, None, None, false).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn assignment_decode_ben_propagates_read_error() { + // assignment_decode_ben propagates I/O errors from the BEN reader. + struct AlwaysErrors; + impl io::Read for AlwaysErrors { + fn read(&mut self, _: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + let mut out = Vec::new(); + let err = assignment_decode_ben(AlwaysErrors, &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +#[test] +fn assignment_encode_xben_offsets_values_and_writes_xben() { + let input = b"[0,1,1]\n[2,2,0]\n"; + + let mut xben = Vec::new(); + assignment_encode_xben(BufReader::new(&input[..]), &mut xben).unwrap(); + + let mut out = Vec::new(); + decode_xben_to_jsonl(Cursor::new(xben), &mut out).unwrap(); + + let rendered = String::from_utf8(out).unwrap(); + assert!(rendered.contains(r#""assignment":[1,2,2]"#)); + assert!(rendered.contains(r#""assignment":[3,3,1]"#)); +} + +#[test] +fn assignment_decode_ben_iterator_error_propagates() { + // Provides a valid BEN banner so AssignmentReader::new succeeds, + // then returns a non-EOF error on the next read so the iterator + // fires the Err(e) => return Err(e) arm (line 204). + use std::io::Read; + use crate::format::banners::STANDARD_BEN_BANNER; + + struct BannerThenError { + banner: &'static [u8], + pos: usize, + } + impl Read for BannerThenError { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos < self.banner.len() { + let n = buf.len().min(self.banner.len() - self.pos); + buf[..n].copy_from_slice(&self.banner[self.pos..self.pos + n]); + self.pos += n; + Ok(n) + } else { + Err(io::Error::new(io::ErrorKind::BrokenPipe, "broken")) + } + } + } + + let reader = BannerThenError { banner: STANDARD_BEN_BANNER, pos: 0 }; + let mut out = Vec::new(); + let err = assignment_decode_ben(reader, &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} diff --git a/ben/src/cli/reben.rs b/ben/src/cli/reben.rs deleted file mode 100644 index 93f05e1..0000000 --- a/ben/src/cli/reben.rs +++ /dev/null @@ -1,977 +0,0 @@ -use crate::cli::common::set_verbose; -use crate::{ - json::graph::{sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod}, - ops::relabel::{ - convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, - relabel_ben_file_as_variant_limit, relabel_ben_file_limit, relabel_ben_file_with_map, - relabel_ben_file_with_map_as_variant, relabel_ben_file_with_map_as_variant_limit, - relabel_ben_file_with_map_limit, - }, - BenVariant, -}; -use clap::{Parser, ValueEnum}; -use serde_json::{json, Value}; -use std::{ - collections::HashMap, - fs::File, - io::{BufReader, BufWriter, Write}, -}; - -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -/// Defines the mode of operation. -enum Mode { - /// Sort a JSON dual graph by a key and emit a relabeling map. - Json, - /// Relabel or canonicalize a BEN file. - Ben, -} - -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -/// Topology-based ordering methods for JSON graph relabeling. -enum OrderingMethod { - /// Recursive multilevel clustering based on local neighborhoods. - #[clap(alias = "mlc")] - MultiLevelCluster, - /// Reverse Cuthill-McKee ordering. - #[clap(alias = "rcm")] - ReverseCuthillMckee, -} - -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -/// BEN variants supported for BEN-mode output. -enum BenCliVariant { - Standard, - MkvChain, - #[clap(alias = "twodelta")] - TwoDelta, -} - -#[derive(Parser, Debug)] -#[command( - name = "Relabeling Binary Ensemble CLI Tool", - about = concat!( - "This is a command line tool for relabeling binary ensembles ", - "to help improve compression ratios for BEN and XBEN files." - ), - version -)] -/// Defines the command line arguments accepted by the program. -// TODO: Change the name of shape_file to dual_graph_file. -struct Args { - /// Input file to read from. - #[arg()] - input_file: String, - /// Output file to write to. - #[arg(short, long)] - output_file: Option, - /// Key to sort the JSON or BEN file by. - #[arg(short, long)] - key: Option, - /// Topology-based ordering method to use instead of a key sort. - #[arg(long, value_enum)] - ordering: Option, - /// Shape file to use for sorting the BEN file. Only needed - /// in BEN mode when a map is not provided. - #[arg(short, long)] - shape_file: Option, - /// Map file to use for relabeling the BEN file. - #[arg(short = 'p', long)] - map_file: Option, - /// Mode to run the program in (either JSON or BEN). - /// The JSON mode will sort a JSON file by a given key or graph-ordering - /// method. The BEN mode will relabel a BEN file according to a map file - /// or a graph-ordering request (which also requires a dual-graph file). If no - /// map file or key is provided, the BEN mode will canonicalize - /// the assignment vectors in the BEN file. - #[arg(short, long)] - mode: Mode, - /// Only relabel the first `n` expanded samples in BEN mode. - #[arg(long)] - n_items: Option, - /// BEN variant to use for the BEN-mode output file. - #[arg(long, value_enum)] - output_variant: Option, - /// Rewrite the BEN stream without canonicalizing or map relabeling. - #[arg(long)] - convert_only: bool, - /// Verbosity level for the program. - #[arg(short, long)] - verbose: bool, -} - -/// Parse CLI arguments and execute the selected `reben` mode. -pub fn run() { - let args = Args::parse(); - set_verbose(args.verbose); - - if let Err(err) = run_with_args(args) { - eprintln!("Error: {err}"); - std::process::exit(1); - } -} - -fn run_with_args(args: Args) -> Result<(), String> { - match args.mode.clone() { - Mode::Json => run_json_mode(args), - Mode::Ben => run_ben_mode(args), - } -} - -fn run_json_mode(args: Args) -> Result<(), String> { - if args.n_items.is_some() { - return Err("--n-items is only supported in BEN mode.".to_string()); - } - - let input_file = File::open(&args.input_file) - .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; - let reader = BufReader::new(input_file); - let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", label).as_str() - } - }; - - let output_file = File::create(&output_file_name) - .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; - let writer = BufWriter::new(output_file); - - let map = if let Some(key) = args.key.as_ref() { - sort_json_file_by_key(reader, writer, key) - } else { - let ordering = args - .ordering - .as_ref() - .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; - sort_json_file_by_ordering(reader, writer, to_graph_ordering(ordering)) - } - .map_err(|e| format!("Could not sort input graph: {e}"))?; - - let map_file_name = args.input_file.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", label).as_str() - + "_map.json"; - let map_file = File::create(&map_file_name) - .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": args.key.as_ref(), - "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; - Ok(()) -} - -fn run_ben_mode(args: Args) -> Result<(), String> { - if args.convert_only && args.output_variant.is_none() { - return Err("--convert-only requires --output-variant.".to_string()); - } - if args.convert_only - && (args.map_file.is_some() || args.key.is_some() || args.ordering.is_some()) - { - return Err("--convert-only cannot be combined with relabeling options.".to_string()); - } - - let input_file = File::open(&args.input_file) - .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; - let reader = BufReader::new(input_file); - let output_variant = args.output_variant.as_ref().map(to_ben_variant); - - if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { - if args.convert_only { - tracing::trace!("Converting BEN file to requested variant."); - } else { - tracing::trace!("Canonicalizing assignment vectors in ben file."); - } - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - if let Some(variant) = output_variant { - args.input_file.trim_end_matches(".ben").to_owned() - + format!("_{}.ben", ben_variant_name(variant)).as_str() - } else { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + "_canonicalized_assignments.jsonl.ben" - } - } - }; - - let output_file = File::create(&output_file_name) - .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; - let writer = BufWriter::new(output_file); - - if args.convert_only { - let variant = output_variant.expect("checked above"); - if let Some(limit) = args.n_items { - convert_ben_file_limit(reader, writer, variant, limit) - } else { - convert_ben_file(reader, writer, variant) - } - } else if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_as_variant_limit(reader, writer, variant, limit) - } else { - relabel_ben_file_as_variant(reader, writer, variant) - } - } else if let Some(limit) = args.n_items { - relabel_ben_file_limit(reader, writer, limit) - } else { - relabel_ben_file(reader, writer) - } - .map_err(|e| format!("BEN relabeling failed: {e}"))?; - return Ok(()); - } - - if args.map_file.is_some() && (args.key.is_some() || args.ordering.is_some()) { - return Err(concat!( - "Cannot provide both a map file and a sorting option. ", - "Please provide either the map file or the key/ordering and the ", - "(JSON formatted) dual-graph file needed to generate a map file." - ) - .to_string()); - } - - let mut map_file_name = String::new(); - if args.key.is_some() || args.ordering.is_some() { - let shape = args.shape_file.as_ref().ok_or_else(|| { - "No shape file provided to go with the requested ordering.".to_string() - })?; - let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; - tracing::trace!("Creating map file for ordering: {}", label); - - let output_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}.json", label).as_str(); - - let output_file = File::create(&output_file_name) - .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; - let writer = BufWriter::new(output_file); - - let shape_file = - File::open(shape).map_err(|e| format!("Could not open shape file {shape:?}: {e}"))?; - let shape_reader = BufReader::new(shape_file); - let map = if let Some(key) = args.key.as_ref() { - sort_json_file_by_key(shape_reader, writer, key) - } else { - let ordering = args - .ordering - .as_ref() - .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; - sort_json_file_by_ordering(shape_reader, writer, to_graph_ordering(ordering)) - } - .map_err(|e| format!("Could not sort shape file: {e}"))?; - - map_file_name = shape.trim_end_matches(".json").to_owned() - + format!("_sorted_by_{}", label).as_str() - + "_map.json"; - let map_file = File::create(&map_file_name) - .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; - let mut map_writer = BufWriter::new(map_file); - - let map_json = json!({ - "input_file": args.input_file, - "output_file": output_file_name, - "key": args.key.as_ref(), - "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map - }); - - map_writer - .write_all(map_json.to_string().as_bytes()) - .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; - } - - if map_file_name.is_empty() { - map_file_name = args - .map_file - .as_ref() - .ok_or_else(|| "Provide --map-file, --key, or --ordering in BEN mode.".to_string())? - .to_owned(); - } - - let (new_to_old_node_map, label) = read_relabel_map_file(&map_file_name)?; - - let output_file_name = match args.output_file { - Some(name) => name, - None => { - args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + format!("_sorted_by_{}.jsonl.ben", label).as_str() - } - }; - let output_file = File::create(&output_file_name) - .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; - let writer = BufWriter::new(output_file); - - tracing::trace!( - "Relabeling ben file according to map file {}", - map_file_name, - ); - - if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_with_map_as_variant_limit( - reader, - writer, - new_to_old_node_map, - variant, - limit, - ) - } else { - relabel_ben_file_with_map_as_variant(reader, writer, new_to_old_node_map, variant) - } - } else if let Some(limit) = args.n_items { - relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) - } else { - relabel_ben_file_with_map(reader, writer, new_to_old_node_map) - } - .map_err(|e| format!("BEN relabeling with map {map_file_name:?} failed: {e}"))?; - Ok(()) -} - -fn read_relabel_map_file(map_file_name: &str) -> Result<(HashMap, String), String> { - let map_file = File::open(map_file_name) - .map_err(|e| format!("Could not open map file {map_file_name:?}: {e}"))?; - let map_reader = BufReader::new(map_file); - - let data: Value = serde_json::from_reader(map_reader) - .map_err(|e| format!("Could not parse map file {map_file_name:?} as JSON: {e}"))?; - - let map_obj = data - .get("relabeling_old_to_new_nodes_map") - .and_then(Value::as_object) - .ok_or_else(|| { - format!( - "Map file {map_file_name:?} must contain object field \ - relabeling_old_to_new_nodes_map" - ) - })?; - - let mut new_to_old_node_map = HashMap::with_capacity(map_obj.len()); - for (old_idx_text, new_idx_value) in map_obj { - let old_idx = old_idx_text.parse::().map_err(|e| { - format!( - "Map file {map_file_name:?} contains invalid old node index {old_idx_text:?}: {e}" - ) - })?; - let new_idx = new_idx_value.as_u64().ok_or_else(|| { - format!( - "Map file {map_file_name:?} maps old node {old_idx} to non-integer value \ - {new_idx_value}" - ) - })? as usize; - new_to_old_node_map.insert(new_idx, old_idx); - } - - let label = data["key"] - .as_str() - .map(ToOwned::to_owned) - .or_else(|| data["ordering_method"].as_str().map(ToOwned::to_owned)) - .unwrap_or_else(|| "map".to_string()); - - Ok((new_to_old_node_map, label)) -} - -/// Convert a CLI ordering method variant to the library's graph ordering type. -/// -/// # Arguments -/// -/// * `ordering` - The CLI ordering method selected by the user. -/// -/// # Returns -/// -/// Returns the corresponding `GraphOrderingMethod`. -fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { - match ordering { - OrderingMethod::MultiLevelCluster => GraphOrderingMethod::MultiLevelCluster, - OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, - } -} - -/// Return the kebab-case display name for an ordering method. -/// -/// # Arguments -/// -/// * `ordering` - The CLI ordering method variant. -/// -/// # Returns -/// -/// Returns a static string identifying the ordering method. -fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { - match ordering { - OrderingMethod::MultiLevelCluster => "multi-level-cluster", - OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", - } -} - -/// Return the lowercase display name for a BEN variant. -/// -/// # Arguments -/// -/// * `variant` - The BEN variant to name. -/// -/// # Returns -/// -/// Returns a static string identifying the variant. -fn ben_variant_name(variant: BenVariant) -> &'static str { - match variant { - BenVariant::Standard => "standard", - BenVariant::MkvChain => "mkvchain", - BenVariant::TwoDelta => "twodelta", - } -} - -/// Convert a CLI BEN variant to the library's `BenVariant` type. -/// -/// # Arguments -/// -/// * `variant` - The CLI BEN variant selected by the user. -/// -/// # Returns -/// -/// Returns the corresponding `BenVariant`. -fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { - match variant { - BenCliVariant::Standard => BenVariant::Standard, - BenCliVariant::MkvChain => BenVariant::MkvChain, - BenCliVariant::TwoDelta => BenVariant::TwoDelta, - } -} - -/// Derive a human-readable label from the key or ordering method for file naming. -/// -/// # Arguments -/// -/// * `key` - An optional JSON key used for sorting. -/// * `ordering` - An optional topology-based ordering method. -/// -/// # Returns -/// -/// Returns the label string, or `None` if neither option is provided. -fn relabeling_label( - key: Option<&str>, - ordering: Option<&OrderingMethod>, -) -> Result { - match (key, ordering) { - (Some(_), Some(_)) => Err("Provide either --key or --ordering, not both.".to_string()), - (Some(key), None) => Ok(key.to_string()), - (None, Some(ordering)) => Ok(ordering_method_name(ordering).to_string()), - (None, None) => Err("Provide either --key or --ordering.".to_string()), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::codec::encode::encode_jsonl_to_ben; - use clap::{CommandFactory, Parser}; - use std::{ - fs, - io::Cursor, - time::{SystemTime, UNIX_EPOCH}, - }; - - fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("reben-{name}-{nonce}")) - } - - /// Write a minimal Standard BEN file to a temp path and return the path. - fn write_temp_ben(name: &str) -> std::path::PathBuf { - let path = unique_path(name); - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - fs::write(&path, &ben).unwrap(); - path - } - - #[test] - fn clap_metadata_uses_package_version() { - let mut command = Args::command(); - let help = command.render_long_help().to_string(); - - assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); - assert!(help.contains("Relabeling Binary Ensemble CLI Tool")); - assert!(help.contains("--shape-file")); - assert!(help.contains("canonicalize")); - } - - #[test] - fn parse_json_mode_args() { - let args = Args::try_parse_from([ - "reben", - "dual_graph.json", - "--mode", - "json", - "--key", - "GEOID20", - "--output-file", - "sorted.json", - "--verbose", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::Json); - assert_eq!(args.input_file, "dual_graph.json"); - assert_eq!(args.key.as_deref(), Some("GEOID20")); - assert_eq!(args.output_file.as_deref(), Some("sorted.json")); - assert!(args.verbose); - } - - #[test] - fn parse_json_mode_ordering_args() { - let args = Args::try_parse_from([ - "reben", - "dual_graph.json", - "--mode", - "json", - "--ordering", - "multi-level-cluster", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::Json); - assert_eq!(args.ordering, Some(OrderingMethod::MultiLevelCluster)); - assert!(args.key.is_none()); - } - - #[test] - fn parse_ben_mode_n_items_args() { - let args = Args::try_parse_from([ - "reben", - "samples.jsonl.ben", - "--mode", - "ben", - "--n-items", - "25", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::Ben); - assert_eq!(args.n_items, Some(25)); - } - - #[test] - fn parse_ben_mode_output_variant_args() { - let args = Args::try_parse_from([ - "reben", - "samples.jsonl.ben", - "--mode", - "ben", - "--output-variant", - "twodelta", - "--convert-only", - ]) - .unwrap(); - - assert_eq!(args.mode, Mode::Ben); - assert_eq!(args.output_variant, Some(BenCliVariant::TwoDelta)); - assert!(args.convert_only); - } - - #[test] - fn run_json_mode_rejects_n_items() { - let args = Args::try_parse_from([ - "reben", "x.json", "--mode", "json", "--key", "k", "--n-items", "5", - ]) - .unwrap(); - let err = run_json_mode(args).unwrap_err(); - assert!(err.contains("--n-items")); - } - - #[test] - fn run_ben_mode_rejects_convert_only_without_variant() { - let args = Args::try_parse_from([ - "reben", "x.ben", "--mode", "ben", "--convert-only", - ]) - .unwrap(); - let err = run_ben_mode(args).unwrap_err(); - assert!(err.contains("--output-variant")); - } - - #[test] - fn run_ben_mode_rejects_convert_only_with_relabeling() { - let args = Args::try_parse_from([ - "reben", "x.ben", "--mode", "ben", - "--convert-only", "--output-variant", "standard", "--key", "k", - ]) - .unwrap(); - let err = run_ben_mode(args).unwrap_err(); - assert!(err.contains("--convert-only cannot be combined")); - } - - #[test] - fn ben_variant_name_covers_all_variants() { - assert_eq!(ben_variant_name(BenVariant::Standard), "standard"); - assert_eq!(ben_variant_name(BenVariant::MkvChain), "mkvchain"); - assert_eq!(ben_variant_name(BenVariant::TwoDelta), "twodelta"); - } - - #[test] - fn to_ben_variant_covers_standard() { - assert_eq!(to_ben_variant(&BenCliVariant::Standard), BenVariant::Standard); - } - - #[test] - fn relabeling_label_errors_on_both_key_and_ordering() { - let err = relabeling_label( - Some("k"), - Some(&OrderingMethod::MultiLevelCluster), - ) - .unwrap_err(); - assert!(err.contains("not both")); - } - - #[test] - fn relabeling_label_errors_on_neither() { - let err = relabeling_label(None, None).unwrap_err(); - assert!(err.contains("either")); - } - - #[test] - fn run_ben_mode_with_n_items_limit() { - let input = write_temp_ben("n_items_input.jsonl.ben"); - let out = unique_path("n_items_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", - input.to_str().unwrap(), - "--mode", "ben", - "--n-items", "1", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - let _ = fs::remove_file(&input); - let _ = fs::remove_file(&out); - } - - #[test] - fn run_json_mode_with_ordering_derives_output_name() { - // Create a minimal graph JSON file. - let shape = unique_path("ordering_shape.json"); - fs::write( - &shape, - br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, - ) - .unwrap(); - let args = Args::try_parse_from([ - "reben", - shape.to_str().unwrap(), - "--mode", "json", - "--ordering", "reverse-cuthill-mckee", - ]) - .unwrap(); - let result = run_json_mode(args); - // Clean up derived output file. - let derived = shape.to_str().unwrap() - .trim_end_matches(".json") - .to_owned() - + "_sorted_by_reverse-cuthill-mckee_map.json"; - let derived2 = shape.to_str().unwrap() - .trim_end_matches(".json") - .to_owned() - + "_sorted_by_reverse-cuthill-mckee.jsonl.ben"; - let _ = fs::remove_file(&derived); - let _ = fs::remove_file(&derived2); - let _ = fs::remove_file(&shape); - result.unwrap(); - } - - #[test] - fn run_ben_mode_with_map_file_and_n_items() { - use crate::codec::encode::encode_jsonl_to_ben; - use std::io::Cursor; - - // Build a 3-node BEN file. - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - let ben_path = unique_path("map_n_items.jsonl.ben"); - fs::write(&ben_path, &ben).unwrap(); - - let map_path = unique_path("map_n_items_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", - ) - .unwrap(); - - let out = unique_path("map_n_items_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", - ben_path.to_str().unwrap(), - "--mode", "ben", - "--map-file", map_path.to_str().unwrap(), - "--n-items", "1", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - - for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } - } - - #[test] - fn run_ben_mode_with_map_file_no_limit() { - use crate::codec::encode::encode_jsonl_to_ben; - use std::io::Cursor; - - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - let ben_path = unique_path("map_nolimit.jsonl.ben"); - fs::write(&ben_path, &ben).unwrap(); - - let map_path = unique_path("map_nolimit_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", - ) - .unwrap(); - - let out = unique_path("map_nolimit_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", - ben_path.to_str().unwrap(), - "--mode", "ben", - "--map-file", map_path.to_str().unwrap(), - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - - for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } - } - - #[test] - fn run_ben_mode_with_output_variant_and_n_items() { - let input = write_temp_ben("var_n_items.jsonl.ben"); - let out = unique_path("var_n_items_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", input.to_str().unwrap(), - "--mode", "ben", - "--output-variant", "standard", - "--n-items", "1", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - let _ = fs::remove_file(&input); - let _ = fs::remove_file(&out); - } - - #[test] - fn run_ben_mode_with_shape_file_and_ordering() { - // Covers the shape_file + ordering path (lines 265-269). - // Creates a map from the shape file ordering, then relabels the BEN. - let input = write_temp_ben("shape_order_input.jsonl.ben"); - let shape = unique_path("shape_order_shape.json"); - fs::write( - &shape, - br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, - ) - .unwrap(); - let out = unique_path("shape_order_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", input.to_str().unwrap(), - "--mode", "ben", - "--shape-file", shape.to_str().unwrap(), - "--ordering", "reverse-cuthill-mckee", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - let result = run_ben_mode(args); - // Clean up the map file the function derives automatically. - let map = shape.to_str().unwrap() - .trim_end_matches(".json") - .to_owned() - + "_sorted_by_reverse-cuthill-mckee_map.json"; - let sorted_json = shape.to_str().unwrap() - .trim_end_matches(".json") - .to_owned() - + "_sorted_by_reverse-cuthill-mckee.json"; - let _ = fs::remove_file(&map); - let _ = fs::remove_file(&sorted_json); - for p in [&input, &shape, &out] { let _ = fs::remove_file(p); } - result.unwrap(); - } - - #[test] - fn run_ben_mode_with_map_file_and_output_variant_n_items() { - use crate::codec::encode::encode_jsonl_to_ben; - use std::io::Cursor; - - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - let ben_path = unique_path("map_var_n.jsonl.ben"); - fs::write(&ben_path, &ben).unwrap(); - - let map_path = unique_path("map_var_n_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", - ) - .unwrap(); - let out = unique_path("map_var_n_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", ben_path.to_str().unwrap(), - "--mode", "ben", - "--map-file", map_path.to_str().unwrap(), - "--output-variant", "standard", - "--n-items", "1", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } - } - - #[test] - fn run_ben_mode_with_map_file_and_output_variant_no_limit() { - use crate::codec::encode::encode_jsonl_to_ben; - use std::io::Cursor; - - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - let ben_path = unique_path("map_var_nolim.jsonl.ben"); - fs::write(&ben_path, &ben).unwrap(); - - let map_path = unique_path("map_var_nolim_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", - ) - .unwrap(); - let out = unique_path("map_var_nolim_output.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", ben_path.to_str().unwrap(), - "--mode", "ben", - "--map-file", map_path.to_str().unwrap(), - "--output-variant", "standard", - "--output-file", out.to_str().unwrap(), - ]) - .unwrap(); - run_ben_mode(args).unwrap(); - for p in [&ben_path, &map_path, &out] { let _ = fs::remove_file(p); } - } - - #[test] - fn run_ben_mode_map_file_without_output_file_derives_name() { - // Covers the None branch of output_file (lines 306-307). - use crate::codec::encode::encode_jsonl_to_ben; - use std::io::Cursor; - - let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); - let input = unique_path("map_derive.jsonl.ben"); - fs::write(&input, &ben).unwrap(); - - let map_path = unique_path("map_derive_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1},\"key\":\"sort\"}", - ) - .unwrap(); - let args = Args::try_parse_from([ - "reben", input.to_str().unwrap(), - "--mode", "ben", - "--map-file", map_path.to_str().unwrap(), - ]) - .unwrap(); - let result = run_ben_mode(args); - // Derived output: input stripped of ".jsonl.ben" + "_sorted_by_{label}.jsonl.ben" - let derived = input.to_str().unwrap() - .trim_end_matches(".jsonl.ben") - .to_owned() - + "_sorted_by_sort.jsonl.ben"; - let _ = fs::remove_file(&derived); - for p in [&input, &map_path] { let _ = fs::remove_file(p); } - result.unwrap(); - } - - #[test] - fn read_relabel_map_file_rejects_non_integer_index() { - let map_path = unique_path("bad_index_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"not_a_number\":0}}", - ) - .unwrap(); - let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); - assert!(err.contains("invalid old node index")); - let _ = fs::remove_file(&map_path); - } - - #[test] - fn read_relabel_map_file_rejects_non_integer_value() { - let map_path = unique_path("bad_value_map.json"); - fs::write( - &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":\"not_a_number\"}}", - ) - .unwrap(); - let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); - assert!(err.contains("non-integer")); - let _ = fs::remove_file(&map_path); - } - - #[test] - fn run_ben_mode_canonicalize_derives_output_name() { - let input = write_temp_ben("canon.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", - input.to_str().unwrap(), - "--mode", "ben", - ]) - .unwrap(); - // run_ben_mode should succeed and write a derived output file. - let result = run_ben_mode(args); - // Clean up any produced files. - let derived = input - .to_str() - .unwrap() - .trim_end_matches(".jsonl.ben") - .to_owned() - + "_canonicalized_assignments.jsonl.ben"; - let _ = fs::remove_file(&derived); - fs::remove_file(&input).unwrap(); - result.unwrap(); - } - - #[test] - fn run_ben_mode_with_output_variant_derives_name() { - let input = write_temp_ben("variant.ben"); - let args = Args::try_parse_from([ - "reben", - input.to_str().unwrap(), - "--mode", "ben", - "--output-variant", "standard", - ]) - .unwrap(); - let result = run_ben_mode(args); - let derived = input - .to_str() - .unwrap() - .trim_end_matches(".ben") - .to_owned() - + "_standard.ben"; - let _ = fs::remove_file(&derived); - fs::remove_file(&input).unwrap(); - result.unwrap(); - } -} diff --git a/ben/src/cli/reben/args.rs b/ben/src/cli/reben/args.rs new file mode 100644 index 0000000..5bde2a0 --- /dev/null +++ b/ben/src/cli/reben/args.rs @@ -0,0 +1,83 @@ +use clap::{Parser, ValueEnum}; + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +pub(super) enum Mode { + /// Sort a JSON dual graph by a key and emit a relabeling map. + Json, + /// Relabel or canonicalize a BEN file. + Ben, +} + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Topology-based ordering methods for JSON graph relabeling. +pub(super) enum OrderingMethod { + /// Recursive multilevel clustering based on local neighborhoods. + #[clap(alias = "mlc")] + MultiLevelCluster, + /// Reverse Cuthill-McKee ordering. + #[clap(alias = "rcm")] + ReverseCuthillMckee, +} + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// BEN variants supported for BEN-mode output. +pub(super) enum BenCliVariant { + Standard, + MkvChain, + #[clap(alias = "twodelta")] + TwoDelta, +} + +#[derive(Parser, Debug)] +#[command( + name = "Relabeling Binary Ensemble CLI Tool", + about = concat!( + "This is a command line tool for relabeling binary ensembles ", + "to help improve compression ratios for BEN and XBEN files." + ), + version +)] +/// Defines the command line arguments accepted by the program. +// TODO: Change the name of shape_file to dual_graph_file. +pub(super) struct Args { + /// Input file to read from. + #[arg()] + pub input_file: String, + /// Output file to write to. + #[arg(short, long)] + pub output_file: Option, + /// Key to sort the JSON or BEN file by. + #[arg(short, long)] + pub key: Option, + /// Topology-based ordering method to use instead of a key sort. + #[arg(long, value_enum)] + pub ordering: Option, + /// Shape file to use for sorting the BEN file. Only needed + /// in BEN mode when a map is not provided. + #[arg(short, long)] + pub shape_file: Option, + /// Map file to use for relabeling the BEN file. + #[arg(short = 'p', long)] + pub map_file: Option, + /// Mode to run the program in (either JSON or BEN). + /// The JSON mode will sort a JSON file by a given key or graph-ordering + /// method. The BEN mode will relabel a BEN file according to a map file + /// or a graph-ordering request (which also requires a dual-graph file). If no + /// map file or key is provided, the BEN mode will canonicalize + /// the assignment vectors in the BEN file. + #[arg(short, long)] + pub mode: Mode, + /// Only relabel the first `n` expanded samples in BEN mode. + #[arg(long)] + pub n_items: Option, + /// BEN variant to use for the BEN-mode output file. + #[arg(long, value_enum)] + pub output_variant: Option, + /// Rewrite the BEN stream without canonicalizing or map relabeling. + #[arg(long)] + pub convert_only: bool, + /// Verbosity level for the program. + #[arg(short, long)] + pub verbose: bool, +} diff --git a/ben/src/cli/reben/ben_mode.rs b/ben/src/cli/reben/ben_mode.rs new file mode 100644 index 0000000..a46b9cf --- /dev/null +++ b/ben/src/cli/reben/ben_mode.rs @@ -0,0 +1,181 @@ +use super::args::Args; +use super::helpers::{ + ben_variant_name, ordering_method_name, read_relabel_map_file, relabeling_label, + to_ben_variant, to_graph_ordering, +}; +use crate::json::graph::{sort_json_file_by_key, sort_json_file_by_ordering}; +use crate::ops::relabel::{ + convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, + relabel_ben_file_as_variant_limit, relabel_ben_file_limit, relabel_ben_file_with_map, + relabel_ben_file_with_map_as_variant, relabel_ben_file_with_map_as_variant_limit, + relabel_ben_file_with_map_limit, +}; +use serde_json::json; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { + if args.convert_only && args.output_variant.is_none() { + return Err("--convert-only requires --output-variant.".to_string()); + } + if args.convert_only + && (args.map_file.is_some() || args.key.is_some() || args.ordering.is_some()) + { + return Err("--convert-only cannot be combined with relabeling options.".to_string()); + } + + let input_file = File::open(&args.input_file) + .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; + let reader = BufReader::new(input_file); + let output_variant = args.output_variant.as_ref().map(to_ben_variant); + + if args.map_file.is_none() && args.key.is_none() && args.ordering.is_none() { + if args.convert_only { + tracing::trace!("Converting BEN file to requested variant."); + } else { + tracing::trace!("Canonicalizing assignment vectors in ben file."); + } + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + if let Some(variant) = output_variant { + args.input_file.trim_end_matches(".ben").to_owned() + + format!("_{}.ben", ben_variant_name(variant)).as_str() + } else { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + "_canonicalized_assignments.jsonl.ben" + } + } + }; + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + if args.convert_only { + let variant = output_variant.expect("checked above"); + if let Some(limit) = args.n_items { + convert_ben_file_limit(reader, writer, variant, limit) + } else { + convert_ben_file(reader, writer, variant) + } + } else if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_as_variant_limit(reader, writer, variant, limit) + } else { + relabel_ben_file_as_variant(reader, writer, variant) + } + } else if let Some(limit) = args.n_items { + relabel_ben_file_limit(reader, writer, limit) + } else { + relabel_ben_file(reader, writer) + } + .map_err(|e| format!("BEN relabeling failed: {e}"))?; + return Ok(()); + } + + if args.map_file.is_some() && (args.key.is_some() || args.ordering.is_some()) { + return Err(concat!( + "Cannot provide both a map file and a sorting option. ", + "Please provide either the map file or the key/ordering and the ", + "(JSON formatted) dual-graph file needed to generate a map file." + ) + .to_string()); + } + + let mut map_file_name = String::new(); + if args.key.is_some() || args.ordering.is_some() { + let shape = args.shape_file.as_ref().ok_or_else(|| { + "No shape file provided to go with the requested ordering.".to_string() + })?; + let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; + tracing::trace!("Creating map file for ordering: {}", label); + + let output_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", label).as_str(); + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + let shape_file = + File::open(shape).map_err(|e| format!("Could not open shape file {shape:?}: {e}"))?; + let shape_reader = BufReader::new(shape_file); + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(shape_reader, writer, key) + } else { + let ordering = args + .ordering + .as_ref() + .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; + sort_json_file_by_ordering(shape_reader, writer, to_graph_ordering(ordering)) + } + .map_err(|e| format!("Could not sort shape file: {e}"))?; + + map_file_name = shape.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", label).as_str() + + "_map.json"; + let map_file = File::create(&map_file_name) + .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), + "relabeling_old_to_new_nodes_map": map + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; + } + + if map_file_name.is_empty() { + map_file_name = args + .map_file + .as_ref() + .ok_or_else(|| "Provide --map-file, --key, or --ordering in BEN mode.".to_string())? + .to_owned(); + } + + let (new_to_old_node_map, label) = read_relabel_map_file(&map_file_name)?; + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".jsonl.ben").to_owned() + + format!("_sorted_by_{}.jsonl.ben", label).as_str() + } + }; + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + tracing::trace!( + "Relabeling ben file according to map file {}", + map_file_name, + ); + + if let Some(variant) = output_variant { + if let Some(limit) = args.n_items { + relabel_ben_file_with_map_as_variant_limit( + reader, + writer, + new_to_old_node_map, + variant, + limit, + ) + } else { + relabel_ben_file_with_map_as_variant(reader, writer, new_to_old_node_map, variant) + } + } else if let Some(limit) = args.n_items { + relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) + } else { + relabel_ben_file_with_map(reader, writer, new_to_old_node_map) + } + .map_err(|e| format!("BEN relabeling with map {map_file_name:?} failed: {e}"))?; + Ok(()) +} diff --git a/ben/src/cli/reben/helpers.rs b/ben/src/cli/reben/helpers.rs new file mode 100644 index 0000000..f2d5ba5 --- /dev/null +++ b/ben/src/cli/reben/helpers.rs @@ -0,0 +1,99 @@ +use super::args::{BenCliVariant, OrderingMethod}; +use crate::json::graph::GraphOrderingMethod; +use crate::BenVariant; +use serde_json::Value; +use std::collections::HashMap; +use std::fs::File; +use std::io::BufReader; + +pub(super) fn read_relabel_map_file( + map_file_name: &str, +) -> Result<(HashMap, String), String> { + let map_file = File::open(map_file_name) + .map_err(|e| format!("Could not open map file {map_file_name:?}: {e}"))?; + let map_reader = BufReader::new(map_file); + + let data: Value = serde_json::from_reader(map_reader) + .map_err(|e| format!("Could not parse map file {map_file_name:?} as JSON: {e}"))?; + + let map_obj = data + .get("relabeling_old_to_new_nodes_map") + .and_then(Value::as_object) + .ok_or_else(|| { + format!( + "Map file {map_file_name:?} must contain object field \ + relabeling_old_to_new_nodes_map" + ) + })?; + + let mut new_to_old_node_map = HashMap::with_capacity(map_obj.len()); + for (old_idx_text, new_idx_value) in map_obj { + let old_idx = old_idx_text.parse::().map_err(|e| { + format!( + "Map file {map_file_name:?} contains invalid old node index {old_idx_text:?}: {e}" + ) + })?; + let new_idx = new_idx_value.as_u64().ok_or_else(|| { + format!( + "Map file {map_file_name:?} maps old node {old_idx} to non-integer value \ + {new_idx_value}" + ) + })? as usize; + new_to_old_node_map.insert(new_idx, old_idx); + } + + let label = data["key"] + .as_str() + .map(ToOwned::to_owned) + .or_else(|| data["ordering_method"].as_str().map(ToOwned::to_owned)) + .unwrap_or_else(|| "map".to_string()); + + Ok((new_to_old_node_map, label)) +} + +/// Convert a CLI ordering method variant to the library's graph ordering type. +pub(super) fn to_graph_ordering(ordering: &OrderingMethod) -> GraphOrderingMethod { + match ordering { + OrderingMethod::MultiLevelCluster => GraphOrderingMethod::MultiLevelCluster, + OrderingMethod::ReverseCuthillMckee => GraphOrderingMethod::ReverseCuthillMckee, + } +} + +/// Return the kebab-case display name for an ordering method. +pub(super) fn ordering_method_name(ordering: &OrderingMethod) -> &'static str { + match ordering { + OrderingMethod::MultiLevelCluster => "multi-level-cluster", + OrderingMethod::ReverseCuthillMckee => "reverse-cuthill-mckee", + } +} + +/// Return the lowercase display name for a BEN variant. +pub(super) fn ben_variant_name(variant: BenVariant) -> &'static str { + match variant { + BenVariant::Standard => "standard", + BenVariant::MkvChain => "mkvchain", + BenVariant::TwoDelta => "twodelta", + } +} + +/// Convert a CLI BEN variant to the library's `BenVariant` type. +pub(super) fn to_ben_variant(variant: &BenCliVariant) -> BenVariant { + match variant { + BenCliVariant::Standard => BenVariant::Standard, + BenCliVariant::MkvChain => BenVariant::MkvChain, + BenCliVariant::TwoDelta => BenVariant::TwoDelta, + } +} + +/// Derive a human-readable label from the key or ordering method for file naming. +pub(super) fn relabeling_label( + key: Option<&str>, + ordering: Option<&OrderingMethod>, +) -> Result { + match (key, ordering) { + (Some(_), Some(_)) => Err("Provide either --key or --ordering, not both.".to_string()), + (Some(key), None) => Ok(key.to_string()), + (None, Some(ordering)) => Ok(ordering_method_name(ordering).to_string()), + (None, None) => Err("Provide either --key or --ordering.".to_string()), + } +} diff --git a/ben/src/cli/reben/json_mode.rs b/ben/src/cli/reben/json_mode.rs new file mode 100644 index 0000000..164a0da --- /dev/null +++ b/ben/src/cli/reben/json_mode.rs @@ -0,0 +1,60 @@ +use super::args::Args; +use super::helpers::{ordering_method_name, relabeling_label, to_graph_ordering}; +use crate::json::graph::{sort_json_file_by_key, sort_json_file_by_ordering}; +use serde_json::json; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +pub(super) fn run_json_mode(args: Args) -> Result<(), String> { + if args.n_items.is_some() { + return Err("--n-items is only supported in BEN mode.".to_string()); + } + + let input_file = File::open(&args.input_file) + .map_err(|e| format!("Could not open input file {:?}: {e}", args.input_file))?; + let reader = BufReader::new(input_file); + let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; + + let output_file_name = match args.output_file { + Some(name) => name, + None => { + args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}.json", label).as_str() + } + }; + + let output_file = File::create(&output_file_name) + .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; + let writer = BufWriter::new(output_file); + + let map = if let Some(key) = args.key.as_ref() { + sort_json_file_by_key(reader, writer, key) + } else { + let ordering = args + .ordering + .as_ref() + .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; + sort_json_file_by_ordering(reader, writer, to_graph_ordering(ordering)) + } + .map_err(|e| format!("Could not sort input graph: {e}"))?; + + let map_file_name = args.input_file.trim_end_matches(".json").to_owned() + + format!("_sorted_by_{}", label).as_str() + + "_map.json"; + let map_file = File::create(&map_file_name) + .map_err(|e| format!("Could not create map file {map_file_name:?}: {e}"))?; + let mut map_writer = BufWriter::new(map_file); + + let map_json = json!({ + "input_file": args.input_file, + "output_file": output_file_name, + "key": args.key.as_ref(), + "ordering_method": args.ordering.as_ref().map(ordering_method_name), + "relabeling_old_to_new_nodes_map": map + }); + + map_writer + .write_all(map_json.to_string().as_bytes()) + .map_err(|e| format!("Could not write map file {map_file_name:?}: {e}"))?; + Ok(()) +} diff --git a/ben/src/cli/reben/mod.rs b/ben/src/cli/reben/mod.rs new file mode 100644 index 0000000..d54cd50 --- /dev/null +++ b/ben/src/cli/reben/mod.rs @@ -0,0 +1,34 @@ +//! `reben` CLI: relabel and canonicalize BEN files using JSON dual-graph orderings. + +mod args; +mod ben_mode; +mod helpers; +mod json_mode; + +#[cfg(test)] +mod tests; + +use args::{Args, Mode}; +use ben_mode::run_ben_mode; +use json_mode::run_json_mode; + +use crate::cli::common::set_verbose; +use clap::Parser; + +/// Parse CLI arguments and execute the selected `reben` mode. +pub fn run() { + let args = Args::parse(); + set_verbose(args.verbose); + + if let Err(err) = run_with_args(args) { + eprintln!("Error: {err}"); + std::process::exit(1); + } +} + +fn run_with_args(args: Args) -> Result<(), String> { + match args.mode.clone() { + Mode::Json => run_json_mode(args), + Mode::Ben => run_ben_mode(args), + } +} diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs new file mode 100644 index 0000000..7322531 --- /dev/null +++ b/ben/src/cli/reben/tests.rs @@ -0,0 +1,565 @@ +use super::args::{Args, BenCliVariant, Mode, OrderingMethod}; +use super::ben_mode::run_ben_mode; +use super::helpers::{ + ben_variant_name, read_relabel_map_file, relabeling_label, to_ben_variant, +}; +use super::json_mode::run_json_mode; +use crate::codec::encode::encode_jsonl_to_ben; +use crate::BenVariant; +use clap::{CommandFactory, Parser}; +use std::{ + fs, + io::Cursor, + time::{SystemTime, UNIX_EPOCH}, +}; + +fn unique_path(name: &str) -> std::path::PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("reben-{name}-{nonce}")) +} + +/// Write a minimal Standard BEN file to a temp path and return the path. +fn write_temp_ben(name: &str) -> std::path::PathBuf { + let path = unique_path(name); + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + fs::write(&path, &ben).unwrap(); + path +} + +#[test] +fn clap_metadata_uses_package_version() { + let mut command = Args::command(); + let help = command.render_long_help().to_string(); + + assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); + assert!(help.contains("Relabeling Binary Ensemble CLI Tool")); + assert!(help.contains("--shape-file")); + assert!(help.contains("canonicalize")); +} + +#[test] +fn parse_json_mode_args() { + let args = Args::try_parse_from([ + "reben", + "dual_graph.json", + "--mode", + "json", + "--key", + "GEOID20", + "--output-file", + "sorted.json", + "--verbose", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Json); + assert_eq!(args.input_file, "dual_graph.json"); + assert_eq!(args.key.as_deref(), Some("GEOID20")); + assert_eq!(args.output_file.as_deref(), Some("sorted.json")); + assert!(args.verbose); +} + +#[test] +fn parse_json_mode_ordering_args() { + let args = Args::try_parse_from([ + "reben", + "dual_graph.json", + "--mode", + "json", + "--ordering", + "multi-level-cluster", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Json); + assert_eq!(args.ordering, Some(OrderingMethod::MultiLevelCluster)); + assert!(args.key.is_none()); +} + +#[test] +fn parse_ben_mode_n_items_args() { + let args = Args::try_parse_from([ + "reben", + "samples.jsonl.ben", + "--mode", + "ben", + "--n-items", + "25", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Ben); + assert_eq!(args.n_items, Some(25)); +} + +#[test] +fn parse_ben_mode_output_variant_args() { + let args = Args::try_parse_from([ + "reben", + "samples.jsonl.ben", + "--mode", + "ben", + "--output-variant", + "twodelta", + "--convert-only", + ]) + .unwrap(); + + assert_eq!(args.mode, Mode::Ben); + assert_eq!(args.output_variant, Some(BenCliVariant::TwoDelta)); + assert!(args.convert_only); +} + +#[test] +fn run_json_mode_rejects_n_items() { + let args = Args::try_parse_from([ + "reben", "x.json", "--mode", "json", "--key", "k", "--n-items", "5", + ]) + .unwrap(); + let err = run_json_mode(args).unwrap_err(); + assert!(err.contains("--n-items")); +} + +#[test] +fn run_ben_mode_rejects_convert_only_without_variant() { + let args = Args::try_parse_from([ + "reben", + "x.ben", + "--mode", + "ben", + "--convert-only", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + assert!(err.contains("--output-variant")); +} + +#[test] +fn run_ben_mode_rejects_convert_only_with_relabeling() { + let args = Args::try_parse_from([ + "reben", + "x.ben", + "--mode", + "ben", + "--convert-only", + "--output-variant", + "standard", + "--key", + "k", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + assert!(err.contains("--convert-only cannot be combined")); +} + +#[test] +fn ben_variant_name_covers_all_variants() { + assert_eq!(ben_variant_name(BenVariant::Standard), "standard"); + assert_eq!(ben_variant_name(BenVariant::MkvChain), "mkvchain"); + assert_eq!(ben_variant_name(BenVariant::TwoDelta), "twodelta"); +} + +#[test] +fn to_ben_variant_covers_standard() { + assert_eq!( + to_ben_variant(&BenCliVariant::Standard), + BenVariant::Standard + ); +} + +#[test] +fn relabeling_label_errors_on_both_key_and_ordering() { + let err = + relabeling_label(Some("k"), Some(&OrderingMethod::MultiLevelCluster)).unwrap_err(); + assert!(err.contains("not both")); +} + +#[test] +fn relabeling_label_errors_on_neither() { + let err = relabeling_label(None, None).unwrap_err(); + assert!(err.contains("either")); +} + +#[test] +fn run_ben_mode_with_n_items_limit() { + let input = write_temp_ben("n_items_input.jsonl.ben"); + let out = unique_path("n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--n-items", + "1", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); +} + +#[test] +fn run_json_mode_with_ordering_derives_output_name() { + // Create a minimal graph JSON file. + let shape = unique_path("ordering_shape.json"); + fs::write( + &shape, + br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, + ) + .unwrap(); + let args = Args::try_parse_from([ + "reben", + shape.to_str().unwrap(), + "--mode", + "json", + "--ordering", + "reverse-cuthill-mckee", + ]) + .unwrap(); + let result = run_json_mode(args); + // Clean up derived output file. + let derived = shape + .to_str() + .unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee_map.json"; + let derived2 = shape + .to_str() + .unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee.jsonl.ben"; + let _ = fs::remove_file(&derived); + let _ = fs::remove_file(&derived2); + let _ = fs::remove_file(&shape); + result.unwrap(); +} + +#[test] +fn run_ben_mode_with_map_file_and_n_items() { + // Build a 3-node BEN file. + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_n_items.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_n_items_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + + let out = unique_path("map_n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--n-items", + "1", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + + for p in [&ben_path, &map_path, &out] { + let _ = fs::remove_file(p); + } +} + +#[test] +fn run_ben_mode_with_map_file_no_limit() { + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_nolimit.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_nolimit_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + + let out = unique_path("map_nolimit_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + + for p in [&ben_path, &map_path, &out] { + let _ = fs::remove_file(p); + } +} + +#[test] +fn run_ben_mode_with_output_variant_and_n_items() { + let input = write_temp_ben("var_n_items.jsonl.ben"); + let out = unique_path("var_n_items_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "standard", + "--n-items", + "1", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); +} + +#[test] +fn run_ben_mode_with_shape_file_and_ordering() { + // Covers the shape_file + ordering path. + // Creates a map from the shape file ordering, then relabels the BEN. + let input = write_temp_ben("shape_order_input.jsonl.ben"); + let shape = unique_path("shape_order_shape.json"); + fs::write( + &shape, + br#"{"nodes":[{"id":0},{"id":1},{"id":2}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#, + ) + .unwrap(); + let out = unique_path("shape_order_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--shape-file", + shape.to_str().unwrap(), + "--ordering", + "reverse-cuthill-mckee", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + // Clean up the map file the function derives automatically. + let map = shape + .to_str() + .unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee_map.json"; + let sorted_json = shape + .to_str() + .unwrap() + .trim_end_matches(".json") + .to_owned() + + "_sorted_by_reverse-cuthill-mckee.json"; + let _ = fs::remove_file(&map); + let _ = fs::remove_file(&sorted_json); + for p in [&input, &shape, &out] { + let _ = fs::remove_file(p); + } + result.unwrap(); +} + +#[test] +fn run_ben_mode_with_map_file_and_output_variant_n_items() { + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_var_n.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_var_n_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + let out = unique_path("map_var_n_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-variant", + "standard", + "--n-items", + "1", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + for p in [&ben_path, &map_path, &out] { + let _ = fs::remove_file(p); + } +} + +#[test] +fn run_ben_mode_with_map_file_and_output_variant_no_limit() { + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben_path = unique_path("map_var_nolim.jsonl.ben"); + fs::write(&ben_path, &ben).unwrap(); + + let map_path = unique_path("map_var_nolim_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + ) + .unwrap(); + let out = unique_path("map_var_nolim_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + ben_path.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + "--output-variant", + "standard", + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + run_ben_mode(args).unwrap(); + for p in [&ben_path, &map_path, &out] { + let _ = fs::remove_file(p); + } +} + +#[test] +fn run_ben_mode_map_file_without_output_file_derives_name() { + // Covers the None branch of output_file. + let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; + let mut ben = Vec::new(); + encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let input = unique_path("map_derive.jsonl.ben"); + fs::write(&input, &ben).unwrap(); + + let map_path = unique_path("map_derive_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1},\"key\":\"sort\"}", + ) + .unwrap(); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + map_path.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + // Derived output: input stripped of ".jsonl.ben" + "_sorted_by_{label}.jsonl.ben" + let derived = input + .to_str() + .unwrap() + .trim_end_matches(".jsonl.ben") + .to_owned() + + "_sorted_by_sort.jsonl.ben"; + let _ = fs::remove_file(&derived); + for p in [&input, &map_path] { + let _ = fs::remove_file(p); + } + result.unwrap(); +} + +#[test] +fn read_relabel_map_file_rejects_non_integer_index() { + let map_path = unique_path("bad_index_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"not_a_number\":0}}", + ) + .unwrap(); + let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + assert!(err.contains("invalid old node index")); + let _ = fs::remove_file(&map_path); +} + +#[test] +fn read_relabel_map_file_rejects_non_integer_value() { + let map_path = unique_path("bad_value_map.json"); + fs::write( + &map_path, + b"{\"relabeling_old_to_new_nodes_map\":{\"0\":\"not_a_number\"}}", + ) + .unwrap(); + let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + assert!(err.contains("non-integer")); + let _ = fs::remove_file(&map_path); +} + +#[test] +fn run_ben_mode_canonicalize_derives_output_name() { + let input = write_temp_ben("canon.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + ]) + .unwrap(); + let result = run_ben_mode(args); + let derived = input + .to_str() + .unwrap() + .trim_end_matches(".jsonl.ben") + .to_owned() + + "_canonicalized_assignments.jsonl.ben"; + let _ = fs::remove_file(&derived); + fs::remove_file(&input).unwrap(); + result.unwrap(); +} + +#[test] +fn run_ben_mode_with_output_variant_derives_name() { + let input = write_temp_ben("variant.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--output-variant", + "standard", + ]) + .unwrap(); + let result = run_ben_mode(args); + let derived = input + .to_str() + .unwrap() + .trim_end_matches(".ben") + .to_owned() + + "_standard.ben"; + let _ = fs::remove_file(&derived); + fs::remove_file(&input).unwrap(); + result.unwrap(); +} From ae44ce5e5c9266692032cb564d25aafa07aed749 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:51:08 -0600 Subject: [PATCH 077/221] reog pyben side to parallel ben side --- pyben/.gitignore | 1 + pyben/docs/user/using_pyben.ipynb | 82 ++- pyben/src/decode/decoder.rs | 579 ++++++++++++++++++++ pyben/src/decode/helpers.rs | 134 +++++ pyben/src/decode/mod.rs | 867 +----------------------------- pyben/src/decode/py_funcs.rs | 76 +++ pyben/src/decode/types.rs | 95 ++++ pyben/src/encode/encoder.rs | 234 ++++++++ pyben/src/encode/helpers.rs | 69 +++ pyben/src/encode/mod.rs | 423 +-------------- pyben/src/encode/py_funcs.rs | 86 +++ pyben/src/encode/types.rs | 41 ++ 12 files changed, 1368 insertions(+), 1319 deletions(-) create mode 100644 pyben/src/decode/decoder.rs create mode 100644 pyben/src/decode/helpers.rs create mode 100644 pyben/src/decode/py_funcs.rs create mode 100644 pyben/src/decode/types.rs create mode 100644 pyben/src/encode/encoder.rs create mode 100644 pyben/src/encode/helpers.rs create mode 100644 pyben/src/encode/py_funcs.rs create mode 100644 pyben/src/encode/types.rs diff --git a/pyben/.gitignore b/pyben/.gitignore index ddc4f84..d54fae9 100755 --- a/pyben/.gitignore +++ b/pyben/.gitignore @@ -6,5 +6,6 @@ dist/ __pycache__ *.pyc *.so +*.profraw docs/_build \ No newline at end of file diff --git a/pyben/docs/user/using_pyben.ipynb b/pyben/docs/user/using_pyben.ipynb index 5896142..24902b5 100644 --- a/pyben/docs/user/using_pyben.ipynb +++ b/pyben/docs/user/using_pyben.ipynb @@ -142,22 +142,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "9296ca41", "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'compress_jsonl_to_ben' from 'pyben' (unknown location)", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mImportError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyben\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 2\u001b[39m compress_jsonl_to_ben, compress_jsonl_to_xben, compress_ben_to_xben, decompress_ben_to_jsonl, decompress_xben_to_jsonl, decompress_xben_to_ben\n\u001b[32m 3\u001b[39m )\n", - "\u001b[31mImportError\u001b[39m: cannot import name 'compress_jsonl_to_ben' from 'pyben' (unknown location)" - ] - } - ], + "outputs": [], "source": [ "from binary_ensemble import (\n", " compress_jsonl_to_ben, compress_jsonl_to_xben, compress_ben_to_xben, decompress_ben_to_jsonl, decompress_xben_to_jsonl, decompress_xben_to_ben\n", @@ -177,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "1e1e32b0", "metadata": {}, "outputs": [], @@ -199,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2f1ce280", "metadata": {}, "outputs": [ @@ -254,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "81b1f724", "metadata": {}, "outputs": [], @@ -313,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a4e512b3", "metadata": {}, "outputs": [], @@ -365,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "eb43be57", "metadata": {}, "outputs": [], @@ -419,14 +407,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "dec15cda", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6e2bbae3f017444e9dfa43e6a72d7352", + "model_id": "cece4170d7034a36bcf1626ef58d9412", "version_major": 2, "version_minor": 0 }, @@ -439,7 +427,7 @@ } ], "source": [ - "from pyben import PyBenEncoder\n", + "from binary_ensemble import PyBenEncoder\n", "\n", "graph_node_order = list(graph.nodes)\n", "\n", @@ -465,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "801c6fb7", "metadata": {}, "outputs": [ @@ -474,20 +462,20 @@ "output_type": "stream", "text": [ "Sample: 1, Cut Edge Count: 56\n", - "Sample: 1001, Cut Edge Count: 42\n", - "Sample: 2001, Cut Edge Count: 38\n", - "Sample: 3001, Cut Edge Count: 43\n", - "Sample: 4001, Cut Edge Count: 40\n", - "Sample: 5001, Cut Edge Count: 39\n", - "Sample: 6001, Cut Edge Count: 38\n", - "Sample: 7001, Cut Edge Count: 44\n", - "Sample: 8001, Cut Edge Count: 39\n", - "Sample: 9001, Cut Edge Count: 38\n" + "Sample: 1001, Cut Edge Count: 41\n", + "Sample: 2001, Cut Edge Count: 44\n", + "Sample: 3001, Cut Edge Count: 41\n", + "Sample: 4001, Cut Edge Count: 32\n", + "Sample: 5001, Cut Edge Count: 38\n", + "Sample: 6001, Cut Edge Count: 36\n", + "Sample: 7001, Cut Edge Count: 41\n", + "Sample: 8001, Cut Edge Count: 37\n", + "Sample: 9001, Cut Edge Count: 41\n" ] } ], "source": [ - "from pyben import PyBenDecoder\n", + "from binary_ensemble import PyBenDecoder\n", "import pandas as pd\n", "\n", "\n", @@ -521,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "ff8e8e4a", "metadata": {}, "outputs": [], @@ -536,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "24761ca6", "metadata": {}, "outputs": [ @@ -557,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "0a815edf", "metadata": {}, "outputs": [ @@ -581,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "3be48458", "metadata": {}, "outputs": [ @@ -619,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "51d9f27a", "metadata": {}, "outputs": [ @@ -627,7 +615,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1525384/229284435.py:1: UserWarning: XBEN may take a second to start decoding.\n", + "/tmp/ipykernel_239360/229284435.py:1: UserWarning: XBEN may take a second to start decoding.\n", " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_indices([1, 23978, 100000]):\n" ] }, @@ -648,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "a51d0019", "metadata": {}, "outputs": [ @@ -656,7 +644,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1525384/1010090289.py:1: UserWarning: XBEN may take a second to start decoding.\n", + "/tmp/ipykernel_239360/1010090289.py:1: UserWarning: XBEN may take a second to start decoding.\n", " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_range(1000,1005):\n" ] }, @@ -680,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "eeb1c112", "metadata": {}, "outputs": [ @@ -688,7 +676,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1525384/49125867.py:1: UserWarning: XBEN may take a second to start decoding.\n", + "/tmp/ipykernel_239360/49125867.py:1: UserWarning: XBEN may take a second to start decoding.\n", " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n" ] }, @@ -714,6 +702,14 @@ "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n", " print(assignment[:10])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "179c6f2d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pyben/src/decode/decoder.rs b/pyben/src/decode/decoder.rs new file mode 100644 index 0000000..cdef149 --- /dev/null +++ b/pyben/src/decode/decoder.rs @@ -0,0 +1,579 @@ +use super::helpers::{ + build_bundle_iter, build_frames_for_subsample, build_plain_iter, detect_is_bundle, + scan_bundle_samples, warn_xben_startup, +}; +use super::types::{ActiveSelection, BundleState, DecoderBackend, DecoderMode, DynIter}; +use binary_ensemble::io::bundle::format::{ + ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + ASSET_TYPE_RELABEL_MAP, +}; +use binary_ensemble::io::bundle::BendlReader; +use binary_ensemble::io::reader::{count_samples_from_file, Selection, SubsampleFrameDecoder}; +use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyUserWarning}; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, BufWriter, Write}; +use std::path::PathBuf; + +#[pyclass(module = "binary_ensemble", unsendable)] +pub struct PyBenDecoder { + path: PathBuf, + mode: DecoderMode, + backend: DecoderBackend, + iter: DynIter, + current_assignment: Option>, + remaining_count: u16, + base_len: Option, + len_hint: Option, + active_selection: ActiveSelection, +} + +#[pymethods] +impl PyBenDecoder { + /// Open a decoder on a `.ben`, `.xben`, or `.bendl` file. + /// + /// The file's leading bytes are sniffed to decide whether it is a + /// bundle. When the file is a `.bendl`, the bundle's header decides + /// the BEN/XBEN format and the `mode` argument is ignored; when the + /// file is a plain stream, `mode` selects between the BEN and XBEN + /// readers and defaults to `"ben"`. + /// + /// # Arguments + /// + /// * `file_path` - Path to the input file. + /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain + /// streams; bundles use `assignment_format` from the header. + #[new] + #[pyo3(signature = (file_path, mode = "ben"))] + #[pyo3(text_signature = "(file_path, mode='ben')")] + fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { + // Validate the mode string up front so "Unknown mode" is reported + // regardless of whether the file exists or turns out to be a bundle. + let parsed_mode = DecoderMode::parse(mode)?; + let is_bundle = detect_is_bundle(&file_path).map_err(|e| { + PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) + })?; + + if is_bundle { + let file = File::open(&file_path).map_err(|e| { + PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) + })?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + file_path.display() + )) + })?; + let fmt = reader.assignment_format().ok_or_else(|| { + PyException::new_err( + "Bundle header has an unrecognized assignment_format field.", + ) + })?; + let derived_mode = DecoderMode::from_assignment_format(fmt); + let (stream_offset, stream_len) = + reader.assignment_stream_range().map_err(|e| { + PyException::new_err(format!( + "Failed to determine stream region in {}: {e}", + file_path.display() + )) + })?; + let state = BundleState { + reader, + stream_offset, + stream_len, + }; + + // Emit the XBEN startup warning once, up front. + if matches!(derived_mode, DecoderMode::XBen) { + warn_xben_startup(py)?; + } + + let iter = build_bundle_iter(&file_path, &state, derived_mode)?; + Ok(Self { + path: file_path, + mode: derived_mode, + backend: DecoderBackend::Bundle(state), + iter, + current_assignment: None, + remaining_count: 0, + base_len: None, + len_hint: None, + active_selection: ActiveSelection::None, + }) + } else { + if matches!(parsed_mode, DecoderMode::XBen) { + warn_xben_startup(py)?; + } + let iter = build_plain_iter(&file_path, parsed_mode)?; + Ok(Self { + path: file_path, + mode: parsed_mode, + backend: DecoderBackend::Plain, + iter, + current_assignment: None, + remaining_count: 0, + base_len: None, + len_hint: None, + active_selection: ActiveSelection::None, + }) + } + } + + /// Return `self` as an iterator, rebuilding the underlying frame + /// walker so iteration can be restarted. + /// + /// Calling `iter(dec)` (or using `for x in dec: …`) more than once + /// is supported: each call reopens the stream region from the start + /// and, if a subsample selection is active, reapplies it. + fn __iter__(mut slf: PyRefMut) -> PyResult> { + slf.current_assignment = None; + slf.remaining_count = 0; + + let path = slf.path.clone(); + let mode = slf.mode; + let selection = slf.active_selection.clone(); + + let new_iter: DynIter = match selection { + ActiveSelection::None => match &slf.backend { + DecoderBackend::Plain => build_plain_iter(&path, mode)?, + DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, + }, + sel => { + let frames = build_frames_for_subsample(&path, mode, &slf.backend)?; + let ben_sel = sel + .to_selection() + .expect("active subsample selection must be convertible"); + Box::new(SubsampleFrameDecoder::new(frames, ben_sel)) + } + }; + + slf.iter = new_iter; + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult>> { + if slf.remaining_count > 0 { + slf.remaining_count -= 1; + let a = slf.current_assignment.as_ref().unwrap().clone(); + return Ok(Some(a)); + } + match slf.iter.next() { + Some(Ok((assignment, count))) => { + if count == 0 { + return Err(PyException::new_err( + "Decoder yielded a zero-count record; data may be corrupted.", + )); + } + slf.current_assignment = Some(assignment.clone()); + slf.remaining_count = count - 1; + Ok(Some(assignment)) + } + Some(Err(e)) => Err(PyException::new_err(format!( + "Error decoding next item: {e}" + ))), + None => Ok(None), + } + } + + // Because we want progress bars!!! + fn __len__(mut slf: PyRefMut, py: Python<'_>) -> PyResult { + if let Some(len_hint) = slf.len_hint { + return Ok(len_hint); + } + + let base_len = ensure_base_len(&mut slf, py)?; + slf.len_hint = Some(base_len); + Ok(base_len) + } + + #[pyo3(text_signature = "(self)")] + fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { + // Always reports the total number of samples in the source file, + // even after `subsample_*` has been applied. We deliberately do + // not touch `len_hint` here: when a subsample selection is + // active, `len_hint` tracks the filtered count that `__len__` + // should return, and clobbering it would break `len(dec)` after + // a `count_samples()` call. + ensure_base_len(&mut slf, py) + } + + #[pyo3(text_signature = "(self, indices, /)")] + fn subsample_indices<'py>( + mut slf: PyRefMut<'py, Self>, + mut indices: Vec, + py: Python<'_>, + ) -> PyResult> { + if !indices.iter().is_sorted() { + // We need to sort and deduplicate the indices + // This is a bit annoying, but it is necessary to ensure that we can + // efficiently iterate over the underlying data. + // We use unstable sort because we don't care about the order of equal elements + // and it is faster than stable sort. + let warnings = py.import("warnings")?; + let kwargs = PyDict::new(py); + // kwargs.set_item("stacklevel", 2)?; + + warnings.call_method( + "warn", + ( + "Indices must be sorted and unique; sorting and deduplicating.", + py.get_type::(), + ), + Some(&kwargs), + )?; + } + indices.sort_unstable(); + indices.dedup(); + + if indices.is_empty() { + return Err(PyException::new_err("indices must not be empty")); + } + let base_len = ensure_base_len(&mut slf, py)?; + if indices[0] <= 0 { + return Err(PyException::new_err("indices must be 1-based")); + } + if indices.last().unwrap() > &base_len { + return Err(PyException::new_err(format!( + "indices must be <= number of samples in base data ({})", + base_len + ))); + } + let len_hint = indices.len(); + + slf.active_selection = ActiveSelection::Indices(indices.clone()); + let sel = Selection::Indices(indices.into_iter().peekable()); + reset_with_selection(&mut slf, sel, len_hint)?; + Ok(slf.into()) + } + + #[pyo3(text_signature = "(self, start, end, /)")] + fn subsample_range<'py>( + mut slf: PyRefMut<'py, Self>, + start: usize, + end: usize, + py: Python<'_>, + ) -> PyResult> { + if start == 0 || end < start { + return Err(PyException::new_err( + "range must be 1-based and end >= start", + )); + } + let base_len = ensure_base_len(&mut slf, py)?; + if end > base_len { + return Err(PyException::new_err(format!( + "end must be <= number of samples in base data ({})", + base_len + ))); + } + + slf.active_selection = ActiveSelection::Range { start, end }; + let sel = Selection::Range { start, end }; + let len_hint = end - start + 1; + reset_with_selection(&mut slf, sel, len_hint)?; + Ok(slf.into()) + } + + #[pyo3(signature = (step, offset=1))] + fn subsample_every<'py>( + mut slf: PyRefMut<'py, Self>, + step: usize, + offset: usize, + py: Python<'_>, + ) -> PyResult> { + if step == 0 || offset == 0 { + return Err(PyException::new_err("step and offset must be >= 1")); + } + let base_len = ensure_base_len(&mut slf, py)?; + if offset > base_len { + return Err(PyException::new_err(format!( + "offset must be <= number of samples in base data ({})", + base_len + ))); + } + slf.active_selection = ActiveSelection::Every { step, offset }; + let sel = Selection::Every { step, offset }; + let len_hint = (base_len + step - 1 - (offset - 1)) / step; + reset_with_selection(&mut slf, sel, len_hint)?; + Ok(slf.into()) + } + + // --------------------------------------------------------------------- + // Bundle-inspection surface. + // + // These methods only make sense when the decoder was opened on a + // `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear + // error pointing the user at the right tool. + // --------------------------------------------------------------------- + + /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a + /// plain `.ben`/`.xben` stream (`False`). + #[pyo3(text_signature = "(self)")] + fn is_bundle(&self) -> bool { + self.backend.is_bundle() + } + + /// Return the container format of the underlying assignment stream + /// as `"ben"` or `"xben"`. + #[pyo3(text_signature = "(self)")] + fn assignment_format(&self) -> &'static str { + self.mode.as_str() + } + + /// Return the bundle's format version as a `(major, minor)` tuple. + /// Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn version(&self) -> PyResult<(u16, u16)> { + let state = self.require_bundle("version()")?; + let h = state.reader.header(); + Ok((h.major_version, h.minor_version)) + } + + /// Whether the bundle was successfully finalized. Errors on plain + /// streams. + #[pyo3(text_signature = "(self)")] + fn is_complete(&self) -> PyResult { + let state = self.require_bundle("is_complete()")?; + Ok(state.reader.is_complete()) + } + + /// Names of every entry in the bundle's directory, in directory + /// order. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn asset_names(&self) -> PyResult> { + let state = self.require_bundle("asset_names()")?; + Ok(state + .reader + .assets() + .iter() + .map(|e| e.name.clone()) + .collect()) + } + + /// Return the full bundle directory as a list of dicts with keys + /// `name`, `type`, `offset`, `len`, and `flags` (a list of string + /// tags). Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { + let state = self.require_bundle("list_assets()")?; + let entries = state.reader.assets(); + let mut out = Vec::with_capacity(entries.len()); + for entry in entries { + let d = PyDict::new(py); + d.set_item("name", &entry.name)?; + d.set_item("type", entry.asset_type)?; + d.set_item("offset", entry.payload_offset)?; + d.set_item("len", entry.payload_len)?; + let mut flags: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flags.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flags.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flags.push("checksum"); + } + d.set_item("flags", flags)?; + out.push(d); + } + Ok(out) + } + + /// Read the (decoded) bytes of a named asset as a Python `bytes` + /// object. Errors on plain streams. + #[pyo3(text_signature = "(self, name, /)")] + fn read_asset_bytes(&mut self, name: &str) -> PyResult> { + let state = self.require_bundle_mut("read_asset_bytes()")?; + let entry = state + .reader + .find_asset_by_name(name) + .cloned() + .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; + state + .reader + .asset_bytes(&entry) + .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) + } + + /// Parse a JSON asset into a Python object (dict, list, …). Errors + /// on plain streams and when the asset does not exist or is not + /// valid UTF-8 / JSON. + #[pyo3(text_signature = "(self, name, /)")] + fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { + let bytes = self.read_asset_bytes(name)?; + let json_mod = py.import("json")?; + let text = std::str::from_utf8(&bytes) + .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; + let parsed = json_mod.call_method1("loads", (text,))?; + Ok(parsed.into()) + } + + /// Read the bundle's `graph.json` asset as a parsed JSON object. + /// Returns `None` if the bundle does not carry a graph asset. Errors + /// on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_graph()")?; + if state.reader.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "graph.json")?)) + } + + /// Read the bundle's `metadata.json` asset as a parsed JSON object, + /// or `None` if absent. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_metadata()")?; + if state + .reader + .find_asset_by_type(ASSET_TYPE_METADATA) + .is_none() + { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "metadata.json")?)) + } + + /// Read the bundle's `relabel_map.json` asset as a parsed JSON + /// object, or `None` if absent. Errors on plain streams. + #[pyo3(text_signature = "(self)")] + fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { + { + let state = self.require_bundle_mut("read_relabel_map()")?; + if state + .reader + .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) + .is_none() + { + return Ok(None); + } + } + Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) + } + + /// Copy the embedded assignment stream region verbatim to + /// `out_path`. The resulting file can be opened directly with + /// `PyBenDecoder(out_path, mode=dec.assignment_format())`. + /// Errors on plain streams. + #[pyo3(signature = (out_path, overwrite=false))] + #[pyo3(text_signature = "(self, out_path, overwrite=False)")] + fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { + let state = self.require_bundle_mut("extract_stream()")?; + if out_path.exists() && !overwrite { + return Err(PyIOError::new_err(format!( + "Output file {} already exists (use overwrite=True to replace).", + out_path.display() + ))); + } + let out = if overwrite { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&out_path) + } else { + OpenOptions::new() + .write(true) + .create_new(true) + .open(&out_path) + } + .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; + let mut out = BufWriter::new(out); + + let mut stream = state + .reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; + io::copy(&mut stream, &mut out) + .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; + out.flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + Ok(()) + } +} + +impl PyBenDecoder { + /// Borrow the bundle state or raise a clear Python error explaining + /// that the decoder was opened on a plain stream. + fn require_bundle(&self, op: &str) -> PyResult<&BundleState> { + match &self.backend { + DecoderBackend::Bundle(state) => Ok(state), + DecoderBackend::Plain => Err(PyException::new_err(format!( + "{op} is only available on .bendl bundles; this decoder was opened \ + on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ + via PyBenEncoder with ben_file_only=False) to get bundle features.", + self.mode.as_str() + ))), + } + } + + fn require_bundle_mut(&mut self, op: &str) -> PyResult<&mut BundleState> { + match &mut self.backend { + DecoderBackend::Bundle(state) => Ok(state), + DecoderBackend::Plain => Err(PyException::new_err(format!( + "{op} is only available on .bendl bundles; this decoder was opened \ + on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ + via PyBenEncoder with ben_file_only=False) to get bundle features.", + self.mode.as_str() + ))), + } + } +} + +fn reset_with_selection( + decoder: &mut PyBenDecoder, + selection: Selection, + len_hint: usize, +) -> PyResult<()> { + let frames = build_frames_for_subsample(&decoder.path, decoder.mode, &decoder.backend)?; + let frame_decoder = SubsampleFrameDecoder::new(frames, selection); + decoder.iter = Box::new(frame_decoder); + decoder.current_assignment = None; + decoder.remaining_count = 0; + decoder.len_hint = Some(len_hint); + Ok(()) +} + +fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { + if let Some(base_len) = decoder.base_len { + return Ok(base_len); + } + + let base_len = match &decoder.backend { + DecoderBackend::Plain => { + let path = decoder.path.clone(); + let mode = decoder.mode.as_str().to_string(); + py.detach(|| count_samples_from_file(&path, &mode)) + .map_err(|e| { + PyException::new_err(format!( + "Failed to count samples in {}: {e}", + path.display() + )) + })? + } + DecoderBackend::Bundle(state) => { + // Prefer the authoritative sample_count carried in the + // bundle header, which is set for finalized bundles and is + // O(1). Fall back to scanning the stream region when the + // header has no count (unfinalized append target, or a + // header byte we cannot interpret). + if let Some(n) = state.reader.sample_count() { + if n >= 0 { + n as usize + } else { + scan_bundle_samples(&decoder.path, state, decoder.mode)? + } + } else { + scan_bundle_samples(&decoder.path, state, decoder.mode)? + } + } + }; + decoder.base_len = Some(base_len); + Ok(base_len) +} diff --git a/pyben/src/decode/helpers.rs b/pyben/src/decode/helpers.rs new file mode 100644 index 0000000..edcc6ca --- /dev/null +++ b/pyben/src/decode/helpers.rs @@ -0,0 +1,134 @@ +use super::types::{BundleState, DecoderBackend, DecoderMode, DynIter}; +use crate::common::open_input; +use binary_ensemble::io::bundle::format::BENDL_MAGIC; +use binary_ensemble::io::reader::{ + build_frame_iter, build_frame_iter_from_reader, count_samples_from_frame_iter, + AssignmentReader, XZAssignmentReader, +}; +use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use std::fs::File; +use std::io::{self, BufReader, Read, Seek, SeekFrom}; +use std::path::Path; + +pub(super) fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { + let warnings = py.import("warnings")?; + let kwargs = PyDict::new(py); + + warnings.call_method( + "warn", + ( + "XBEN may take a second to start decoding.", + py.get_type::(), + ), + Some(&kwargs), + )?; + + Ok(()) +} + +/// Sniff the first 8 bytes of a file and decide whether it starts with +/// the `BENDL` magic. +pub(super) fn detect_is_bundle(path: &Path) -> io::Result { + let mut file = File::open(path)?; + let mut magic = [0u8; 8]; + match file.read_exact(&mut magic) { + Ok(()) => Ok(magic == BENDL_MAGIC), + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(false), + Err(e) => Err(e), + } +} + +/// Build a plain-stream iterator from `path` using `mode`. +pub(super) fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { + let reader = open_input(&path.to_path_buf())?; + match mode { + DecoderMode::Ben => { + let ben = AssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; + Ok(Box::new(ben)) + } + DecoderMode::XBen => { + let xben = XZAssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; + Ok(Box::new(xben)) + } + } +} + +/// Open a second file handle on the bundle path, seek to the stream +/// region, and wrap it in the appropriate assignment reader so the +/// decoder iterator only walks the embedded stream. +pub(super) fn build_bundle_iter( + path: &Path, + state: &BundleState, + mode: DecoderMode, +) -> PyResult { + let reader = open_bundle_stream_reader(path, state)?; + match mode { + DecoderMode::Ben => { + let ben = AssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; + Ok(Box::new(ben)) + } + DecoderMode::XBen => { + let xben = XZAssignmentReader::new(reader) + .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; + Ok(Box::new(xben)) + } + } +} + +/// Create a `Read`-only handle bounded to the bundle's assignment stream +/// region. +pub(super) fn open_bundle_stream_reader( + path: &Path, + state: &BundleState, +) -> PyResult>> { + let file = File::open(path) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", path.display())))?; + let mut buf = BufReader::new(file); + buf.seek(SeekFrom::Start(state.stream_offset)) + .map_err(|e| PyIOError::new_err(format!("Failed to seek into bundle stream: {e}")))?; + Ok(buf.take(state.stream_len)) +} + +pub(super) fn build_frames_for_subsample( + path: &Path, + mode: DecoderMode, + backend: &DecoderBackend, +) -> PyResult { + match backend { + DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + path.display() + )) + }), + DecoderBackend::Bundle(state) => { + let reader = open_bundle_stream_reader(path, state)?; + build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from bundle {}: {e}", + path.display() + )) + }) + } + } +} + +pub(super) fn scan_bundle_samples( + path: &Path, + state: &BundleState, + mode: DecoderMode, +) -> PyResult { + let reader = open_bundle_stream_reader(path, state)?; + let iter = build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + PyException::new_err(format!( + "Failed to open bundle stream for sample count: {e}" + )) + })?; + count_samples_from_frame_iter(iter) + .map_err(|e| PyException::new_err(format!("Failed to count samples in bundle: {e}"))) +} diff --git a/pyben/src/decode/mod.rs b/pyben/src/decode/mod.rs index ffb6ae4..4c6708f 100644 --- a/pyben/src/decode/mod.rs +++ b/pyben/src/decode/mod.rs @@ -1,862 +1,9 @@ -use crate::common::{open_input, open_output, validate_input_output_paths}; -use binary_ensemble::codec::decode::{ - decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, -}; -use binary_ensemble::io::bundle::format::{ - AssignmentFormat, BENDL_MAGIC, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, - ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, -}; -use binary_ensemble::io::bundle::BendlReader; -use binary_ensemble::io::reader::{ - build_frame_iter, build_frame_iter_from_reader, count_samples_from_file, - count_samples_from_frame_iter, AssignmentReader, MkvRecord, Selection, SubsampleFrameDecoder, - XZAssignmentReader, -}; -use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyUserWarning}; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use std::fs::{File, OpenOptions}; -use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; -use std::path::{Path, PathBuf}; +//! Python bindings for BEN/XBEN decoding and `.bendl` bundle inspection. -type DynIter = Box> + Send>; +mod decoder; +mod helpers; +mod py_funcs; +mod types; -#[derive(Clone, Copy)] -enum DecoderMode { - Ben, - XBen, -} - -impl DecoderMode { - fn parse(mode: &str) -> PyResult { - match mode { - "ben" => Ok(Self::Ben), - "xben" => Ok(Self::XBen), - _ => Err(PyException::new_err( - "Unknown mode. Supported modes are 'ben' and 'xben'.", - )), - } - } - - fn as_str(&self) -> &'static str { - match self { - Self::Ben => "ben", - Self::XBen => "xben", - } - } - - fn from_assignment_format(fmt: AssignmentFormat) -> Self { - match fmt { - AssignmentFormat::Ben => Self::Ben, - AssignmentFormat::Xben => Self::XBen, - } - } -} - -/// Cached bundle state for a decoder opened on a `.bendl` file. -/// -/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC -/// inspection and asset-read calls without disturbing the iterator (which -/// reads the stream region through a separate file handle). -struct BundleState { - reader: BendlReader>, - stream_offset: u64, - stream_len: u64, -} - -/// What the decoder was actually opened on. -enum DecoderBackend { - Plain, - Bundle(BundleState), -} - -impl DecoderBackend { - fn is_bundle(&self) -> bool { - matches!(self, DecoderBackend::Bundle(_)) - } -} - -/// Stored form of the most recently installed subsampling selection. -/// -/// The iterator is single-pass, so to support restarting iteration -/// (e.g. `for x in dec: ... ; for x in dec: ...`) the decoder remembers -/// the active selection and rebuilds a fresh frame decoder on every -/// call to `__iter__`. -#[derive(Clone)] -enum ActiveSelection { - None, - Indices(Vec), - Range { start: usize, end: usize }, - Every { step: usize, offset: usize }, -} - -impl ActiveSelection { - fn to_selection(&self) -> Option { - match self { - Self::None => None, - Self::Indices(v) => Some(Selection::Indices(v.clone().into_iter().peekable())), - Self::Range { start, end } => Some(Selection::Range { - start: *start, - end: *end, - }), - Self::Every { step, offset } => Some(Selection::Every { - step: *step, - offset: *offset, - }), - } - } -} - -#[pyclass(module = "binary_ensemble", unsendable)] -pub struct PyBenDecoder { - path: PathBuf, - mode: DecoderMode, - backend: DecoderBackend, - iter: DynIter, - current_assignment: Option>, - remaining_count: u16, - base_len: Option, - len_hint: Option, - active_selection: ActiveSelection, -} - -#[pymethods] -impl PyBenDecoder { - /// Open a decoder on a `.ben`, `.xben`, or `.bendl` file. - /// - /// The file's leading bytes are sniffed to decide whether it is a - /// bundle. When the file is a `.bendl`, the bundle's header decides - /// the BEN/XBEN format and the `mode` argument is ignored; when the - /// file is a plain stream, `mode` selects between the BEN and XBEN - /// readers and defaults to `"ben"`. - /// - /// # Arguments - /// - /// * `file_path` - Path to the input file. - /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain - /// streams; bundles use `assignment_format` from the header. - #[new] - #[pyo3(signature = (file_path, mode = "ben"))] - #[pyo3(text_signature = "(file_path, mode='ben')")] - fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { - // Validate the mode string up front so "Unknown mode" is reported - // regardless of whether the file exists or turns out to be a bundle. - let parsed_mode = DecoderMode::parse(mode)?; - let is_bundle = detect_is_bundle(&file_path).map_err(|e| { - PyIOError::new_err(format!( - "Failed to open {}: {e}", - file_path.display() - )) - })?; - - if is_bundle { - let file = File::open(&file_path).map_err(|e| { - PyIOError::new_err(format!( - "Failed to open {}: {e}", - file_path.display() - )) - })?; - let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { - PyException::new_err(format!( - "Failed to parse bundle header in {}: {e}", - file_path.display() - )) - })?; - let fmt = reader.assignment_format().ok_or_else(|| { - PyException::new_err( - "Bundle header has an unrecognized assignment_format field.", - ) - })?; - let derived_mode = DecoderMode::from_assignment_format(fmt); - let (stream_offset, stream_len) = reader.assignment_stream_range() - .map_err(|e| PyException::new_err(format!( - "Failed to determine stream region in {}: {e}", - file_path.display() - )))?; - let state = BundleState { - reader, - stream_offset, - stream_len, - }; - - // Emit the XBEN startup warning once, up front. - if matches!(derived_mode, DecoderMode::XBen) { - warn_xben_startup(py)?; - } - - let iter = build_bundle_iter(&file_path, &state, derived_mode)?; - Ok(Self { - path: file_path, - mode: derived_mode, - backend: DecoderBackend::Bundle(state), - iter, - current_assignment: None, - remaining_count: 0, - base_len: None, - len_hint: None, - active_selection: ActiveSelection::None, - }) - } else { - if matches!(parsed_mode, DecoderMode::XBen) { - warn_xben_startup(py)?; - } - let iter = build_plain_iter(&file_path, parsed_mode)?; - Ok(Self { - path: file_path, - mode: parsed_mode, - backend: DecoderBackend::Plain, - iter, - current_assignment: None, - remaining_count: 0, - base_len: None, - len_hint: None, - active_selection: ActiveSelection::None, - }) - } - } - - /// Return `self` as an iterator, rebuilding the underlying frame - /// walker so iteration can be restarted. - /// - /// Calling `iter(dec)` (or using `for x in dec: …`) more than once - /// is supported: each call reopens the stream region from the start - /// and, if a subsample selection is active, reapplies it. - fn __iter__(mut slf: PyRefMut) -> PyResult> { - slf.current_assignment = None; - slf.remaining_count = 0; - - let path = slf.path.clone(); - let mode = slf.mode; - let selection = slf.active_selection.clone(); - - let new_iter: DynIter = match selection { - ActiveSelection::None => match &slf.backend { - DecoderBackend::Plain => build_plain_iter(&path, mode)?, - DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, - }, - sel => { - let frames = build_frames_for_subsample(&path, mode, &slf.backend)?; - let ben_sel = sel - .to_selection() - .expect("active subsample selection must be convertible"); - Box::new(SubsampleFrameDecoder::new(frames, ben_sel)) - } - }; - - slf.iter = new_iter; - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult>> { - if slf.remaining_count > 0 { - slf.remaining_count -= 1; - let a = slf.current_assignment.as_ref().unwrap().clone(); - return Ok(Some(a)); - } - match slf.iter.next() { - Some(Ok((assignment, count))) => { - if count == 0 { - return Err(PyException::new_err( - "Decoder yielded a zero-count record; data may be corrupted.", - )); - } - slf.current_assignment = Some(assignment.clone()); - slf.remaining_count = count - 1; - Ok(Some(assignment)) - } - Some(Err(e)) => Err(PyException::new_err(format!( - "Error decoding next item: {e}" - ))), - None => Ok(None), - } - } - - // Because we want progress bars!!! - fn __len__(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - if let Some(len_hint) = slf.len_hint { - return Ok(len_hint); - } - - let base_len = ensure_base_len(&mut slf, py)?; - slf.len_hint = Some(base_len); - Ok(base_len) - } - - #[pyo3(text_signature = "(self)")] - fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - // Always reports the total number of samples in the source file, - // even after `subsample_*` has been applied. We deliberately do - // not touch `len_hint` here: when a subsample selection is - // active, `len_hint` tracks the filtered count that `__len__` - // should return, and clobbering it would break `len(dec)` after - // a `count_samples()` call. - ensure_base_len(&mut slf, py) - } - - #[pyo3(text_signature = "(self, indices, /)")] - fn subsample_indices<'py>( - mut slf: PyRefMut<'py, Self>, - mut indices: Vec, - py: Python<'_>, - ) -> PyResult> { - if !indices.iter().is_sorted() { - // We need to sort and deduplicate the indices - // This is a bit annoying, but it is necessary to ensure that we can - // efficiently iterate over the underlying data. - // We use unstable sort because we don't care about the order of equal elements - // and it is faster than stable sort. - let warnings = py.import("warnings")?; - let kwargs = PyDict::new(py); - // kwargs.set_item("stacklevel", 2)?; - - warnings.call_method( - "warn", - ( - "Indices must be sorted and unique; sorting and deduplicating.", - py.get_type::(), - ), - Some(&kwargs), - )?; - } - indices.sort_unstable(); - indices.dedup(); - - if indices.is_empty() { - return Err(PyException::new_err("indices must not be empty")); - } - let base_len = ensure_base_len(&mut slf, py)?; - if indices[0] <= 0 { - return Err(PyException::new_err("indices must be 1-based")); - } - if indices.last().unwrap() > &base_len { - return Err(PyException::new_err(format!( - "indices must be <= number of samples in base data ({})", - base_len - ))); - } - let len_hint = indices.len(); - - slf.active_selection = ActiveSelection::Indices(indices.clone()); - let sel = Selection::Indices(indices.into_iter().peekable()); - reset_with_selection(&mut slf, sel, len_hint)?; - Ok(slf.into()) - } - - #[pyo3(text_signature = "(self, start, end, /)")] - fn subsample_range<'py>( - mut slf: PyRefMut<'py, Self>, - start: usize, - end: usize, - py: Python<'_>, - ) -> PyResult> { - if start == 0 || end < start { - return Err(PyException::new_err( - "range must be 1-based and end >= start", - )); - } - let base_len = ensure_base_len(&mut slf, py)?; - if end > base_len { - return Err(PyException::new_err(format!( - "end must be <= number of samples in base data ({})", - base_len - ))); - } - - slf.active_selection = ActiveSelection::Range { start, end }; - let sel = Selection::Range { start, end }; - let len_hint = end - start + 1; - reset_with_selection(&mut slf, sel, len_hint)?; - Ok(slf.into()) - } - - #[pyo3(signature = (step, offset=1))] - fn subsample_every<'py>( - mut slf: PyRefMut<'py, Self>, - step: usize, - offset: usize, - py: Python<'_>, - ) -> PyResult> { - if step == 0 || offset == 0 { - return Err(PyException::new_err("step and offset must be >= 1")); - } - let base_len = ensure_base_len(&mut slf, py)?; - if offset > base_len { - return Err(PyException::new_err(format!( - "offset must be <= number of samples in base data ({})", - base_len - ))); - } - slf.active_selection = ActiveSelection::Every { step, offset }; - let sel = Selection::Every { step, offset }; - let len_hint = (base_len + step - 1 - (offset - 1)) / step; - reset_with_selection(&mut slf, sel, len_hint)?; - Ok(slf.into()) - } - - // --------------------------------------------------------------------- - // Bundle-inspection surface. - // - // These methods only make sense when the decoder was opened on a - // `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear - // error pointing the user at the right tool. - // --------------------------------------------------------------------- - - /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a - /// plain `.ben`/`.xben` stream (`False`). - #[pyo3(text_signature = "(self)")] - fn is_bundle(&self) -> bool { - self.backend.is_bundle() - } - - /// Return the container format of the underlying assignment stream - /// as `"ben"` or `"xben"`. - #[pyo3(text_signature = "(self)")] - fn assignment_format(&self) -> &'static str { - self.mode.as_str() - } - - /// Return the bundle's format version as a `(major, minor)` tuple. - /// Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn version(&self) -> PyResult<(u16, u16)> { - let state = self.require_bundle("version()")?; - let h = state.reader.header(); - Ok((h.major_version, h.minor_version)) - } - - /// Whether the bundle was successfully finalized. Errors on plain - /// streams. - #[pyo3(text_signature = "(self)")] - fn is_complete(&self) -> PyResult { - let state = self.require_bundle("is_complete()")?; - Ok(state.reader.is_complete()) - } - - /// Names of every entry in the bundle's directory, in directory - /// order. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn asset_names(&self) -> PyResult> { - let state = self.require_bundle("asset_names()")?; - Ok(state - .reader - .assets() - .iter() - .map(|e| e.name.clone()) - .collect()) - } - - /// Return the full bundle directory as a list of dicts with keys - /// `name`, `type`, `offset`, `len`, and `flags` (a list of string - /// tags). Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { - let state = self.require_bundle("list_assets()")?; - let entries = state.reader.assets(); - let mut out = Vec::with_capacity(entries.len()); - for entry in entries { - let d = PyDict::new(py); - d.set_item("name", &entry.name)?; - d.set_item("type", entry.asset_type)?; - d.set_item("offset", entry.payload_offset)?; - d.set_item("len", entry.payload_len)?; - let mut flags: Vec<&str> = Vec::new(); - if entry.asset_flags & ASSET_FLAG_JSON != 0 { - flags.push("json"); - } - if entry.asset_flags & ASSET_FLAG_XZ != 0 { - flags.push("xz"); - } - if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { - flags.push("checksum"); - } - d.set_item("flags", flags)?; - out.push(d); - } - Ok(out) - } - - /// Read the (decoded) bytes of a named asset as a Python `bytes` - /// object. Errors on plain streams. - #[pyo3(text_signature = "(self, name, /)")] - fn read_asset_bytes(&mut self, name: &str) -> PyResult> { - let state = self.require_bundle_mut("read_asset_bytes()")?; - let entry = state - .reader - .find_asset_by_name(name) - .cloned() - .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; - state - .reader - .asset_bytes(&entry) - .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) - } - - /// Parse a JSON asset into a Python object (dict, list, …). Errors - /// on plain streams and when the asset does not exist or is not - /// valid UTF-8 / JSON. - #[pyo3(text_signature = "(self, name, /)")] - fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { - let bytes = self.read_asset_bytes(name)?; - let json_mod = py.import("json")?; - let text = std::str::from_utf8(&bytes) - .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; - let parsed = json_mod.call_method1("loads", (text,))?; - Ok(parsed.into()) - } - - /// Read the bundle's `graph.json` asset as a parsed JSON object. - /// Returns `None` if the bundle does not carry a graph asset. Errors - /// on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_graph()")?; - if state.reader.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "graph.json")?)) - } - - /// Read the bundle's `metadata.json` asset as a parsed JSON object, - /// or `None` if absent. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_metadata()")?; - if state.reader.find_asset_by_type(ASSET_TYPE_METADATA).is_none() { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "metadata.json")?)) - } - - /// Read the bundle's `relabel_map.json` asset as a parsed JSON - /// object, or `None` if absent. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_relabel_map()")?; - if state - .reader - .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) - .is_none() - { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) - } - - /// Copy the embedded assignment stream region verbatim to - /// `out_path`. The resulting file can be opened directly with - /// `PyBenDecoder(out_path, mode=dec.assignment_format())`. - /// Errors on plain streams. - #[pyo3(signature = (out_path, overwrite=false))] - #[pyo3(text_signature = "(self, out_path, overwrite=False)")] - fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { - let state = self.require_bundle_mut("extract_stream()")?; - if out_path.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_path.display() - ))); - } - let out = if overwrite { - OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&out_path) - } else { - OpenOptions::new() - .write(true) - .create_new(true) - .open(&out_path) - } - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; - let mut out = BufWriter::new(out); - - let mut stream = state - .reader - .assignment_stream_reader() - .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; - io::copy(&mut stream, &mut out) - .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; - out.flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; - Ok(()) - } -} - -impl PyBenDecoder { - /// Borrow the bundle state or raise a clear Python error explaining - /// that the decoder was opened on a plain stream. - fn require_bundle(&self, op: &str) -> PyResult<&BundleState> { - match &self.backend { - DecoderBackend::Bundle(state) => Ok(state), - DecoderBackend::Plain => Err(PyException::new_err(format!( - "{op} is only available on .bendl bundles; this decoder was opened \ - on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ - via PyBenEncoder with ben_file_only=False) to get bundle features.", - self.mode.as_str() - ))), - } - } - - fn require_bundle_mut(&mut self, op: &str) -> PyResult<&mut BundleState> { - match &mut self.backend { - DecoderBackend::Bundle(state) => Ok(state), - DecoderBackend::Plain => Err(PyException::new_err(format!( - "{op} is only available on .bendl bundles; this decoder was opened \ - on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ - via PyBenEncoder with ben_file_only=False) to get bundle features.", - self.mode.as_str() - ))), - } - } -} - -fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { - let warnings = py.import("warnings")?; - let kwargs = PyDict::new(py); - - warnings.call_method( - "warn", - ( - "XBEN may take a second to start decoding.", - py.get_type::(), - ), - Some(&kwargs), - )?; - - Ok(()) -} - -/// Sniff the first 8 bytes of a file and decide whether it starts with -/// the `BENDL` magic. -fn detect_is_bundle(path: &Path) -> io::Result { - let mut file = File::open(path)?; - let mut magic = [0u8; 8]; - match file.read_exact(&mut magic) { - Ok(()) => Ok(magic == BENDL_MAGIC), - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(false), - Err(e) => Err(e), - } -} - -/// Build a plain-stream iterator from `path` using `mode`. -fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { - let reader = open_input(&path.to_path_buf())?; - match mode { - DecoderMode::Ben => { - let ben = AssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; - Ok(Box::new(ben)) - } - DecoderMode::XBen => { - let xben = XZAssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; - Ok(Box::new(xben)) - } - } -} - -/// Open a second file handle on the bundle path, seek to the stream -/// region, and wrap it in the appropriate assignment reader so the -/// decoder iterator only walks the embedded stream. -fn build_bundle_iter( - path: &Path, - state: &BundleState, - mode: DecoderMode, -) -> PyResult { - let reader = open_bundle_stream_reader(path, state)?; - match mode { - DecoderMode::Ben => { - let ben = AssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; - Ok(Box::new(ben)) - } - DecoderMode::XBen => { - let xben = XZAssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; - Ok(Box::new(xben)) - } - } -} - -/// Create a `Read`-only handle bounded to the bundle's assignment stream -/// region. -fn open_bundle_stream_reader( - path: &Path, - state: &BundleState, -) -> PyResult>> { - let file = File::open(path) - .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", path.display())))?; - let mut buf = BufReader::new(file); - buf.seek(SeekFrom::Start(state.stream_offset)).map_err(|e| { - PyIOError::new_err(format!("Failed to seek into bundle stream: {e}")) - })?; - Ok(buf.take(state.stream_len)) -} - -fn build_frames_for_subsample( - path: &Path, - mode: DecoderMode, - backend: &DecoderBackend, -) -> PyResult { - match backend { - DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), mode.as_str()).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - path.display() - )) - }), - DecoderBackend::Bundle(state) => { - let reader = open_bundle_stream_reader(path, state)?; - build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from bundle {}: {e}", - path.display() - )) - }) - } - } -} - -fn reset_with_selection( - decoder: &mut PyBenDecoder, - selection: Selection, - len_hint: usize, -) -> PyResult<()> { - let frames = build_frames_for_subsample(&decoder.path, decoder.mode, &decoder.backend)?; - let frame_decoder = SubsampleFrameDecoder::new(frames, selection); - decoder.iter = Box::new(frame_decoder); - decoder.current_assignment = None; - decoder.remaining_count = 0; - decoder.len_hint = Some(len_hint); - Ok(()) -} - -fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { - if let Some(base_len) = decoder.base_len { - return Ok(base_len); - } - - let base_len = match &decoder.backend { - DecoderBackend::Plain => { - let path = decoder.path.clone(); - let mode = decoder.mode.as_str().to_string(); - py.detach(|| count_samples_from_file(&path, &mode)) - .map_err(|e| { - PyException::new_err(format!( - "Failed to count samples in {}: {e}", - path.display() - )) - })? - } - DecoderBackend::Bundle(state) => { - // Prefer the authoritative sample_count carried in the - // bundle header, which is set for finalized bundles and is - // O(1). Fall back to scanning the stream region when the - // header has no count (unfinalized append target, or a - // header byte we cannot interpret). - if let Some(n) = state.reader.sample_count() { - if n >= 0 { - n as usize - } else { - scan_bundle_samples(&decoder.path, state, decoder.mode)? - } - } else { - scan_bundle_samples(&decoder.path, state, decoder.mode)? - } - } - }; - decoder.base_len = Some(base_len); - Ok(base_len) -} - -fn scan_bundle_samples( - path: &Path, - state: &BundleState, - mode: DecoderMode, -) -> PyResult { - let reader = open_bundle_stream_reader(path, state)?; - let iter = build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { - PyException::new_err(format!( - "Failed to open bundle stream for sample count: {e}" - )) - })?; - count_samples_from_frame_iter(iter).map_err(|e| { - PyException::new_err(format!("Failed to count samples in bundle: {e}")) - }) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_xben_to_ben( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - decode_xben_to_ben(reader, writer).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert XBEN to BEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; - - Ok(()) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_xben_to_jsonl( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - decode_xben_to_jsonl(reader, writer).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert XBEN to JSONL from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; - - Ok(()) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_ben_to_jsonl( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - decode_ben_to_jsonl(reader, writer).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert BEN to JSONL from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; - - Ok(()) -} +pub use decoder::PyBenDecoder; +pub use py_funcs::{decompress_ben_to_jsonl, decompress_xben_to_ben, decompress_xben_to_jsonl}; diff --git a/pyben/src/decode/py_funcs.rs b/pyben/src/decode/py_funcs.rs new file mode 100644 index 0000000..071c80c --- /dev/null +++ b/pyben/src/decode/py_funcs.rs @@ -0,0 +1,76 @@ +use crate::common::{open_input, open_output, validate_input_output_paths}; +use binary_ensemble::codec::decode::{ + decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, +}; +use pyo3::exceptions::PyIOError; +use pyo3::prelude::*; +use std::path::PathBuf; + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] +pub fn decompress_xben_to_ben( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, +) -> PyResult<()> { + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + decode_xben_to_ben(reader, writer).map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert XBEN to BEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; + + Ok(()) +} + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] +pub fn decompress_xben_to_jsonl( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, +) -> PyResult<()> { + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + decode_xben_to_jsonl(reader, writer).map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert XBEN to JSONL from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; + + Ok(()) +} + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] +pub fn decompress_ben_to_jsonl( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, +) -> PyResult<()> { + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + decode_ben_to_jsonl(reader, writer).map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert BEN to JSONL from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; + + Ok(()) +} diff --git a/pyben/src/decode/types.rs b/pyben/src/decode/types.rs new file mode 100644 index 0000000..94cc0e7 --- /dev/null +++ b/pyben/src/decode/types.rs @@ -0,0 +1,95 @@ +use binary_ensemble::io::bundle::format::AssignmentFormat; +use binary_ensemble::io::bundle::BendlReader; +use binary_ensemble::io::reader::{MkvRecord, Selection}; +use pyo3::exceptions::PyException; +use pyo3::prelude::*; +use std::fs::File; +use std::io::{self, BufReader}; + +pub(super) type DynIter = Box> + Send>; + +#[derive(Clone, Copy)] +pub(super) enum DecoderMode { + Ben, + XBen, +} + +impl DecoderMode { + pub(super) fn parse(mode: &str) -> PyResult { + match mode { + "ben" => Ok(Self::Ben), + "xben" => Ok(Self::XBen), + _ => Err(PyException::new_err( + "Unknown mode. Supported modes are 'ben' and 'xben'.", + )), + } + } + + pub(super) fn as_str(&self) -> &'static str { + match self { + Self::Ben => "ben", + Self::XBen => "xben", + } + } + + pub(super) fn from_assignment_format(fmt: AssignmentFormat) -> Self { + match fmt { + AssignmentFormat::Ben => Self::Ben, + AssignmentFormat::Xben => Self::XBen, + } + } +} + +/// Cached bundle state for a decoder opened on a `.bendl` file. +/// +/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC +/// inspection and asset-read calls without disturbing the iterator (which +/// reads the stream region through a separate file handle). +pub(super) struct BundleState { + pub reader: BendlReader>, + pub stream_offset: u64, + pub stream_len: u64, +} + +/// What the decoder was actually opened on. +pub(super) enum DecoderBackend { + Plain, + Bundle(BundleState), +} + +impl DecoderBackend { + pub(super) fn is_bundle(&self) -> bool { + matches!(self, DecoderBackend::Bundle(_)) + } +} + +/// Stored form of the most recently installed subsampling selection. +/// +/// The iterator is single-pass, so to support restarting iteration +/// (e.g. `for x in dec: ... ; for x in dec: ...`) the decoder remembers +/// the active selection and rebuilds a fresh frame decoder on every +/// call to `__iter__`. +#[derive(Clone)] +pub(super) enum ActiveSelection { + None, + Indices(Vec), + Range { start: usize, end: usize }, + Every { step: usize, offset: usize }, +} + +impl ActiveSelection { + pub(super) fn to_selection(&self) -> Option { + match self { + Self::None => None, + Self::Indices(v) => Some(Selection::Indices(v.clone().into_iter().peekable())), + Self::Range { start, end } => Some(Selection::Range { + start: *start, + end: *end, + }), + Self::Every { step, offset } => Some(Selection::Every { + step: *step, + offset: *offset, + }), + } + } +} diff --git a/pyben/src/encode/encoder.rs b/pyben/src/encode/encoder.rs new file mode 100644 index 0000000..aa0301a --- /dev/null +++ b/pyben/src/encode/encoder.rs @@ -0,0 +1,234 @@ +use super::helpers::{parse_graph_input, xz_compress}; +use super::types::{OutputMode, SharedFileSlot, SharedFileWriter}; +use crate::common::{open_output, parse_variant}; +use binary_ensemble::io::bundle::format::{ + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, + ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH, COMPLETE_YES, HEADER_SIZE, +}; +use binary_ensemble::io::writer::AssignmentWriter; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use std::cell::RefCell; +use std::io::{Seek, SeekFrom, Write}; +use std::path::PathBuf; +use std::rc::Rc; + +#[pyclass(unsendable)] +pub struct PyBenEncoder { + file: Option, + encoder: Option>, + mode: OutputMode, +} + +#[pymethods] +impl PyBenEncoder { + /// Open a new encoder. The default output is a `.bendl` bundle with + /// an embedded assignment stream and an optional embedded graph; set + /// `ben_file_only=True` to emit a plain `.ben` file instead. + /// + /// # Arguments + /// + /// * `file_path` - Output path. Must not exist unless `overwrite=True`. + /// * `overwrite` - Replace an existing file at `file_path`. + /// * `variant` - BEN variant for the assignment stream (`"standard"`, + /// `"mkv_chain"`, or `"twodelta"`). + /// * `graph` - Optional graph to embed as the `graph.json` asset when + /// writing a bundle. Accepts a `pathlib.Path` / `str` path, a + /// `bytes` object containing UTF-8 JSON, a Python `dict` / `list` + /// that will be serialized with `json.dumps`, or a file-like object + /// with a `.read()` method. Passing a graph alongside + /// `ben_file_only=True` is an error. + /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no + /// bundle framing. Defaults to `False`. + #[new] + #[pyo3(signature = ( + file_path, + overwrite = false, + variant = None, + graph = None, + ben_file_only = false, + ))] + #[pyo3( + text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)" + )] + fn new( + py: Python<'_>, + file_path: PathBuf, + overwrite: bool, + variant: Option, + graph: Option>, + ben_file_only: bool, + ) -> PyResult { + let ben_var = parse_variant(variant.as_deref())?; + + if ben_file_only && graph.is_some() { + return Err(PyValueError::new_err( + "graph= cannot be combined with ben_file_only=True (the graph \ + would have nowhere to live in a plain .ben file).", + )); + } + + let buf = open_output(&file_path, overwrite)?; + let file: SharedFileSlot = Rc::new(RefCell::new(buf)); + + let mode = if ben_file_only { + OutputMode::BenOnly + } else { + let graph_bytes = match graph { + Some(obj) => Some(parse_graph_input(py, &obj)?), + None => None, + }; + + // Write a provisional bundle header and any graph asset before + // the assignment stream begins. + let mut header = BendlHeader::provisional(AssignmentFormat::Ben, HEADER_SIZE as u64); + let mut entries: Vec = Vec::new(); + { + let mut slot = file.borrow_mut(); + slot.seek(SeekFrom::Start(0)) + .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; + header.write_to(&mut *slot).map_err(|e| { + PyIOError::new_err(format!("Failed to write bundle header: {e}")) + })?; + + if let Some(bytes) = graph_bytes { + let compressed = xz_compress(&bytes).map_err(|e| { + PyIOError::new_err(format!("Failed to xz-compress graph asset: {e}")) + })?; + let payload_offset = slot.stream_position().map_err(|e| { + PyIOError::new_err(format!("Failed to query output position: {e}")) + })?; + slot.write_all(&compressed).map_err(|e| { + PyIOError::new_err(format!("Failed to write graph asset payload: {e}")) + })?; + entries.push(BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: CANONICAL_NAME_GRAPH.to_string(), + payload_offset, + payload_len: compressed.len() as u64, + checksum: None, + }); + } + } + + let stream_start = file + .borrow_mut() + .stream_position() + .map_err(|e| PyIOError::new_err(format!("Failed to query output position: {e}")))?; + header.stream_offset = stream_start; + + OutputMode::Bundle { + header, + entries, + stream_start, + sample_count: 0, + } + }; + + // Construct the AssignmentWriter on a clone of the shared slot. + // This writes the BEN banner as its first action, which in the + // bundle case becomes the first byte of the stream region. + let encoder = AssignmentWriter::new(SharedFileWriter(Rc::clone(&file)), ben_var) + .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {e}")))?; + + Ok(PyBenEncoder { + file: Some(file), + encoder: Some(encoder), + mode, + }) + } + + /// Encode a single assignment and append it to the output stream. + #[pyo3(signature = (assignment))] + #[pyo3(text_signature = "(assignment)")] + fn write(&mut self, assignment: Vec) -> PyResult<()> { + let enc = self + .encoder + .as_mut() + .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; + enc.write_assignment(assignment) + .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {e}")))?; + if let OutputMode::Bundle { sample_count, .. } = &mut self.mode { + *sample_count += 1; + } + Ok(()) + } + + /// Flush the assignment stream and, for bundle output, patch the + /// header and write the trailing directory. Idempotent. + fn close(&mut self) -> PyResult<()> { + // Finish the assignment stream and drop the inner encoder so its + // Rc handle to the shared file slot is released. + if let Some(mut enc) = self.encoder.take() { + enc.finish().map_err(|e| { + PyIOError::new_err(format!("Failed to flush encoder when closing: {e}")) + })?; + drop(enc); + } + + let file = match self.file.take() { + Some(f) => f, + None => return Ok(()), + }; + + match &mut self.mode { + OutputMode::BenOnly => { + file.borrow_mut() + .flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + } + OutputMode::Bundle { + header, + entries, + stream_start, + sample_count, + } => { + let mut slot = file.borrow_mut(); + let stream_end = slot.stream_position().map_err(|e| { + PyIOError::new_err(format!("Failed to query output position: {e}")) + })?; + let stream_len = stream_end.saturating_sub(*stream_start); + + let directory_offset = stream_end; + let directory_bytes = encode_directory(entries).map_err(|e| { + PyException::new_err(format!("Failed to encode bundle directory: {e}")) + })?; + slot.write_all(&directory_bytes).map_err(|e| { + PyIOError::new_err(format!("Failed to write bundle directory: {e}")) + })?; + let directory_len = directory_bytes.len() as u64; + + header.stream_offset = *stream_start; + header.stream_len = stream_len; + header.directory_offset = directory_offset; + header.directory_len = directory_len; + header.sample_count = *sample_count; + header.complete = COMPLETE_YES; + + slot.seek(SeekFrom::Start(0)) + .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; + header.write_to(&mut *slot).map_err(|e| { + PyIOError::new_err(format!("Failed to patch bundle header: {e}")) + })?; + slot.flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + } + } + Ok(()) + } + + fn __enter__(slf: pyo3::PyRefMut) -> pyo3::PyRefMut { + slf + } + + fn __exit__( + &mut self, + _exc_type: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, + _exc_value: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, + _traceback: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, + ) -> PyResult { + self.close()?; + Ok(false) + } +} diff --git a/pyben/src/encode/helpers.rs b/pyben/src/encode/helpers.rs new file mode 100644 index 0000000..125a319 --- /dev/null +++ b/pyben/src/encode/helpers.rs @@ -0,0 +1,69 @@ +use binary_ensemble::io::bundle::format::DEFAULT_XZ_PRESET; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyDict, PyList}; +use std::io::{self, Write}; +use std::path::PathBuf; +use xz2::write::XzEncoder; + +/// xz-compress a byte slice with the bundle's default preset. +pub(super) fn xz_compress(bytes: &[u8]) -> io::Result> { + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder.write_all(bytes)?; + encoder.finish() +} + +/// Normalize a user-supplied graph argument into raw UTF-8 JSON bytes. +/// +/// Accepted forms: +/// +/// - `dict` / `list`: serialized via `json.dumps`. +/// - `bytes` / `bytearray`: used verbatim. +/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): +/// `.read()` is called and the result is coerced to bytes. +/// - `pathlib.Path` or `str`: treated as a filesystem path to read. +pub(super) fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { + // Dict / list → json.dumps. + if obj.is_instance_of::() || obj.is_instance_of::() { + let json_mod = py.import("json")?; + let dumped = json_mod.call_method1("dumps", (obj,))?; + let s: String = dumped.extract()?; + return Ok(s.into_bytes()); + } + + // Raw bytes / bytearray. + if let Ok(b) = obj.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = obj.extract::>() { + return Ok(b); + } + + // File-like: must have .read(). Check before str/path, since a plain + // `str` / `Path` has no `.read()` attribute and will fall through. + if obj.hasattr("read")? { + let data = obj.call_method0("read")?; + if let Ok(b) = data.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = data.extract::>() { + return Ok(b); + } + if let Ok(s) = data.extract::() { + return Ok(s.into_bytes()); + } + return Err(PyException::new_err( + "graph .read() must return bytes or str", + )); + } + + // Path / str → read the file at that path. + let path: PathBuf = obj.extract().map_err(|_| { + PyValueError::new_err( + "graph must be a dict/list, bytes, a file-like with .read(), or a path", + ) + })?; + std::fs::read(&path).map_err(|e| { + PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) + }) +} diff --git a/pyben/src/encode/mod.rs b/pyben/src/encode/mod.rs index 540e533..cd0c20a 100644 --- a/pyben/src/encode/mod.rs +++ b/pyben/src/encode/mod.rs @@ -1,418 +1,9 @@ -use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; -use binary_ensemble::codec::encode::{ - encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, -}; -use binary_ensemble::io::bundle::format::{ - encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, - ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH, COMPLETE_YES, DEFAULT_XZ_PRESET, - HEADER_SIZE, -}; -use binary_ensemble::io::writer::AssignmentWriter; -use pyo3::exceptions::{PyException, PyIOError, PyValueError}; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyDict, PyList}; -use std::cell::RefCell; -use std::fs::File; -use std::io::{self, BufWriter, Seek, SeekFrom, Write}; -use std::path::PathBuf; -use std::rc::Rc; -use xz2::write::XzEncoder; +//! Python bindings for BEN/XBEN encoding and `.bendl` bundle authoring. -/// Handle to the underlying output file shared between the live -/// `AssignmentWriter` and the `PyBenEncoder` that owns it. Needed so the -/// encoder can reach the buffered file after the inner assignment writer -/// has finished, in order to patch the bundle header and write the -/// trailing directory. -type SharedFileSlot = Rc>>; +mod encoder; +mod helpers; +mod py_funcs; +mod types; -/// Wrapper around a shared buffered file that implements `Write`. The -/// `AssignmentWriter` holds one of these and delegates every write into -/// the shared slot. -struct SharedFileWriter(SharedFileSlot); - -impl Write for SharedFileWriter { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.0.borrow_mut().write(buf) - } - - fn flush(&mut self) -> io::Result<()> { - self.0.borrow_mut().flush() - } -} - -/// Output container produced by `PyBenEncoder`. -enum OutputMode { - /// Plain `.ben` file: just the assignment stream, no header or directory. - BenOnly, - /// `.bendl` bundle: provisional header up front, optional graph asset, - /// then the assignment stream, then a directory written at close time. - Bundle { - header: BendlHeader, - entries: Vec, - stream_start: u64, - sample_count: i64, - }, -} - -#[pyclass(unsendable)] -pub struct PyBenEncoder { - file: Option, - encoder: Option>, - mode: OutputMode, -} - -#[pymethods] -impl PyBenEncoder { - /// Open a new encoder. The default output is a `.bendl` bundle with - /// an embedded assignment stream and an optional embedded graph; set - /// `ben_file_only=True` to emit a plain `.ben` file instead. - /// - /// # Arguments - /// - /// * `file_path` - Output path. Must not exist unless `overwrite=True`. - /// * `overwrite` - Replace an existing file at `file_path`. - /// * `variant` - BEN variant for the assignment stream (`"standard"`, - /// `"mkv_chain"`, or `"twodelta"`). - /// * `graph` - Optional graph to embed as the `graph.json` asset when - /// writing a bundle. Accepts a `pathlib.Path` / `str` path, a - /// `bytes` object containing UTF-8 JSON, a Python `dict` / `list` - /// that will be serialized with `json.dumps`, or a file-like object - /// with a `.read()` method. Passing a graph alongside - /// `ben_file_only=True` is an error. - /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no - /// bundle framing. Defaults to `False`. - #[new] - #[pyo3(signature = ( - file_path, - overwrite = false, - variant = None, - graph = None, - ben_file_only = false, - ))] - #[pyo3( - text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)" - )] - fn new( - py: Python<'_>, - file_path: PathBuf, - overwrite: bool, - variant: Option, - graph: Option>, - ben_file_only: bool, - ) -> PyResult { - let ben_var = parse_variant(variant.as_deref())?; - - if ben_file_only && graph.is_some() { - return Err(PyValueError::new_err( - "graph= cannot be combined with ben_file_only=True (the graph \ - would have nowhere to live in a plain .ben file).", - )); - } - - let buf = open_output(&file_path, overwrite)?; - let file: SharedFileSlot = Rc::new(RefCell::new(buf)); - - let mode = if ben_file_only { - OutputMode::BenOnly - } else { - let graph_bytes = match graph { - Some(obj) => Some(parse_graph_input(py, &obj)?), - None => None, - }; - - // Write a provisional bundle header and any graph asset before - // the assignment stream begins. - let mut header = BendlHeader::provisional(AssignmentFormat::Ben, HEADER_SIZE as u64); - let mut entries: Vec = Vec::new(); - { - let mut slot = file.borrow_mut(); - slot.seek(SeekFrom::Start(0)) - .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; - header.write_to(&mut *slot).map_err(|e| { - PyIOError::new_err(format!("Failed to write bundle header: {e}")) - })?; - - if let Some(bytes) = graph_bytes { - let compressed = xz_compress(&bytes).map_err(|e| { - PyIOError::new_err(format!("Failed to xz-compress graph asset: {e}")) - })?; - let payload_offset = slot.stream_position().map_err(|e| { - PyIOError::new_err(format!("Failed to query output position: {e}")) - })?; - slot.write_all(&compressed).map_err(|e| { - PyIOError::new_err(format!("Failed to write graph asset payload: {e}")) - })?; - entries.push(BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), - payload_offset, - payload_len: compressed.len() as u64, - checksum: None, - }); - } - } - - let stream_start = file - .borrow_mut() - .stream_position() - .map_err(|e| PyIOError::new_err(format!("Failed to query output position: {e}")))?; - header.stream_offset = stream_start; - - OutputMode::Bundle { - header, - entries, - stream_start, - sample_count: 0, - } - }; - - // Construct the AssignmentWriter on a clone of the shared slot. - // This writes the BEN banner as its first action, which in the - // bundle case becomes the first byte of the stream region. - let encoder = AssignmentWriter::new(SharedFileWriter(Rc::clone(&file)), ben_var) - .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {e}")))?; - - Ok(PyBenEncoder { - file: Some(file), - encoder: Some(encoder), - mode, - }) - } - - /// Encode a single assignment and append it to the output stream. - #[pyo3(signature = (assignment))] - #[pyo3(text_signature = "(assignment)")] - fn write(&mut self, assignment: Vec) -> PyResult<()> { - let enc = self - .encoder - .as_mut() - .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; - enc.write_assignment(assignment) - .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {e}")))?; - if let OutputMode::Bundle { sample_count, .. } = &mut self.mode { - *sample_count += 1; - } - Ok(()) - } - - /// Flush the assignment stream and, for bundle output, patch the - /// header and write the trailing directory. Idempotent. - fn close(&mut self) -> PyResult<()> { - // Finish the assignment stream and drop the inner encoder so its - // Rc handle to the shared file slot is released. - if let Some(mut enc) = self.encoder.take() { - enc.finish().map_err(|e| { - PyIOError::new_err(format!("Failed to flush encoder when closing: {e}")) - })?; - drop(enc); - } - - let file = match self.file.take() { - Some(f) => f, - None => return Ok(()), - }; - - match &mut self.mode { - OutputMode::BenOnly => { - file.borrow_mut() - .flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; - } - OutputMode::Bundle { - header, - entries, - stream_start, - sample_count, - } => { - let mut slot = file.borrow_mut(); - let stream_end = slot.stream_position().map_err(|e| { - PyIOError::new_err(format!("Failed to query output position: {e}")) - })?; - let stream_len = stream_end.saturating_sub(*stream_start); - - let directory_offset = stream_end; - let directory_bytes = encode_directory(entries).map_err(|e| { - PyException::new_err(format!("Failed to encode bundle directory: {e}")) - })?; - slot.write_all(&directory_bytes).map_err(|e| { - PyIOError::new_err(format!("Failed to write bundle directory: {e}")) - })?; - let directory_len = directory_bytes.len() as u64; - - header.stream_offset = *stream_start; - header.stream_len = stream_len; - header.directory_offset = directory_offset; - header.directory_len = directory_len; - header.sample_count = *sample_count; - header.complete = COMPLETE_YES; - - slot.seek(SeekFrom::Start(0)) - .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; - header.write_to(&mut *slot).map_err(|e| { - PyIOError::new_err(format!("Failed to patch bundle header: {e}")) - })?; - slot.flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; - } - } - Ok(()) - } - - fn __enter__(slf: pyo3::PyRefMut) -> pyo3::PyRefMut { - slf - } - - fn __exit__( - &mut self, - _exc_type: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, - _exc_value: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, - _traceback: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, - ) -> PyResult { - self.close()?; - Ok(false) - } -} - -/// xz-compress a byte slice with the bundle's default preset. -fn xz_compress(bytes: &[u8]) -> io::Result> { - let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); - encoder.write_all(bytes)?; - encoder.finish() -} - -/// Normalize a user-supplied graph argument into raw UTF-8 JSON bytes. -/// -/// Accepted forms: -/// -/// - `dict` / `list`: serialized via `json.dumps`. -/// - `bytes` / `bytearray`: used verbatim. -/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): -/// `.read()` is called and the result is coerced to bytes. -/// - `pathlib.Path` or `str`: treated as a filesystem path to read. -fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { - // Dict / list → json.dumps. - if obj.is_instance_of::() || obj.is_instance_of::() { - let json_mod = py.import("json")?; - let dumped = json_mod.call_method1("dumps", (obj,))?; - let s: String = dumped.extract()?; - return Ok(s.into_bytes()); - } - - // Raw bytes / bytearray. - if let Ok(b) = obj.downcast::() { - return Ok(b.as_bytes().to_vec()); - } - if let Ok(b) = obj.extract::>() { - return Ok(b); - } - - // File-like: must have .read(). Check before str/path, since a plain - // `str` / `Path` has no `.read()` attribute and will fall through. - if obj.hasattr("read")? { - let data = obj.call_method0("read")?; - if let Ok(b) = data.downcast::() { - return Ok(b.as_bytes().to_vec()); - } - if let Ok(b) = data.extract::>() { - return Ok(b); - } - if let Ok(s) = data.extract::() { - return Ok(s.into_bytes()); - } - return Err(PyException::new_err( - "graph .read() must return bytes or str", - )); - } - - // Path / str → read the file at that path. - let path: PathBuf = obj.extract().map_err(|_| { - PyValueError::new_err( - "graph must be a dict/list, bytes, a file-like with .read(), or a path", - ) - })?; - std::fs::read(&path).map_err(|e| { - PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) - }) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, n_threads = None, compression_level = None))] -#[pyo3( - text_signature = "(in_file, out_file, overwrite=false, n_threads=None, compression_level=None)" -)] -pub fn compress_ben_to_xben( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, - n_threads: Option, - compression_level: Option, -) -> PyResult<()> { - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert BEN to XBEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; - - Ok(()) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain')")] -pub fn compress_jsonl_to_ben( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, - variant: &str, -) -> PyResult<()> { - let ben_var = parse_variant(Some(variant))?; - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - encode_jsonl_to_ben(reader, writer, ben_var).map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert JSONL to BEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; - Ok(()) -} - -#[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None))] -#[pyo3( - text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain', n_threads=None, compression_level=None)" -)] -pub fn compress_jsonl_to_xben( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, - variant: &str, - n_threads: Option, - compression_level: Option, -) -> PyResult<()> { - let ben_var = parse_variant(Some(variant))?; - validate_input_output_paths(&in_file, &out_file)?; - let reader = open_input(&in_file)?; - let writer = open_output(&out_file, overwrite)?; - - encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err( - |e| { - PyIOError::new_err(format!( - "Failed to convert JSONL to XBEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - }, - )?; - Ok(()) -} +pub use encoder::PyBenEncoder; +pub use py_funcs::{compress_ben_to_xben, compress_jsonl_to_ben, compress_jsonl_to_xben}; diff --git a/pyben/src/encode/py_funcs.rs b/pyben/src/encode/py_funcs.rs new file mode 100644 index 0000000..e90d833 --- /dev/null +++ b/pyben/src/encode/py_funcs.rs @@ -0,0 +1,86 @@ +use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; +use binary_ensemble::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; +use pyo3::exceptions::PyIOError; +use pyo3::prelude::*; +use std::path::PathBuf; + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false, n_threads = None, compression_level = None))] +#[pyo3( + text_signature = "(in_file, out_file, overwrite=false, n_threads=None, compression_level=None)" +)] +pub fn compress_ben_to_xben( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, + n_threads: Option, + compression_level: Option, +) -> PyResult<()> { + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert BEN to XBEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; + + Ok(()) +} + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain')")] +pub fn compress_jsonl_to_ben( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, + variant: &str, +) -> PyResult<()> { + let ben_var = parse_variant(Some(variant))?; + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + encode_jsonl_to_ben(reader, writer, ben_var).map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert JSONL to BEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; + Ok(()) +} + +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None))] +#[pyo3( + text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain', n_threads=None, compression_level=None)" +)] +pub fn compress_jsonl_to_xben( + in_file: PathBuf, + out_file: PathBuf, + overwrite: bool, + variant: &str, + n_threads: Option, + compression_level: Option, +) -> PyResult<()> { + let ben_var = parse_variant(Some(variant))?; + validate_input_output_paths(&in_file, &out_file)?; + let reader = open_input(&in_file)?; + let writer = open_output(&out_file, overwrite)?; + + encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err( + |e| { + PyIOError::new_err(format!( + "Failed to convert JSONL to XBEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + }, + )?; + Ok(()) +} diff --git a/pyben/src/encode/types.rs b/pyben/src/encode/types.rs new file mode 100644 index 0000000..02d8ee3 --- /dev/null +++ b/pyben/src/encode/types.rs @@ -0,0 +1,41 @@ +use binary_ensemble::io::bundle::format::{BendlDirectoryEntry, BendlHeader}; +use std::cell::RefCell; +use std::fs::File; +use std::io::{self, BufWriter, Write}; +use std::rc::Rc; + +/// Handle to the underlying output file shared between the live +/// `AssignmentWriter` and the `PyBenEncoder` that owns it. Needed so the +/// encoder can reach the buffered file after the inner assignment writer +/// has finished, in order to patch the bundle header and write the +/// trailing directory. +pub(super) type SharedFileSlot = Rc>>; + +/// Wrapper around a shared buffered file that implements `Write`. The +/// `AssignmentWriter` holds one of these and delegates every write into +/// the shared slot. +pub(super) struct SharedFileWriter(pub SharedFileSlot); + +impl Write for SharedFileWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.borrow_mut().write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.0.borrow_mut().flush() + } +} + +/// Output container produced by `PyBenEncoder`. +pub(super) enum OutputMode { + /// Plain `.ben` file: just the assignment stream, no header or directory. + BenOnly, + /// `.bendl` bundle: provisional header up front, optional graph asset, + /// then the assignment stream, then a directory written at close time. + Bundle { + header: BendlHeader, + entries: Vec, + stream_start: u64, + sample_count: i64, + }, +} From aa8ad81725de8f52fe0cb0578fafad9f207b5fd1 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 1 May 2026 09:22:12 -0600 Subject: [PATCH 078/221] reorg to make cli easier to test --- Taskfile.yml | 27 +- ben/src/bin/ben.rs | 5 +- ben/src/bin/bendl.rs | 5 +- ben/src/bin/pben.rs | 7 +- ben/src/bin/reben.rs | 5 +- ben/src/cli/ben/mod.rs | 375 +------------------------ ben/src/cli/ben/modes/decode.rs | 47 ++++ ben/src/cli/ben/modes/encode.rs | 54 ++++ ben/src/cli/ben/modes/mod.rs | 13 + ben/src/cli/ben/modes/read.rs | 29 ++ ben/src/cli/ben/modes/xdecode.rs | 25 ++ ben/src/cli/ben/modes/xencode.rs | 97 +++++++ ben/src/cli/ben/modes/xz_compress.rs | 30 ++ ben/src/cli/ben/modes/xz_decompress.rs | 36 +++ ben/src/cli/ben/tests.rs | 10 +- ben/src/cli/bendl/mod.rs | 12 +- ben/src/cli/bendl/tests.rs | 15 +- ben/src/cli/common/error.rs | 109 +++++++ ben/src/cli/common/mod.rs | 55 +++- ben/src/cli/common/tests.rs | 37 ++- ben/src/cli/pben/mod.rs | 22 +- ben/src/cli/reben/mod.rs | 10 +- ben/src/cli/reben/tests.rs | 18 +- ben/src/codec/decode/mod.rs | 4 + ben/src/codec/decode/path.rs | 58 ++++ ben/src/codec/encode/mod.rs | 4 + ben/src/codec/encode/path.rs | 180 ++++++++++++ ben/src/lib.rs | 3 + ben/src/ops/extract/mod.rs | 22 +- ben/src/ops/extract/tests.rs | 26 ++ ben/src/test_utils.rs | 129 +++++++++ ben/tests/common/mod.rs | 27 ++ ben/tests/test_assignment_reader.rs | 14 +- ben/tests/test_cli.rs | 32 +-- ben/tests/test_coverage.rs | 14 +- ben/tests/test_impls_pipeline.rs | 26 +- 36 files changed, 1073 insertions(+), 509 deletions(-) create mode 100644 ben/src/cli/ben/modes/decode.rs create mode 100644 ben/src/cli/ben/modes/encode.rs create mode 100644 ben/src/cli/ben/modes/mod.rs create mode 100644 ben/src/cli/ben/modes/read.rs create mode 100644 ben/src/cli/ben/modes/xdecode.rs create mode 100644 ben/src/cli/ben/modes/xencode.rs create mode 100644 ben/src/cli/ben/modes/xz_compress.rs create mode 100644 ben/src/cli/ben/modes/xz_decompress.rs create mode 100644 ben/src/cli/common/error.rs create mode 100644 ben/src/codec/decode/path.rs create mode 100644 ben/src/codec/encode/path.rs create mode 100644 ben/src/test_utils.rs create mode 100644 ben/tests/common/mod.rs diff --git a/Taskfile.yml b/Taskfile.yml index a9696bd..b5dd178 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -7,7 +7,7 @@ vars: LOCAL_BIN: '{{.HOME}}/.local/bin' LLVM_BIN: '{{.HOME}}/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/bin' COV_TARGET_DIR: '{{.ROOT_DIR}}/target/llvm-cov-target' - PYBEN_TEST_PATHS: 'tests/test_python_pipelines.py' + PYBEN_TEST_PATHS: 'tests/' tasks: default: @@ -101,8 +101,8 @@ tasks: cmds: - uv run maturin build --release - test-rust: - desc: Run Rust tests for the workspace + test-rust-fast: + desc: Run the default Rust test suite (excludes #[ignore] tests) silent: true deps: - ensure-toolchain @@ -111,6 +111,23 @@ tasks: cmds: - cargo test + test-rust-slow: + desc: Run only the `#[ignore]`-gated slow / stress Rust tests + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo test -- --ignored + + test-rust: + desc: Run Rust tests for the workspace (fast suite plus #[ignore]-gated stress tests) + silent: true + cmds: + - task: test-rust-fast + - task: test-rust-slow + test-python: desc: Run the pyben Python tests silent: true @@ -137,7 +154,7 @@ tasks: env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)(bin|cli)/' + - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)bin/' coverage-pyben: desc: Run Python-driven Rust coverage for pyben @@ -208,7 +225,7 @@ tasks: ben_report_file=/tmp/ben-coverage-report.txt; pyben_report_file=/tmp/pyben-coverage-report.txt; - cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)(bin|cli)/'"'"' > "$ben_report_file"; + cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)bin/'"'"' > "$ben_report_file"; ben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_report_file")"; cargo llvm-cov clean --workspace >/dev/null; diff --git a/ben/src/bin/ben.rs b/ben/src/bin/ben.rs index ffe2698..6972558 100755 --- a/ben/src/bin/ben.rs +++ b/ben/src/bin/ben.rs @@ -1,4 +1,7 @@ /// Entry point for the `ben` CLI binary. fn main() { - binary_ensemble::cli::ben::run(); + if let Err(err) = binary_ensemble::cli::ben::run() { + eprintln!("Error: {err}"); + std::process::exit(1); + } } diff --git a/ben/src/bin/bendl.rs b/ben/src/bin/bendl.rs index 5033a34..efa6c99 100644 --- a/ben/src/bin/bendl.rs +++ b/ben/src/bin/bendl.rs @@ -1,4 +1,7 @@ /// Entry point for the `bendl` CLI binary. fn main() { - binary_ensemble::cli::bendl::run(); + if let Err(err) = binary_ensemble::cli::bendl::run() { + eprintln!("Error: {err}"); + std::process::exit(1); + } } diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs index 2409401..cca6ca2 100755 --- a/ben/src/bin/pben.rs +++ b/ben/src/bin/pben.rs @@ -1,4 +1,7 @@ /// Entry point for the `pben` CLI binary. -fn main() -> std::io::Result<()> { - binary_ensemble::cli::pben::run() +fn main() { + if let Err(err) = binary_ensemble::cli::pben::run() { + eprintln!("Error: {err}"); + std::process::exit(1); + } } diff --git a/ben/src/bin/reben.rs b/ben/src/bin/reben.rs index f94f8d2..dad21db 100755 --- a/ben/src/bin/reben.rs +++ b/ben/src/bin/reben.rs @@ -1,4 +1,7 @@ /// Entry point for the `reben` CLI binary. fn main() { - binary_ensemble::cli::reben::run(); + if let Err(err) = binary_ensemble::cli::reben::run() { + eprintln!("Error: {err}"); + std::process::exit(1); + } } diff --git a/ben/src/cli/ben/mod.rs b/ben/src/cli/ben/mod.rs index 0a350cf..4f06544 100644 --- a/ben/src/cli/ben/mod.rs +++ b/ben/src/cli/ben/mod.rs @@ -2,381 +2,36 @@ mod args; mod bundle; +mod modes; mod paths; #[cfg(test)] mod tests; -use args::{resolve_variant, Args, Mode}; -use bundle::{run_encode_bundle_with_graph, run_xencode_bundle_with_graph}; -use paths::{decode_setup, encode_setup, open_derived_writer, open_reader, open_writer}; +use args::{Args, Mode}; -use crate::cli::common::{check_overwrite, set_verbose}; -use crate::codec::decode::{ - decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, -}; -use crate::codec::encode::{ - encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, -}; -use crate::ops::extract::extract_assignment_ben; +use crate::cli::common::{set_verbose, CliError, CliResult}; use clap::Parser; -use std::fs::File; -use std::io::{BufReader, BufWriter, Write}; -use std::path::Path; -/// Parse CLI arguments and execute the selected `ben` sub-mode. -pub fn run() { +/// Parse CLI arguments and dispatch to the per-mode handler in [`modes`]. +pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); // --graph is only meaningful for the stream-producing modes. if args.graph.is_some() && args.mode != Mode::Encode && args.mode != Mode::XEncode { - eprintln!("Error: --graph is only supported with --mode encode or --mode x-encode"); - return; + return Err(CliError::other( + "--graph is only supported with --mode encode or --mode x-encode", + )); } match args.mode { - Mode::Encode => { - tracing::trace!("Running in encode mode"); - - // --graph path: produce a .bendl bundle with the BEN stream - // plus a post-stream graph asset. - if let Some(graph_path) = args.graph.as_ref() { - let in_file = match args.input_file.as_ref() { - Some(f) => f, - None => { - eprintln!("Error: --graph requires an input file (stdin not supported)."); - return; - } - }; - if args.print { - eprintln!("Error: --graph is incompatible with --print."); - return; - } - let out_path = match encode_setup( - args.mode.clone(), - in_file.clone(), - args.output_file.clone(), - args.overwrite, - true, - ) { - Ok(path) => path, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = - run_encode_bundle_with_graph(Path::new(in_file), &out_path, variant, graph_path) - { - eprintln!("Error: {:?}", err); - } - return; - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(in_file) if !args.print => match encode_setup( - args.mode.clone(), - in_file.clone(), - args.output_file.clone(), - args.overwrite, - false, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = encode_jsonl_to_ben(reader, writer, variant) { - eprintln!("Error: {:?}", err); - } - } - Mode::XEncode => { - tracing::trace!("Running in xencode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_xben = args.jsonl_and_xben; - - if let Some(in_file) = args.input_file.as_ref() { - if in_file.ends_with(".ben") { - ben_and_xben = true; - } else if in_file.ends_with(".jsonl") { - jsonl_and_xben = true; - } - } - - // --graph path: produce a .bendl bundle with the XBEN stream - // plus a post-stream graph asset. - if let Some(graph_path) = args.graph.as_ref() { - let in_file = match args.input_file.as_ref() { - Some(f) => f, - None => { - eprintln!("Error: --graph requires an input file (stdin not supported)."); - return; - } - }; - if args.print { - eprintln!("Error: --graph is incompatible with --print."); - return; - } - if !ben_and_xben && !jsonl_and_xben { - eprintln!("Error: Unsupported file type(s) for xencode mode"); - return; - } - let out_path = match encode_setup( - args.mode.clone(), - in_file.clone(), - args.output_file.clone(), - args.overwrite, - true, - ) { - Ok(path) => path, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - let variant = resolve_variant(args.variant, args.save_all); - if let Err(err) = run_xencode_bundle_with_graph( - Path::new(in_file), - &out_path, - variant, - ben_and_xben, - args.n_cpus, - args.compression_level, - args.chunk_size, - graph_path, - ) { - eprintln!("Error: {:?}", err); - } - return; - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(in_file) if !args.print => match encode_setup( - args.mode.clone(), - in_file.clone(), - args.output_file.clone(), - args.overwrite, - false, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if ben_and_xben { - if let Err(err) = encode_ben_to_xben( - reader, - writer, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_xben { - let variant = resolve_variant(args.variant, args.save_all); - if let Err(e) = encode_jsonl_to_xben( - reader, - writer, - variant, - args.n_cpus, - args.compression_level, - args.chunk_size, - ) { - eprintln!("Error: {:?}", e); - } - } else { - eprintln!("Error: Unsupported file type(s) for xencode mode"); - } - } - Mode::Decode => { - tracing::trace!("Running in decode mode"); - - let mut ben_and_xben = args.ben_and_xben; - let mut jsonl_and_ben = args.jsonl_and_ben; - - if let Some(file) = args.input_file.as_ref() { - if file.ends_with(".ben") { - jsonl_and_ben = true; - } else if file.ends_with(".xben") { - ben_and_xben = true; - } - } - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(file) if !args.print => { - match decode_setup( - file.clone(), - args.output_file.clone(), - false, - args.overwrite, - ) { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - } - } - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if ben_and_xben { - if let Err(err) = decode_xben_to_ben(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else if jsonl_and_ben { - if let Err(err) = decode_ben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } else { - eprintln!("Error: Unsupported file type(s) for decode mode"); - } - } - Mode::XDecode => { - tracing::trace!("Running in x-decode mode"); - - let reader = open_reader(args.input_file.as_deref()); - let writer = match args.input_file.as_ref() { - Some(file) if !args.print => { - match decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite) - { - Ok(path) => open_derived_writer(path), - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - } - } - _ => match open_writer(args.output_file.as_deref(), args.print, args.overwrite) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }, - }; - - if let Err(err) = decode_xben_to_jsonl(reader, writer) { - eprintln!("Error: {:?}", err); - } - } - Mode::Read => { - tracing::trace!("Running in read mode"); - let reader = BufReader::new( - File::open( - &args - .input_file - .expect("Must provide input file for read mode."), - ) - .unwrap(), - ); - - if args.sample_number.is_none() { - eprintln!("Error: Sample number is required in read mode"); - return; - } - - let mut writer = match open_writer(args.output_file.as_deref(), args.print, false) { - Ok(writer) => writer, - Err(err) => { - eprintln!("Error: {:?}", err); - return; - } - }; - - args.sample_number - .map(|n| match extract_assignment_ben(reader, n) { - Ok(vec) => writer.write_all(format!("{:?}\n", vec).as_bytes()).unwrap(), - Err(e) => eprintln!("Error: {:?}", e), - }); - } - Mode::XzCompress => { - tracing::trace!("Running in xz compress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-compress mode."); - let reader = BufReader::new(File::open(&in_file_name).unwrap()); - - let out_file_name = match args.output_file { - Some(name) => name, - None => in_file_name + ".xz", - }; - - if let Err(err) = check_overwrite(&out_file_name, args.overwrite) { - eprintln!("Error: {:?}", err); - return; - } - - let writer = BufWriter::new(File::create(out_file_name).unwrap()); - - if let Err(err) = xz_compress(reader, writer, args.n_cpus, args.compression_level) { - eprintln!("Error: {:?}", err); - } - tracing::trace!("Done!"); - } - Mode::XzDecompress => { - tracing::trace!("Running in xz decompress mode"); - - let in_file_name = args - .input_file - .expect("Must provide input file for xz-decompress mode."); - - if !in_file_name.ends_with(".xz") { - eprintln!("Error: Unsupported file type for xz decompress mode"); - return; - } - - let output_file_name = match args.output_file { - Some(name) => name, - None => in_file_name[..in_file_name.len() - 3].to_string(), - }; - - if let Err(err) = check_overwrite(&output_file_name, args.overwrite) { - eprintln!("Error: {:?}", err); - return; - } - - let reader = BufReader::new(File::open(&in_file_name).unwrap()); - let writer = BufWriter::new(File::create(output_file_name).unwrap()); - - if let Err(err) = xz_decompress(reader, writer) { - eprintln!("Error: {:?}", err); - } - } + Mode::Encode => modes::encode::run(args), + Mode::XEncode => modes::xencode::run(args), + Mode::Decode => modes::decode::run(args), + Mode::XDecode => modes::xdecode::run(args), + Mode::Read => modes::read::run(args), + Mode::XzCompress => modes::xz_compress::run(args), + Mode::XzDecompress => modes::xz_decompress::run(args), } } diff --git a/ben/src/cli/ben/modes/decode.rs b/ben/src/cli/ben/modes/decode.rs new file mode 100644 index 0000000..e673d6a --- /dev/null +++ b/ben/src/cli/ben/modes/decode.rs @@ -0,0 +1,47 @@ +//! `ben --mode decode` handler. + +use super::super::args::Args; +use super::super::paths::{decode_setup, open_derived_writer, open_reader, open_writer}; + +use crate::cli::common::{CliError, CliResult}; +use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; + +/// Execute the `decode` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in decode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_ben = args.jsonl_and_ben; + + if let Some(file) = args.input_file.as_ref() { + if file.ends_with(".ben") { + jsonl_and_ben = true; + } else if file.ends_with(".xben") { + ben_and_xben = true; + } + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + let path = decode_setup( + file.clone(), + args.output_file.clone(), + false, + args.overwrite, + )?; + open_derived_writer(path) + } + _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, + }; + + if ben_and_xben { + decode_xben_to_ben(reader, writer)?; + Ok(()) + } else if jsonl_and_ben { + decode_ben_to_jsonl(reader, writer)?; + Ok(()) + } else { + Err(CliError::other("Unsupported file type(s) for decode mode")) + } +} diff --git a/ben/src/cli/ben/modes/encode.rs b/ben/src/cli/ben/modes/encode.rs new file mode 100644 index 0000000..48f9c78 --- /dev/null +++ b/ben/src/cli/ben/modes/encode.rs @@ -0,0 +1,54 @@ +//! `ben --mode encode` handler. + +use super::super::args::{resolve_variant, Args}; +use super::super::bundle::run_encode_bundle_with_graph; +use super::super::paths::{encode_setup, open_derived_writer, open_reader, open_writer}; + +use crate::cli::common::{CliError, CliResult}; +use crate::codec::encode::encode_jsonl_to_ben; +use std::path::Path; + +/// Execute the `encode` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in encode mode"); + + // --graph path: produce a .bendl bundle with the BEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = args.input_file.as_ref().ok_or_else(|| { + CliError::other("--graph requires an input file (stdin not supported).") + })?; + if args.print { + return Err(CliError::other("--graph is incompatible with --print.")); + } + let out_path = encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + )?; + let variant = resolve_variant(args.variant, args.save_all); + run_encode_bundle_with_graph(Path::new(in_file), &out_path, variant, graph_path)?; + return Ok(()); + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => { + let path = encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + false, + )?; + open_derived_writer(path) + } + _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, + }; + + let variant = resolve_variant(args.variant, args.save_all); + encode_jsonl_to_ben(reader, writer, variant)?; + Ok(()) +} diff --git a/ben/src/cli/ben/modes/mod.rs b/ben/src/cli/ben/modes/mod.rs new file mode 100644 index 0000000..f3fb0aa --- /dev/null +++ b/ben/src/cli/ben/modes/mod.rs @@ -0,0 +1,13 @@ +//! Per-mode handlers for the `ben` CLI. +//! +//! The dispatcher in `super::run` matches on the parsed `Mode` enum and +//! forwards to one of these handlers. Splitting one handler per file keeps +//! each mode under ~40 lines and makes them individually testable. + +pub(super) mod decode; +pub(super) mod encode; +pub(super) mod read; +pub(super) mod xdecode; +pub(super) mod xencode; +pub(super) mod xz_compress; +pub(super) mod xz_decompress; diff --git a/ben/src/cli/ben/modes/read.rs b/ben/src/cli/ben/modes/read.rs new file mode 100644 index 0000000..e297fc4 --- /dev/null +++ b/ben/src/cli/ben/modes/read.rs @@ -0,0 +1,29 @@ +//! `ben --mode read` handler. + +use super::super::args::Args; +use super::super::paths::open_writer; + +use crate::cli::common::{CliError, CliResult}; +use crate::ops::extract::extract_assignment_ben; +use std::fs::File; +use std::io::{BufReader, Write}; + +/// Execute the `read` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in read mode"); + + let in_file = args + .input_file + .ok_or_else(|| CliError::other("Must provide input file for read mode."))?; + let reader = BufReader::new(File::open(&in_file)?); + + let sample_number = args + .sample_number + .ok_or_else(|| CliError::other("Sample number is required in read mode"))?; + + let mut writer = open_writer(args.output_file.as_deref(), args.print, false)?; + let vec = extract_assignment_ben(reader, sample_number) + .map_err(|e| CliError::other(format!("{e}")))?; + writer.write_all(format!("{:?}\n", vec).as_bytes())?; + Ok(()) +} diff --git a/ben/src/cli/ben/modes/xdecode.rs b/ben/src/cli/ben/modes/xdecode.rs new file mode 100644 index 0000000..436f298 --- /dev/null +++ b/ben/src/cli/ben/modes/xdecode.rs @@ -0,0 +1,25 @@ +//! `ben --mode x-decode` handler. + +use super::super::args::Args; +use super::super::paths::{decode_setup, open_derived_writer, open_reader, open_writer}; + +use crate::cli::common::CliResult; +use crate::codec::decode::decode_xben_to_jsonl; + +/// Execute the `x-decode` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in x-decode mode"); + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(file) if !args.print => { + let path = + decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite)?; + open_derived_writer(path) + } + _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, + }; + + decode_xben_to_jsonl(reader, writer)?; + Ok(()) +} diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs new file mode 100644 index 0000000..1f67eb0 --- /dev/null +++ b/ben/src/cli/ben/modes/xencode.rs @@ -0,0 +1,97 @@ +//! `ben --mode x-encode` handler. + +use super::super::args::{resolve_variant, Args}; +use super::super::bundle::run_xencode_bundle_with_graph; +use super::super::paths::{encode_setup, open_derived_writer, open_reader, open_writer}; + +use crate::cli::common::{CliError, CliResult}; +use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_xben}; +use std::path::Path; + +/// Execute the `x-encode` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in xencode mode"); + + let mut ben_and_xben = args.ben_and_xben; + let mut jsonl_and_xben = args.jsonl_and_xben; + + if let Some(in_file) = args.input_file.as_ref() { + if in_file.ends_with(".ben") { + ben_and_xben = true; + } else if in_file.ends_with(".jsonl") { + jsonl_and_xben = true; + } + } + + // --graph path: produce a .bendl bundle with the XBEN stream + // plus a post-stream graph asset. + if let Some(graph_path) = args.graph.as_ref() { + let in_file = args.input_file.as_ref().ok_or_else(|| { + CliError::other("--graph requires an input file (stdin not supported).") + })?; + if args.print { + return Err(CliError::other("--graph is incompatible with --print.")); + } + if !ben_and_xben && !jsonl_and_xben { + return Err(CliError::other("Unsupported file type(s) for xencode mode")); + } + let out_path = encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + true, + )?; + let variant = resolve_variant(args.variant, args.save_all); + run_xencode_bundle_with_graph( + Path::new(in_file), + &out_path, + variant, + ben_and_xben, + args.n_cpus, + args.compression_level, + args.chunk_size, + graph_path, + )?; + return Ok(()); + } + + let reader = open_reader(args.input_file.as_deref()); + let writer = match args.input_file.as_ref() { + Some(in_file) if !args.print => { + let path = encode_setup( + args.mode.clone(), + in_file.clone(), + args.output_file.clone(), + args.overwrite, + false, + )?; + open_derived_writer(path) + } + _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, + }; + + if ben_and_xben { + encode_ben_to_xben( + reader, + writer, + args.n_cpus, + args.compression_level, + args.chunk_size, + )?; + Ok(()) + } else if jsonl_and_xben { + let variant = resolve_variant(args.variant, args.save_all); + encode_jsonl_to_xben( + reader, + writer, + variant, + args.n_cpus, + args.compression_level, + args.chunk_size, + )?; + Ok(()) + } else { + Err(CliError::other("Unsupported file type(s) for xencode mode")) + } +} diff --git a/ben/src/cli/ben/modes/xz_compress.rs b/ben/src/cli/ben/modes/xz_compress.rs new file mode 100644 index 0000000..b80a019 --- /dev/null +++ b/ben/src/cli/ben/modes/xz_compress.rs @@ -0,0 +1,30 @@ +//! `ben --mode xz-compress` handler. + +use super::super::args::Args; + +use crate::cli::common::{check_overwrite, CliError, CliResult}; +use crate::codec::encode::xz_compress; +use std::fs::File; +use std::io::{BufReader, BufWriter}; + +/// Execute the `xz-compress` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in xz compress mode"); + + let in_file_name = args + .input_file + .ok_or_else(|| CliError::other("Must provide input file for xz-compress mode."))?; + let reader = BufReader::new(File::open(&in_file_name)?); + + let out_file_name = match args.output_file { + Some(name) => name, + None => in_file_name + ".xz", + }; + + check_overwrite(&out_file_name, args.overwrite)?; + let writer = BufWriter::new(File::create(out_file_name)?); + + xz_compress(reader, writer, args.n_cpus, args.compression_level)?; + tracing::trace!("Done!"); + Ok(()) +} diff --git a/ben/src/cli/ben/modes/xz_decompress.rs b/ben/src/cli/ben/modes/xz_decompress.rs new file mode 100644 index 0000000..19496ae --- /dev/null +++ b/ben/src/cli/ben/modes/xz_decompress.rs @@ -0,0 +1,36 @@ +//! `ben --mode xz-decompress` handler. + +use super::super::args::Args; + +use crate::cli::common::{check_overwrite, CliError, CliResult}; +use crate::codec::decode::xz_decompress; +use std::fs::File; +use std::io::{BufReader, BufWriter}; + +/// Execute the `xz-decompress` sub-mode. +pub(in crate::cli::ben) fn run(args: Args) -> CliResult { + tracing::trace!("Running in xz decompress mode"); + + let in_file_name = args + .input_file + .ok_or_else(|| CliError::other("Must provide input file for xz-decompress mode."))?; + + if !in_file_name.ends_with(".xz") { + return Err(CliError::other( + "Unsupported file type for xz decompress mode", + )); + } + + let output_file_name = match args.output_file { + Some(name) => name, + None => in_file_name[..in_file_name.len() - 3].to_string(), + }; + + check_overwrite(&output_file_name, args.overwrite)?; + + let reader = BufReader::new(File::open(&in_file_name)?); + let writer = BufWriter::new(File::create(output_file_name)?); + + xz_decompress(reader, writer)?; + Ok(()) +} diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index 23f22d4..6585025 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -6,19 +6,11 @@ use super::bundle::{ use super::paths::{ count_jsonl_lines, decode_setup, encode_setup, open_derived_writer, open_reader, open_writer, }; +use crate::test_utils::unique_path; use crate::BenVariant; use clap::{CommandFactory, Parser}; use std::fs; use std::io::{self, Write}; -use std::time::{SystemTime, UNIX_EPOCH}; - -fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("ben-cli-ben-{name}-{nonce}")) -} #[test] fn clap_metadata_uses_package_version() { diff --git a/ben/src/cli/bendl/mod.rs b/ben/src/cli/bendl/mod.rs index 11a1419..bb59494 100644 --- a/ben/src/cli/bendl/mod.rs +++ b/ben/src/cli/bendl/mod.rs @@ -26,23 +26,19 @@ use create::run_create; use extract::run_extract; use inspect::run_inspect; -use crate::cli::common::set_verbose; +use crate::cli::common::{set_verbose, CliError, CliResult}; use clap::Parser; /// Parse CLI arguments and execute the selected subcommand. -pub fn run() { +pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); - let result = match args.command { + match args.command { Command::Create(a) => run_create(a), Command::Inspect(a) => run_inspect(a), Command::Extract(a) => run_extract(a), Command::Append(a) => run_append(a), - }; - - if let Err(err) = result { - eprintln!("Error: {err}"); - std::process::exit(1); } + .map_err(CliError::from) } diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index 8d231b4..c6bb697 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -7,27 +7,16 @@ use super::inspect::run_inspect; use crate::codec::encode::encode_jsonl_to_ben; use crate::io::bundle::format::AssignmentFormat; use crate::io::bundle::{BendlReader, BendlWriter}; +use crate::test_utils::{sample_bendl_bytes, unique_path}; use clap::Parser; use std::io::{BufReader, Cursor}; use std::path::PathBuf; use std::time::{SystemTime, UNIX_EPOCH}; -fn unique_path(name: &str) -> PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("bendl-cli-{name}-{nonce}")) -} - /// Write a minimal finalized .bendl file and return its path. fn write_temp_bendl(name: &str, format: AssignmentFormat) -> PathBuf { let path = unique_path(name); - let stream = b"STANDARD BEN FILE\x00fake"; - let mut buf: Vec = Vec::new(); - let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); - writer.write_stream_bytes(stream, 1).unwrap(); - writer.finish().unwrap(); + let buf = sample_bendl_bytes(b"STANDARD BEN FILE\x00fake", format); std::fs::write(&path, &buf).unwrap(); path } diff --git a/ben/src/cli/common/error.rs b/ben/src/cli/common/error.rs new file mode 100644 index 0000000..3942a57 --- /dev/null +++ b/ben/src/cli/common/error.rs @@ -0,0 +1,109 @@ +//! Error type used by the top-level `run()` functions of every CLI binary. +//! +//! The shape is intentionally narrow: a few specific variants for cases where +//! a caller (or test) might want to match the error type, plus an `Other` +//! catch-all that preserves the older `Result<(), String>` ergonomic so the +//! existing per-command runners still propagate cleanly via `?`. + +use std::fmt; +use std::io; +use std::path::PathBuf; + +/// Error returned by the top-level CLI `run()` functions. +#[derive(Debug)] +pub enum CliError { + /// An underlying I/O error (file open, read, write, etc.). + Io(io::Error), + /// The output path already existed and the user declined to overwrite. + OverwriteRefused(PathBuf), + /// A free-form error message. Used as a catch-all so existing + /// `Result<(), String>` runners still flow through unchanged. + Other(String), +} + +/// Convenience alias for `Result`. +pub type CliResult = Result; + +impl CliError { + /// Construct a free-form error from anything that displays to a string. + pub fn other>(s: S) -> Self { + CliError::Other(s.into()) + } +} + +impl fmt::Display for CliError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CliError::Io(e) => write!(f, "{e}"), + CliError::OverwriteRefused(p) => { + write!(f, "user declined to overwrite {}", p.display()) + } + CliError::Other(msg) => f.write_str(msg), + } + } +} + +impl std::error::Error for CliError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + CliError::Io(e) => Some(e), + _ => None, + } + } +} + +impl From for CliError { + fn from(e: io::Error) -> Self { + CliError::Io(e) + } +} + +impl From for CliError { + fn from(s: String) -> Self { + CliError::Other(s) + } +} + +impl From<&str> for CliError { + fn from(s: &str) -> Self { + CliError::Other(s.to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn io_error_round_trips_via_from() { + let original = io::Error::new(io::ErrorKind::NotFound, "missing"); + let cli: CliError = original.into(); + assert!(matches!(cli, CliError::Io(_))); + assert_eq!(cli.to_string(), "missing"); + } + + #[test] + fn other_constructor_accepts_string_and_str() { + let a: CliError = "boom".into(); + assert_eq!(a.to_string(), "boom"); + let b: CliError = String::from("kapow").into(); + assert_eq!(b.to_string(), "kapow"); + let c = CliError::other("bang"); + assert_eq!(c.to_string(), "bang"); + } + + #[test] + fn overwrite_refused_displays_path() { + let e = CliError::OverwriteRefused(PathBuf::from("/tmp/out.bin")); + assert!(e.to_string().contains("/tmp/out.bin")); + assert!(e.to_string().contains("declined")); + } + + #[test] + fn io_source_propagates() { + use std::error::Error; + let original = io::Error::new(io::ErrorKind::Other, "deep"); + let cli = CliError::Io(original); + assert!(cli.source().is_some()); + } +} diff --git a/ben/src/cli/common/mod.rs b/ben/src/cli/common/mod.rs index 0f17d10..60afb38 100644 --- a/ben/src/cli/common/mod.rs +++ b/ben/src/cli/common/mod.rs @@ -1,3 +1,6 @@ +pub mod error; +pub use error::{CliError, CliResult}; + use std::io::{self, Result}; use std::path::Path; @@ -21,6 +24,39 @@ pub fn set_verbose(verbose: bool) { crate::logging::init_logging(); } +/// Decide whether overwriting an output path should proceed, given the +/// state observed by the caller. +/// +/// This is the pure half of [`check_overwrite`]: it does no I/O, so it can +/// be unit-tested by enumerating the four reachable states (file missing / +/// `overwrite` flag set / user said yes / user said anything else). +/// +/// # Arguments +/// +/// * `file_exists` - Whether the candidate output path already exists. +/// * `overwrite` - Whether the caller passed `--overwrite` to skip prompting. +/// * `response` - The line the user typed in response to the overwrite +/// prompt, or `None` if no prompt was issued. +/// +/// # Returns +/// +/// Returns `true` when the caller may safely overwrite; `false` when the +/// user (or the absence of a yes-response) indicates the operation should +/// be aborted. +pub(crate) fn check_overwrite_pure( + file_exists: bool, + overwrite: bool, + response: Option<&str>, +) -> bool { + if !file_exists || overwrite { + return true; + } + matches!( + response.map(|s| s.trim().to_lowercase()).as_deref(), + Some("y") | Some("yes") + ) +} + /// Confirm whether an existing output path may be overwritten. /// /// If `overwrite` is `false` and the destination already exists, the user is @@ -36,19 +72,24 @@ pub fn set_verbose(verbose: bool) { /// /// Returns `Ok(())` when the output path may be used. pub fn check_overwrite(file_name: &str, overwrite: bool) -> Result<()> { - if Path::new(file_name).exists() && !overwrite { + let exists = Path::new(file_name).exists(); + let response = if exists && !overwrite { eprint!( "File {:?} already exists, do you want to overwrite it? (y/[n]): ", file_name ); - let mut user_input = String::new(); - io::stdin().read_line(&mut user_input).unwrap(); + let mut buf = String::new(); + io::stdin().read_line(&mut buf)?; eprintln!(); - if user_input.trim().to_lowercase() != "y" { - return Err(io::Error::from(io::ErrorKind::AlreadyExists)); - } + Some(buf) + } else { + None + }; + if check_overwrite_pure(exists, overwrite, response.as_deref()) { + Ok(()) + } else { + Err(io::Error::from(io::ErrorKind::AlreadyExists)) } - Ok(()) } #[cfg(test)] diff --git a/ben/src/cli/common/tests.rs b/ben/src/cli/common/tests.rs index d5b1d03..ca2885d 100644 --- a/ben/src/cli/common/tests.rs +++ b/ben/src/cli/common/tests.rs @@ -1,21 +1,13 @@ use super::*; +use crate::test_utils::unique_path; use std::fs; use std::sync::{Mutex, OnceLock}; -use std::time::{SystemTime, UNIX_EPOCH}; fn env_lock() -> &'static Mutex<()> { static LOCK: OnceLock> = OnceLock::new(); LOCK.get_or_init(|| Mutex::new(())) } -fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("ben-cli-common-{name}-{nonce}")) -} - #[test] fn set_verbose_sets_rust_log() { let _guard = env_lock().lock().unwrap(); @@ -55,3 +47,30 @@ fn check_overwrite_allows_existing_file_when_forced() { check_overwrite(path.to_str().unwrap(), true).unwrap(); fs::remove_file(path).unwrap(); } + +#[test] +fn check_overwrite_pure_passes_when_file_missing() { + assert!(check_overwrite_pure(false, false, None)); + assert!(check_overwrite_pure(false, true, None)); +} + +#[test] +fn check_overwrite_pure_passes_when_overwrite_flag_set() { + assert!(check_overwrite_pure(true, true, None)); +} + +#[test] +fn check_overwrite_pure_accepts_y_and_yes_responses() { + assert!(check_overwrite_pure(true, false, Some("y\n"))); + assert!(check_overwrite_pure(true, false, Some("Y\n"))); + assert!(check_overwrite_pure(true, false, Some("yes\n"))); + assert!(check_overwrite_pure(true, false, Some(" YES "))); +} + +#[test] +fn check_overwrite_pure_rejects_other_responses() { + assert!(!check_overwrite_pure(true, false, Some("n\n"))); + assert!(!check_overwrite_pure(true, false, Some("\n"))); + assert!(!check_overwrite_pure(true, false, Some("maybe\n"))); + assert!(!check_overwrite_pure(true, false, None)); +} diff --git a/ben/src/cli/pben/mod.rs b/ben/src/cli/pben/mod.rs index d8ec616..dc0da7f 100644 --- a/ben/src/cli/pben/mod.rs +++ b/ben/src/cli/pben/mod.rs @@ -1,4 +1,4 @@ -use crate::cli::common::{check_overwrite, set_verbose}; +use crate::cli::common::{check_overwrite, set_verbose, CliError, CliResult}; use crate::io::reader::AssignmentReader; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; @@ -7,7 +7,7 @@ use pipe::pipe; use serde_json::json; use std::{ fs::File, - io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}, + io::{self, BufRead, BufReader, BufWriter, Read, Write}, }; use xz2::write::XzEncoder; @@ -52,7 +52,7 @@ struct Args { } /// Parse CLI arguments and execute the selected `pben` conversion. -pub fn run() -> Result<()> { +pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); @@ -61,7 +61,7 @@ pub fn run() -> Result<()> { tracing::trace!("Converting BEN to PCOMPRESS"); let ben_reader: Box = match args.input_file.as_ref() { - Some(file) => Box::new(BufReader::new(File::open(file).unwrap())), + Some(file) => Box::new(BufReader::new(File::open(file)?)), None => Box::new(io::stdin()), }; @@ -71,7 +71,7 @@ pub fn run() -> Result<()> { args.output_file.as_deref(), args.overwrite, )? { - Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), + Some(file) => BufWriter::new(Box::new(File::create(file)?)), None => BufWriter::new(Box::new(io::stdout())), }; @@ -92,7 +92,7 @@ pub fn run() -> Result<()> { .input_file .as_ref() { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), None => BufReader::new(Box::new(io::stdin())), }; @@ -102,7 +102,7 @@ pub fn run() -> Result<()> { args.output_file.as_deref(), args.overwrite, )? { - Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), + Some(file) => BufWriter::new(Box::new(File::create(file)?)), None => BufWriter::new(Box::new(io::stdout())), }; @@ -114,7 +114,7 @@ pub fn run() -> Result<()> { }); let mut buf_pipe_reader = BufReader::new(pipe_reader); - assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer) + assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) } Mode::PcToXben => { tracing::trace!("Converting PCOMPRESS to XBEN"); @@ -123,7 +123,7 @@ pub fn run() -> Result<()> { .input_file .as_ref() { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file).unwrap()))), + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), None => BufReader::new(Box::new(io::stdin())), }; @@ -133,7 +133,7 @@ pub fn run() -> Result<()> { args.output_file.as_deref(), args.overwrite, )? { - Some(file) => BufWriter::new(Box::new(File::create(file).unwrap())), + Some(file) => BufWriter::new(Box::new(File::create(file)?)), None => BufWriter::new(Box::new(io::stdout())), }; @@ -145,7 +145,7 @@ pub fn run() -> Result<()> { }); let mut buf_pipe_reader = BufReader::new(pipe_reader); - assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer) + assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) } } } diff --git a/ben/src/cli/reben/mod.rs b/ben/src/cli/reben/mod.rs index d54cd50..9ff88e3 100644 --- a/ben/src/cli/reben/mod.rs +++ b/ben/src/cli/reben/mod.rs @@ -12,18 +12,14 @@ use args::{Args, Mode}; use ben_mode::run_ben_mode; use json_mode::run_json_mode; -use crate::cli::common::set_verbose; +use crate::cli::common::{set_verbose, CliError, CliResult}; use clap::Parser; /// Parse CLI arguments and execute the selected `reben` mode. -pub fn run() { +pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); - - if let Err(err) = run_with_args(args) { - eprintln!("Error: {err}"); - std::process::exit(1); - } + run_with_args(args).map_err(CliError::from) } fn run_with_args(args: Args) -> Result<(), String> { diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 7322531..2ca34eb 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -5,28 +5,16 @@ use super::helpers::{ }; use super::json_mode::run_json_mode; use crate::codec::encode::encode_jsonl_to_ben; +use crate::test_utils::{sample_ben_bytes, unique_path}; use crate::BenVariant; use clap::{CommandFactory, Parser}; -use std::{ - fs, - io::Cursor, - time::{SystemTime, UNIX_EPOCH}, -}; - -fn unique_path(name: &str) -> std::path::PathBuf { - let nonce = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - std::env::temp_dir().join(format!("reben-{name}-{nonce}")) -} +use std::{fs, io::Cursor}; /// Write a minimal Standard BEN file to a temp path and return the path. fn write_temp_ben(name: &str) -> std::path::PathBuf { let path = unique_path(name); let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n{\"assignment\":[2,1,3],\"sample\":2}\n"; - let mut ben = Vec::new(); - encode_jsonl_to_ben(Cursor::new(jsonl), &mut ben, BenVariant::Standard).unwrap(); + let ben = sample_ben_bytes(jsonl, BenVariant::Standard); fs::write(&path, &ben).unwrap(); path } diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index fae720f..664ac33 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -5,12 +5,16 @@ mod ben32; pub(crate) mod errors; pub(crate) use errors::DecodeError; mod jsonl; +pub mod path; mod twodelta; mod xz; pub use ben::decode_ben_line; pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; +pub use path::{ + decode_ben_to_jsonl_path, decode_xben_to_ben_path, decode_xben_to_jsonl_path, xz_decompress_path, +}; pub(crate) use twodelta::apply_twodelta_runs_to_assignment; pub use twodelta::decode_twodelta_frame; pub use xz::{decode_xben_to_ben, xz_decompress}; diff --git a/ben/src/codec/decode/path.rs b/ben/src/codec/decode/path.rs new file mode 100644 index 0000000..e424c65 --- /dev/null +++ b/ben/src/codec/decode/path.rs @@ -0,0 +1,58 @@ +//! Path-based convenience wrappers around the streaming decoders. +//! +//! Each wrapper opens a buffered reader on the input and a buffered writer on +//! the output, then delegates to the corresponding streaming function. The +//! wrappers exist so that CLI dispatch and library consumers do not have to +//! repeat the `BufReader`/`BufWriter`/`File` plumbing at every callsite. + +use std::fs::File; +use std::io::{BufReader, BufWriter, Result}; +use std::path::Path; + +use super::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress}; + +/// Decode a BEN file at `input` into a JSONL file at `output`. +pub fn decode_ben_to_jsonl_path(input: &Path, output: &Path) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + decode_ben_to_jsonl(reader, writer) +} + +/// Decode an XBEN file at `input` into a JSONL file at `output`. +pub fn decode_xben_to_jsonl_path(input: &Path, output: &Path) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + decode_xben_to_jsonl(reader, writer) +} + +/// Decode an XBEN file at `input` into a BEN file at `output`. +pub fn decode_xben_to_ben_path(input: &Path, output: &Path) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + decode_xben_to_ben(reader, writer) +} + +/// Decompress an `.xz` file at `input` into a plain file at `output`. +pub fn xz_decompress_path(input: &Path, output: &Path) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + xz_decompress(reader, writer) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::unique_path; + + #[test] + fn decode_path_propagates_open_error() { + let missing = unique_path("nonexistent.ben"); + let out = unique_path("decode-fail.jsonl"); + let err = decode_ben_to_jsonl_path(&missing, &out).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::NotFound); + let _ = std::fs::remove_file(&out); + } + + // The happy-path round-trip tests for these decoders live alongside the + // matching encoders in `super::super::encode::path::tests`. +} diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index ed20b6d..6c80e91 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -3,6 +3,7 @@ mod ben; pub mod errors; mod jsonl; +pub mod path; mod twodelta; mod xz; @@ -14,6 +15,9 @@ pub(crate) use twodelta::encode_twodelta_frame_with_hint; #[cfg(test)] pub(crate) use ben::encode_ben32_line; pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +pub use path::{ + encode_ben_to_xben_path, encode_jsonl_to_ben_path, encode_jsonl_to_xben_path, xz_compress_path, +}; pub use xz::{encode_ben_to_xben, xz_compress}; #[cfg(test)] diff --git a/ben/src/codec/encode/path.rs b/ben/src/codec/encode/path.rs new file mode 100644 index 0000000..3ae9849 --- /dev/null +++ b/ben/src/codec/encode/path.rs @@ -0,0 +1,180 @@ +//! Path-based convenience wrappers around the streaming encoders. +//! +//! Each wrapper opens a buffered reader on the input and a buffered writer on +//! the output, then delegates to the corresponding streaming function. The +//! wrappers exist so that CLI dispatch and library consumers do not have to +//! repeat the `BufReader`/`BufWriter`/`File` plumbing at every callsite. + +use std::fs::File; +use std::io::{BufReader, BufWriter, Result}; +use std::path::Path; + +use crate::BenVariant; + +use super::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress}; + +/// Encode a JSONL file at `input` into a BEN file at `output`. +pub fn encode_jsonl_to_ben_path(input: &Path, output: &Path, variant: BenVariant) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + encode_jsonl_to_ben(reader, writer, variant) +} + +/// Encode a JSONL file at `input` into an XBEN file at `output`. +pub fn encode_jsonl_to_xben_path( + input: &Path, + output: &Path, + variant: BenVariant, + n_threads: Option, + compression_level: Option, + chunk_size: Option, +) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + encode_jsonl_to_xben(reader, writer, variant, n_threads, compression_level, chunk_size) +} + +/// Encode a BEN file at `input` into an XBEN file at `output`. +pub fn encode_ben_to_xben_path( + input: &Path, + output: &Path, + n_threads: Option, + compression_level: Option, + chunk_size: Option, +) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + encode_ben_to_xben(reader, writer, n_threads, compression_level, chunk_size) +} + +/// Compress an arbitrary file at `input` into an `.xz` file at `output`. +pub fn xz_compress_path( + input: &Path, + output: &Path, + n_threads: Option, + compression_level: Option, +) -> Result<()> { + let reader = BufReader::new(File::open(input)?); + let writer = BufWriter::new(File::create(output)?); + xz_compress(reader, writer, n_threads, compression_level) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::{jsonl_from_assignments, unique_path}; + + #[test] + fn jsonl_to_ben_path_round_trips() { + use crate::codec::decode::path::decode_ben_to_jsonl_path; + + let jsonl_in = unique_path("path-encode-jsonl.jsonl"); + let ben_out = unique_path("path-encode-jsonl.ben"); + let jsonl_back = unique_path("path-encode-jsonl-back.jsonl"); + + std::fs::write( + &jsonl_in, + jsonl_from_assignments(&[vec![1, 2, 3], vec![2, 1, 3]]), + ) + .unwrap(); + + encode_jsonl_to_ben_path(&jsonl_in, &ben_out, BenVariant::Standard).unwrap(); + decode_ben_to_jsonl_path(&ben_out, &jsonl_back).unwrap(); + + let s = std::fs::read_to_string(&jsonl_back).unwrap(); + assert!(s.contains("[1,2,3]")); + assert!(s.contains("[2,1,3]")); + + for p in [&jsonl_in, &ben_out, &jsonl_back] { + let _ = std::fs::remove_file(p); + } + } + + #[test] + fn jsonl_to_xben_path_round_trips() { + use crate::codec::decode::path::decode_xben_to_jsonl_path; + + let jsonl_in = unique_path("path-encode-xben.jsonl"); + let xben_out = unique_path("path-encode-xben.xben"); + let jsonl_back = unique_path("path-encode-xben-back.jsonl"); + + std::fs::write( + &jsonl_in, + jsonl_from_assignments(&[vec![1, 2, 3], vec![2, 1, 3]]), + ) + .unwrap(); + + encode_jsonl_to_xben_path( + &jsonl_in, + &xben_out, + BenVariant::Standard, + Some(1), + Some(1), + None, + ) + .unwrap(); + decode_xben_to_jsonl_path(&xben_out, &jsonl_back).unwrap(); + + let s = std::fs::read_to_string(&jsonl_back).unwrap(); + assert!(s.contains("[1,2,3]")); + + for p in [&jsonl_in, &xben_out, &jsonl_back] { + let _ = std::fs::remove_file(p); + } + } + + #[test] + fn ben_to_xben_path_round_trips() { + use crate::codec::decode::path::decode_xben_to_ben_path; + + let jsonl_in = unique_path("path-bxb.jsonl"); + let ben = unique_path("path-bxb.ben"); + let xben = unique_path("path-bxb.xben"); + let ben_back = unique_path("path-bxb-back.ben"); + + std::fs::write( + &jsonl_in, + jsonl_from_assignments(&[vec![1, 2, 3]]), + ) + .unwrap(); + encode_jsonl_to_ben_path(&jsonl_in, &ben, BenVariant::Standard).unwrap(); + encode_ben_to_xben_path(&ben, &xben, Some(1), Some(1), None).unwrap(); + decode_xben_to_ben_path(&xben, &ben_back).unwrap(); + + // Round trip: ben_back should be byte-equivalent to ben (same banner, same content). + assert_eq!(std::fs::read(&ben).unwrap(), std::fs::read(&ben_back).unwrap()); + + for p in [&jsonl_in, &ben, &xben, &ben_back] { + let _ = std::fs::remove_file(p); + } + } + + #[test] + fn xz_compress_path_round_trips() { + use crate::codec::decode::path::xz_decompress_path; + + let plain = unique_path("path-xz.txt"); + let xz_out = unique_path("path-xz.txt.xz"); + let plain_back = unique_path("path-xz-back.txt"); + + std::fs::write(&plain, b"hello world\n").unwrap(); + xz_compress_path(&plain, &xz_out, Some(1), Some(1)).unwrap(); + xz_decompress_path(&xz_out, &plain_back).unwrap(); + + assert_eq!(std::fs::read(&plain_back).unwrap(), b"hello world\n"); + + for p in [&plain, &xz_out, &plain_back] { + let _ = std::fs::remove_file(p); + } + } + + #[test] + fn encode_path_propagates_open_error() { + let missing = unique_path("nonexistent.jsonl"); + let out = unique_path("encode-fail.ben"); + let err = encode_jsonl_to_ben_path(&missing, &out, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::NotFound); + // Note: out is created before the read fails, so its absence is not asserted. + let _ = std::fs::remove_file(&out); + } +} diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 95b174a..8e25c94 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -40,6 +40,9 @@ pub mod ops; /// Miscellaneous utilities that do not fit into the other modules. pub mod util; +#[doc(hidden)] +pub mod test_utils; + /// Print an in-place progress update when trace logging is enabled. /// /// This is intentionally separate from normal structured logging because many diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 54af481..ea5fa08 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -3,8 +3,10 @@ use crate::codec::decode::decode_ben32_line; use crate::io::reader::{AssignmentReader, XZAssignmentReader}; use serde_json::Error as SerdeError; +use std::fs::File; use std::io::Cursor; -use std::io::{self, Read}; +use std::io::{self, BufReader, Read}; +use std::path::Path; use thiserror::Error; #[derive(Debug, Error)] @@ -120,5 +122,23 @@ pub fn extract_assignment_xben( }) } +/// Extract a single 1-based sample from a BEN file at `input`. +pub fn extract_assignment_ben_path( + input: &Path, + sample_number: usize, +) -> Result, SampleError> { + let reader = BufReader::new(File::open(input).map_err(SampleError::new_io_error)?); + extract_assignment_ben(reader, sample_number) +} + +/// Extract a single 1-based sample from an XBEN file at `input`. +pub fn extract_assignment_xben_path( + input: &Path, + sample_number: usize, +) -> Result, SampleError> { + let reader = BufReader::new(File::open(input).map_err(SampleError::new_io_error)?); + extract_assignment_xben(reader, sample_number) +} + #[cfg(test)] mod tests; diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 46969ec..a12d25f 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -187,3 +187,29 @@ fn test_sample_error_new_io_error() { assert!(matches!(sample_err, SampleError::IoError(_))); assert_eq!(sample_err.to_string(), "IO Error: file gone"); } + +#[test] +fn extract_assignment_ben_path_returns_assignment() { + use crate::test_utils::{jsonl_from_assignments, sample_ben_bytes, unique_path}; + + let ben_bytes = sample_ben_bytes( + &jsonl_from_assignments(&[vec![1, 2, 3], vec![3, 2, 1]]), + BenVariant::Standard, + ); + let path = unique_path("extract-path.ben"); + std::fs::write(&path, &ben_bytes).unwrap(); + + assert_eq!(extract_assignment_ben_path(&path, 1).unwrap(), vec![1, 2, 3]); + assert_eq!(extract_assignment_ben_path(&path, 2).unwrap(), vec![3, 2, 1]); + + let _ = std::fs::remove_file(&path); +} + +#[test] +fn extract_assignment_ben_path_propagates_missing_file() { + use crate::test_utils::unique_path; + + let missing = unique_path("nonexistent.ben"); + let err = extract_assignment_ben_path(&missing, 1).unwrap_err(); + assert!(matches!(err, SampleError::IoError(_))); +} diff --git a/ben/src/test_utils.rs b/ben/src/test_utils.rs new file mode 100644 index 0000000..6d54cd8 --- /dev/null +++ b/ben/src/test_utils.rs @@ -0,0 +1,129 @@ +//! Test helpers shared across unit and integration tests. +//! +//! This module is always-compiled (not `#[cfg(test)]`) so integration tests +//! in `ben/tests/` — which are separate crates — can reuse the same helpers +//! as unit tests inside `ben/src/.../tests.rs`. It is `#[doc(hidden)]` and +//! is not part of the stable public API. + +use std::io::{Cursor, Write}; +use std::path::PathBuf; +use std::time::{SystemTime, UNIX_EPOCH}; + +use serde_json::json; + +use crate::codec::encode::encode_jsonl_to_ben; +use crate::io::bundle::format::AssignmentFormat; +use crate::io::bundle::BendlWriter; +use crate::BenVariant; + +/// Return a unique temp path of the form `binary-ensemble-{name}-{nonce}` in +/// the system temp directory. The nonce is the current monotonic-ish time in +/// nanoseconds, sufficient to avoid collisions between parallel test runs. +pub fn unique_path(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("binary-ensemble-{name}-{nonce}")) +} + +/// Build a JSONL byte buffer from a sequence of assignment vectors, +/// numbering samples from 1. +pub fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { + let mut buf = Vec::new(); + for (i, a) in assignments.iter().enumerate() { + writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); + } + buf +} + +/// Expand an RLE sequence `(value, length)` into a flat assignment vector, +/// truncating at `cap`. +pub fn expand_rle(rle: &[(u16, u16)], cap: usize) -> Vec { + let mut v = Vec::with_capacity(cap); + for &(val, len) in rle { + let take = (len as usize).min(cap.saturating_sub(v.len())); + v.extend(std::iter::repeat_n(val, take)); + if v.len() >= cap { + break; + } + } + v +} + +/// Encode the given JSONL bytes as a BEN byte vector, including the 17-byte +/// banner. Panics on encoder error; intended only for fixture construction. +pub fn sample_ben_bytes(jsonl: &[u8], variant: BenVariant) -> Vec { + let mut out = Vec::new(); + encode_jsonl_to_ben(jsonl, &mut out, variant).unwrap(); + out +} + +/// Build a minimal finalized `.bendl` byte vector containing the given +/// pre-encoded assignment stream bytes. Panics on writer error; intended +/// only for fixture construction. +pub fn sample_bendl_bytes(stream: &[u8], format: AssignmentFormat) -> Vec { + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); + writer.write_stream_bytes(stream, 1).unwrap(); + writer.finish().unwrap(); + } + buf +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn unique_path_includes_name_and_is_unique() { + let a = unique_path("hello"); + let b = unique_path("hello"); + assert!(a.file_name().unwrap().to_string_lossy().contains("hello")); + assert_ne!(a, b); + } + + #[test] + fn jsonl_from_assignments_emits_one_line_per_sample() { + let out = jsonl_from_assignments(&[vec![1, 2, 3], vec![2, 1, 3]]); + let s = std::str::from_utf8(&out).unwrap(); + assert_eq!(s.lines().count(), 2); + assert!(s.contains("\"sample\":1")); + assert!(s.contains("\"sample\":2")); + assert!(s.contains("[1,2,3]")); + } + + #[test] + fn expand_rle_truncates_at_cap() { + let v = expand_rle(&[(1, 5), (2, 5)], 7); + assert_eq!(v, vec![1, 1, 1, 1, 1, 2, 2]); + } + + #[test] + fn expand_rle_handles_zero_cap() { + let v = expand_rle(&[(1, 5)], 0); + assert!(v.is_empty()); + } + + #[test] + fn sample_ben_bytes_round_trips_via_decode() { + use crate::codec::decode::decode_ben_to_jsonl; + let jsonl = jsonl_from_assignments(&[vec![1, 2, 3]]); + let ben = sample_ben_bytes(&jsonl, BenVariant::Standard); + let mut decoded = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + assert!(s.contains("[1,2,3]")); + } + + #[test] + fn sample_bendl_bytes_yields_complete_bundle() { + use crate::io::bundle::BendlReader; + use std::io::BufReader; + + let bytes = sample_bendl_bytes(b"STANDARD BEN FILE\x00fake", AssignmentFormat::Ben); + let reader = BendlReader::open(BufReader::new(Cursor::new(bytes))).unwrap(); + assert!(reader.is_complete()); + } +} diff --git a/ben/tests/common/mod.rs b/ben/tests/common/mod.rs new file mode 100644 index 0000000..20ca75d --- /dev/null +++ b/ben/tests/common/mod.rs @@ -0,0 +1,27 @@ +//! Helpers shared across `ben/tests/*.rs` integration tests. +//! +//! Each integration test crate declares `mod common;` to opt in. The +//! module re-exports the in-crate test utilities from +//! `binary_ensemble::test_utils` and adds integration-only helpers +//! (subprocess paths, etc.). + +#![allow(dead_code, unused_imports)] + +pub use binary_ensemble::test_utils::{ + expand_rle, jsonl_from_assignments, sample_ben_bytes, sample_bendl_bytes, unique_path, +}; + +/// Path to a compiled binary for shelling out from integration tests. +/// +/// Returns the same `env!("CARGO_BIN_EXE_*")` value the existing test +/// helpers use; centralised here so future CLI tests can pick up the +/// canonical lookup table. +pub fn binary_path(name: &str) -> &'static str { + match name { + "ben" => env!("CARGO_BIN_EXE_ben"), + "pben" => env!("CARGO_BIN_EXE_pben"), + "reben" => env!("CARGO_BIN_EXE_reben"), + "bendl" => env!("CARGO_BIN_EXE_bendl"), + _ => panic!("unknown binary {name}"), + } +} diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index 1afdc8c..26e8d85 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -12,21 +12,15 @@ use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::util::rle::rle_to_vec; use binary_ensemble::BenVariant; -use serde_json::json; -use std::io::{self, Cursor, Write}; +use std::io::{self, Cursor}; + +mod common; +use common::jsonl_from_assignments; // ────────────────────────────────────────────────────────────────────────────── // Shared helpers // ────────────────────────────────────────────────────────────────────────────── -fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { - let mut buf = Vec::new(); - for (i, a) in assignments.iter().enumerate() { - writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); - } - buf -} - fn encode_ben(assignments: &[Vec], variant: BenVariant) -> Vec { let jsonl = jsonl_from_assignments(assignments); let mut ben = Vec::new(); diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index d997b19..76197e4 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -479,7 +479,7 @@ fn ben_cli_reports_expected_error_paths() { &["--mode", "x-encode", bogus_txt.to_str().unwrap()], temp.path(), ); - assert_success(&xencode); + assert_failure(&xencode); assert!(String::from_utf8_lossy(&xencode.stderr) .contains("Unsupported file type(s) for xencode mode")); @@ -488,7 +488,7 @@ fn ben_cli_reports_expected_error_paths() { &["--mode", "decode", bogus_jsonl.to_str().unwrap()], temp.path(), ); - assert_success(&decode); + assert_failure(&decode); assert!( String::from_utf8_lossy(&decode.stderr).contains("Unsupported file type for decode mode") ); @@ -498,7 +498,7 @@ fn ben_cli_reports_expected_error_paths() { &["--mode", "read", bogus_jsonl.to_str().unwrap()], temp.path(), ); - assert_success(&read); + assert_failure(&read); assert!( String::from_utf8_lossy(&read.stderr).contains("Sample number is required in read mode") ); @@ -508,12 +508,12 @@ fn ben_cli_reports_expected_error_paths() { &["--mode", "xz-decompress", bogus_xz.to_str().unwrap()], temp.path(), ); - assert_success(&xz); + assert_failure(&xz); assert!(String::from_utf8_lossy(&xz.stderr) .contains("Unsupported file type for xz decompress mode")); let bad_xben = run_stdin_stdout("ben", &["--mode", "x-decode"], temp.path(), b"not-an-xben"); - assert_success(&bad_xben); + assert_failure(&bad_xben); assert!(String::from_utf8_lossy(&bad_xben.stderr).contains("Error:")); let bad_decode_ben = run_stdin_stdout( @@ -522,7 +522,7 @@ fn ben_cli_reports_expected_error_paths() { temp.path(), b"not-a-ben", ); - assert_success(&bad_decode_ben); + assert_failure(&bad_decode_ben); assert!(String::from_utf8_lossy(&bad_decode_ben.stderr).contains("Error:")); let bad_decode_xben = run_stdin_stdout( @@ -531,7 +531,7 @@ fn ben_cli_reports_expected_error_paths() { temp.path(), b"not-an-xben", ); - assert_success(&bad_decode_xben); + assert_failure(&bad_decode_xben); assert!(String::from_utf8_lossy(&bad_decode_xben.stderr).contains("Error:")); } @@ -712,8 +712,8 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { b"n\n", ), ] { - assert_success(&output); - assert!(String::from_utf8_lossy(&output.stderr).contains("AlreadyExists")); + assert_failure(&output); + assert!(String::from_utf8_lossy(&output.stderr).contains("already")); } let invalid_ben_to_xben = run( @@ -728,11 +728,11 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&invalid_ben_to_xben); + assert_failure(&invalid_ben_to_xben); assert!(String::from_utf8_lossy(&invalid_ben_to_xben.stderr).contains("Error:")); let unsupported_decode = run_stdin_stdout("ben", &["--mode", "decode"], temp.path(), b""); - assert_success(&unsupported_decode); + assert_failure(&unsupported_decode); assert!(String::from_utf8_lossy(&unsupported_decode.stderr) .contains("Unsupported file type(s) for decode mode")); @@ -748,7 +748,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&read_too_large); + assert_failure(&read_too_large); assert!(String::from_utf8_lossy(&read_too_large.stderr).contains("Error:")); let invalid_decode_ben = run( @@ -763,7 +763,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&invalid_decode_ben); + assert_failure(&invalid_decode_ben); assert!(String::from_utf8_lossy(&invalid_decode_ben.stderr).contains("Error:")); let invalid_decode_xben = run( @@ -778,7 +778,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&invalid_decode_xben); + assert_failure(&invalid_decode_xben); assert!(String::from_utf8_lossy(&invalid_decode_xben.stderr).contains("Error:")); let invalid_xdecode = run( @@ -793,7 +793,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&invalid_xdecode); + assert_failure(&invalid_xdecode); assert!(String::from_utf8_lossy(&invalid_xdecode.stderr).contains("Error:")); let invalid_xz_decompress = run( @@ -808,7 +808,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { ], temp.path(), ); - assert_success(&invalid_xz_decompress); + assert_failure(&invalid_xz_decompress); } #[test] diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index f1b9d84..b98a2c0 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -32,21 +32,15 @@ use binary_ensemble::BenVariant; use serde_json::json; use std::collections::HashMap; -use std::io::{self, BufReader, Cursor, Write}; +use std::io::{self, BufReader, Cursor}; + +mod common; +use common::jsonl_from_assignments; // ────────────────────────────────────────────────────────────────────────────── // Helpers // ────────────────────────────────────────────────────────────────────────────── -/// Build a minimal JSONL payload for the given list of assignment vectors. -fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { - let mut buf = Vec::new(); - for (i, a) in assignments.iter().enumerate() { - writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); - } - buf -} - /// Encode assignments as a Standard BEN byte vector (including the 17-byte banner). fn encode_standard_ben(assignments: &[Vec]) -> Vec { let jsonl = jsonl_from_assignments(assignments); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index d401d8a..ce83284 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -23,30 +23,10 @@ use std::io::{BufReader, Cursor, Write}; use std::path::PathBuf; use std::time::{SystemTime, UNIX_EPOCH}; -// ---------- Helpers ---------- - -/// Expand an RLE sequence into a flat assignment Vec. -fn expand_rle(rle: &[(u16, u16)], cap: usize) -> Vec { - let mut v = Vec::with_capacity(cap); - for &(val, len) in rle { - let take = (len as usize).min(cap.saturating_sub(v.len())); - v.extend(std::iter::repeat(val).take(take)); - if v.len() >= cap { - break; - } - } - v -} +mod common; +use common::{expand_rle, jsonl_from_assignments}; -/// Generate a JSONL buffer from a sequence of assignment vectors. -fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { - let mut buf = Vec::new(); - for (i, a) in assignments.iter().enumerate() { - let line = json!({ "assignment": a, "sample": i + 1 }).to_string(); - writeln!(&mut buf, "{line}").unwrap(); - } - buf -} +// ---------- Helpers ---------- /// From a decoded `(assignment, count)` stream, reconstitute JSONL. fn jsonl_from_records(records: &[(Vec, u16)], start_at: usize) -> Vec { From 3c868057ef88b29a31c377608bea93ed1b28c4fe Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 3 May 2026 22:55:54 -0600 Subject: [PATCH 079/221] fix twodelta inconsistency --- ben/src/codec/frames/tests.rs | 25 +++++++++++++++++++++++ ben/src/codec/frames/twodelta_decode.rs | 3 ++- ben/src/codec/frames/twodelta_encode.rs | 23 +++++++++++++++------ ben/src/io/writer/xz_assignment_writer.rs | 8 ++++++-- ben/tests/test_coverage.rs | 2 ++ 5 files changed, 52 insertions(+), 9 deletions(-) diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index e6702db..b0b9ff9 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -540,9 +540,34 @@ fn twodelta_from_run_lengths_then_from_parts_roundtrip() { encoded.pair, encoded.max_len_bit_count, encoded.payload().to_vec(), + encoded.count, ); assert_eq!(reconstructed.run_length_vector, run_lengths); assert_eq!(reconstructed.pair, (10, 20)); + assert_eq!(reconstructed.count, encoded.count); + assert_eq!(reconstructed.raw_bytes, encoded.raw_bytes); +} + +#[test] +fn twodelta_from_parts_preserves_nontrivial_count() { + use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; + // Regression: from_parts previously hardcoded count = 1 in raw_bytes, + // so reconstructed frames silently emitted the wrong trailing count + // bytes. Verify count > 1 now round-trips through from_parts. + let run_lengths = vec![5u16, 3, 7, 1, 2]; + let encoded = TwoDeltaEncodeFrame::from_run_lengths((10, 20), run_lengths.clone(), Some(42)); + + let reconstructed = TwoDeltaEncodeFrame::from_parts( + encoded.pair, + encoded.max_len_bit_count, + encoded.payload().to_vec(), + 42, + ); + + assert_eq!(reconstructed.count, 42); + let trailing = &reconstructed.raw_bytes[reconstructed.raw_bytes.len() - 2..]; + assert_eq!(trailing, &42u16.to_be_bytes()); + assert_eq!(reconstructed.raw_bytes, encoded.raw_bytes); } #[test] diff --git a/ben/src/codec/frames/twodelta_decode.rs b/ben/src/codec/frames/twodelta_decode.rs index 1de0b3a..691a2aa 100644 --- a/ben/src/codec/frames/twodelta_decode.rs +++ b/ben/src/codec/frames/twodelta_decode.rs @@ -47,7 +47,8 @@ impl BenDecode for TwoDeltaDecodeFrame { let count = reader.read_u16::()?; - let encode_frame = TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload); + let encode_frame = + TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload, count); Ok(Some(TwoDeltaDecodeFrame { pair: encode_frame.pair, diff --git a/ben/src/codec/frames/twodelta_encode.rs b/ben/src/codec/frames/twodelta_encode.rs index 73e6cbe..56cb451 100644 --- a/ben/src/codec/frames/twodelta_encode.rs +++ b/ben/src/codec/frames/twodelta_encode.rs @@ -15,8 +15,10 @@ pub struct TwoDeltaEncodeFrame { pub n_bytes: u32, // The run-length vector that was encoded into this frame, stored here for reference. pub run_length_vector: Vec, - // The full serialized TwoDelta frame bytes, including the header and payload. + // The full serialized TwoDelta frame bytes, including the header, payload, and count. pub raw_bytes: Vec, + // The number of times this frame is repeated. Mirrors the trailing u16 in `raw_bytes`. + pub count: u16, } impl TwoDeltaEncodeFrame { @@ -115,6 +117,7 @@ impl TwoDeltaEncodeFrame { n_bytes, run_length_vector, raw_bytes, + count, } } @@ -133,20 +136,27 @@ impl TwoDeltaEncodeFrame { /// * `max_len_bit_count` - The bit width of each packed run length, as read from the /// frame header. /// * `payload` - The raw packed payload bytes, not including the 9-byte header. + /// * `count` - The repetition count for the frame, as read from the trailing `u16` + /// in the wire format. /// /// # Returns /// - /// A `TwoDeltaEncodeFrame` with both `raw_bytes` (header + payload) and the decoded - /// `run_length_vector` populated. - pub fn from_parts(pair: (u16, u16), max_len_bit_count: u8, payload: Vec) -> Self { + /// A `TwoDeltaEncodeFrame` with `raw_bytes` (header + payload + count), the decoded + /// `run_length_vector`, and `count` populated. + pub fn from_parts( + pair: (u16, u16), + max_len_bit_count: u8, + payload: Vec, + count: u16, + ) -> Self { let n_bytes = payload.len() as u32; - let mut raw_bytes = Vec::with_capacity(9 + payload.len()); + let mut raw_bytes = Vec::with_capacity(9 + payload.len() + 2); raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); raw_bytes.push(max_len_bit_count); raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); raw_bytes.extend_from_slice(&payload); - raw_bytes.extend_from_slice(&1u16.to_be_bytes()); + raw_bytes.extend_from_slice(&count.to_be_bytes()); let mut run_length_vector = Vec::new(); let mut buffer: u32 = 0; @@ -172,6 +182,7 @@ impl TwoDeltaEncodeFrame { n_bytes, run_length_vector, raw_bytes, + count, } } } diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index b680ece..04b7645 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -320,8 +320,12 @@ impl XZAssignmentWriter { let count = reader.read_u16::()?; // Unpack bitpacked run lengths. - let frame = - TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), delta_max_len_bits, payload); + let frame = TwoDeltaEncodeFrame::from_parts( + (pair_a, pair_b), + delta_max_len_bits, + payload, + count, + ); let run_lengths = frame.run_length_vector; self.chunk_buffer.push(BufferedDeltaFrame { diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index b98a2c0..5db5b96 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -1590,11 +1590,13 @@ fn twodelta_frame_from_parts_round_trip() { pair, original.max_len_bit_count, original.payload().to_vec(), + original.count, ); assert_eq!(original.as_slice(), reconstructed.as_slice()); assert_eq!(original.pair, reconstructed.pair); assert_eq!(original.max_len_bit_count, reconstructed.max_len_bit_count); assert_eq!(original.n_bytes, reconstructed.n_bytes); + assert_eq!(original.count, reconstructed.count); } #[test] From 288045d3b52bbe08209d4ed45be2e6a13ea216df Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 5 May 2026 07:48:18 -0600 Subject: [PATCH 080/221] fix up some ambiguous / clashing terminology --- ben/src/cli/ben/bundle.rs | 4 +- ben/src/cli/ben/tests.rs | 6 +-- ben/src/cli/bendl/append.rs | 8 ++-- ben/src/cli/bendl/args.rs | 12 +++--- ben/src/cli/bendl/create.rs | 8 ++-- ben/src/cli/bendl/inspect.rs | 2 +- ben/src/cli/bendl/tests.rs | 36 +++++++++--------- ben/src/cli/reben/ben_mode.rs | 6 +-- ben/src/cli/reben/helpers.rs | 6 +-- ben/src/cli/reben/json_mode.rs | 2 +- ben/src/cli/reben/tests.rs | 24 ++++++------ ben/src/codec/encode/errors.rs | 2 +- ben/src/codec/encode/twodelta.rs | 10 ++--- ben/src/codec/frames/twodelta_decode.rs | 2 +- ben/src/codec/frames/twodelta_encode.rs | 8 ++-- ben/src/io/bundle/format.rs | 50 ++++++++++++------------- ben/src/io/bundle/reader.rs | 24 ++++++------ ben/src/io/bundle/tests/format.rs | 28 +++++++------- ben/src/io/bundle/tests/reader.rs | 36 +++++++++--------- ben/src/io/bundle/tests/writer.rs | 32 ++++++++-------- ben/src/io/bundle/writer.rs | 26 ++++++------- ben/src/ops/relabel/mod.rs | 10 ++--- ben/src/ops/relabel/tests.rs | 10 ++--- ben/src/test_utils.rs | 2 +- ben/tests/test_cli.rs | 6 +-- ben/tests/test_coverage.rs | 2 +- ben/tests/test_stress_edges.rs | 4 +- pyben/src/decode/decoder.rs | 12 +++--- pyben/src/encode/encoder.rs | 6 +-- pyben/tests/test_bundle.py | 16 ++++---- 30 files changed, 200 insertions(+), 200 deletions(-) diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 83644f7..e674eaa 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -1,6 +1,6 @@ use super::paths::count_jsonl_lines; use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; -use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH}; +use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, STANDARDIZED_NAME_GRAPH}; use crate::io::bundle::writer::BendlAppender; use crate::io::bundle::{AddAssetOptions, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; @@ -27,7 +27,7 @@ pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<() appender .add_asset( ASSET_TYPE_GRAPH, - CANONICAL_NAME_GRAPH, + STANDARDIZED_NAME_GRAPH, &graph_bytes, AddAssetOptions::defaults().json(), ) diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index 6585025..586ac99 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -363,7 +363,7 @@ fn run_encode_bundle_with_graph_creates_bendl() { let file = fs::File::open(&out).unwrap(); let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert!(reader.find_asset_by_name("graph.json").is_some()); assert_eq!(reader.sample_count(), Some(2)); @@ -394,7 +394,7 @@ fn run_xencode_bundle_with_graph_from_jsonl_creates_bendl() { let file = fs::File::open(&out).unwrap(); let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert!(reader.find_asset_by_name("graph.json").is_some()); fs::remove_file(&jsonl).unwrap(); @@ -432,7 +432,7 @@ fn run_xencode_bundle_with_graph_from_ben_creates_bendl() { let file = fs::File::open(&out).unwrap(); let reader = BendlReader::open(std::io::BufReader::new(file)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert!(reader.find_asset_by_name("graph.json").is_some()); fs::remove_file(&ben_path).unwrap(); diff --git a/ben/src/cli/bendl/append.rs b/ben/src/cli/bendl/append.rs index cc801d0..6ac77b6 100644 --- a/ben/src/cli/bendl/append.rs +++ b/ben/src/cli/bendl/append.rs @@ -1,7 +1,7 @@ use super::args::{AppendArgs, NamedAsset}; use super::helpers::append_file_asset; use crate::io::bundle::format::{ - ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, }; use crate::io::bundle::writer::BendlAppender; use crate::io::bundle::AddAssetOptions; @@ -36,11 +36,11 @@ pub(super) fn run_append(args: AppendArgs) -> Result<(), String> { append_file_asset(&mut appender, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; added += 1; } - if let Some(ref path) = args.relabel_map { + if let Some(ref path) = args.node_permutation_map { append_file_asset( &mut appender, - ASSET_TYPE_RELABEL_MAP, - "relabel_map.json", + ASSET_TYPE_NODE_PERMUTATION_MAP, + "node_permutation_map.json", path, AddAssetOptions::defaults().json(), )?; diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs index 4522ab2..998772f 100644 --- a/ben/src/cli/bendl/args.rs +++ b/ben/src/cli/bendl/args.rs @@ -62,15 +62,15 @@ pub(super) struct CreateArgs { #[arg(short = 'o', long)] pub output: PathBuf, /// Optional `graph.json` asset path. Will be stored under the - /// canonical name `graph.json` and xz-compressed by default. + /// standardized name `graph.json` and xz-compressed by default. #[arg(long)] pub graph: Option, - /// Optional `metadata.json` asset path. Stored under canonical name. + /// Optional `metadata.json` asset path. Stored under standardized name. #[arg(long)] pub metadata: Option, - /// Optional `relabel_map.json` asset path. Stored under canonical name. + /// Optional `node_permutation_map.json` asset path. Stored under standardized name. #[arg(long)] - pub relabel_map: Option, + pub node_permutation_map: Option, /// Additional custom assets, specified as `NAME=PATH`. May be repeated. #[arg(long = "asset")] pub assets: Vec, @@ -118,9 +118,9 @@ pub(super) struct AppendArgs { /// Optional `metadata.json` asset path to add. #[arg(long)] pub metadata: Option, - /// Optional `relabel_map.json` asset path to add. + /// Optional `node_permutation_map.json` asset path to add. #[arg(long)] - pub relabel_map: Option, + pub node_permutation_map: Option, /// Additional custom assets, specified as `NAME=PATH`. May be repeated. #[arg(long = "asset")] pub assets: Vec, diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs index 7709680..9262fca 100644 --- a/ben/src/cli/bendl/create.rs +++ b/ben/src/cli/bendl/create.rs @@ -2,7 +2,7 @@ use super::args::{CreateArgs, NamedAsset}; use super::helpers::{add_file_asset, format_from_path, mode_str}; use crate::cli::common::check_overwrite; use crate::io::bundle::format::{ - ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_RELABEL_MAP, + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, }; use crate::io::bundle::{AddAssetOptions, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; @@ -46,11 +46,11 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { }; add_file_asset(&mut writer, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; } - if let Some(ref path) = args.relabel_map { + if let Some(ref path) = args.node_permutation_map { add_file_asset( &mut writer, - ASSET_TYPE_RELABEL_MAP, - "relabel_map.json", + ASSET_TYPE_NODE_PERMUTATION_MAP, + "node_permutation_map.json", path, AddAssetOptions::defaults().json(), )?; diff --git a/ben/src/cli/bendl/inspect.rs b/ben/src/cli/bendl/inspect.rs index c66bbca..1679698 100644 --- a/ben/src/cli/bendl/inspect.rs +++ b/ben/src/cli/bendl/inspect.rs @@ -18,7 +18,7 @@ pub(super) fn run_inspect(args: InspectArgs) -> Result<(), String> { "version: {}.{}", header.major_version, header.minor_version ); - println!("complete: {}", reader.is_complete()); + println!("finalized: {}", reader.is_finalized()); println!( "assignment_format: {}", match reader.assignment_format() { diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index c6bb697..2a2c849 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -26,7 +26,7 @@ fn write_temp_bendl_xben_variant_works() { // Exercises the Xben branch of write_temp_bendl. let path = write_temp_bendl("xben_helper_check.bendl", AssignmentFormat::Xben); let reader = BendlReader::open(BufReader::new(std::fs::File::open(&path).unwrap())).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); let _ = std::fs::remove_file(&path); } @@ -79,7 +79,7 @@ fn run_create_with_relabel_map_and_custom_asset() { output: out.clone(), graph: None, metadata: None, - relabel_map: Some(relabel.clone()), + node_permutation_map: Some(relabel.clone()), assets: vec![asset_str.parse().unwrap()], overwrite: false, graph_raw: false, @@ -87,7 +87,7 @@ fn run_create_with_relabel_map_and_custom_asset() { run_create(args).unwrap(); let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); - assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + assert!(reader.find_asset_by_name("node_permutation_map.json").is_some()); assert!(reader.find_asset_by_name("myblob").is_some()); for p in [&ben, &relabel, &custom, &out] { @@ -134,7 +134,7 @@ fn run_append_no_assets_is_noop() { input: bendl.clone(), graph: None, metadata: None, - relabel_map: None, + node_permutation_map: None, assets: vec![], graph_raw: false, }; @@ -142,7 +142,7 @@ fn run_append_no_assets_is_noop() { // File should be unchanged (bundle is still valid). let reader = BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); let _ = std::fs::remove_file(&bendl); } @@ -158,7 +158,7 @@ fn run_append_with_metadata_and_relabel_map() { input: bendl.clone(), graph: None, metadata: Some(meta.clone()), - relabel_map: Some(relabel.clone()), + node_permutation_map: Some(relabel.clone()), assets: vec![], graph_raw: false, }; @@ -167,7 +167,7 @@ fn run_append_with_metadata_and_relabel_map() { let reader = BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); assert!(reader.find_asset_by_name("metadata.json").is_some()); - assert!(reader.find_asset_by_name("relabel_map.json").is_some()); + assert!(reader.find_asset_by_name("node_permutation_map.json").is_some()); for p in [&bendl, &meta, &relabel] { let _ = std::fs::remove_file(p); @@ -196,7 +196,7 @@ fn run_create_with_graph_raw_flag() { output: out.clone(), graph: Some(graph.clone()), metadata: None, - relabel_map: None, + node_permutation_map: None, assets: vec![], overwrite: false, graph_raw: true, @@ -214,16 +214,16 @@ fn run_create_with_graph_raw_flag() { #[test] fn run_inspect_unknown_format_and_no_sample_count() { use crate::io::bundle::format::{ - BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, HEADER_SIZE, + BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, HEADER_SIZE, }; // Build a header with an unknown assignment format byte and - // complete=0 so sample_count() returns None. + // finalized=0 so sample_count() returns None. let mut header = [0u8; HEADER_SIZE]; header[0..8].copy_from_slice(&BENDL_MAGIC); header[8..10].copy_from_slice(&BENDL_MAJOR_VERSION.to_le_bytes()); header[10..12].copy_from_slice(&BENDL_MINOR_VERSION.to_le_bytes()); - header[12] = COMPLETE_NO; + header[12] = FINALIZED_NO; header[13] = 0xFF; // unknown format byte // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 let stream_offset = HEADER_SIZE as u64; @@ -250,7 +250,7 @@ fn run_append_with_graph_raw_and_graph_asset() { input: bendl.clone(), graph: Some(graph.clone()), metadata: None, - relabel_map: None, + node_permutation_map: None, assets: vec![], graph_raw: true, }; @@ -297,7 +297,7 @@ fn run_create_errors_on_missing_metadata_file() { output: out.clone(), graph: None, metadata: Some(unique_path("nonexistent_meta.json")), - relabel_map: None, + node_permutation_map: None, assets: vec![], overwrite: false, graph_raw: false, @@ -331,7 +331,7 @@ fn run_create_errors_on_missing_relabel_map_file() { output: out.clone(), graph: None, metadata: None, - relabel_map: Some(unique_path("nonexistent_relabel.json")), + node_permutation_map: Some(unique_path("nonexistent_relabel.json")), assets: vec![], overwrite: false, graph_raw: false, @@ -367,7 +367,7 @@ fn run_create_errors_on_missing_custom_asset_file() { output: out.clone(), graph: None, metadata: None, - relabel_map: None, + node_permutation_map: None, assets: vec![asset_str.parse().unwrap()], overwrite: false, graph_raw: false, @@ -423,7 +423,7 @@ fn run_append_errors_on_missing_metadata_file() { input: bendl.clone(), graph: None, metadata: Some(unique_path("nonexistent_meta.json")), - relabel_map: None, + node_permutation_map: None, assets: vec![], graph_raw: false, }; @@ -439,7 +439,7 @@ fn run_append_errors_on_missing_relabel_map_file() { input: bendl.clone(), graph: None, metadata: None, - relabel_map: Some(unique_path("nonexistent_relabel.json")), + node_permutation_map: Some(unique_path("nonexistent_relabel.json")), assets: vec![], graph_raw: false, }; @@ -457,7 +457,7 @@ fn run_append_errors_on_missing_custom_asset_file() { input: bendl.clone(), graph: None, metadata: None, - relabel_map: None, + node_permutation_map: None, assets: vec![asset_str.parse().unwrap()], graph_raw: false, }; diff --git a/ben/src/cli/reben/ben_mode.rs b/ben/src/cli/reben/ben_mode.rs index a46b9cf..fd2c901 100644 --- a/ben/src/cli/reben/ben_mode.rs +++ b/ben/src/cli/reben/ben_mode.rs @@ -1,6 +1,6 @@ use super::args::Args; use super::helpers::{ - ben_variant_name, ordering_method_name, read_relabel_map_file, relabeling_label, + ben_variant_name, ordering_method_name, read_node_permutation_map_file, relabeling_label, to_ben_variant, to_graph_ordering, }; use crate::json::graph::{sort_json_file_by_key, sort_json_file_by_ordering}; @@ -125,7 +125,7 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { "output_file": output_file_name, "key": args.key.as_ref(), "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map + "node_permutation_old_to_new": map }); map_writer @@ -141,7 +141,7 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { .to_owned(); } - let (new_to_old_node_map, label) = read_relabel_map_file(&map_file_name)?; + let (new_to_old_node_map, label) = read_node_permutation_map_file(&map_file_name)?; let output_file_name = match args.output_file { Some(name) => name, diff --git a/ben/src/cli/reben/helpers.rs b/ben/src/cli/reben/helpers.rs index f2d5ba5..90d98da 100644 --- a/ben/src/cli/reben/helpers.rs +++ b/ben/src/cli/reben/helpers.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::BufReader; -pub(super) fn read_relabel_map_file( +pub(super) fn read_node_permutation_map_file( map_file_name: &str, ) -> Result<(HashMap, String), String> { let map_file = File::open(map_file_name) @@ -17,12 +17,12 @@ pub(super) fn read_relabel_map_file( .map_err(|e| format!("Could not parse map file {map_file_name:?} as JSON: {e}"))?; let map_obj = data - .get("relabeling_old_to_new_nodes_map") + .get("node_permutation_old_to_new") .and_then(Value::as_object) .ok_or_else(|| { format!( "Map file {map_file_name:?} must contain object field \ - relabeling_old_to_new_nodes_map" + node_permutation_old_to_new" ) })?; diff --git a/ben/src/cli/reben/json_mode.rs b/ben/src/cli/reben/json_mode.rs index 164a0da..585f331 100644 --- a/ben/src/cli/reben/json_mode.rs +++ b/ben/src/cli/reben/json_mode.rs @@ -50,7 +50,7 @@ pub(super) fn run_json_mode(args: Args) -> Result<(), String> { "output_file": output_file_name, "key": args.key.as_ref(), "ordering_method": args.ordering.as_ref().map(ordering_method_name), - "relabeling_old_to_new_nodes_map": map + "node_permutation_old_to_new": map }); map_writer diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 2ca34eb..062cefb 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -1,7 +1,7 @@ use super::args::{Args, BenCliVariant, Mode, OrderingMethod}; use super::ben_mode::run_ben_mode; use super::helpers::{ - ben_variant_name, read_relabel_map_file, relabeling_label, to_ben_variant, + ben_variant_name, read_node_permutation_map_file, relabeling_label, to_ben_variant, }; use super::json_mode::run_json_mode; use crate::codec::encode::encode_jsonl_to_ben; @@ -243,7 +243,7 @@ fn run_ben_mode_with_map_file_and_n_items() { let map_path = unique_path("map_n_items_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + b"{\"node_permutation_old_to_new\":{\"0\":2,\"1\":0,\"2\":1}}", ) .unwrap(); @@ -279,7 +279,7 @@ fn run_ben_mode_with_map_file_no_limit() { let map_path = unique_path("map_nolimit_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + b"{\"node_permutation_old_to_new\":{\"0\":2,\"1\":0,\"2\":1}}", ) .unwrap(); @@ -382,7 +382,7 @@ fn run_ben_mode_with_map_file_and_output_variant_n_items() { let map_path = unique_path("map_var_n_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + b"{\"node_permutation_old_to_new\":{\"0\":2,\"1\":0,\"2\":1}}", ) .unwrap(); let out = unique_path("map_var_n_output.jsonl.ben"); @@ -418,7 +418,7 @@ fn run_ben_mode_with_map_file_and_output_variant_no_limit() { let map_path = unique_path("map_var_nolim_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1}}", + b"{\"node_permutation_old_to_new\":{\"0\":2,\"1\":0,\"2\":1}}", ) .unwrap(); let out = unique_path("map_var_nolim_output.jsonl.ben"); @@ -453,7 +453,7 @@ fn run_ben_mode_map_file_without_output_file_derives_name() { let map_path = unique_path("map_derive_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":2,\"1\":0,\"2\":1},\"key\":\"sort\"}", + b"{\"node_permutation_old_to_new\":{\"0\":2,\"1\":0,\"2\":1},\"key\":\"sort\"}", ) .unwrap(); let args = Args::try_parse_from([ @@ -481,27 +481,27 @@ fn run_ben_mode_map_file_without_output_file_derives_name() { } #[test] -fn read_relabel_map_file_rejects_non_integer_index() { +fn read_node_permutation_map_file_rejects_non_integer_index() { let map_path = unique_path("bad_index_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"not_a_number\":0}}", + b"{\"node_permutation_old_to_new\":{\"not_a_number\":0}}", ) .unwrap(); - let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + let err = read_node_permutation_map_file(map_path.to_str().unwrap()).unwrap_err(); assert!(err.contains("invalid old node index")); let _ = fs::remove_file(&map_path); } #[test] -fn read_relabel_map_file_rejects_non_integer_value() { +fn read_node_permutation_map_file_rejects_non_integer_value() { let map_path = unique_path("bad_value_map.json"); fs::write( &map_path, - b"{\"relabeling_old_to_new_nodes_map\":{\"0\":\"not_a_number\"}}", + b"{\"node_permutation_old_to_new\":{\"0\":\"not_a_number\"}}", ) .unwrap(); - let err = read_relabel_map_file(map_path.to_str().unwrap()).unwrap_err(); + let err = read_node_permutation_map_file(map_path.to_str().unwrap()).unwrap_err(); assert!(err.contains("non-integer")); let _ = fs::remove_file(&map_path); } diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index 359542f..7a6e512 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -16,7 +16,7 @@ pub enum EncodeError { #[error("line {line}: value `{value}` cannot be represented as u16")] InvalidAssignmentValue { line: usize, value: u64 }, - #[error("TwoDelta transition involves more than two distinct assignment ids")] + #[error("TwoDelta transition involves more than two distinct district ids")] TwoDeltaTooManyIds, #[error("TwoDelta received identical assignment to previous frame")] diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 7c473d6..17f595d 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -12,7 +12,7 @@ use std::io::{Error, ErrorKind, Result}; /// * `new_assignment` - The full assignment vector for the sample being encoded. /// * `delta_pair` - An optional hint asserting which pair of ids is involved in the /// transition. Must be provided together with `previous_masks`, and the two ids must be distinct. -/// * `previous_masks` - An optional mutable map from assignment id to the sorted list of positions +/// * `previous_masks` - An optional mutable map from district id to the sorted list of positions /// it occupies in `previous_assignment`. When provided, the map is updated in-place to /// reflect `new_assignment` before returning. /// @@ -24,7 +24,7 @@ use std::io::{Error, ErrorKind, Result}; /// # TwoDelta encoding /// /// A TwoDelta frame is valid only when every position that changes between -/// `previous_assignment` and `new_assignment` involves exactly two assignment ids +/// `previous_assignment` and `new_assignment` involves exactly two district ids /// (call them A and B), and no position outside that pair changes. The frame stores /// the pair and the lengths of alternating runs of A and B over the positions /// occupied by the pair, ordered by position. The first run always corresponds to @@ -38,7 +38,7 @@ use std::io::{Error, ErrorKind, Result}; /// the transition. Must be provided together with `previous_masks`. The pair must have two /// distinct ids — passing `(x, x)` is an error. /// -/// - `previous_masks`: A mutable map from assignment id to the sorted list of positions it +/// - `previous_masks`: A mutable map from district id to the sorted list of positions it /// occupies in `previous_assignment`. When provided, the function reads positions /// directly from the map instead of scanning the assignment vector, and updates /// the map in-place to reflect `new_assignment` before returning. The previous_masks must @@ -119,7 +119,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( /// /// # Arguments /// -/// * `pair` - The two assignment ids to validate and order. +/// * `pair` - The two district ids to validate and order. /// * `previous_masks` - The position mask map to look up entries in. /// /// # Returns @@ -421,7 +421,7 @@ fn construct_twodelta_frame_from_scratch( /// the scan entirely. /// /// The transition is valid only when all changed positions involve exactly two -/// assignment ids and positions outside that pair remain unchanged. +/// district ids and positions outside that pair remain unchanged. /// /// # Arguments /// diff --git a/ben/src/codec/frames/twodelta_decode.rs b/ben/src/codec/frames/twodelta_decode.rs index 691a2aa..82330dd 100644 --- a/ben/src/codec/frames/twodelta_decode.rs +++ b/ben/src/codec/frames/twodelta_decode.rs @@ -11,7 +11,7 @@ use std::io::{self, Read}; /// `pair`, `run_lengths`, and `count`. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TwoDeltaDecodeFrame { - /// The ordered pair of assignment ids involved in the delta. + /// The ordered pair of district ids involved in the delta. pub pair: (u16, u16), /// The unpacked run-length vector over the positions occupied by the pair. pub run_lengths: Vec, diff --git a/ben/src/codec/frames/twodelta_encode.rs b/ben/src/codec/frames/twodelta_encode.rs index 56cb451..2663666 100644 --- a/ben/src/codec/frames/twodelta_encode.rs +++ b/ben/src/codec/frames/twodelta_encode.rs @@ -1,11 +1,11 @@ /// Canonical representation of a TwoDelta frame. /// -/// A TwoDelta frame stores the two assignment ids that may change relative to +/// A TwoDelta frame stores the two district ids that may change relative to /// the previous sample and then encodes the lengths of alternating runs over /// just those two ids. The first run always corresponds to `pair.0`. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TwoDeltaEncodeFrame { - // The pair of assignment ids that are encoded in this frame, stored here for reference. + // The pair of district ids that are encoded in this frame, stored here for reference. // Canonically, `pair.0` is the id for the first run in the run-length vector and `pair.1` // is the id for the second run. pub pair: (u16, u16), @@ -57,7 +57,7 @@ impl TwoDeltaEncodeFrame { /// /// # Arguments /// - /// * `pair` - The ordered pair of assignment ids. `pair.0` corresponds to the first run. + /// * `pair` - The ordered pair of district ids. `pair.0` corresponds to the first run. /// * `run_length_vector` - The lengths of alternating runs of `pair.0` and `pair.1` /// over the positions occupied by the pair, in position order. /// @@ -132,7 +132,7 @@ impl TwoDeltaEncodeFrame { /// /// # Arguments /// - /// * `pair` - The ordered pair of assignment ids as read from the frame header. + /// * `pair` - The ordered pair of district ids as read from the frame header. /// * `max_len_bit_count` - The bit width of each packed run length, as read from the /// frame header. /// * `payload` - The raw packed payload bytes, not including the 9-byte header. diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 034d0ff..c7fefac 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -27,10 +27,10 @@ pub const BENDL_MINOR_VERSION: u16 = 0; /// Size of the fixed header in bytes. pub const HEADER_SIZE: usize = 64; -/// `complete` flag value for incomplete (unfinalized) bundles. -pub const COMPLETE_NO: u8 = 0; -/// `complete` flag value for finalized bundles. -pub const COMPLETE_YES: u8 = 1; +/// `finalized` flag value for incomplete (unfinalized) bundles. +pub const FINALIZED_NO: u8 = 0; +/// `finalized` flag value for finalized bundles. +pub const FINALIZED_YES: u8 = 1; // --------------------------------------------------------------------------- // Assignment format identifiers @@ -74,32 +74,32 @@ impl AssignmentFormat { } // --------------------------------------------------------------------------- -// Asset types, flags, canonical names +// Asset types, flags, standardized names // --------------------------------------------------------------------------- /// Asset type id for `metadata.json`. pub const ASSET_TYPE_METADATA: u16 = 1; /// Asset type id for `graph.json`. pub const ASSET_TYPE_GRAPH: u16 = 2; -/// Asset type id for `relabel_map.json`. -pub const ASSET_TYPE_RELABEL_MAP: u16 = 3; +/// Asset type id for `node_permutation_map.json`. +pub const ASSET_TYPE_NODE_PERMUTATION_MAP: u16 = 3; /// Asset type id for a custom user asset (name chosen by writer). pub const ASSET_TYPE_CUSTOM: u16 = 4; -/// Canonical name for the `metadata.json` asset. -pub const CANONICAL_NAME_METADATA: &str = "metadata.json"; -/// Canonical name for the `graph.json` asset. -pub const CANONICAL_NAME_GRAPH: &str = "graph.json"; -/// Canonical name for the `relabel_map.json` asset. -pub const CANONICAL_NAME_RELABEL_MAP: &str = "relabel_map.json"; +/// Standardized name for the `metadata.json` asset. +pub const STANDARDIZED_NAME_METADATA: &str = "metadata.json"; +/// Standardized name for the `graph.json` asset. +pub const STANDARDIZED_NAME_GRAPH: &str = "graph.json"; +/// Standardized name for the `node_permutation_map.json` asset. +pub const STANDARDIZED_NAME_NODE_PERMUTATION_MAP: &str = "node_permutation_map.json"; -/// Return the canonical name reserved for a known singleton asset type, +/// Return the standardized name reserved for a known singleton asset type, /// or `None` for custom or unknown types. -pub fn canonical_name_for(asset_type: u16) -> Option<&'static str> { +pub fn standardized_name_for(asset_type: u16) -> Option<&'static str> { match asset_type { - ASSET_TYPE_METADATA => Some(CANONICAL_NAME_METADATA), - ASSET_TYPE_GRAPH => Some(CANONICAL_NAME_GRAPH), - ASSET_TYPE_RELABEL_MAP => Some(CANONICAL_NAME_RELABEL_MAP), + ASSET_TYPE_METADATA => Some(STANDARDIZED_NAME_METADATA), + ASSET_TYPE_GRAPH => Some(STANDARDIZED_NAME_GRAPH), + ASSET_TYPE_NODE_PERMUTATION_MAP => Some(STANDARDIZED_NAME_NODE_PERMUTATION_MAP), _ => None, } } @@ -138,7 +138,7 @@ pub struct BendlHeader { /// Additive backward-compatible version. pub minor_version: u16, /// `1` if the bundle was successfully finalized, else `0`. - pub complete: u8, + pub finalized: u8, /// Container format of the embedded assignment stream. pub assignment_format: u8, /// Padding after `assignment_format`; writers set to zero, readers ignore. @@ -167,7 +167,7 @@ impl BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: assignment_format.to_u8(), reserved_0: 0, flags: 0, @@ -180,8 +180,8 @@ impl BendlHeader { } /// Whether the bundle has been finalized. - pub fn is_complete(&self) -> bool { - self.complete == COMPLETE_YES + pub fn is_finalized(&self) -> bool { + self.finalized == FINALIZED_YES } /// Typed view of the embedded assignment format. @@ -195,7 +195,7 @@ impl BendlHeader { out[0..8].copy_from_slice(&self.magic); out[8..10].copy_from_slice(&self.major_version.to_le_bytes()); out[10..12].copy_from_slice(&self.minor_version.to_le_bytes()); - out[12] = self.complete; + out[12] = self.finalized; out[13] = self.assignment_format; out[14..16].copy_from_slice(&self.reserved_0.to_le_bytes()); out[16..24].copy_from_slice(&self.flags.to_le_bytes()); @@ -228,7 +228,7 @@ impl BendlHeader { magic, major_version, minor_version, - complete: bytes[12], + finalized: bytes[12], assignment_format: bytes[13], reserved_0: u16::from_le_bytes(bytes[14..16].try_into().unwrap()), flags: u64::from_le_bytes(bytes[16..24].try_into().unwrap()), @@ -268,7 +268,7 @@ pub struct BendlDirectoryEntry { pub asset_type: u16, /// Encoding/compression flags for this asset. pub asset_flags: u16, - /// UTF-8 asset name. Must match the canonical name for singleton types. + /// UTF-8 asset name. Must match the standardized name for singleton types. pub name: String, /// Absolute file offset of the asset payload. pub payload_offset: u64, diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 6b41a98..b2baf4d 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -13,7 +13,7 @@ use std::io::{self, Read, Seek, SeekFrom, Take}; use xz2::read::XzDecoder; use super::format::{ - canonical_name_for, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, + standardized_name_for, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_XZ, }; @@ -62,14 +62,14 @@ impl BendlReader { } /// Whether the bundle was successfully finalized. - pub fn is_complete(&self) -> bool { - self.header.is_complete() + pub fn is_finalized(&self) -> bool { + self.header.is_finalized() } /// The sample count recorded in the header, or `None` if not /// authoritative (i.e. the bundle is still incomplete). pub fn sample_count(&self) -> Option { - if self.header.is_complete() { + if self.header.is_finalized() { Some(self.header.sample_count) } else { None @@ -93,8 +93,8 @@ impl BendlReader { /// Look up the unique directory entry with the given asset type, if /// any. Singleton types (`metadata.json`, `graph.json`, - /// `relabel_map.json`) use this to grab their payload without caring - /// about the canonical name. + /// `node_permutation_map.json`) use this to grab their payload without caring + /// about the standardized name. pub fn find_asset_by_type(&self, asset_type: u16) -> Option<&BendlDirectoryEntry> { self.directory.iter().find(|e| e.asset_type == asset_type) } @@ -106,7 +106,7 @@ impl BendlReader { /// stream is taken as EOF (or the directory start, if a provisional /// directory was written). pub fn assignment_stream_range(&mut self) -> io::Result<(u64, u64)> { - if self.header.is_complete() { + if self.header.is_finalized() { Ok((self.header.stream_offset, self.header.stream_len)) } else { let end = if self.header.directory_offset != 0 { @@ -214,7 +214,7 @@ pub(crate) fn validate_directory_entries( if !seen_names.insert(entry.name.as_str()) { return Err(BundleValidationError::DuplicateName(entry.name.clone())); } - if let Some(canonical) = canonical_name_for(entry.asset_type) { + if let Some(canonical) = standardized_name_for(entry.asset_type) { if entry.name != canonical { return Err(BundleValidationError::WrongCanonicalName { asset_type: entry.asset_type, @@ -274,12 +274,12 @@ pub enum BundleValidationError { #[error("duplicate asset name: {0:?}")] DuplicateName(String), - /// An entry with a known singleton type is not using its canonical name. - #[error("asset type {asset_type} must use canonical name {expected:?}, found {found:?}")] + /// An entry with a known singleton type is not using its standardized name. + #[error("asset type {asset_type} must use standardized name {expected:?}, found {found:?}")] WrongCanonicalName { - /// The asset type whose canonical name was violated. + /// The asset type whose standardized name was violated. asset_type: u16, - /// The canonical name the writer should have used. + /// The standardized name the writer should have used. expected: String, /// The name that was actually written. found: String, diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index bafccdc..cc03c79 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -9,25 +9,25 @@ fn magic_is_eight_bytes_and_matches_spec() { } #[test] -fn canonical_name_lookup() { +fn standardized_name_lookup() { assert_eq!( - canonical_name_for(ASSET_TYPE_METADATA), + standardized_name_for(ASSET_TYPE_METADATA), Some("metadata.json") ); - assert_eq!(canonical_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); + assert_eq!(standardized_name_for(ASSET_TYPE_GRAPH), Some("graph.json")); assert_eq!( - canonical_name_for(ASSET_TYPE_RELABEL_MAP), - Some("relabel_map.json") + standardized_name_for(ASSET_TYPE_NODE_PERMUTATION_MAP), + Some("node_permutation_map.json") ); - assert_eq!(canonical_name_for(ASSET_TYPE_CUSTOM), None); - assert_eq!(canonical_name_for(9999), None); + assert_eq!(standardized_name_for(ASSET_TYPE_CUSTOM), None); + assert_eq!(standardized_name_for(9999), None); } #[test] fn default_compression_policy() { assert!(default_compresses_by_type(ASSET_TYPE_GRAPH)); assert!(!default_compresses_by_type(ASSET_TYPE_METADATA)); - assert!(!default_compresses_by_type(ASSET_TYPE_RELABEL_MAP)); + assert!(!default_compresses_by_type(ASSET_TYPE_NODE_PERMUTATION_MAP)); assert!(!default_compresses_by_type(ASSET_TYPE_CUSTOM)); } @@ -52,7 +52,7 @@ fn header_round_trip_provisional() { let header = BendlHeader::provisional(AssignmentFormat::Xben, 64); let decoded = BendlHeader::from_bytes(&header.to_bytes()).unwrap(); assert_eq!(header, decoded); - assert!(!decoded.is_complete()); + assert!(!decoded.is_finalized()); assert_eq!( decoded.assignment_format_typed(), Some(AssignmentFormat::Xben) @@ -68,7 +68,7 @@ fn header_round_trip_finalized() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: ASSIGNMENT_FORMAT_BEN, reserved_0: 0, flags: 0x0000_0000_0000_000F, @@ -81,7 +81,7 @@ fn header_round_trip_finalized() { let bytes = header.to_bytes(); let decoded = BendlHeader::from_bytes(&bytes).unwrap(); assert_eq!(decoded, header); - assert!(decoded.is_complete()); + assert!(decoded.is_finalized()); } #[test] @@ -108,7 +108,7 @@ fn directory_entry_round_trip_no_checksum() { let entry = BendlDirectoryEntry { asset_type: ASSET_TYPE_GRAPH, asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), + name: STANDARDIZED_NAME_GRAPH.to_string(), payload_offset: 128, payload_len: 4096, checksum: None, @@ -146,7 +146,7 @@ fn directory_table_round_trip() { BendlDirectoryEntry { asset_type: ASSET_TYPE_GRAPH, asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), + name: STANDARDIZED_NAME_GRAPH.to_string(), payload_offset: 64, payload_len: 2048, checksum: None, @@ -154,7 +154,7 @@ fn directory_table_round_trip() { BendlDirectoryEntry { asset_type: ASSET_TYPE_METADATA, asset_flags: ASSET_FLAG_JSON, - name: CANONICAL_NAME_METADATA.to_string(), + name: STANDARDIZED_NAME_METADATA.to_string(), payload_offset: 2112, payload_len: 128, checksum: None, diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index c7b850e..01a98bc 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -5,8 +5,8 @@ use xz2::write::XzEncoder; use crate::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, - ASSET_TYPE_RELABEL_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_NO, - COMPLETE_YES, HEADER_SIZE, + ASSET_TYPE_NODE_PERMUTATION_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, + FINALIZED_YES, HEADER_SIZE, }; use crate::io::bundle::reader::{ validate_directory_entries, BendlReader, BundleAssignmentReaderError, BundleValidationError, @@ -75,7 +75,7 @@ fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -94,7 +94,7 @@ fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { fn open_finalized_bundle_and_read_metadata() { let (bytes, _, _, _) = build_finalized_bundle(); let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(42)); assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); assert_eq!(reader.assets().len(), 2); @@ -151,7 +151,7 @@ fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -165,7 +165,7 @@ fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { bytes.extend_from_slice(&fake_stream); let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); + assert!(!reader.is_finalized()); assert_eq!(reader.sample_count(), None); assert!(reader.assets().is_empty()); @@ -275,7 +275,7 @@ fn build_basic_finalized_bundle() -> Vec { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -415,7 +415,7 @@ fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -428,7 +428,7 @@ fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { let mut bytes = Vec::new(); bytes.extend_from_slice(&header.to_bytes()); let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); + assert!(!reader.is_finalized()); assert_eq!(reader.sample_count(), None); } @@ -464,7 +464,7 @@ fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -490,7 +490,7 @@ fn validate_directory_catches_duplicate_singleton_types() { // Two entries of type METADATA. The second one uses a non-canonical // name to confirm the canonical-name check fires (it lands first // here, and is the path we cover; the singleton check is exercised - // elsewhere via duplicate canonical names). + // elsewhere via duplicate standardized names). let entries = vec![ BendlDirectoryEntry { asset_type: ASSET_TYPE_METADATA, @@ -542,9 +542,9 @@ fn validate_directory_accepts_well_formed_multi_singleton_bundle() { checksum: None, }, BendlDirectoryEntry { - asset_type: ASSET_TYPE_RELABEL_MAP, + asset_type: ASSET_TYPE_NODE_PERMUTATION_MAP, asset_flags: ASSET_FLAG_JSON, - name: "relabel_map.json".to_string(), + name: "node_permutation_map.json".to_string(), payload_offset: 72, payload_len: 4, checksum: None, @@ -610,7 +610,7 @@ fn stress_thousand_custom_assets_round_trip() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -663,7 +663,7 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -698,7 +698,7 @@ fn reader_scales_to_very_wide_stream_offset_field() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -743,7 +743,7 @@ fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -759,7 +759,7 @@ fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { bytes.extend_from_slice(fake_dir); let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); - assert!(!reader.is_complete()); + assert!(!reader.is_finalized()); let (offset, len) = reader.assignment_stream_range().unwrap(); assert_eq!(offset, stream_start); diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 262e519..aeb12f9 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -3,7 +3,7 @@ use std::io::{self, Cursor, Read, Write}; use crate::io::bundle::format::{ AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, - BENDL_MINOR_VERSION, COMPLETE_NO, COMPLETE_YES, HEADER_SIZE, + BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, }; use crate::io::bundle::reader::{BendlReader, BundleAssignmentReader}; use crate::io::bundle::writer::{ @@ -25,7 +25,7 @@ fn minimal_bundle_round_trip_through_reader() { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(7)); assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); assert_eq!(reader.assets().len(), 1); @@ -114,7 +114,7 @@ fn writer_rejects_second_graph() { } #[test] -fn writer_rejects_wrong_canonical_name_for_singleton() { +fn writer_rejects_wrong_standardized_name_for_singleton() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let err = writer .add_json_asset(ASSET_TYPE_GRAPH, "graph_but_wrong_name.json", b"{}") @@ -173,7 +173,7 @@ fn asset_only_bundle_finalizes_with_empty_stream() { let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(0)); assert_eq!(reader.header().stream_len, 0); } @@ -236,7 +236,7 @@ fn append_adds_new_asset_and_preserves_old_entries() { assert!(reader.find_asset_by_name("metadata.json").is_some()); assert!(reader.find_asset_by_name("graph.json").is_some()); // Finalized bundle invariants still hold. - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(3)); } @@ -376,7 +376,7 @@ fn append_rejects_incomplete_bundle() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_NO, + finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -405,7 +405,7 @@ fn append_rejects_complete_bundle_with_zero_directory() { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, @@ -516,7 +516,7 @@ fn write_ben_stream_round_trips_through_assignment_reader() { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); // Four write_assignment calls → sample_count == 4. assert_eq!(reader.sample_count(), Some(samples.len() as i64)); assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); @@ -559,7 +559,7 @@ fn write_xben_stream_round_trips_through_assignment_reader() { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(samples.len() as i64)); assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); @@ -656,7 +656,7 @@ fn fully_empty_bundle_finalizes_and_round_trips() { let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); assert_eq!(reader.sample_count(), Some(0)); assert_eq!(reader.header().stream_len, 0); assert_eq!(reader.assets().len(), 0); @@ -899,7 +899,7 @@ fn finished_writer_rejects_further_operations() { let buf = writer.finish().unwrap().into_inner(); // The resulting buffer is a valid finalized bundle. let reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); } #[test] @@ -948,10 +948,10 @@ fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { #[test] fn writer_rejected_add_leaves_singleton_slot_usable() { // A rejected singleton add must not consume the singleton slot — - // otherwise a future valid add with the correct canonical name + // otherwise a future valid add with the correct standardized name // would spuriously fail with DuplicateSingletonType. let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - // First try with wrong canonical name — rejected. + // First try with wrong standardized name — rejected. let _ = writer .add_json_asset(ASSET_TYPE_GRAPH, "not_graph.json", b"{}") .unwrap_err(); @@ -1064,7 +1064,7 @@ fn randomized_round_trip_many_custom_assets() { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - assert!(reader.is_complete(), "seed {seed}: not finalized"); + assert!(reader.is_finalized(), "seed {seed}: not finalized"); assert_eq!(reader.sample_count(), Some(sample_count)); reader .validate_directory() @@ -1128,7 +1128,7 @@ fn five_successive_appends_preserve_everything() { // Re-open and verify the full set is intact and sample_count // still matches the baseline (append must not touch it). let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); - assert!(reader.is_complete(), "round {round}"); + assert!(reader.is_finalized(), "round {round}"); assert_eq!( reader.sample_count(), baseline_samples, @@ -1324,5 +1324,5 @@ fn finish_from_finished_state_errors() { let buf = writer.finish().unwrap(); // Verify the result is usable let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); } diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 2f98a91..eb7cc06 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -34,9 +34,9 @@ use thiserror::Error; use xz2::write::XzEncoder; use super::format::{ - canonical_name_for, default_compresses_by_type, encode_directory, read_directory, + standardized_name_for, default_compresses_by_type, encode_directory, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_JSON, - ASSET_FLAG_XZ, COMPLETE_YES, DEFAULT_XZ_PRESET, HEADER_SIZE, + ASSET_FLAG_XZ, FINALIZED_YES, DEFAULT_XZ_PRESET, HEADER_SIZE, }; /// Ability to truncate an underlying seekable target to a given length. @@ -178,7 +178,7 @@ impl BendlWriter { } // Canonical-name rule for known singleton types. - if let Some(canonical) = canonical_name_for(asset_type) { + if let Some(canonical) = standardized_name_for(asset_type) { if name != canonical { return Err(BendlWriteError::WrongCanonicalName { asset_type, @@ -196,7 +196,7 @@ impl BendlWriter { // Roll back the singleton insertion before returning, so // the writer remains in a consistent state. (Only known // singleton types would have been inserted above.) - if canonical_name_for(asset_type).is_some() { + if standardized_name_for(asset_type).is_some() { self.singleton_types.remove(&asset_type); } return Err(BendlWriteError::DuplicateName(name.to_string())); @@ -408,7 +408,7 @@ impl BendlWriter { self.header.directory_len = directory_len; self.header.stream_len = stream_len; self.header.sample_count = sample_count; - self.header.complete = COMPLETE_YES; + self.header.finalized = FINALIZED_YES; self.inner.seek(SeekFrom::Start(0))?; self.header.write_to(&mut self.inner)?; @@ -532,12 +532,12 @@ pub enum BendlWriteError { #[error("duplicate singleton asset type: {0}")] DuplicateSingletonType(u16), - /// A singleton asset was added under the wrong canonical name. - #[error("asset type {asset_type} must use canonical name {expected:?}, got {found:?}")] + /// A singleton asset was added under the wrong standardized name. + #[error("asset type {asset_type} must use standardized name {expected:?}, got {found:?}")] WrongCanonicalName { - /// The asset type whose canonical name was violated. + /// The asset type whose standardized name was violated. asset_type: u16, - /// The canonical name the caller should have used. + /// The standardized name the caller should have used. expected: String, /// The name the caller actually provided. found: String, @@ -623,7 +623,7 @@ impl BendlAppender { pub fn open(mut inner: W) -> Result { inner.seek(SeekFrom::Start(0))?; let header = BendlHeader::read_from(&mut inner).map_err(BendlWriteError::Format)?; - if !header.is_complete() { + if !header.is_finalized() { return Err(BendlWriteError::BundleIncomplete); } if header.directory_offset == 0 || header.directory_len == 0 { @@ -647,7 +647,7 @@ impl BendlAppender { let mut existing_singleton_types = HashSet::new(); for entry in &existing_entries { existing_names.insert(entry.name.clone()); - if canonical_name_for(entry.asset_type).is_some() { + if standardized_name_for(entry.asset_type).is_some() { existing_singleton_types.insert(entry.asset_type); } } @@ -678,7 +678,7 @@ impl BendlAppender { options: AddAssetOptions, ) -> Result<(), BendlWriteError> { // Canonical-name rule. - if let Some(canonical) = canonical_name_for(asset_type) { + if let Some(canonical) = standardized_name_for(asset_type) { if name != canonical { return Err(BendlWriteError::WrongCanonicalName { asset_type, @@ -703,7 +703,7 @@ impl BendlAppender { .unwrap_or_else(|| default_compresses_by_type(asset_type)); self.pending_names.insert(name.to_string()); - if canonical_name_for(asset_type).is_some() { + if standardized_name_for(asset_type).is_some() { self.pending_singleton_types.insert(asset_type); } self.pending.push(PendingAsset { diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index faadfc8..d225845 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -45,7 +45,7 @@ fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result< Ok(permutation) } -/// Canonicalize an assignment vector by remapping labels in first-seen order. +/// Remap an assignment vector's district labels in first-seen order. /// /// # Arguments /// @@ -55,7 +55,7 @@ fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result< /// /// Returns a new vector with labels replaced by sequential integers starting at 1, /// assigned in the order they first appear. -fn canonicalize_assignment(assignment: &[u16]) -> Vec { +fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { let mut label_map = HashMap::new(); let mut next_label = 0u16; let mut out = Vec::with_capacity(assignment.len()); @@ -451,7 +451,7 @@ fn relabel_ben_file_impl( &mut writer, variant, max_samples, - |assignment| Ok(canonicalize_assignment(assignment)), + |assignment| Ok(first_seen_relabel_assignment(assignment)), )? } } @@ -735,7 +735,7 @@ pub fn relabel_ben_file_as_variant( let chained = Cursor::new(check_buffer).chain(reader); relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - Ok(canonicalize_assignment(&assignment)) + Ok(first_seen_relabel_assignment(&assignment)) }) } @@ -767,7 +767,7 @@ pub fn relabel_ben_file_as_variant_limit( writer, target_variant, Some(max_samples), - |assignment| Ok(canonicalize_assignment(assignment)), + |assignment| Ok(first_seen_relabel_assignment(assignment)), ) } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 3ac5f61..d14ec7b 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -962,13 +962,13 @@ fn test_permute_assignment_index_out_of_range() { assert!(err.to_string().contains("old index")); } -// ── canonicalize_assignment ────────────────────────────────────────── +// ── first_seen_relabel_assignment ────────────────────────────────────────── #[test] -fn test_canonicalize_assignment() { - assert_eq!(canonicalize_assignment(&[5, 3, 5, 7]), vec![1, 2, 1, 3]); - assert_eq!(canonicalize_assignment(&[]), Vec::::new()); - assert_eq!(canonicalize_assignment(&[42]), vec![1]); +fn test_first_seen_relabel_assignment() { + assert_eq!(first_seen_relabel_assignment(&[5, 3, 5, 7]), vec![1, 2, 1, 3]); + assert_eq!(first_seen_relabel_assignment(&[]), Vec::::new()); + assert_eq!(first_seen_relabel_assignment(&[42]), vec![1]); } // ── relabel_ben_lines_with_map: LengthMismatch ───────────────────── diff --git a/ben/src/test_utils.rs b/ben/src/test_utils.rs index 6d54cd8..0946860 100644 --- a/ben/src/test_utils.rs +++ b/ben/src/test_utils.rs @@ -124,6 +124,6 @@ mod tests { let bytes = sample_bendl_bytes(b"STANDARD BEN FILE\x00fake", AssignmentFormat::Ben); let reader = BendlReader::open(BufReader::new(Cursor::new(bytes))).unwrap(); - assert!(reader.is_complete()); + assert!(reader.is_finalized()); } } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 76197e4..f4ecd3a 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -935,7 +935,7 @@ fn reben_cli_rejects_map_referencing_missing_assignment_index() { fs::write( &map_path, - r#"{"key":"map","relabeling_old_to_new_nodes_map":{"0":0,"2":1}}"#, + r#"{"key":"map","node_permutation_old_to_new":{"0":0,"2":1}}"#, ) .unwrap(); @@ -980,7 +980,7 @@ fn reben_cli_rejects_map_referencing_missing_assignment_index() { let stderr = String::from_utf8_lossy(&malformed.stderr); assert!( stderr.contains("Error: Map file") - && stderr.contains("relabeling_old_to_new_nodes_map") + && stderr.contains("node_permutation_old_to_new") && !stderr.contains("panicked"), "stderr:\n{stderr}" ); @@ -1638,7 +1638,7 @@ fn bendl_cli_create_inspect_extract_append_roundtrip() { ); assert_success(&inspect); let inspect_out = String::from_utf8_lossy(&inspect.stdout); - assert!(inspect_out.contains("complete: true")); + assert!(inspect_out.contains("finalized: true")); assert!(inspect_out.contains("assignment_format: ben")); assert!(inspect_out.contains("graph.json")); assert!(inspect_out.contains("metadata.json")); diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 5db5b96..bef2563 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -1513,7 +1513,7 @@ fn encode_twodelta_frame_more_than_two_values_errors() { let next = vec![3u16, 1, 2]; let err = encode_twodelta_frame(&prev, &next, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert!(err.to_string().contains("two distinct assignment ids")); + assert!(err.to_string().contains("two distinct district ids")); } #[test] diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 7c439ee..c3d961b 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -9,7 +9,7 @@ use binary_ensemble::format::banners::{ }; use binary_ensemble::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_TYPE_CUSTOM, - ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, COMPLETE_YES, + ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_YES, HEADER_SIZE, }; use binary_ensemble::io::bundle::writer::{ @@ -52,7 +52,7 @@ fn minimal_bendl_with_entries( magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, - complete: COMPLETE_YES, + finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), reserved_0: 0, flags: 0, diff --git a/pyben/src/decode/decoder.rs b/pyben/src/decode/decoder.rs index cdef149..3619d74 100644 --- a/pyben/src/decode/decoder.rs +++ b/pyben/src/decode/decoder.rs @@ -5,7 +5,7 @@ use super::helpers::{ use super::types::{ActiveSelection, BundleState, DecoderBackend, DecoderMode, DynIter}; use binary_ensemble::io::bundle::format::{ ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, - ASSET_TYPE_RELABEL_MAP, + ASSET_TYPE_NODE_PERMUTATION_MAP, }; use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::{count_samples_from_file, Selection, SubsampleFrameDecoder}; @@ -334,7 +334,7 @@ impl PyBenDecoder { #[pyo3(text_signature = "(self)")] fn is_complete(&self) -> PyResult { let state = self.require_bundle("is_complete()")?; - Ok(state.reader.is_complete()) + Ok(state.reader.is_finalized()) } /// Names of every entry in the bundle's directory, in directory @@ -440,21 +440,21 @@ impl PyBenDecoder { Ok(Some(self.read_json_asset(py, "metadata.json")?)) } - /// Read the bundle's `relabel_map.json` asset as a parsed JSON - /// object, or `None` if absent. Errors on plain streams. + /// Read the bundle's `node_permutation_map.json` asset as a parsed + /// JSON object, or `None` if absent. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { { let state = self.require_bundle_mut("read_relabel_map()")?; if state .reader - .find_asset_by_type(ASSET_TYPE_RELABEL_MAP) + .find_asset_by_type(ASSET_TYPE_NODE_PERMUTATION_MAP) .is_none() { return Ok(None); } } - Ok(Some(self.read_json_asset(py, "relabel_map.json")?)) + Ok(Some(self.read_json_asset(py, "node_permutation_map.json")?)) } /// Copy the embedded assignment stream region verbatim to diff --git a/pyben/src/encode/encoder.rs b/pyben/src/encode/encoder.rs index aa0301a..2b3bd37 100644 --- a/pyben/src/encode/encoder.rs +++ b/pyben/src/encode/encoder.rs @@ -3,7 +3,7 @@ use super::types::{OutputMode, SharedFileSlot, SharedFileWriter}; use crate::common::{open_output, parse_variant}; use binary_ensemble::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, - ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, CANONICAL_NAME_GRAPH, COMPLETE_YES, HEADER_SIZE, + ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, STANDARDIZED_NAME_GRAPH, FINALIZED_YES, HEADER_SIZE, }; use binary_ensemble::io::writer::AssignmentWriter; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; @@ -104,7 +104,7 @@ impl PyBenEncoder { entries.push(BendlDirectoryEntry { asset_type: ASSET_TYPE_GRAPH, asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: CANONICAL_NAME_GRAPH.to_string(), + name: STANDARDIZED_NAME_GRAPH.to_string(), payload_offset, payload_len: compressed.len() as u64, checksum: None, @@ -204,7 +204,7 @@ impl PyBenEncoder { header.directory_offset = directory_offset; header.directory_len = directory_len; header.sample_count = *sample_count; - header.complete = COMPLETE_YES; + header.finalized = FINALIZED_YES; slot.seek(SeekFrom::Start(0)) .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; diff --git a/pyben/tests/test_bundle.py b/pyben/tests/test_bundle.py index eb8d0e5..ae08e0a 100644 --- a/pyben/tests/test_bundle.py +++ b/pyben/tests/test_bundle.py @@ -49,7 +49,7 @@ ASSET_TYPE_METADATA = 1 ASSET_TYPE_GRAPH = 2 -ASSET_TYPE_RELABEL_MAP = 3 +ASSET_TYPE_NODE_PERMUTATION_MAP = 3 ASSET_TYPE_CUSTOM = 4 ASSET_FLAG_JSON = 1 << 0 @@ -311,8 +311,8 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: compress=True, ), _Asset( - asset_type=ASSET_TYPE_RELABEL_MAP, - name="relabel_map.json", + asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, + name="node_permutation_map.json", payload=relabel_json, is_json=True, compress=False, @@ -336,7 +336,7 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: assert reader.assignment_format() == "ben" names = reader.asset_names() - assert names == ["metadata.json", "graph.json", "relabel_map.json", "notes.bin"] + assert names == ["metadata.json", "graph.json", "node_permutation_map.json", "notes.bin"] assets = reader.list_assets() assert [a["name"] for a in assets] == names @@ -355,7 +355,7 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: # Raw byte access (decompresses xz transparently). assert reader.read_asset_bytes("metadata.json") == metadata_json assert reader.read_asset_bytes("graph.json") == graph_json - assert reader.read_asset_bytes("relabel_map.json") == relabel_json + assert reader.read_asset_bytes("node_permutation_map.json") == relabel_json assert reader.read_asset_bytes("notes.bin") == custom_blob # Typed JSON helpers. @@ -1500,8 +1500,8 @@ def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: compress=True, ), _Asset( - asset_type=ASSET_TYPE_RELABEL_MAP, - name="relabel_map.json", + asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, + name="node_permutation_map.json", payload=relabel_json, is_json=True, ), @@ -1520,7 +1520,7 @@ def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: assert dec.asset_names() == [ "metadata.json", "graph.json", - "relabel_map.json", + "node_permutation_map.json", "notes.bin", ] assets = dec.list_assets() From 1c5255a69ee83c250d47fb1dec0ef7a2ef190e19 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 5 May 2026 07:59:08 -0600 Subject: [PATCH 081/221] tweaks to cli behavior --- ben/src/bin/pben.rs | 7 ------- ben/src/bin/pcben.rs | 7 +++++++ ben/src/cli/ben/args.rs | 4 ++-- ben/src/cli/ben/mod.rs | 2 +- ben/src/cli/ben/modes/{read.rs => lookup.rs} | 8 +++---- ben/src/cli/ben/modes/mod.rs | 2 +- ben/src/cli/mod.rs | 2 +- ben/src/cli/{pben => pcben}/mod.rs | 6 +++--- ben/src/cli/{pben => pcben}/tests.rs | 2 +- ben/src/cli/reben/ben_mode.rs | 2 +- ben/src/cli/reben/tests.rs | 2 +- ben/tests/common/mod.rs | 2 +- ben/tests/test_cli.rs | 22 ++++++++++---------- 13 files changed, 34 insertions(+), 34 deletions(-) delete mode 100755 ben/src/bin/pben.rs create mode 100755 ben/src/bin/pcben.rs rename ben/src/cli/ben/modes/{read.rs => lookup.rs} (87%) rename ben/src/cli/{pben => pcben}/mod.rs (97%) rename ben/src/cli/{pben => pcben}/tests.rs (99%) diff --git a/ben/src/bin/pben.rs b/ben/src/bin/pben.rs deleted file mode 100755 index cca6ca2..0000000 --- a/ben/src/bin/pben.rs +++ /dev/null @@ -1,7 +0,0 @@ -/// Entry point for the `pben` CLI binary. -fn main() { - if let Err(err) = binary_ensemble::cli::pben::run() { - eprintln!("Error: {err}"); - std::process::exit(1); - } -} diff --git a/ben/src/bin/pcben.rs b/ben/src/bin/pcben.rs new file mode 100755 index 0000000..4358ff0 --- /dev/null +++ b/ben/src/bin/pcben.rs @@ -0,0 +1,7 @@ +/// Entry point for the `pcben` CLI binary. +fn main() { + if let Err(err) = binary_ensemble::cli::pcben::run() { + eprintln!("Error: {err}"); + std::process::exit(1); + } +} diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index 8594f84..e6629ae 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -39,8 +39,8 @@ pub(super) enum Mode { Decode, /// Fully decode XBEN into JSONL. XDecode, - /// Read a single sample from a BEN file. - Read, + /// Look up a single sample from a BEN file (random-access decode). + Lookup, /// Compress an arbitrary stream with XZ. XzCompress, /// Decompress an `.xz` file. diff --git a/ben/src/cli/ben/mod.rs b/ben/src/cli/ben/mod.rs index 4f06544..b9d1edc 100644 --- a/ben/src/cli/ben/mod.rs +++ b/ben/src/cli/ben/mod.rs @@ -30,7 +30,7 @@ pub fn run() -> CliResult { Mode::XEncode => modes::xencode::run(args), Mode::Decode => modes::decode::run(args), Mode::XDecode => modes::xdecode::run(args), - Mode::Read => modes::read::run(args), + Mode::Lookup => modes::lookup::run(args), Mode::XzCompress => modes::xz_compress::run(args), Mode::XzDecompress => modes::xz_decompress::run(args), } diff --git a/ben/src/cli/ben/modes/read.rs b/ben/src/cli/ben/modes/lookup.rs similarity index 87% rename from ben/src/cli/ben/modes/read.rs rename to ben/src/cli/ben/modes/lookup.rs index e297fc4..6f4cd9a 100644 --- a/ben/src/cli/ben/modes/read.rs +++ b/ben/src/cli/ben/modes/lookup.rs @@ -8,18 +8,18 @@ use crate::ops::extract::extract_assignment_ben; use std::fs::File; use std::io::{BufReader, Write}; -/// Execute the `read` sub-mode. +/// Execute the `lookup` sub-mode. pub(in crate::cli::ben) fn run(args: Args) -> CliResult { - tracing::trace!("Running in read mode"); + tracing::trace!("Running in lookup mode"); let in_file = args .input_file - .ok_or_else(|| CliError::other("Must provide input file for read mode."))?; + .ok_or_else(|| CliError::other("Must provide input file for lookup mode."))?; let reader = BufReader::new(File::open(&in_file)?); let sample_number = args .sample_number - .ok_or_else(|| CliError::other("Sample number is required in read mode"))?; + .ok_or_else(|| CliError::other("Sample number is required in lookup mode"))?; let mut writer = open_writer(args.output_file.as_deref(), args.print, false)?; let vec = extract_assignment_ben(reader, sample_number) diff --git a/ben/src/cli/ben/modes/mod.rs b/ben/src/cli/ben/modes/mod.rs index f3fb0aa..adb90dc 100644 --- a/ben/src/cli/ben/modes/mod.rs +++ b/ben/src/cli/ben/modes/mod.rs @@ -6,7 +6,7 @@ pub(super) mod decode; pub(super) mod encode; -pub(super) mod read; +pub(super) mod lookup; pub(super) mod xdecode; pub(super) mod xencode; pub(super) mod xz_compress; diff --git a/ben/src/cli/mod.rs b/ben/src/cli/mod.rs index 9b0161b..8dc8d4a 100644 --- a/ben/src/cli/mod.rs +++ b/ben/src/cli/mod.rs @@ -3,5 +3,5 @@ pub mod ben; pub mod bendl; pub mod common; -pub mod pben; +pub mod pcben; pub mod reben; diff --git a/ben/src/cli/pben/mod.rs b/ben/src/cli/pcben/mod.rs similarity index 97% rename from ben/src/cli/pben/mod.rs rename to ben/src/cli/pcben/mod.rs index dc0da7f..183398d 100644 --- a/ben/src/cli/pben/mod.rs +++ b/ben/src/cli/pcben/mod.rs @@ -51,7 +51,7 @@ struct Args { verbose: bool, } -/// Parse CLI arguments and execute the selected `pben` conversion. +/// Parse CLI arguments and execute the selected `pcben` conversion. pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); @@ -150,7 +150,7 @@ pub fn run() -> CliResult { } } -/// Resolve the output file path for a `pben` mode. +/// Resolve the output file path for a `pcben` mode. fn resolved_output_path( mode: Mode, input_file: Option<&str>, @@ -168,7 +168,7 @@ fn resolved_output_path( Ok(Some(path)) } -/// Derive the default output file name for a `pben` conversion mode. +/// Derive the default output file name for a `pcben` conversion mode. fn derive_output_path(mode: Mode, input_file: &str) -> String { match mode { Mode::BenToPc => input_file diff --git a/ben/src/cli/pben/tests.rs b/ben/src/cli/pcben/tests.rs similarity index 99% rename from ben/src/cli/pben/tests.rs rename to ben/src/cli/pcben/tests.rs index 5ce5b17..9f0e0cb 100644 --- a/ben/src/cli/pben/tests.rs +++ b/ben/src/cli/pcben/tests.rs @@ -17,7 +17,7 @@ fn clap_metadata_uses_package_version() { #[test] fn parse_pc_to_xben_args() { let args = Args::try_parse_from([ - "pben", + "pcben", "--mode", "pc-to-xben", "--input-file", diff --git a/ben/src/cli/reben/ben_mode.rs b/ben/src/cli/reben/ben_mode.rs index fd2c901..f44bc25 100644 --- a/ben/src/cli/reben/ben_mode.rs +++ b/ben/src/cli/reben/ben_mode.rs @@ -44,7 +44,7 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { + format!("_{}.ben", ben_variant_name(variant)).as_str() } else { args.input_file.trim_end_matches(".jsonl.ben").to_owned() - + "_canonicalized_assignments.jsonl.ben" + + "_first_seen_relabeled.jsonl.ben" } } }; diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 062cefb..67896b3 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -522,7 +522,7 @@ fn run_ben_mode_canonicalize_derives_output_name() { .unwrap() .trim_end_matches(".jsonl.ben") .to_owned() - + "_canonicalized_assignments.jsonl.ben"; + + "_first_seen_relabeled.jsonl.ben"; let _ = fs::remove_file(&derived); fs::remove_file(&input).unwrap(); result.unwrap(); diff --git a/ben/tests/common/mod.rs b/ben/tests/common/mod.rs index 20ca75d..0a87f43 100644 --- a/ben/tests/common/mod.rs +++ b/ben/tests/common/mod.rs @@ -19,7 +19,7 @@ pub use binary_ensemble::test_utils::{ pub fn binary_path(name: &str) -> &'static str { match name { "ben" => env!("CARGO_BIN_EXE_ben"), - "pben" => env!("CARGO_BIN_EXE_pben"), + "pcben" => env!("CARGO_BIN_EXE_pcben"), "reben" => env!("CARGO_BIN_EXE_reben"), "bendl" => env!("CARGO_BIN_EXE_bendl"), _ => panic!("unknown binary {name}"), diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index f4ecd3a..28e3bfe 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -37,7 +37,7 @@ impl Drop for TempDir { fn bin_path(name: &str) -> &'static str { match name { "ben" => env!("CARGO_BIN_EXE_ben"), - "pben" => env!("CARGO_BIN_EXE_pben"), + "pcben" => env!("CARGO_BIN_EXE_pcben"), "reben" => env!("CARGO_BIN_EXE_reben"), "bendl" => env!("CARGO_BIN_EXE_bendl"), _ => panic!("unknown binary {name}"), @@ -115,7 +115,7 @@ fn sample_graph() -> &'static str { #[test] fn all_clis_report_help_and_package_version() { - for bin in ["ben", "pben", "reben", "bendl"] { + for bin in ["ben", "pcben", "reben", "bendl"] { let help = run(bin, &["--help"], Path::new(".")); assert_success(&help); let help_text = String::from_utf8_lossy(&help.stdout); @@ -173,7 +173,7 @@ fn ben_cli_encode_decode_read_and_x_modes_roundtrip() { "ben", &[ "--mode", - "read", + "lookup", ben_path.to_str().unwrap(), "--sample-number", "2", @@ -495,12 +495,12 @@ fn ben_cli_reports_expected_error_paths() { let read = run( "ben", - &["--mode", "read", bogus_jsonl.to_str().unwrap()], + &["--mode", "lookup", bogus_jsonl.to_str().unwrap()], temp.path(), ); assert_failure(&read); assert!( - String::from_utf8_lossy(&read.stderr).contains("Sample number is required in read mode") + String::from_utf8_lossy(&read.stderr).contains("Sample number is required in lookup mode") ); let xz = run( @@ -677,7 +677,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { "ben", &[ "--mode", - "read", + "lookup", ben_path.to_str().unwrap(), "--sample-number", "1", @@ -740,7 +740,7 @@ fn ben_cli_reports_overwrite_denials_and_remaining_error_modes() { "ben", &[ "--mode", - "read", + "lookup", ben_path.to_str().unwrap(), "--sample-number", "99", @@ -1499,7 +1499,7 @@ fn reben_cli_supports_multi_level_cluster_ordering() { #[test] fn pben_cli_converts_between_formats() { - let temp = TempDir::new("pben"); + let temp = TempDir::new("pcben"); let jsonl_path = temp.path().join("samples.jsonl"); let ben_path = temp.path().join("samples.ben"); let pc_path = temp.path().join("samples.pc"); @@ -1517,7 +1517,7 @@ fn pben_cli_converts_between_formats() { fs::write(&ben_path, ben_bytes).unwrap(); let ben_to_pc = run( - "pben", + "pcben", &[ "--mode", "ben-to-pc", @@ -1532,7 +1532,7 @@ fn pben_cli_converts_between_formats() { assert!(pc_path.exists()); let pc_to_ben = run( - "pben", + "pcben", &[ "--mode", "pc-to-ben", @@ -1546,7 +1546,7 @@ fn pben_cli_converts_between_formats() { assert_success(&pc_to_ben); let pc_to_xben = run( - "pben", + "pcben", &[ "--mode", "pc-to-xben", From e6a98eefe13a8370f771d08c47046fd8424fa3a7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 5 May 2026 08:15:03 -0600 Subject: [PATCH 082/221] change pyben -> ben-py --- Cargo.lock | 36 +- Cargo.toml | 2 +- README.md | 10 +- TODO.md | 4 +- Taskfile.yml | 84 ++-- {pyben => ben-py}/.gitignore | 0 {pyben => ben-py}/.python-version | 0 {pyben => ben-py}/Cargo.toml | 2 +- {pyben => ben-py}/README.md | 0 ben-py/binary_ensemble/__init__.py | 21 + {pyben => ben-py}/binary_ensemble/_core.pyi | 0 {pyben => ben-py}/binary_ensemble/py.typed | 0 {pyben => ben-py}/docs/Makefile | 0 {pyben => ben-py}/docs/_static/css/custom.css | 0 {pyben => ben-py}/docs/conf.py | 0 {pyben => ben-py}/docs/index.rst | 0 {pyben => ben-py}/docs/make.bat | 0 {pyben => ben-py}/docs/user/.gitignore | 0 {pyben => ben-py}/docs/user/using_pyben.ipynb | 0 {pyben => ben-py}/pyproject.toml | 0 {pyben => ben-py}/src/common.rs | 0 {pyben => ben-py}/src/decode/decoder.rs | 2 +- {pyben => ben-py}/src/decode/helpers.rs | 0 {pyben => ben-py}/src/decode/mod.rs | 2 +- {pyben => ben-py}/src/decode/py_funcs.rs | 16 +- {pyben => ben-py}/src/decode/types.rs | 0 {pyben => ben-py}/src/encode/encoder.rs | 2 +- {pyben => ben-py}/src/encode/helpers.rs | 0 {pyben => ben-py}/src/encode/mod.rs | 2 +- {pyben => ben-py}/src/encode/py_funcs.rs | 23 +- {pyben => ben-py}/src/encode/types.rs | 0 ben-py/src/lib.rs | 23 + {pyben => ben-py}/tests/test_bundle.py | 292 +++++------ .../tests/test_python_pipelines.py | 452 +++++++++--------- {pyben => ben-py}/uv.lock | 0 pyben/binary_ensemble/__init__.py | 21 - pyben/src/lib.rs | 23 - 37 files changed, 511 insertions(+), 506 deletions(-) rename {pyben => ben-py}/.gitignore (100%) rename {pyben => ben-py}/.python-version (100%) rename {pyben => ben-py}/Cargo.toml (97%) rename {pyben => ben-py}/README.md (100%) create mode 100644 ben-py/binary_ensemble/__init__.py rename {pyben => ben-py}/binary_ensemble/_core.pyi (100%) rename {pyben => ben-py}/binary_ensemble/py.typed (100%) rename {pyben => ben-py}/docs/Makefile (100%) rename {pyben => ben-py}/docs/_static/css/custom.css (100%) rename {pyben => ben-py}/docs/conf.py (100%) rename {pyben => ben-py}/docs/index.rst (100%) rename {pyben => ben-py}/docs/make.bat (100%) rename {pyben => ben-py}/docs/user/.gitignore (100%) rename {pyben => ben-py}/docs/user/using_pyben.ipynb (100%) rename {pyben => ben-py}/pyproject.toml (100%) rename {pyben => ben-py}/src/common.rs (100%) rename {pyben => ben-py}/src/decode/decoder.rs (99%) rename {pyben => ben-py}/src/decode/helpers.rs (100%) rename {pyben => ben-py}/src/decode/mod.rs (62%) rename {pyben => ben-py}/src/decode/py_funcs.rs (81%) rename {pyben => ben-py}/src/decode/types.rs (100%) rename {pyben => ben-py}/src/encode/encoder.rs (99%) rename {pyben => ben-py}/src/encode/helpers.rs (100%) rename {pyben => ben-py}/src/encode/mod.rs (63%) rename {pyben => ben-py}/src/encode/py_funcs.rs (80%) rename {pyben => ben-py}/src/encode/types.rs (100%) create mode 100755 ben-py/src/lib.rs rename {pyben => ben-py}/tests/test_bundle.py (93%) rename {pyben => ben-py}/tests/test_python_pipelines.py (75%) rename {pyben => ben-py}/uv.lock (100%) delete mode 100644 pyben/binary_ensemble/__init__.py delete mode 100755 pyben/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 44e33e6..3fc2f64 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,6 +93,24 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "ben-py" +version = "1.0.0" +dependencies = [ + "binary-ensemble", + "byteorder", + "clap 4.5.48", + "lipsum", + "pcompress", + "pipe", + "pyo3", + "rand 0.9.2", + "rand_chacha 0.9.0", + "rand_distr", + "serde_json", + "xz2", +] + [[package]] name = "binary-ensemble" version = "1.0.0" @@ -741,24 +759,6 @@ dependencies = [ "unarray", ] -[[package]] -name = "pyben" -version = "1.0.0" -dependencies = [ - "binary-ensemble", - "byteorder", - "clap 4.5.48", - "lipsum", - "pcompress", - "pipe", - "pyo3", - "rand 0.9.2", - "rand_chacha 0.9.0", - "rand_distr", - "serde_json", - "xz2", -] - [[package]] name = "pyo3" version = "0.26.0" diff --git a/Cargo.toml b/Cargo.toml index db34469..bf51c76 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ resolver = "2" members = [ "ben", - "pyben", + "ben-py", ] exclude = [ diff --git a/README.md b/README.md index 42c2f25..8a55931 100755 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ run things through `reben` before compressing into an XBEN format. In our runnin then compress this file back down to an XBEN format using ``` -ben -m x-encode 100k_CO_chain_canonicalized_assignments.jsonl.ben +ben -m x-encode 100k_CO_chain_first_seen_relabeled.jsonl.ben ``` DON'T ACTUALLY DO THIS, IT WILL TAKE OVER AN HOUR!!! @@ -194,12 +194,12 @@ In our example, the CO_small.json file has the GEOID20 key that we would like to the command ``` -reben -m ben -s CO_small.json -k GEOID20 100k_CO_chain_canonicalized_assignments.jsonl.ben +reben -m ben -s CO_small.json -k GEOID20 100k_CO_chain_first_seen_relabeled.jsonl.ben ``` This will produce the files -- 100k_CO_chain_canonicalized_assignments_sorted_by_GEOID20.jsonl.ben (~550Mb) +- 100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.ben (~550Mb) - CO_small_sorted_by_GEOID20_map.json (a map file containing the new data) - CO_small_sorted_by_GEOID20.json (a dual-graph file with the nodes shifted around) @@ -207,11 +207,11 @@ Notice, our BEN file has now shrunk from ~7Gb to around 0.5Gb, which is pretty g further compress this file using the `x-encode` mode of the `ben` CLI ``` -ben -m x-encode 100k_CO_chain_canonicalized_assignments_sorted_by_GEOID20.jsonl.ben +ben -m x-encode 100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.ben ``` And this will produce the file -`100k_CO_chain_canonicalized_assignments_sorted_by_GEOID20.jsonl.xben` which will only be ~6Mb! That +`100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.xben` which will only be ~6Mb! That is over a 1000x improvement over the original BEN file, and over a 4500x improvement on the JSONL file! diff --git a/TODO.md b/TODO.md index af0c650..2665aae 100755 --- a/TODO.md +++ b/TODO.md @@ -39,6 +39,6 @@ - [ ] Add a compression mechanism on CutEdges similar to NBEC from Todd Proebsting -- [ ] Maybe change around the MkvChain mode to canonicalize the assignments and then have a header - that keeps track of the relabeling. +- [ ] Maybe change around the MkvChain mode to first-seen relabel the assignments and then have a + header that keeps track of the district relabeling. diff --git a/Taskfile.yml b/Taskfile.yml index b5dd178..1d6d572 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -68,36 +68,36 @@ tasks: - ensure-rust-{{OS}} - ensure-uv-{{OS}} - pyben-sync: - desc: Sync the pyben development environment + ben-py-sync: + desc: Sync the ben-py development environment silent: true deps: - ensure-toolchain env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' - dir: pyben + dir: ben-py cmds: - uv sync --all-groups - pyben-develop: - desc: Build the editable pyben extension + ben-py-develop: + desc: Build the editable ben-py extension silent: true deps: - - pyben-sync + - ben-py-sync env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' - dir: pyben + dir: ben-py cmds: - uv run maturin develop release: - desc: Build a release wheel for pyben + desc: Build a release wheel for ben-py silent: true deps: - - pyben-sync + - ben-py-sync env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' - dir: pyben + dir: ben-py cmds: - uv run maturin build --release @@ -129,13 +129,13 @@ tasks: - task: test-rust-slow test-python: - desc: Run the pyben Python tests + desc: Run the ben-py Python tests silent: true deps: - - pyben-develop + - ben-py-develop env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' - dir: pyben + dir: ben-py cmds: - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} @@ -156,8 +156,8 @@ tasks: cmds: - cargo llvm-cov --package binary-ensemble --summary-only --ignore-filename-regex '(^|/)bin/' - coverage-pyben: - desc: Run Python-driven Rust coverage for pyben + coverage-ben-py: + desc: Run Python-driven Rust coverage for ben-py silent: true deps: - ensure-toolchain @@ -165,26 +165,26 @@ tasks: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - cargo llvm-cov clean --workspace - - cargo llvm-cov -p pyben --no-report + - cargo llvm-cov -p ben-py --no-report - >- bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; - cd "{{.ROOT_DIR}}/pyben"; + cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' - >- {{.LLVM_BIN}}/llvm-profdata merge -sparse {{.ROOT_DIR}}/target/*.profraw {{.COV_TARGET_DIR}}/*.profraw - -o /tmp/pyben.profdata + -o /tmp/ben-py.profdata - >- {{.LLVM_BIN}}/llvm-cov report {{.COV_TARGET_DIR}}/debug/libpyben_core.so - -instr-profile=/tmp/pyben.profdata + -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='/.cargo/registry|/rustc/|^/mnt/.*/ben/src/' - coverage-pyben-html: - desc: Generate an HTML Rust coverage report for pyben at /tmp/pyben-coverage.html + coverage-ben-py-html: + desc: Generate an HTML Rust coverage report for ben-py at /tmp/ben-py-coverage.html silent: true deps: - ensure-toolchain @@ -192,26 +192,26 @@ tasks: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - cargo llvm-cov clean --workspace - - cargo llvm-cov -p pyben --no-report + - cargo llvm-cov -p ben-py --no-report - >- bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; - cd "{{.ROOT_DIR}}/pyben"; + cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' - >- {{.LLVM_BIN}}/llvm-profdata merge -sparse {{.ROOT_DIR}}/target/*.profraw {{.COV_TARGET_DIR}}/*.profraw - -o /tmp/pyben.profdata + -o /tmp/ben-py.profdata - >- bash -lc '{{.LLVM_BIN}}/llvm-cov show {{.COV_TARGET_DIR}}/debug/libpyben_core.so - -instr-profile=/tmp/pyben.profdata + -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='\"'\"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'\"'\"' - --format=html > /tmp/pyben-coverage.html' + --format=html > /tmp/ben-py-coverage.html' coverage-summary: - desc: Run ben and pyben coverage and print both reports plus a combined summary table + desc: Run ben and ben-py coverage and print both reports plus a combined summary table silent: true deps: - ensure-toolchain @@ -223,30 +223,30 @@ tasks: - >- bash -lc ' ben_report_file=/tmp/ben-coverage-report.txt; - pyben_report_file=/tmp/pyben-coverage-report.txt; + ben_py_report_file=/tmp/ben-py-coverage-report.txt; cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)bin/'"'"' > "$ben_report_file"; ben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_report_file")"; cargo llvm-cov clean --workspace >/dev/null; - cargo llvm-cov -p pyben --no-report >/dev/null; + cargo llvm-cov -p ben-py --no-report >/dev/null; eval "$(cargo llvm-cov show-env --sh)"; export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; - cd "{{.ROOT_DIR}}/pyben"; + cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}" >/dev/null; uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} >/dev/null; cd "{{.ROOT_DIR}}"; - {{.LLVM_BIN}}/llvm-profdata merge -sparse target/*.profraw {{.COV_TARGET_DIR}}/*.profraw -o /tmp/pyben.profdata >/dev/null; - {{.LLVM_BIN}}/llvm-cov report {{.COV_TARGET_DIR}}/debug/libpyben_core.so -instr-profile=/tmp/pyben.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$pyben_report_file"; - pyben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$pyben_report_file")"; + {{.LLVM_BIN}}/llvm-profdata merge -sparse target/*.profraw {{.COV_TARGET_DIR}}/*.profraw -o /tmp/ben-py.profdata >/dev/null; + {{.LLVM_BIN}}/llvm-cov report {{.COV_TARGET_DIR}}/debug/libpyben_core.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$ben_py_report_file"; + ben_py_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_py_report_file")"; printf "\n%s\n\n" "BEN COVERAGE"; cat "$ben_report_file"; printf "\n%s\n\n" "PYBEN COVERAGE"; - cat "$pyben_report_file"; + cat "$ben_py_report_file"; printf "\n%-10s %-10s\n" "Target" "Lines"; printf "%-10s %-10s\n" "ben" "${ben_total:-n/a}"; - printf "%-10s %-10s\n" "pyben" "${pyben_total:-n/a}"; + printf "%-10s %-10s\n" "ben-py" "${ben_py_total:-n/a}"; ' clean-linux: &clean-unix @@ -255,9 +255,9 @@ tasks: silent: true cmds: - cargo clean - - rm -rf pyben/target pyben/dist pyben/pyben.egg-info pyben/src/pyben.c - - rm -rf pyben/pyben/*abi3.so pyben/pyben/pyben.*.pyd - - rm -rf pyben/.venv + - rm -rf ben-py/target ben-py/dist ben-py/binary_ensemble.egg-info ben-py/src/ben-py.c + - rm -rf ben-py/binary_ensemble/*abi3.so ben-py/binary_ensemble/*.pyd + - rm -rf ben-py/.venv - find . -type d -name "__pycache__" -exec rm -rf {} + - rm -rf docs/_build docs/user/example_data @@ -269,10 +269,10 @@ tasks: silent: true cmds: - cargo clean - - cmd /c "if exist pyben\\target rmdir /s /q pyben\\target" - - cmd /c "if exist pyben\\dist rmdir /s /q pyben\\dist" - - cmd /c "if exist pyben\\pyben.egg-info rmdir /s /q pyben\\pyben.egg-info" - - cmd /c "if exist pyben\\.venv rmdir /s /q pyben\\.venv" + - cmd /c "if exist ben-py\\target rmdir /s /q ben-py\\target" + - cmd /c "if exist ben-py\\dist rmdir /s /q ben-py\\dist" + - cmd /c "if exist ben-py\\binary_ensemble.egg-info rmdir /s /q ben-py\\binary_ensemble.egg-info" + - cmd /c "if exist ben-py\\.venv rmdir /s /q ben-py\\.venv" - powershell -NoProfile -Command "Get-ChildItem -Path . -Directory -Filter __pycache__ -Recurse | Remove-Item -Recurse -Force" - cmd /c "if exist docs\\_build rmdir /s /q docs\\_build" - cmd /c "if exist docs\\user\\example_data rmdir /s /q docs\\user\\example_data" diff --git a/pyben/.gitignore b/ben-py/.gitignore similarity index 100% rename from pyben/.gitignore rename to ben-py/.gitignore diff --git a/pyben/.python-version b/ben-py/.python-version similarity index 100% rename from pyben/.python-version rename to ben-py/.python-version diff --git a/pyben/Cargo.toml b/ben-py/Cargo.toml similarity index 97% rename from pyben/Cargo.toml rename to ben-py/Cargo.toml index 705ad06..77dcc5d 100755 --- a/pyben/Cargo.toml +++ b/ben-py/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pyben" +name = "ben-py" version.workspace = true edition = "2021" authors = ["Peter Rock "] diff --git a/pyben/README.md b/ben-py/README.md similarity index 100% rename from pyben/README.md rename to ben-py/README.md diff --git a/ben-py/binary_ensemble/__init__.py b/ben-py/binary_ensemble/__init__.py new file mode 100644 index 0000000..68227f4 --- /dev/null +++ b/ben-py/binary_ensemble/__init__.py @@ -0,0 +1,21 @@ +from ._core import ( + BenDecoder, + BenEncoder, + encode_jsonl_to_ben, + encode_ben_to_xben, + encode_jsonl_to_xben, + decode_ben_to_jsonl, + decode_xben_to_jsonl, + decode_xben_to_ben, +) + +__all__ = [ + "BenDecoder", + "BenEncoder", + "encode_jsonl_to_ben", + "encode_ben_to_xben", + "encode_jsonl_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", +] diff --git a/pyben/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi similarity index 100% rename from pyben/binary_ensemble/_core.pyi rename to ben-py/binary_ensemble/_core.pyi diff --git a/pyben/binary_ensemble/py.typed b/ben-py/binary_ensemble/py.typed similarity index 100% rename from pyben/binary_ensemble/py.typed rename to ben-py/binary_ensemble/py.typed diff --git a/pyben/docs/Makefile b/ben-py/docs/Makefile similarity index 100% rename from pyben/docs/Makefile rename to ben-py/docs/Makefile diff --git a/pyben/docs/_static/css/custom.css b/ben-py/docs/_static/css/custom.css similarity index 100% rename from pyben/docs/_static/css/custom.css rename to ben-py/docs/_static/css/custom.css diff --git a/pyben/docs/conf.py b/ben-py/docs/conf.py similarity index 100% rename from pyben/docs/conf.py rename to ben-py/docs/conf.py diff --git a/pyben/docs/index.rst b/ben-py/docs/index.rst similarity index 100% rename from pyben/docs/index.rst rename to ben-py/docs/index.rst diff --git a/pyben/docs/make.bat b/ben-py/docs/make.bat similarity index 100% rename from pyben/docs/make.bat rename to ben-py/docs/make.bat diff --git a/pyben/docs/user/.gitignore b/ben-py/docs/user/.gitignore similarity index 100% rename from pyben/docs/user/.gitignore rename to ben-py/docs/user/.gitignore diff --git a/pyben/docs/user/using_pyben.ipynb b/ben-py/docs/user/using_pyben.ipynb similarity index 100% rename from pyben/docs/user/using_pyben.ipynb rename to ben-py/docs/user/using_pyben.ipynb diff --git a/pyben/pyproject.toml b/ben-py/pyproject.toml similarity index 100% rename from pyben/pyproject.toml rename to ben-py/pyproject.toml diff --git a/pyben/src/common.rs b/ben-py/src/common.rs similarity index 100% rename from pyben/src/common.rs rename to ben-py/src/common.rs diff --git a/pyben/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs similarity index 99% rename from pyben/src/decode/decoder.rs rename to ben-py/src/decode/decoder.rs index 3619d74..418cfa6 100644 --- a/pyben/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -16,7 +16,7 @@ use std::fs::{File, OpenOptions}; use std::io::{self, BufReader, BufWriter, Write}; use std::path::PathBuf; -#[pyclass(module = "binary_ensemble", unsendable)] +#[pyclass(module = "binary_ensemble", name = "BenDecoder", unsendable)] pub struct PyBenDecoder { path: PathBuf, mode: DecoderMode, diff --git a/pyben/src/decode/helpers.rs b/ben-py/src/decode/helpers.rs similarity index 100% rename from pyben/src/decode/helpers.rs rename to ben-py/src/decode/helpers.rs diff --git a/pyben/src/decode/mod.rs b/ben-py/src/decode/mod.rs similarity index 62% rename from pyben/src/decode/mod.rs rename to ben-py/src/decode/mod.rs index 4c6708f..6ae56c8 100644 --- a/pyben/src/decode/mod.rs +++ b/ben-py/src/decode/mod.rs @@ -6,4 +6,4 @@ mod py_funcs; mod types; pub use decoder::PyBenDecoder; -pub use py_funcs::{decompress_ben_to_jsonl, decompress_xben_to_ben, decompress_xben_to_jsonl}; +pub use py_funcs::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; diff --git a/pyben/src/decode/py_funcs.rs b/ben-py/src/decode/py_funcs.rs similarity index 81% rename from pyben/src/decode/py_funcs.rs rename to ben-py/src/decode/py_funcs.rs index 071c80c..9a90fd0 100644 --- a/pyben/src/decode/py_funcs.rs +++ b/ben-py/src/decode/py_funcs.rs @@ -1,6 +1,8 @@ use crate::common::{open_input, open_output, validate_input_output_paths}; use binary_ensemble::codec::decode::{ - decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, + decode_ben_to_jsonl as core_decode_ben_to_jsonl, + decode_xben_to_ben as core_decode_xben_to_ben, + decode_xben_to_jsonl as core_decode_xben_to_jsonl, }; use pyo3::exceptions::PyIOError; use pyo3::prelude::*; @@ -9,7 +11,7 @@ use std::path::PathBuf; #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_xben_to_ben( +pub fn decode_xben_to_ben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -18,7 +20,7 @@ pub fn decompress_xben_to_ben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - decode_xben_to_ben(reader, writer).map_err(|e| { + core_decode_xben_to_ben(reader, writer).map_err(|e| { PyIOError::new_err(format!( "Failed to convert XBEN to BEN from {} to {}: {e}", in_file.display(), @@ -32,7 +34,7 @@ pub fn decompress_xben_to_ben( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_xben_to_jsonl( +pub fn decode_xben_to_jsonl( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -41,7 +43,7 @@ pub fn decompress_xben_to_jsonl( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - decode_xben_to_jsonl(reader, writer).map_err(|e| { + core_decode_xben_to_jsonl(reader, writer).map_err(|e| { PyIOError::new_err(format!( "Failed to convert XBEN to JSONL from {} to {}: {e}", in_file.display(), @@ -55,7 +57,7 @@ pub fn decompress_xben_to_jsonl( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decompress_ben_to_jsonl( +pub fn decode_ben_to_jsonl( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -64,7 +66,7 @@ pub fn decompress_ben_to_jsonl( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - decode_ben_to_jsonl(reader, writer).map_err(|e| { + core_decode_ben_to_jsonl(reader, writer).map_err(|e| { PyIOError::new_err(format!( "Failed to convert BEN to JSONL from {} to {}: {e}", in_file.display(), diff --git a/pyben/src/decode/types.rs b/ben-py/src/decode/types.rs similarity index 100% rename from pyben/src/decode/types.rs rename to ben-py/src/decode/types.rs diff --git a/pyben/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs similarity index 99% rename from pyben/src/encode/encoder.rs rename to ben-py/src/encode/encoder.rs index 2b3bd37..8d05ef7 100644 --- a/pyben/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -13,7 +13,7 @@ use std::io::{Seek, SeekFrom, Write}; use std::path::PathBuf; use std::rc::Rc; -#[pyclass(unsendable)] +#[pyclass(name = "BenEncoder", unsendable)] pub struct PyBenEncoder { file: Option, encoder: Option>, diff --git a/pyben/src/encode/helpers.rs b/ben-py/src/encode/helpers.rs similarity index 100% rename from pyben/src/encode/helpers.rs rename to ben-py/src/encode/helpers.rs diff --git a/pyben/src/encode/mod.rs b/ben-py/src/encode/mod.rs similarity index 63% rename from pyben/src/encode/mod.rs rename to ben-py/src/encode/mod.rs index cd0c20a..57e6ae6 100644 --- a/pyben/src/encode/mod.rs +++ b/ben-py/src/encode/mod.rs @@ -6,4 +6,4 @@ mod py_funcs; mod types; pub use encoder::PyBenEncoder; -pub use py_funcs::{compress_ben_to_xben, compress_jsonl_to_ben, compress_jsonl_to_xben}; +pub use py_funcs::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; diff --git a/pyben/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs similarity index 80% rename from pyben/src/encode/py_funcs.rs rename to ben-py/src/encode/py_funcs.rs index e90d833..2b93965 100644 --- a/pyben/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -1,5 +1,9 @@ use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; -use binary_ensemble::codec::encode::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; +use binary_ensemble::codec::encode::{ + encode_ben_to_xben as core_encode_ben_to_xben, + encode_jsonl_to_ben as core_encode_jsonl_to_ben, + encode_jsonl_to_xben as core_encode_jsonl_to_xben, +}; use pyo3::exceptions::PyIOError; use pyo3::prelude::*; use std::path::PathBuf; @@ -9,7 +13,7 @@ use std::path::PathBuf; #[pyo3( text_signature = "(in_file, out_file, overwrite=false, n_threads=None, compression_level=None)" )] -pub fn compress_ben_to_xben( +pub fn encode_ben_to_xben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -20,7 +24,7 @@ pub fn compress_ben_to_xben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { + core_encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { PyIOError::new_err(format!( "Failed to convert BEN to XBEN from {} to {}: {e}", in_file.display(), @@ -34,7 +38,7 @@ pub fn compress_ben_to_xben( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] #[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain')")] -pub fn compress_jsonl_to_ben( +pub fn encode_jsonl_to_ben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -45,7 +49,7 @@ pub fn compress_jsonl_to_ben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_jsonl_to_ben(reader, writer, ben_var).map_err(|e| { + core_encode_jsonl_to_ben(reader, writer, ben_var).map_err(|e| { PyIOError::new_err(format!( "Failed to convert JSONL to BEN from {} to {}: {e}", in_file.display(), @@ -60,7 +64,7 @@ pub fn compress_jsonl_to_ben( #[pyo3( text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain', n_threads=None, compression_level=None)" )] -pub fn compress_jsonl_to_xben( +pub fn encode_jsonl_to_xben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, @@ -73,14 +77,13 @@ pub fn compress_jsonl_to_xben( let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None).map_err( - |e| { + core_encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None) + .map_err(|e| { PyIOError::new_err(format!( "Failed to convert JSONL to XBEN from {} to {}: {e}", in_file.display(), out_file.display() )) - }, - )?; + })?; Ok(()) } diff --git a/pyben/src/encode/types.rs b/ben-py/src/encode/types.rs similarity index 100% rename from pyben/src/encode/types.rs rename to ben-py/src/encode/types.rs diff --git a/ben-py/src/lib.rs b/ben-py/src/lib.rs new file mode 100755 index 0000000..642db94 --- /dev/null +++ b/ben-py/src/lib.rs @@ -0,0 +1,23 @@ +use pyo3::prelude::*; +use pyo3::wrap_pyfunction; + +pub mod common; +pub mod decode; +pub mod encode; + +#[pymodule] +fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(crate::decode::decode_ben_to_jsonl, m)?)?; + m.add_function(wrap_pyfunction!(crate::decode::decode_xben_to_ben, m)?)?; + m.add_function(wrap_pyfunction!( + crate::decode::decode_xben_to_jsonl, + m + )?)?; + m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_ben, m)?)?; + m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_xben, m)?)?; + m.add_function(wrap_pyfunction!(crate::encode::encode_ben_to_xben, m)?)?; + + Ok(()) +} diff --git a/pyben/tests/test_bundle.py b/ben-py/tests/test_bundle.py similarity index 93% rename from pyben/tests/test_bundle.py rename to ben-py/tests/test_bundle.py index ae08e0a..ffb6f00 100644 --- a/pyben/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -1,4 +1,4 @@ -"""Tests for bundle (.bendl) support in PyBenDecoder. +"""Tests for bundle (.bendl) support in BenDecoder. These tests do not rely on the `bendl` CLI binary being built. Instead, they construct `.bendl` bundles directly in Python from the on-disk format spec @@ -6,8 +6,8 @@ self-contained and lets them stress odd byte layouts that a CLI-based helper could not produce (truncated files, bad magic, dangling offsets, etc). -Real BEN/XBEN stream payloads are produced via ``PyBenEncoder`` / -``compress_jsonl_to_xben`` so the stream region always matches what the +Real BEN/XBEN stream payloads are produced via ``BenEncoder`` / +``encode_jsonl_to_xben`` so the stream region always matches what the main compression pipeline would produce. """ @@ -25,10 +25,10 @@ import binary_ensemble from binary_ensemble import ( - PyBenDecoder, - PyBenEncoder, - compress_jsonl_to_ben, - compress_jsonl_to_xben, + BenDecoder, + BenEncoder, + encode_jsonl_to_ben, + encode_jsonl_to_xben, ) @@ -246,9 +246,9 @@ def _write_jsonl(samples: List[List[int]], path: Path) -> None: def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: - """Produce real BEN bytes for ``samples`` via ``PyBenEncoder``.""" + """Produce real BEN bytes for ``samples`` via ``BenEncoder``.""" ben_path = tmp / "inner.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant=variant, ben_file_only=True ) as enc: for a in samples: @@ -260,7 +260,7 @@ def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standar src = tmp / "src.jsonl" _write_jsonl(samples, src) out = tmp / "inner.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, out, overwrite=True, variant=variant, n_threads=1, compression_level=1 ) return out.read_bytes() @@ -277,8 +277,8 @@ def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: def test_module_exports_decoder_and_encoder() -> None: - assert "PyBenDecoder" in binary_ensemble.__all__ - assert "PyBenEncoder" in binary_ensemble.__all__ + assert "BenDecoder" in binary_ensemble.__all__ + assert "BenEncoder" in binary_ensemble.__all__ assert "PyBundleReader" not in binary_ensemble.__all__ @@ -328,7 +328,7 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "out.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) assert reader.is_complete() is True @@ -366,10 +366,10 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: # read_json_asset by name. assert reader.read_json_asset("metadata.json") == json.loads(metadata_json) - # extract_stream then decode via PyBenDecoder. + # extract_stream then decode via BenDecoder. extracted = tmp_path / "stream.ben" reader.extract_stream(extracted) - got = list(PyBenDecoder(extracted, mode="ben")) + got = list(BenDecoder(extracted, mode="ben")) assert got == samples # __repr__ should not crash. @@ -386,7 +386,7 @@ def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: assets=[], ) path = _write_bundle(tmp_path / "xout.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.assignment_format() == "xben" assert reader.is_complete() @@ -396,7 +396,7 @@ def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: # extract_stream → file must round-trip via the xben decoder. extracted = tmp_path / "stream.xben" reader.extract_stream(extracted) - assert list(PyBenDecoder(extracted, mode="xben")) == samples + assert list(BenDecoder(extracted, mode="xben")) == samples def test_bundle_reader_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: @@ -413,7 +413,7 @@ def test_bundle_reader_canonical_helpers_return_none_when_absent(tmp_path: Path) ], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.read_metadata() is None assert reader.read_graph() is None assert reader.read_relabel_map() is None @@ -423,7 +423,7 @@ def test_bundle_reader_asset_free_empty_stream(tmp_path: Path) -> None: # A bundle with no assets and an empty stream is legal (spec says so). bundle = build_bundle(stream_bytes=b"", sample_count=0, assets=[]) path = _write_bundle(tmp_path / "empty.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() assert reader.count_samples() == 0 assert reader.asset_names() == [] @@ -448,7 +448,7 @@ def test_read_asset_bytes_raises_keyerror_for_unknown_name(tmp_path: Path) -> No ], ) path = _write_bundle(tmp_path / "x.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) with pytest.raises(KeyError, match="no asset named"): reader.read_asset_bytes("missing.bin") with pytest.raises(KeyError): @@ -470,7 +470,7 @@ def test_read_json_asset_rejects_non_utf8_payload(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "bin.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) # Raw bytes come back fine. assert reader.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" # But the JSON helper must reject non-UTF8 bytes. @@ -492,7 +492,7 @@ def test_read_json_asset_rejects_malformed_json(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "m.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) # Raw bytes: fine. assert reader.read_asset_bytes("metadata.json") == b"not a json {{{" # Parsed via python's json module: must raise. @@ -511,7 +511,7 @@ def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "u.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.asset_names() == [name] assert reader.read_asset_bytes(name) == b"payload" @@ -529,7 +529,7 @@ def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) names = reader.asset_names() assert names == list(payloads.keys()) # Spot-check the contents round-trip. @@ -549,7 +549,7 @@ def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) sample_count=1, ) path = _write_bundle(tmp_path / "a.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) target = tmp_path / "already.ben" target.write_bytes(b"pre-existing") with pytest.raises(OSError, match="already exists"): @@ -564,12 +564,12 @@ def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: sample_count=2, ) path = _write_bundle(tmp_path / "b.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) target = tmp_path / "out.ben" target.write_bytes(b"filler") reader.extract_stream(target, overwrite=True) - # Re-opening the extracted file via PyBenDecoder confirms it's a valid .ben. - assert list(PyBenDecoder(target, mode="ben")) == [[1, 2], [3, 4]] + # Re-opening the extracted file via BenDecoder confirms it's a valid .ben. + assert list(BenDecoder(target, mode="ben")) == [[1, 2], [3, 4]] # --------------------------------------------------------------------------- @@ -579,7 +579,7 @@ def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: def test_open_rejects_missing_file(tmp_path: Path) -> None: with pytest.raises(OSError, match="Failed to open"): - PyBenDecoder(tmp_path / "does_not_exist.bendl") + BenDecoder(tmp_path / "does_not_exist.bendl") def test_open_rejects_bad_magic(tmp_path: Path) -> None: @@ -592,7 +592,7 @@ def test_open_rejects_bad_magic(tmp_path: Path) -> None: # Bad magic → detect_is_bundle returns False → treated as plain BEN # stream → fails because the bytes aren't a valid BEN banner. with pytest.raises(Exception): - PyBenDecoder(path) + BenDecoder(path) def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: @@ -603,14 +603,14 @@ def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "oldfuture.bendl", bundle) with pytest.raises(Exception, match="Failed to parse bundle header"): - PyBenDecoder(path) + BenDecoder(path) def test_open_rejects_truncated_header(tmp_path: Path) -> None: path = tmp_path / "short.bendl" path.write_bytes(b"BENDL\x00\x00\x01\x00") # magic plus 2 bytes — not enough with pytest.raises(Exception, match="Failed to parse bundle header"): - PyBenDecoder(path) + BenDecoder(path) def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> None: @@ -628,7 +628,7 @@ def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> Non struct.pack_into(" None: @@ -640,7 +640,7 @@ def test_open_rejects_bundle_with_chopped_directory_bytes(tmp_path: Path) -> Non # Drop the final two bytes of the directory. path = _write_bundle(tmp_path / "chop.bendl", bundle[:-2]) with pytest.raises(Exception): - PyBenDecoder(path) + BenDecoder(path) def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: @@ -656,7 +656,7 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "dup.bendl", duplicate_names) with pytest.raises(Exception, match="malformed directory"): - PyBenDecoder(path) + BenDecoder(path) wrong_singleton_name = build_bundle( stream_bytes=stream, @@ -672,7 +672,7 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "singleton.bendl", wrong_singleton_name) with pytest.raises(Exception, match="malformed directory"): - PyBenDecoder(path) + BenDecoder(path) def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) -> None: @@ -689,7 +689,7 @@ def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) path = _write_bundle(tmp_path / "trailing_dir.bendl", bytes(bundle)) with pytest.raises(Exception, match="trailing byte"): - PyBenDecoder(path) + BenDecoder(path) def test_incomplete_bundle_scans_stream_for_sample_count(tmp_path: Path) -> None: @@ -707,18 +707,18 @@ def test_incomplete_bundle_scans_stream_for_sample_count(tmp_path: Path) -> None sample_count=-1, ) path = _write_bundle(tmp_path / "incomplete.bendl", header + stream) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() is False assert reader.count_samples() == 1 assert reader.asset_names() == [] # extract_stream should still write out bytes that decode as BEN. out = tmp_path / "extracted.ben" reader.extract_stream(out) - assert list(PyBenDecoder(out, mode="ben")) == [[1, 2, 3]] + assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] def test_unknown_assignment_format_byte_rejects_at_construction(tmp_path: Path) -> None: - # Assignment format byte = 99 → unrecognized. PyBenDecoder must + # Assignment format byte = 99 → unrecognized. BenDecoder must # reject the bundle at construction time. bundle = bytearray( build_bundle( @@ -731,7 +731,7 @@ def test_unknown_assignment_format_byte_rejects_at_construction(tmp_path: Path) bundle[13] = 99 path = _write_bundle(tmp_path / "wtfmt.bendl", bytes(bundle)) with pytest.raises(Exception, match="unrecognized assignment_format"): - PyBenDecoder(path) + BenDecoder(path) def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: @@ -758,7 +758,7 @@ def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: # Flip a byte well past the magic so the decoder reads it and fails. bundle[xz_start + 20] ^= 0xFF path = _write_bundle(tmp_path / "badxz.bendl", bytes(bundle)) - reader = PyBenDecoder(path) + reader = BenDecoder(path) # Opening works — the header/directory are intact. with pytest.raises(OSError): reader.read_asset_bytes("graph.json") @@ -773,7 +773,7 @@ def test_directory_entry_with_zero_length_custom_payload(tmp_path: Path) -> None ], ) path = _write_bundle(tmp_path / "zlen.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.read_asset_bytes("empty.bin") == b"" entry = next(a for a in reader.list_assets() if a["name"] == "empty.bin") assert entry["len"] == 0 @@ -791,7 +791,7 @@ def test_repr_on_incomplete_bundle(tmp_path: Path) -> None: sample_count=-1, ) path = _write_bundle(tmp_path / "rep.bendl", header + stream) - reader = PyBenDecoder(path) + reader = BenDecoder(path) # Incomplete bundle should open without error. assert reader.is_complete() is False assert reader.asset_names() == [] @@ -829,7 +829,7 @@ def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) - partial = full_ben[: len(full_ben) - 3] path = _write_bundle(tmp_path / "crashed.bendl", _incomplete_bundle(partial)) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() is False assert reader.assignment_format() == "ben" # count_samples scans the truncated stream; it may error or return a @@ -846,7 +846,7 @@ def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) - assert extracted.read_bytes() == partial # The extracted file opens as a BEN stream (banner is intact). - dec = PyBenDecoder(extracted, mode="ben") + dec = BenDecoder(extracted, mode="ben") # Iterating through the truncated stream must either yield a strict # prefix of the samples and then raise, or raise on the very first # frame — both are acceptable outcomes. What is NOT acceptable is @@ -867,14 +867,14 @@ def test_interrupted_ben_stream_inside_banner_fails_to_open_decoder( full_ben = _ben_bytes_for([[1, 2, 3]], tmp_path) path = _write_bundle(tmp_path / "head_cut.bendl", _incomplete_bundle(full_ben[:8])) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() is False extracted = tmp_path / "head_cut.ben" reader.extract_stream(extracted) # The decoder must reject a BEN file whose banner is incomplete. with pytest.raises(Exception, match="Failed to create BenDecoder"): - PyBenDecoder(extracted, mode="ben") + BenDecoder(extracted, mode="ben") def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: @@ -882,7 +882,7 @@ def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: # before any stream bytes landed. path = _write_bundle(tmp_path / "zero.bendl", _incomplete_bundle(b"")) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() is False assert reader.asset_names() == [] # Zero stream bytes → scan fails (no BEN banner). @@ -894,7 +894,7 @@ def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: assert extracted.read_bytes() == b"" # A zero-byte .ben has no banner → decoder construction must fail. with pytest.raises(Exception, match="Failed to create BenDecoder"): - PyBenDecoder(extracted, mode="ben") + BenDecoder(extracted, mode="ben") def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) -> None: @@ -917,7 +917,7 @@ def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) # The reader's open() succeeds — the header fields parse as-is and # validation is lazy. - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() # sample_count is what the header says. assert reader.count_samples() == len(samples) @@ -954,7 +954,7 @@ def test_read_metadata_after_extract_stream_still_works(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "seq.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) reader.extract_stream(tmp_path / "s.ben") assert reader.read_metadata() == {"x": 1} reader.extract_stream(tmp_path / "s2.ben", overwrite=True) @@ -981,7 +981,7 @@ def test_long_asset_name_near_u16_max(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "long.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.asset_names() == [long_name] assert reader.read_asset_bytes(long_name) == payload @@ -1029,7 +1029,7 @@ def test_list_assets_flag_fidelity(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "flags.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) got = reader.list_assets() assert len(got) == len(combos) for entry, want in zip(got, expected): @@ -1058,7 +1058,7 @@ def test_read_asset_bytes_is_idempotent(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "idem.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) for _ in range(5): assert reader.read_asset_bytes("raw.bin") == payload assert reader.read_asset_bytes("compressed.bin") == payload @@ -1095,7 +1095,7 @@ def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: assets=assets, ) path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.asset_names() == [name for name, _ in expected] # Sample every 37th asset and verify the payload decodes correctly @@ -1114,7 +1114,7 @@ def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: # Build 20 deliberately-different bundles from a seeded PRNG. Each one # mixes random asset sizes, random flags, random samples, and is then - # fully round-tripped through PyBenDecoder on a .bendl bundle. + # fully round-tripped through BenDecoder on a .bendl bundle. rng = random.Random(0xFEED_FACE) for trial in range(20): n_assets = rng.randint(0, 12) @@ -1147,7 +1147,7 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) assert reader.is_complete() assert reader.count_samples() == n_samples assert reader.asset_names() == [name for name, _ in truth] @@ -1156,7 +1156,7 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: extracted = tmp_path / f"fuzz-{trial}.ben" reader.extract_stream(extracted) - assert list(PyBenDecoder(extracted, mode="ben")) == samples + assert list(BenDecoder(extracted, mode="ben")) == samples def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: @@ -1192,7 +1192,7 @@ def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: ], ) path = _write_bundle(tmp_path / "interleave.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) # Strongly non-sequential access pattern. assert reader.read_asset_bytes("blob.bin") == custom @@ -1210,7 +1210,7 @@ def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: b = (tmp_path / "b.ben").read_bytes() c = (tmp_path / "c.ben").read_bytes() assert a == b == c - assert list(PyBenDecoder(tmp_path / "a.ben", mode="ben")) == samples + assert list(BenDecoder(tmp_path / "a.ben", mode="ben")) == samples def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) -> None: @@ -1219,14 +1219,14 @@ def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) - sample_count=1, ) path = _write_bundle(tmp_path / "mini.bendl", bundle) - reader = PyBenDecoder(path) + reader = BenDecoder(path) missing = tmp_path / "does" / "not" / "exist" / "out.ben" with pytest.raises(OSError): reader.extract_stream(missing) # --------------------------------------------------------------------------- -# PyBenEncoder bundle-output tests +# BenEncoder bundle-output tests # --------------------------------------------------------------------------- @@ -1247,11 +1247,11 @@ def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) - def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None: out = tmp_path / "stream.bendl" samples = [[1, 1, 2, 2], [3, 3, 2, 2], [3, 3, 3, 3]] - with PyBenEncoder(out, overwrite=True, variant="standard") as enc: + with BenEncoder(out, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) assert reader.is_complete() assert reader.count_samples() == len(samples) @@ -1262,19 +1262,19 @@ def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None extracted = tmp_path / "extracted.ben" reader.extract_stream(extracted) - assert list(PyBenDecoder(extracted, mode="ben")) == samples + assert list(BenDecoder(extracted, mode="ben")) == samples def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: out = tmp_path / "with_graph.bendl" samples = [[1, 1, 2, 2], [1, 1, 3, 3]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.is_complete() assert reader.count_samples() == len(samples) assert reader.asset_names() == ["graph.json"] @@ -1296,13 +1296,13 @@ def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: out = tmp_path / "with_graph_path.bendl" samples = [[0, 0, 1, 1]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=graph_path ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.asset_names() == ["graph.json"] assert reader.read_graph() == SAMPLE_GRAPH @@ -1315,13 +1315,13 @@ def test_pybenencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: out = tmp_path / "via-str.bendl" samples = [[0, 1, 0, 1]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=str(graph_path) ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1329,13 +1329,13 @@ def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: raw = json.dumps(SAMPLE_GRAPH).encode("utf-8") out = tmp_path / "via-bytes.bendl" samples = [[2, 2, 2, 2]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=raw ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1343,13 +1343,13 @@ def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: buf = io.BytesIO(json.dumps(SAMPLE_GRAPH).encode("utf-8")) out = tmp_path / "via-bytesio.bendl" samples = [[1, 2, 1, 2]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=buf ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1357,13 +1357,13 @@ def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: buf = io.StringIO(json.dumps(SAMPLE_GRAPH)) out = tmp_path / "via-stringio.bendl" samples = [[3, 3, 3, 3]] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=buf ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.read_graph() == SAMPLE_GRAPH @@ -1371,17 +1371,17 @@ def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> No out = tmp_path / "full.bendl" rng = random.Random(0xCAFE) samples = [[rng.randint(1, 8) for _ in range(12)] for _ in range(15)] - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH ) as enc: for a in samples: enc.write(a) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.count_samples() == len(samples) extracted = tmp_path / "full.ben" reader.extract_stream(extracted) - assert list(PyBenDecoder(extracted, mode="ben")) == samples + assert list(BenDecoder(extracted, mode="ben")) == samples # And the graph still round-trips from the same reader. assert reader.read_graph() == SAMPLE_GRAPH @@ -1389,7 +1389,7 @@ def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> No def test_pybenencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: out = tmp_path / "ben-with-graph.ben" with pytest.raises(ValueError, match="ben_file_only"): - PyBenEncoder( + BenEncoder( out, overwrite=True, variant="standard", @@ -1402,26 +1402,26 @@ def test_pybenencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: # A ben_file_only=True output should be byte-identical to the legacy # plain-BEN path, so the header has no BENDL magic. out = tmp_path / "legacy.ben" - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", ben_file_only=True ) as enc: enc.write([1, 2, 3]) blob = out.read_bytes() assert not blob.startswith(BENDL_MAGIC) - # PyBenDecoder should still read it in ben mode. - assert list(PyBenDecoder(out, mode="ben")) == [[1, 2, 3]] + # BenDecoder should still read it in ben mode. + assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: out = tmp_path / "idem.bendl" - enc = PyBenEncoder(out, overwrite=True, variant="standard") + enc = BenEncoder(out, overwrite=True, variant="standard") enc.write([1, 1, 2]) enc.close() enc.close() # second close must be a no-op with pytest.raises(OSError, match="already been closed"): enc.write([1, 2, 3]) - reader = PyBenDecoder(out) + reader = BenDecoder(out) assert reader.is_complete() assert reader.count_samples() == 1 @@ -1429,11 +1429,11 @@ def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: out = tmp_path / "bad.bendl" with pytest.raises(ValueError, match="graph must be"): - PyBenEncoder(out, overwrite=True, variant="standard", graph=12345) + BenEncoder(out, overwrite=True, variant="standard", graph=12345) # --------------------------------------------------------------------------- -# PyBenDecoder opened directly on a .bendl bundle. +# BenDecoder opened directly on a .bendl bundle. # # The decoder auto-detects the BENDL magic and, when present, iterates only # the embedded stream region while exposing TOC / asset helpers on the side. @@ -1451,7 +1451,7 @@ def test_pybendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "stream.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() is True assert dec.assignment_format() == "ben" assert dec.is_complete() is True @@ -1469,7 +1469,7 @@ def test_pybendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "stream.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() is True assert dec.assignment_format() == "xben" assert len(dec) == len(samples) @@ -1514,7 +1514,7 @@ def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "rich.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) # TOC surface assert dec.asset_names() == [ @@ -1558,7 +1558,7 @@ def test_pybendecoder_bundle_canonical_helpers_return_none_when_absent( ], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.read_graph() is None assert dec.read_metadata() is None assert dec.read_relabel_map() is None @@ -1572,7 +1572,7 @@ def test_pybendecoder_bundle_subsample_range(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "range.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(3, 6) assert list(dec) == samples[2:6] @@ -1585,7 +1585,7 @@ def test_pybendecoder_bundle_subsample_indices(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "idx.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_indices([1, 4, 8]) assert list(dec) == [samples[0], samples[3], samples[7]] @@ -1598,7 +1598,7 @@ def test_pybendecoder_bundle_subsample_every(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "every.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_every(3, 2) assert list(dec) == [samples[1], samples[4], samples[7]] @@ -1614,7 +1614,7 @@ def test_pybendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "ignore_mode.bendl", bundle) - dec = PyBenDecoder(path, mode="xben") + dec = BenDecoder(path, mode="xben") assert dec.assignment_format() == "ben" assert list(dec) == samples @@ -1624,13 +1624,13 @@ def test_pybendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None # bundle surface is simply unavailable. samples = [[1, 2, 3], [4, 5, 6]] ben_path = tmp_path / "plain.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) assert dec.is_bundle() is False assert dec.assignment_format() == "ben" assert list(dec) == samples @@ -1654,12 +1654,12 @@ def test_pybendecoder_plain_stream_rejects_bundle_methods( tmp_path: Path, method_call ) -> None: ben_path = tmp_path / "plain.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: enc.write([1, 2, 3]) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) with pytest.raises(Exception, match="only available on .bendl bundles"): method_call(dec) @@ -1668,28 +1668,28 @@ def test_pybendecoder_plain_stream_error_mentions_ben_file_only( tmp_path: Path, ) -> None: ben_path = tmp_path / "plain.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: enc.write([1]) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) with pytest.raises(Exception, match="ben_file_only=False"): dec.read_graph() def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> None: - # End-to-end: a bundle written by PyBenEncoder (with a graph asset) - # must round-trip through a single PyBenDecoder call — no need to + # End-to-end: a bundle written by BenEncoder (with a graph asset) + # must round-trip through a single BenDecoder call — no need to # extract the stream first. out = tmp_path / "e2e.bendl" - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH ) as enc: for a in [[1, 2, 3], [2, 3, 4]]: enc.write(a) - dec = PyBenDecoder(out) + dec = BenDecoder(out) assert dec.is_bundle() is True assert dec.is_complete() is True assert dec.assignment_format() == "ben" @@ -1716,7 +1716,7 @@ def test_pybendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: path = tmp_path / "incomplete.bendl" path.write_bytes(header + stream) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() is True assert dec.is_complete() is False # len() forces the fallback scan, which must agree with the data. @@ -1746,7 +1746,7 @@ def test_pybendecoder_incomplete_bundle_count_samples_matches_len( path = tmp_path / "incomplete_count.bendl" path.write_bytes(header + stream) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.count_samples() == len(samples) assert len(dec) == len(samples) @@ -1763,7 +1763,7 @@ def test_pybendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "weird_fmt.bendl", bundle) with pytest.raises(Exception, match="unrecognized assignment_format"): - PyBenDecoder(path) + BenDecoder(path) def test_pybendecoder_empty_stream_bundle(tmp_path: Path) -> None: @@ -1772,7 +1772,7 @@ def test_pybendecoder_empty_stream_bundle(tmp_path: Path) -> None: bundle = build_bundle(stream_bytes=_ben_bytes_for([], tmp_path), sample_count=0) path = _write_bundle(tmp_path / "empty.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() is True assert len(dec) == 0 assert dec.count_samples() == 0 @@ -1800,7 +1800,7 @@ def test_pybendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> N ) path = _write_bundle(tmp_path / "interleave.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) it = iter(dec) assert next(it) == samples[0] @@ -1824,7 +1824,7 @@ def test_pybendecoder_bundle_subsample_range_rejects_out_of_bounds( sample_count=len(samples), ) path = _write_bundle(tmp_path / "range_bad.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception, match="end must be <= number of samples"): dec.subsample_range(1, 99) with pytest.raises(Exception, match="1-based"): @@ -1840,11 +1840,11 @@ def test_pybendecoder_bundle_subsample_indices_rejects_out_of_bounds( sample_count=len(samples), ) path = _write_bundle(tmp_path / "idx_bad.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception, match="number of samples"): dec.subsample_indices([1, 42]) # Empty index list is also rejected. - dec2 = PyBenDecoder(path) + dec2 = BenDecoder(path) with pytest.raises(Exception, match="must not be empty"): dec2.subsample_indices([]) @@ -1856,10 +1856,10 @@ def test_pybendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> sample_count=len(samples), ) path = _write_bundle(tmp_path / "every_bad.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception, match="offset must be <= number of samples"): dec.subsample_every(1, 99) - dec2 = PyBenDecoder(path) + dec2 = BenDecoder(path) with pytest.raises(Exception, match="step and offset must be >= 1"): dec2.subsample_every(0, 1) @@ -1869,12 +1869,12 @@ def test_pybendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: # but must return the same answer. samples = [[1, 2], [3, 4], [5, 6]] ben_path = tmp_path / "cached.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) assert len(dec) == len(samples) assert len(dec) == len(samples) # Explicit count_samples must also agree. @@ -1888,14 +1888,14 @@ def test_pybendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: path = tmp_path / "tiny.ben" path.write_bytes(b"abcd") with pytest.raises(Exception): - PyBenDecoder(path) + BenDecoder(path) def test_pybendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: path = tmp_path / "empty.ben" path.write_bytes(b"") with pytest.raises(Exception): - PyBenDecoder(path) + BenDecoder(path) def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: @@ -1913,7 +1913,7 @@ def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> ], ) path = _write_bundle(tmp_path / "bad_utf8.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) # Raw bytes are fine. assert dec.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" with pytest.raises(Exception, match="not valid UTF-8"): @@ -1934,20 +1934,20 @@ def test_pybendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> ], ) path = _write_bundle(tmp_path / "bad_json.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(json.JSONDecodeError): dec.read_metadata() def test_pybendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> None: - # A bundle built with PyBenEncoder compresses the graph asset as xz; - # read_graph() on PyBenDecoder must still return the decoded JSON. + # A bundle built with BenEncoder compresses the graph asset as xz; + # read_graph() on BenDecoder must still return the decoded JSON. out = tmp_path / "xz_graph.bendl" - with PyBenEncoder( + with BenEncoder( out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH ) as enc: enc.write([1, 2, 3]) - dec = PyBenDecoder(out) + dec = BenDecoder(out) # Spot-check that graph.json was actually stored compressed. by_name = {a["name"]: a for a in dec.list_assets()} assert "xz" in by_name["graph.json"]["flags"] @@ -1975,7 +1975,7 @@ def test_pybendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: ) path = _write_bundle(tmp_path / "xben_assets.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.assignment_format() == "xben" assert dec.asset_names() == ["metadata.json"] assert dec.read_metadata() == {"variant": "mkv_chain"} @@ -1992,7 +1992,7 @@ def test_pybendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> sample_count=len(samples), ) path = _write_bundle(tmp_path / "unsorted.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.warns(UserWarning, match="sorted and unique"): dec.subsample_indices([4, 1, 4, 1]) assert list(dec) == [[1], [4]] @@ -2005,11 +2005,11 @@ def test_pybendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: src = tmp_path / "src.jsonl" _write_jsonl(samples, src) xben_path = tmp_path / "plain.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben_path, overwrite=True, variant="standard", n_threads=1, compression_level=1 ) with pytest.warns(UserWarning): - dec = PyBenDecoder(xben_path, mode="xben") + dec = BenDecoder(xben_path, mode="xben") assert dec.is_bundle() is False assert dec.assignment_format() == "xben" assert list(dec) == samples @@ -2036,7 +2036,7 @@ def test_pybendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( path = tmp_path / "incomplete_toc.bendl" path.write_bytes(header + stream) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() is True assert dec.is_complete() is False assert dec.asset_names() == [] @@ -2055,7 +2055,7 @@ def test_pybendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: sample_count=len(samples), ) path = _write_bundle(tmp_path / "twice.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert list(dec) == samples # A second pass reopens the stream region from the start. assert list(dec) == samples @@ -2064,12 +2064,12 @@ def test_pybendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: def test_pybendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] ben_path = tmp_path / "twice.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) assert list(dec) == samples assert list(dec) == samples @@ -2084,7 +2084,7 @@ def test_pybendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> No sample_count=len(samples), ) path = _write_bundle(tmp_path / "range_twice.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(3, 6) expected = samples[2:6] assert list(dec) == expected @@ -2098,7 +2098,7 @@ def test_pybendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> sample_count=len(samples), ) path = _write_bundle(tmp_path / "idx_twice.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_indices([2, 5, 7]) expected = [samples[1], samples[4], samples[6]] assert list(dec) == expected @@ -2112,7 +2112,7 @@ def test_pybendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> No sample_count=len(samples), ) path = _write_bundle(tmp_path / "every_twice.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_every(3, 2) expected = [samples[1], samples[4], samples[7]] assert list(dec) == expected @@ -2128,7 +2128,7 @@ def test_pybendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> sample_count=len(samples), ) path = _write_bundle(tmp_path / "reselect.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(1, 3) assert list(dec) == samples[:3] dec.subsample_indices([4, 7]) @@ -2147,7 +2147,7 @@ def test_pybendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: sample_count=len(samples), ) path = _write_bundle(tmp_path / "partial.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) it = iter(dec) assert next(it) == samples[0] assert next(it) == samples[1] @@ -2167,7 +2167,7 @@ def test_pybendecoder_count_samples_after_subsample_preserves_len( sample_count=len(samples), ) path = _write_bundle(tmp_path / "count_after_sub.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 5) assert len(dec) == 4 assert dec.count_samples() == len(samples) @@ -2183,12 +2183,12 @@ def test_pybendecoder_count_samples_plain_after_subsample_preserves_len( # non-bundle branch of `ensure_base_len`. samples = [[i] for i in range(1, 11)] ben_path = tmp_path / "plain_count.ben" - with PyBenEncoder( + with BenEncoder( ben_path, overwrite=True, variant="standard", ben_file_only=True ) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(ben_path) + dec = BenDecoder(ben_path) dec.subsample_every(3, 1) expected = samples[::3] assert len(dec) == len(expected) @@ -2208,7 +2208,7 @@ def test_pybendecoder_subsample_then_count_samples_then_reiterate( sample_count=len(samples), ) path = _write_bundle(tmp_path / "sub_count_restart.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_indices([1, 4, 8]) assert dec.count_samples() == len(samples) expected = [samples[0], samples[3], samples[7]] @@ -2226,7 +2226,7 @@ def test_pybendecoder_bundle_read_json_asset_missing_name_raises_keyerror( sample_count=1, ) path = _write_bundle(tmp_path / "missing_json.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(KeyError, match="nope.json"): dec.read_json_asset("nope.json") @@ -2243,7 +2243,7 @@ def test_pybendecoder_bundle_len_uses_header_fast_path(tmp_path: Path) -> None: sample_count=len(samples), ) path = _write_bundle(tmp_path / "fast_len.bendl", bundle) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert len(dec) == len(samples) # A second call returns the cached value and must agree. assert len(dec) == len(samples) diff --git a/pyben/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py similarity index 75% rename from pyben/tests/test_python_pipelines.py rename to ben-py/tests/test_python_pipelines.py index f26959f..5f8ffea 100644 --- a/pyben/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -8,14 +8,14 @@ import binary_ensemble from binary_ensemble import ( - PyBenDecoder, - PyBenEncoder, - compress_ben_to_xben, - compress_jsonl_to_ben, - compress_jsonl_to_xben, - decompress_ben_to_jsonl, - decompress_xben_to_ben, - decompress_xben_to_jsonl, + BenDecoder, + BenEncoder, + encode_ben_to_xben, + encode_jsonl_to_ben, + encode_jsonl_to_xben, + decode_ben_to_jsonl, + decode_xben_to_ben, + decode_xben_to_jsonl, ) # ---------- Helpers ---------- @@ -102,8 +102,8 @@ def test_ben_pipeline(tmp_path: Path) -> None: ben = tmp_path / "out.ben" out_jsonl = tmp_path / "round.jsonl" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - decompress_ben_to_jsonl(ben, out_jsonl, overwrite=True) + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + decode_ben_to_jsonl(ben, out_jsonl, overwrite=True) assert src.read_bytes() == out_jsonl.read_bytes() @@ -119,8 +119,8 @@ def test_mkvben_pipeline(tmp_path: Path) -> None: ben = tmp_path / "out_mkv.ben" out_jsonl = tmp_path / "round_mkv.jsonl" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - decompress_ben_to_jsonl(ben, out_jsonl, overwrite=True) + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + decode_ben_to_jsonl(ben, out_jsonl, overwrite=True) assert src.read_bytes() == out_jsonl.read_bytes() @@ -137,11 +137,11 @@ def test_xben_pipeline(tmp_path: Path) -> None: ben = tmp_path / "out.ben" round_jsonl = tmp_path / "round.jsonl" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="standard", n_threads=1, compression_level=1 ) - decompress_xben_to_ben(xben, ben, overwrite=True) - decompress_ben_to_jsonl(ben, round_jsonl, overwrite=True) + decode_xben_to_ben(xben, ben, overwrite=True) + decode_ben_to_jsonl(ben, round_jsonl, overwrite=True) assert src.read_bytes() == round_jsonl.read_bytes() @@ -158,11 +158,11 @@ def test_xmkvben_pipeline(tmp_path: Path) -> None: ben = tmp_path / "out_mkv.ben" round_jsonl = tmp_path / "round_mkv.jsonl" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) - decompress_xben_to_ben(xben, ben, overwrite=True) - decompress_ben_to_jsonl(ben, round_jsonl, overwrite=True) + decode_xben_to_ben(xben, ben, overwrite=True) + decode_ben_to_jsonl(ben, round_jsonl, overwrite=True) assert src.read_bytes() == round_jsonl.read_bytes() @@ -179,14 +179,14 @@ def test_decoder_iterator_matches_jsonl_ben(tmp_path: Path) -> None: write_jsonl(seq, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") # Baseline: assignments from JSONL baseline = read_jsonl_assignments(src) - # PyBenDecoder over BEN + # BenDecoder over BEN got: list[list[int]] = [] - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") for a in dec: got.append(a) @@ -202,18 +202,18 @@ def test_decoder_iterator_matches_jsonl_xben(tmp_path: Path) -> None: write_jsonl(seq, src) xben = tmp_path / "out.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) # Baseline via full decompression roundtrip = tmp_path / "direct.jsonl" - decompress_xben_to_jsonl(xben, roundtrip, overwrite=True) + decode_xben_to_jsonl(xben, roundtrip, overwrite=True) baseline = read_jsonl_assignments(roundtrip) # Iterator directly over XBEN got: list[list[int]] = [] - dec = PyBenDecoder(xben, mode="xben") + dec = BenDecoder(xben, mode="xben") for a in dec: got.append(a) @@ -232,7 +232,7 @@ def test_subsample_indices(tmp_path: Path) -> None: write_jsonl(seq, src) xben = tmp_path / "out.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) @@ -241,7 +241,7 @@ def test_subsample_indices(tmp_path: Path) -> None: baseline = [seq[i - 1] for i in want] got: list[list[int]] = [] - dec = PyBenDecoder(xben, mode="xben").subsample_indices(want) + dec = BenDecoder(xben, mode="xben").subsample_indices(want) for a in dec: got.append(a) @@ -257,13 +257,13 @@ def test_subsample_range(tmp_path: Path) -> None: write_jsonl(seq, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") start, end = 11, 77 baseline = seq[start - 1 : end] got: list[list[int]] = [] - dec = PyBenDecoder(ben, mode="ben").subsample_range(start, end) + dec = BenDecoder(ben, mode="ben").subsample_range(start, end) for a in dec: got.append(a) @@ -279,7 +279,7 @@ def test_subsample_every(tmp_path: Path) -> None: write_jsonl(seq, src) xben = tmp_path / "out.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) @@ -287,7 +287,7 @@ def test_subsample_every(tmp_path: Path) -> None: baseline = [seq[i - 1] for i in range(offset, n_samples + 1, step)] got: list[list[int]] = [] - dec = PyBenDecoder(xben, mode="xben").subsample_every(step, offset) + dec = BenDecoder(xben, mode="xben").subsample_every(step, offset) for a in dec: got.append(a) @@ -303,14 +303,14 @@ def test_pybenencoder_roundtrip(tmp_path: Path) -> None: seq = gen_sequence_standard(rng, n_samples) ben = tmp_path / "out.ben" - with PyBenEncoder( + with BenEncoder( ben, overwrite=True, variant="standard", ben_file_only=True ) as enc: for a in seq: enc.write(a) # Use decoder to read back - got = list(PyBenDecoder(ben, mode="ben")) + got = list(BenDecoder(ben, mode="ben")) assert got == seq @@ -330,10 +330,10 @@ def test_ben_to_xben_and_back(tmp_path: Path) -> None: ben2 = tmp_path / "out.ben" out_jsonl = tmp_path / "out.jsonl" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - compress_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) - decompress_xben_to_ben(xben, ben2, overwrite=True) - decompress_ben_to_jsonl(ben2, out_jsonl, overwrite=True) + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + decode_xben_to_ben(xben, ben2, overwrite=True) + decode_ben_to_jsonl(ben2, out_jsonl, overwrite=True) assert src.read_bytes() == out_jsonl.read_bytes() @@ -346,9 +346,9 @@ def test_decoder_subsample_indices_rejects_empty_input(tmp_path: Path) -> None: write_jsonl(seq, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") with pytest.raises(Exception, match="indices must not be empty"): dec.subsample_indices([]) @@ -361,9 +361,9 @@ def test_decoder_subsample_every_rejects_offset_past_end(tmp_path: Path) -> None write_jsonl(seq, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") with pytest.raises(Exception, match="offset must be <="): dec.subsample_every(2, 99) @@ -376,22 +376,22 @@ def test_compress_helpers_reject_unknown_variants(tmp_path: Path) -> None: write_jsonl(seq, src) with pytest.raises(ValueError, match="Unknown variant"): - compress_jsonl_to_ben(src, tmp_path / "out.ben", overwrite=True, variant="weird") + encode_jsonl_to_ben(src, tmp_path / "out.ben", overwrite=True, variant="weird") with pytest.raises(ValueError, match="Unknown variant"): - compress_jsonl_to_xben(src, tmp_path / "out.xben", overwrite=True, variant="weird") + encode_jsonl_to_xben(src, tmp_path / "out.xben", overwrite=True, variant="weird") def test_module_exports_are_exposed() -> None: expected = { - "PyBenDecoder", - "PyBenEncoder", - "compress_jsonl_to_ben", - "compress_ben_to_xben", - "compress_jsonl_to_xben", - "decompress_ben_to_jsonl", - "decompress_xben_to_jsonl", - "decompress_xben_to_ben", + "BenDecoder", + "BenEncoder", + "encode_jsonl_to_ben", + "encode_ben_to_xben", + "encode_jsonl_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", } assert expected.issubset(set(binary_ensemble.__all__)) for name in expected: @@ -403,18 +403,18 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3]] default_ben = tmp_path / "default.ben" - with PyBenEncoder(default_ben, overwrite=True, ben_file_only=True) as enc: + with BenEncoder(default_ben, overwrite=True, ben_file_only=True) as enc: for sample in samples: enc.write(sample) - assert list(PyBenDecoder(default_ben, mode="ben")) == samples + assert list(BenDecoder(default_ben, mode="ben")) == samples src = tmp_path / "src.jsonl" write_jsonl(samples, src) alias_ben = tmp_path / "alias.ben" alias_xben = tmp_path / "alias.xben" - compress_jsonl_to_ben(src, alias_ben, overwrite=True, variant="markov") - compress_jsonl_to_xben( + encode_jsonl_to_ben(src, alias_ben, overwrite=True, variant="markov") + encode_jsonl_to_xben( src, alias_xben, overwrite=True, @@ -422,13 +422,13 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: n_threads=1, compression_level=1, ) - assert list(PyBenDecoder(alias_ben, mode="ben")) == samples - assert list(PyBenDecoder(alias_xben, mode="xben")) == samples + assert list(BenDecoder(alias_ben, mode="ben")) == samples + assert list(BenDecoder(alias_xben, mode="xben")) == samples def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: out = tmp_path / "out.ben" - enc = PyBenEncoder( + enc = BenEncoder( out, overwrite=True, variant="standard", ben_file_only=True ) enc.write([1, 2, 3]) @@ -438,14 +438,14 @@ def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: enc.write([1, 2, 3]) ctx_path = tmp_path / "ctx.ben" - with PyBenEncoder( + with BenEncoder( ctx_path, overwrite=True, variant="standard", ben_file_only=True ) as ctx_enc: ctx_enc.write([4, 5, 6]) - assert list(PyBenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] + assert list(BenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] invalid_path = tmp_path / "invalid_assignment.ben" - with PyBenEncoder( + with BenEncoder( invalid_path, overwrite=True, variant="standard", ben_file_only=True ) as invalid_enc: with pytest.raises(Exception): @@ -459,13 +459,13 @@ def test_pybenencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> N out.write_bytes(b"existing") with pytest.raises(ValueError, match="Unknown variant"): - PyBenEncoder(tmp_path / "bad.ben", overwrite=False, variant="weird") + BenEncoder(tmp_path / "bad.ben", overwrite=False, variant="weird") with pytest.raises(OSError, match="already exists"): - PyBenEncoder(out, overwrite=False, variant="standard") + BenEncoder(out, overwrite=False, variant="standard") with pytest.raises(OSError, match="Failed to create"): - PyBenEncoder( + BenEncoder( tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard", @@ -477,10 +477,10 @@ def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: write_jsonl([[1, 1, 2]], src) with pytest.raises(OSError, match="must differ"): - compress_jsonl_to_ben(src, src, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, src, overwrite=True, variant="standard") with pytest.raises(OSError, match="does not exist"): - compress_jsonl_to_ben( + encode_jsonl_to_ben( tmp_path / "missing.jsonl", tmp_path / "out.ben", overwrite=True, @@ -490,7 +490,7 @@ def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: bad_json = tmp_path / "bad.jsonl" bad_json.write_text("not json\n", encoding="utf-8") with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): - compress_jsonl_to_ben( + encode_jsonl_to_ben( bad_json, tmp_path / "bad.ben", overwrite=True, @@ -500,7 +500,7 @@ def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: bad_assign = tmp_path / "bad_assign.jsonl" bad_assign.write_text('{"assignment":"bad","sample":1}\n', encoding="utf-8") with pytest.raises(OSError, match="Failed to convert JSONL to XBEN"): - compress_jsonl_to_xben( + encode_jsonl_to_xben( bad_assign, tmp_path / "bad.xben", overwrite=True, @@ -510,7 +510,7 @@ def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: ) with pytest.raises(OSError, match="Failed to create"): - compress_jsonl_to_ben( + encode_jsonl_to_ben( src, tmp_path / "missing-dir" / "out.ben", overwrite=True, @@ -518,11 +518,11 @@ def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: ) -def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and_existing_output( +def test_encode_ben_to_xben_rejects_same_path_missing_input_invalid_header_and_existing_output( tmp_path: Path, ) -> None: with pytest.raises(OSError, match="does not exist"): - compress_ben_to_xben( + encode_ben_to_xben( tmp_path / "missing.ben", tmp_path / "out.xben", overwrite=True, @@ -534,7 +534,7 @@ def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="must differ"): - compress_ben_to_xben( + encode_ben_to_xben( bad_ben, bad_ben, overwrite=True, @@ -543,7 +543,7 @@ def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and ) with pytest.raises(OSError, match="Failed to convert BEN to XBEN"): - compress_ben_to_xben( + encode_ben_to_xben( bad_ben, tmp_path / "out.xben", overwrite=True, @@ -554,11 +554,11 @@ def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and src = tmp_path / "src.jsonl" write_jsonl([[1, 2, 3]], src) ben = tmp_path / "good.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") out = tmp_path / "exists.xben" out.write_bytes(b"exists") with pytest.raises(OSError, match="already exists"): - compress_ben_to_xben( + encode_ben_to_xben( ben, out, overwrite=False, @@ -569,21 +569,21 @@ def test_compress_ben_to_xben_rejects_same_path_missing_input_invalid_header_and def test_decoder_constructor_and_mode_errors(tmp_path: Path) -> None: with pytest.raises(Exception, match="Unknown mode"): - PyBenDecoder(tmp_path / "missing.ben", mode="weird") + BenDecoder(tmp_path / "missing.ben", mode="weird") with pytest.raises(OSError, match="Failed to open"): - PyBenDecoder(tmp_path / "missing.ben", mode="ben") + BenDecoder(tmp_path / "missing.ben", mode="ben") bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(Exception, match="Failed to create BenDecoder"): - PyBenDecoder(bad_ben, mode="ben") + BenDecoder(bad_ben, mode="ben") bad_xben = tmp_path / "bad.xben" bad_xben.write_bytes(b"garbage") with pytest.warns(UserWarning, match="XBEN may take a second"): with pytest.raises(Exception, match="Failed to create XBenDecoder"): - PyBenDecoder(bad_xben, mode="xben") + BenDecoder(bad_xben, mode="xben") def test_decoder_len_and_count_samples_are_lazy_and_cached(tmp_path: Path) -> None: @@ -592,14 +592,14 @@ def test_decoder_len_and_count_samples_are_lazy_and_cached(tmp_path: Path) -> No write_jsonl(samples, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") assert len(dec) == len(samples) assert dec.count_samples() == len(samples) assert list(dec) == samples - gone = PyBenDecoder(ben, mode="ben") + gone = BenDecoder(ben, mode="ben") assert len(gone) == len(samples) ben.unlink() with pytest.raises(Exception, match="Failed to create frame iterator"): @@ -612,12 +612,12 @@ def test_decoder_xben_len_count_and_warning(tmp_path: Path) -> None: write_jsonl(samples, src) xben = tmp_path / "out.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) with pytest.warns(UserWarning, match="XBEN may take a second"): - dec = PyBenDecoder(xben, mode="xben") + dec = BenDecoder(xben, mode="xben") assert len(dec) == len(samples) assert dec.count_samples() == len(samples) assert list(dec) == samples @@ -629,38 +629,38 @@ def test_decoder_subsample_validations_and_warning_paths(tmp_path: Path) -> None write_jsonl(samples, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") with pytest.warns(UserWarning, match="sorted and unique"): - got = list(PyBenDecoder(ben, mode="ben").subsample_indices([5, 1, 1, 3])) + got = list(BenDecoder(ben, mode="ben").subsample_indices([5, 1, 1, 3])) assert got == [samples[0], samples[2], samples[4]] with pytest.raises(Exception, match="indices must be 1-based"): - PyBenDecoder(ben, mode="ben").subsample_indices([0, 1]) + BenDecoder(ben, mode="ben").subsample_indices([0, 1]) with pytest.raises(Exception): - PyBenDecoder(ben, mode="ben").subsample_indices([-1]) + BenDecoder(ben, mode="ben").subsample_indices([-1]) with pytest.raises(Exception, match="indices must be <="): - PyBenDecoder(ben, mode="ben").subsample_indices([6]) + BenDecoder(ben, mode="ben").subsample_indices([6]) with pytest.raises(Exception, match="range must be 1-based"): - PyBenDecoder(ben, mode="ben").subsample_range(0, 2) + BenDecoder(ben, mode="ben").subsample_range(0, 2) with pytest.raises(Exception): - PyBenDecoder(ben, mode="ben").subsample_range(-1, 2) + BenDecoder(ben, mode="ben").subsample_range(-1, 2) with pytest.raises(Exception, match="end must be <="): - PyBenDecoder(ben, mode="ben").subsample_range(1, 99) + BenDecoder(ben, mode="ben").subsample_range(1, 99) with pytest.raises(Exception, match="step and offset must be >= 1"): - PyBenDecoder(ben, mode="ben").subsample_every(0, 1) + BenDecoder(ben, mode="ben").subsample_every(0, 1) with pytest.raises(Exception, match="offset must be <="): - PyBenDecoder(ben, mode="ben").subsample_every(2, 99) + BenDecoder(ben, mode="ben").subsample_every(2, 99) - assert list(PyBenDecoder(ben, mode="ben").subsample_range(2, 4)) == samples[1:4] - assert list(PyBenDecoder(ben, mode="ben").subsample_every(2, 2)) == samples[1::2] + assert list(BenDecoder(ben, mode="ben").subsample_range(2, 4)) == samples[1:4] + assert list(BenDecoder(ben, mode="ben").subsample_every(2, 2)) == samples[1::2] def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears(tmp_path: Path) -> None: @@ -668,9 +668,9 @@ def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears(tmp_path: write_jsonl([[1], [2], [3]], src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") ben.unlink() with pytest.raises(Exception, match="Failed to count samples"): @@ -682,19 +682,19 @@ def test_decoder_reports_zero_count_and_bad_frame_errors(tmp_path: Path) -> None write_jsonl([[1, 1, 2]], src) mkv_ben = tmp_path / "mkv.ben" - compress_jsonl_to_ben(src, mkv_ben, overwrite=True, variant="mkv_chain") + encode_jsonl_to_ben(src, mkv_ben, overwrite=True, variant="mkv_chain") data = bytearray(mkv_ben.read_bytes()) data[-2:] = b"\x00\x00" mkv_ben.write_bytes(data) with pytest.raises(Exception, match="count must be greater than zero"): - next(iter(PyBenDecoder(mkv_ben, mode="ben"))) + next(iter(BenDecoder(mkv_ben, mode="ben"))) standard_ben = tmp_path / "standard.ben" - compress_jsonl_to_ben(src, standard_ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, standard_ben, overwrite=True, variant="standard") truncated = standard_ben.read_bytes()[:-1] bad_ben = tmp_path / "truncated.ben" bad_ben.write_bytes(truncated) - dec = PyBenDecoder(bad_ben, mode="ben") + dec = BenDecoder(bad_ben, mode="ben") with pytest.raises(Exception, match="Error decoding next item"): next(iter(dec)) @@ -703,7 +703,7 @@ def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_inv tmp_path: Path, ) -> None: with pytest.raises(OSError, match="does not exist"): - decompress_ben_to_jsonl( + decode_ben_to_jsonl( tmp_path / "missing.ben", tmp_path / "out.jsonl", overwrite=True, @@ -712,7 +712,7 @@ def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_inv bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): - decompress_ben_to_jsonl( + decode_ben_to_jsonl( bad_ben, tmp_path / "out.jsonl", overwrite=True, @@ -721,14 +721,14 @@ def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_inv bad_xben = tmp_path / "bad.xben" bad_xben.write_bytes(b"garbage") with pytest.raises(OSError, match="Failed to convert XBEN to BEN"): - decompress_xben_to_ben( + decode_xben_to_ben( bad_xben, tmp_path / "out.ben", overwrite=True, ) with pytest.raises(OSError, match="must differ"): - decompress_xben_to_jsonl( + decode_xben_to_jsonl( bad_xben, bad_xben, overwrite=True, @@ -738,17 +738,17 @@ def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_inv write_jsonl([[1, 2, 3]], src) ben = tmp_path / "good.ben" xben = tmp_path / "good.xben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - compress_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) out = tmp_path / "exists.jsonl" out.write_text("exists\n", encoding="utf-8") with pytest.raises(OSError, match="already exists"): - decompress_ben_to_jsonl(ben, out, overwrite=False) + decode_ben_to_jsonl(ben, out, overwrite=False) # --------------------------------------------------------------------------- -# Bundle inspection via PyBenDecoder +# Bundle inspection via BenDecoder # --------------------------------------------------------------------------- @@ -756,11 +756,11 @@ def test_decoder_bundle_round_trip_all_methods(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] graph = {"nodes": [{"id": 0}, {"id": 1}], "links": [{"source": 0, "target": 1}]} path = tmp_path / "full.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() assert dec.is_complete() assert dec.count_samples() == len(samples) @@ -797,37 +797,37 @@ def test_decoder_bundle_round_trip_all_methods(tmp_path: Path) -> None: def test_decoder_bundle_extract_stream_and_decode(tmp_path: Path) -> None: samples = [[10, 20], [30, 40]] path = tmp_path / "extract.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) out = tmp_path / "extracted.ben" dec.extract_stream(out) - assert list(PyBenDecoder(out, mode="ben")) == samples + assert list(BenDecoder(out, mode="ben")) == samples def test_decoder_bundle_extract_stream_overwrite_and_refuse(tmp_path: Path) -> None: samples = [[1]] path = tmp_path / "ow.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: enc.write(samples[0]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) out = tmp_path / "out.ben" dec.extract_stream(out) with pytest.raises(OSError, match="already exists"): dec.extract_stream(out, overwrite=False) dec.extract_stream(out, overwrite=True) - assert list(PyBenDecoder(out, mode="ben")) == samples + assert list(BenDecoder(out, mode="ben")) == samples def test_decoder_bundle_missing_asset_raises_keyerror(tmp_path: Path) -> None: path = tmp_path / "no_asset.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: enc.write([1, 2]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(KeyError, match="nope"): dec.read_asset_bytes("nope") with pytest.raises(KeyError, match="nope"): @@ -835,18 +835,18 @@ def test_decoder_bundle_missing_asset_raises_keyerror(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -# PyBenEncoder bundle-mode coverage +# BenEncoder bundle-mode coverage # --------------------------------------------------------------------------- def test_pybenencoder_bundle_without_graph(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "no_graph.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() assert dec.assignment_format() == "ben" assert dec.read_graph() is None @@ -856,33 +856,33 @@ def test_pybenencoder_bundle_without_graph(tmp_path: Path) -> None: def test_pybenencoder_bundle_graph_from_dict(tmp_path: Path) -> None: graph = {"test": True} path = tmp_path / "dict_graph.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: enc.write([1]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.read_graph() == graph def test_pybenencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: graph = {"test": "bytes"} path = tmp_path / "bytes_graph.bendl" - with PyBenEncoder( + with BenEncoder( path, overwrite=True, variant="standard", graph=json.dumps(graph).encode() ) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: graph = {"test": "bytearray"} path = tmp_path / "ba_graph.bendl" - with PyBenEncoder( + with BenEncoder( path, overwrite=True, variant="standard", graph=bytearray(json.dumps(graph).encode()), ) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: @@ -890,9 +890,9 @@ def test_pybenencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: gpath = tmp_path / "g.json" gpath.write_text(json.dumps(graph), encoding="utf-8") path = tmp_path / "path_graph.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard", graph=gpath) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=gpath) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: @@ -900,42 +900,42 @@ def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: gpath = tmp_path / "g2.json" gpath.write_text(json.dumps(graph), encoding="utf-8") path = tmp_path / "str_path_graph.bendl" - with PyBenEncoder( + with BenEncoder( path, overwrite=True, variant="standard", graph=str(gpath) ) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: graph = {"test": "bytesio"} path = tmp_path / "bio_graph.bendl" - with PyBenEncoder( + with BenEncoder( path, overwrite=True, variant="standard", graph=io.BytesIO(json.dumps(graph).encode()), ) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: graph = {"test": "stringio"} path = tmp_path / "sio_graph.bendl" - with PyBenEncoder( + with BenEncoder( path, overwrite=True, variant="standard", graph=io.StringIO(json.dumps(graph)), ) as enc: enc.write([1]) - assert PyBenDecoder(path).read_graph() == graph + assert BenDecoder(path).read_graph() == graph def test_pybenencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> None: with pytest.raises(ValueError, match="graph.*cannot be combined"): - PyBenEncoder( + BenEncoder( tmp_path / "bad.ben", overwrite=True, variant="standard", @@ -946,7 +946,7 @@ def test_pybenencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: with pytest.raises(ValueError, match="graph must be"): - PyBenEncoder( + BenEncoder( tmp_path / "bad.bendl", overwrite=True, variant="standard", @@ -956,16 +956,16 @@ def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: path = tmp_path / "idempotent.bendl" - enc = PyBenEncoder(path, overwrite=True, variant="standard") + enc = BenEncoder(path, overwrite=True, variant="standard") enc.write([1, 2]) enc.close() enc.close() - assert list(PyBenDecoder(path)) == [[1, 2]] + assert list(BenDecoder(path)) == [[1, 2]] def test_pybenencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: path = tmp_path / "closed.bendl" - enc = PyBenEncoder(path, overwrite=True, variant="standard") + enc = BenEncoder(path, overwrite=True, variant="standard") enc.write([1]) enc.close() with pytest.raises(OSError, match="already been closed"): @@ -973,17 +973,17 @@ def test_pybenencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -# PyBenDecoder bundle-path coverage +# BenDecoder bundle-path coverage # --------------------------------------------------------------------------- def test_pybendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] path = tmp_path / "auto.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() assert list(dec) == samples @@ -991,10 +991,10 @@ def test_pybendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: def test_pybendecoder_bundle_toc_methods(tmp_path: Path) -> None: graph = {"g": 1} path = tmp_path / "toc.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: enc.write([1, 2, 3]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.is_bundle() assert dec.assignment_format() == "ben" v = dec.version() @@ -1025,19 +1025,19 @@ def test_pybendecoder_bundle_toc_methods(tmp_path: Path) -> None: def test_pybendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: samples = [[i] for i in range(1, 11)] path = tmp_path / "subsample.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 5) assert list(dec) == samples[1:5] - dec2 = PyBenDecoder(path) + dec2 = BenDecoder(path) dec2.subsample_indices([1, 3, 10]) assert list(dec2) == [samples[0], samples[2], samples[9]] - dec3 = PyBenDecoder(path) + dec3 = BenDecoder(path) dec3.subsample_every(3, 2) assert list(dec3) == [samples[1], samples[4], samples[7]] @@ -1045,11 +1045,11 @@ def test_pybendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: def test_pybendecoder_bundle_len_and_count(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "len.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert len(dec) == len(samples) assert dec.count_samples() == len(samples) assert list(dec) == samples @@ -1058,11 +1058,11 @@ def test_pybendecoder_bundle_len_and_count(tmp_path: Path) -> None: def test_pybendecoder_bundle_iteration_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "restart.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert list(dec) == samples assert list(dec) == samples @@ -1070,11 +1070,11 @@ def test_pybendecoder_bundle_iteration_restart(tmp_path: Path) -> None: def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] path = tmp_path / "re_sub.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 5) expected = samples[1:5] assert list(dec) == expected @@ -1083,10 +1083,10 @@ def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> N def test_pybendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: path = tmp_path / "plain.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: enc.write([1, 2]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert not dec.is_bundle() assert dec.assignment_format() == "ben" @@ -1110,11 +1110,11 @@ def test_pybendecoder_bundle_count_samples_preserves_subsample_len( ) -> None: samples = [[i] for i in range(1, 9)] path = tmp_path / "count_sub.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 5) assert len(dec) == 4 assert dec.count_samples() == len(samples) @@ -1122,7 +1122,7 @@ def test_pybendecoder_bundle_count_samples_preserves_subsample_len( # --------------------------------------------------------------------------- -# PyBenDecoder XBEN bundle coverage +# BenDecoder XBEN bundle coverage # --------------------------------------------------------------------------- @@ -1132,17 +1132,17 @@ def test_pybendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: write_jsonl(samples, src) xben_path = tmp_path / "samples.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben_path, overwrite=True, variant="standard", n_threads=1, compression_level=1, ) bendl_path = tmp_path / "xben_bundle.bendl" - with PyBenEncoder(bendl_path, overwrite=True, variant="standard") as enc: + with BenEncoder(bendl_path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(bendl_path) + dec = BenDecoder(bendl_path) assert dec.is_bundle() assert list(dec) == samples @@ -1153,30 +1153,30 @@ def test_pybendecoder_xben_plain_stream(tmp_path: Path) -> None: write_jsonl(samples, src) xben_path = tmp_path / "plain.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben_path, overwrite=True, variant="standard", n_threads=1, compression_level=1, ) - dec = PyBenDecoder(xben_path, mode="xben") + dec = BenDecoder(xben_path, mode="xben") assert not dec.is_bundle() assert dec.assignment_format() == "xben" assert list(dec) == samples # --------------------------------------------------------------------------- -# PyBenDecoder subsample validation errors +# BenDecoder subsample validation errors # --------------------------------------------------------------------------- def test_pybendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "empty_idx.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_indices([]) @@ -1184,11 +1184,11 @@ def test_pybendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: def test_pybendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_idx.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_indices([0, 1, 2]) @@ -1196,11 +1196,11 @@ def test_pybendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: def test_pybendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_start.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_range(0, 2) @@ -1208,11 +1208,11 @@ def test_pybendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: def test_pybendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "bad_range.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_range(5, 2) @@ -1220,11 +1220,11 @@ def test_pybendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> Non def test_pybendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_step.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_every(0) @@ -1232,28 +1232,28 @@ def test_pybendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: def test_pybendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_off.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard") as enc: + with BenEncoder(path, overwrite=True, variant="standard") as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) with pytest.raises(Exception): dec.subsample_every(1, offset=0) # --------------------------------------------------------------------------- -# PyBenDecoder subsample on plain streams +# BenDecoder subsample on plain streams # --------------------------------------------------------------------------- def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_indices([1, 3, 5]) assert list(dec) == [[1], [3], [5]] @@ -1261,11 +1261,11 @@ def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_range.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 4) assert list(dec) == [[2], [3], [4]] @@ -1273,28 +1273,28 @@ def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5], [6]] path = tmp_path / "plain_every.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_every(2, offset=1) assert list(dec) == [[1], [3], [5]] # --------------------------------------------------------------------------- -# PyBenDecoder len/count on plain streams +# BenDecoder len/count on plain streams # --------------------------------------------------------------------------- def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: samples = [[1], [2], [3]] path = tmp_path / "plain_len.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.count_samples() == 3 assert len(dec) == 3 @@ -1302,11 +1302,11 @@ def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub_len.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_range(2, 4) assert len(dec) == 3 assert dec.count_samples() == 5 @@ -1314,18 +1314,18 @@ def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -# PyBenDecoder multiple iteration passes +# BenDecoder multiple iteration passes # --------------------------------------------------------------------------- def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "multi_iter.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert list(dec) == samples assert list(dec) == samples assert list(dec) == samples @@ -1334,11 +1334,11 @@ def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] path = tmp_path / "plain_re_sub.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path) + dec = BenDecoder(path) dec.subsample_every(2, offset=1) expected = [[1], [3], [5], [7]] assert list(dec) == expected @@ -1346,18 +1346,18 @@ def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> No # --------------------------------------------------------------------------- -# PyBenEncoder ben_file_only mode coverage +# BenEncoder ben_file_only mode coverage # --------------------------------------------------------------------------- def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: samples = [[10, 20, 30], [40, 50, 60]] path = tmp_path / "ben_only.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path, mode="ben") + dec = BenDecoder(path, mode="ben") assert not dec.is_bundle() assert list(dec) == samples @@ -1365,27 +1365,27 @@ def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: def test_pybenencoder_ben_file_only_mkv(tmp_path: Path) -> None: samples = [[1, 2], [1, 2], [3, 4]] path = tmp_path / "ben_mkv.ben" - with PyBenEncoder(path, overwrite=True, variant="mkv_chain", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="mkv_chain", ben_file_only=True) as enc: for a in samples: enc.write(a) - dec = PyBenDecoder(path, mode="ben") + dec = BenDecoder(path, mode="ben") assert list(dec) == samples def test_pybenencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: samples = [[5, 6]] path = tmp_path / "close_reopen.ben" - enc = PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) + enc = BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) enc.write(samples[0]) enc.close() - dec = PyBenDecoder(path, mode="ben") + dec = BenDecoder(path, mode="ben") assert list(dec) == samples # --------------------------------------------------------------------------- -# PyBenEncoder bundle with metadata +# BenEncoder bundle with metadata # --------------------------------------------------------------------------- @@ -1393,92 +1393,92 @@ def test_pybenencoder_bundle_with_metadata(tmp_path: Path) -> None: samples = [[1, 2]] graph = {"nodes": [{"id": 0}], "adjacency": [[]]} path = tmp_path / "with_meta.bendl" - with PyBenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: enc.write(samples[0]) - dec = PyBenDecoder(path) + dec = BenDecoder(path) assert dec.read_graph() == graph assert list(dec) == samples # --------------------------------------------------------------------------- -# PyBenDecoder extract_stream on plain stream raises +# BenDecoder extract_stream on plain stream raises # --------------------------------------------------------------------------- def test_pybendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: path = tmp_path / "plain_extract.ben" - with PyBenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: enc.write([1, 2]) - dec = PyBenDecoder(path, mode="ben") + dec = BenDecoder(path, mode="ben") with pytest.raises(Exception, match="only available on .bendl"): dec.extract_stream(tmp_path / "out.ben") # --------------------------------------------------------------------------- -# decompress_ben_to_jsonl and decompress_xben_to_jsonl coverage +# decode_ben_to_jsonl and decode_xben_to_jsonl coverage # --------------------------------------------------------------------------- -def test_decompress_ben_to_jsonl_roundtrip(tmp_path: Path) -> None: +def test_decode_ben_to_jsonl_roundtrip(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") out = tmp_path / "round.jsonl" - decompress_ben_to_jsonl(ben, out, overwrite=True) + decode_ben_to_jsonl(ben, out, overwrite=True) restored = read_jsonl_assignments(out) assert restored == samples -def test_decompress_xben_to_jsonl_roundtrip(tmp_path: Path) -> None: +def test_decode_xben_to_jsonl_roundtrip(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) xben = tmp_path / "out.xben" - compress_jsonl_to_xben( + encode_jsonl_to_xben( src, xben, overwrite=True, variant="standard", n_threads=1, compression_level=1, ) out = tmp_path / "round.jsonl" - decompress_xben_to_jsonl(xben, out, overwrite=True) + decode_xben_to_jsonl(xben, out, overwrite=True) restored = read_jsonl_assignments(out) assert restored == samples # --------------------------------------------------------------------------- -# compress_ben_to_xben coverage +# encode_ben_to_xben coverage # --------------------------------------------------------------------------- -def test_compress_ben_to_xben_roundtrip(tmp_path: Path) -> None: +def test_encode_ben_to_xben_roundtrip(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) ben = tmp_path / "out.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") xben = tmp_path / "from_ben.xben" - compress_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) out = tmp_path / "round.jsonl" - decompress_xben_to_jsonl(xben, out, overwrite=True) + decode_xben_to_jsonl(xben, out, overwrite=True) restored = read_jsonl_assignments(out) assert restored == samples # --------------------------------------------------------------------------- -# PyBenDecoder unknown mode error +# BenDecoder unknown mode error # --------------------------------------------------------------------------- @@ -1486,11 +1486,11 @@ def test_pybendecoder_unknown_mode_raises(tmp_path: Path) -> None: path = tmp_path / "dummy.ben" path.write_bytes(b"\x00" * 100) with pytest.raises(Exception): - PyBenDecoder(path, mode="bogus") + BenDecoder(path, mode="bogus") # --------------------------------------------------------------------------- -# PyBenDecoder MkvChain plain stream +# BenDecoder MkvChain plain stream # --------------------------------------------------------------------------- @@ -1500,8 +1500,8 @@ def test_pybendecoder_mkv_plain_stream(tmp_path: Path) -> None: write_jsonl(samples, src) ben = tmp_path / "mkv.ben" - compress_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - dec = PyBenDecoder(ben, mode="ben") + dec = BenDecoder(ben, mode="ben") assert list(dec) == samples assert dec.count_samples() == 3 diff --git a/pyben/uv.lock b/ben-py/uv.lock similarity index 100% rename from pyben/uv.lock rename to ben-py/uv.lock diff --git a/pyben/binary_ensemble/__init__.py b/pyben/binary_ensemble/__init__.py deleted file mode 100644 index fe15f10..0000000 --- a/pyben/binary_ensemble/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from ._core import ( - PyBenDecoder, - PyBenEncoder, - compress_jsonl_to_ben, - compress_ben_to_xben, - compress_jsonl_to_xben, - decompress_ben_to_jsonl, - decompress_xben_to_jsonl, - decompress_xben_to_ben, -) - -__all__ = [ - "PyBenDecoder", - "PyBenEncoder", - "compress_jsonl_to_ben", - "compress_ben_to_xben", - "compress_jsonl_to_xben", - "decompress_ben_to_jsonl", - "decompress_xben_to_jsonl", - "decompress_xben_to_ben", -] diff --git a/pyben/src/lib.rs b/pyben/src/lib.rs deleted file mode 100755 index ed5a056..0000000 --- a/pyben/src/lib.rs +++ /dev/null @@ -1,23 +0,0 @@ -use pyo3::prelude::*; -use pyo3::wrap_pyfunction; - -pub mod common; -pub mod decode; -pub mod encode; - -#[pymodule] -fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_function(wrap_pyfunction!(crate::decode::decompress_ben_to_jsonl, m)?)?; - m.add_function(wrap_pyfunction!(crate::decode::decompress_xben_to_ben, m)?)?; - m.add_function(wrap_pyfunction!( - crate::decode::decompress_xben_to_jsonl, - m - )?)?; - m.add_function(wrap_pyfunction!(crate::encode::compress_jsonl_to_ben, m)?)?; - m.add_function(wrap_pyfunction!(crate::encode::compress_jsonl_to_xben, m)?)?; - m.add_function(wrap_pyfunction!(crate::encode::compress_ben_to_xben, m)?)?; - - Ok(()) -} From d9b5a3bae5d36338a3d5f872e25344c2d9039aee Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 5 May 2026 11:31:38 -0600 Subject: [PATCH 083/221] change progress to spinner --- ben/src/cli/ben/args.rs | 3 + ben/src/cli/ben/mod.rs | 3 +- ben/src/cli/bendl/args.rs | 4 ++ ben/src/cli/bendl/mod.rs | 3 +- ben/src/cli/common/mod.rs | 28 ++++++++ ben/src/cli/pcben/mod.rs | 6 +- ben/src/cli/reben/args.rs | 3 + ben/src/cli/reben/mod.rs | 3 +- ben/src/codec/decode/jsonl.rs | 15 ++--- ben/src/codec/decode/xz.rs | 10 +-- ben/src/codec/encode/jsonl.rs | 18 +++-- ben/src/codec/translate/mod.rs | 10 +-- ben/src/io/reader/assignment_reader.rs | 16 ++++- ben/src/io/reader/xz_assignment_reader.rs | 8 ++- ben/src/io/writer/xz_assignment_writer.rs | 10 +-- ben/src/lib.rs | 16 +---- ben/src/logging.rs | 17 ----- ben/src/ops/relabel/mod.rs | 18 +++-- ben/src/progress/mod.rs | 82 +++++++++++++++++++++++ 19 files changed, 190 insertions(+), 83 deletions(-) create mode 100644 ben/src/progress/mod.rs diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index e6629ae..528095d 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -114,6 +114,9 @@ pub(super) struct Args { /// Enables verbose printing for the CLI. Optional. #[arg(short, long)] pub verbose: bool, + /// Suppress in-place progress spinners. Trace logging is unaffected. + #[arg(short = 'q', long)] + pub quiet: bool, /// When running x-encoder, this flag will determine the number of cpus to use on the /// system. By default, all available cpus will be used. #[arg(short = 'c', long)] diff --git a/ben/src/cli/ben/mod.rs b/ben/src/cli/ben/mod.rs index b9d1edc..abb532a 100644 --- a/ben/src/cli/ben/mod.rs +++ b/ben/src/cli/ben/mod.rs @@ -10,13 +10,14 @@ mod tests; use args::{Args, Mode}; -use crate::cli::common::{set_verbose, CliError, CliResult}; +use crate::cli::common::{set_quiet, set_verbose, CliError, CliResult}; use clap::Parser; /// Parse CLI arguments and dispatch to the per-mode handler in [`modes`]. pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); + set_quiet(args.quiet); // --graph is only meaningful for the stream-producing modes. if args.graph.is_some() && args.mode != Mode::Encode && args.mode != Mode::XEncode { diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs index 998772f..3d4aa85 100644 --- a/ben/src/cli/bendl/args.rs +++ b/ben/src/cli/bendl/args.rs @@ -36,6 +36,10 @@ pub(super) struct Args { #[arg(short, long, global = true)] pub verbose: bool, + /// Suppress in-place progress spinners. Trace logging is unaffected. + #[arg(short = 'q', long, global = true)] + pub quiet: bool, + #[command(subcommand)] pub command: Command, } diff --git a/ben/src/cli/bendl/mod.rs b/ben/src/cli/bendl/mod.rs index bb59494..f75a08e 100644 --- a/ben/src/cli/bendl/mod.rs +++ b/ben/src/cli/bendl/mod.rs @@ -26,13 +26,14 @@ use create::run_create; use extract::run_extract; use inspect::run_inspect; -use crate::cli::common::{set_verbose, CliError, CliResult}; +use crate::cli::common::{set_quiet, set_verbose, CliError, CliResult}; use clap::Parser; /// Parse CLI arguments and execute the selected subcommand. pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); + set_quiet(args.quiet); match args.command { Command::Create(a) => run_create(a), diff --git a/ben/src/cli/common/mod.rs b/ben/src/cli/common/mod.rs index 60afb38..6f07b0e 100644 --- a/ben/src/cli/common/mod.rs +++ b/ben/src/cli/common/mod.rs @@ -3,6 +3,9 @@ pub use error::{CliError, CliResult}; use std::io::{self, Result}; use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; + +static QUIET: AtomicBool = AtomicBool::new(false); /// Configure tracing for CLI execution. /// @@ -24,6 +27,31 @@ pub fn set_verbose(verbose: bool) { crate::logging::init_logging(); } +/// Suppress in-place progress spinners for this process. +/// +/// Independent of [`set_verbose`]: trace logging is gated by `RUST_LOG`, +/// while spinners are gated by this flag plus stderr TTY detection. +/// +/// # Arguments +/// +/// * `quiet` - When `true`, [`crate::progress::Spinner`] becomes a no-op. +/// +/// # Returns +/// +/// This function does not return a value. +pub fn set_quiet(quiet: bool) { + QUIET.store(quiet, Ordering::Relaxed); +} + +/// Whether progress spinners have been globally suppressed. +/// +/// # Returns +/// +/// Returns `true` when [`set_quiet`] was last called with `true`. +pub fn is_quiet() -> bool { + QUIET.load(Ordering::Relaxed) +} + /// Decide whether overwriting an output path should proceed, given the /// state observed by the caller. /// diff --git a/ben/src/cli/pcben/mod.rs b/ben/src/cli/pcben/mod.rs index 183398d..7414502 100644 --- a/ben/src/cli/pcben/mod.rs +++ b/ben/src/cli/pcben/mod.rs @@ -1,4 +1,4 @@ -use crate::cli::common::{check_overwrite, set_verbose, CliError, CliResult}; +use crate::cli::common::{check_overwrite, set_quiet, set_verbose, CliError, CliResult}; use crate::io::reader::AssignmentReader; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; @@ -49,12 +49,16 @@ struct Args { /// Enables verbose printing for the CLI. Optional. #[arg(short, long)] verbose: bool, + /// Suppress in-place progress spinners. Trace logging is unaffected. + #[arg(short = 'q', long)] + quiet: bool, } /// Parse CLI arguments and execute the selected `pcben` conversion. pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); + set_quiet(args.quiet); match args.mode { Mode::BenToPc => { diff --git a/ben/src/cli/reben/args.rs b/ben/src/cli/reben/args.rs index 5bde2a0..56a13fe 100644 --- a/ben/src/cli/reben/args.rs +++ b/ben/src/cli/reben/args.rs @@ -80,4 +80,7 @@ pub(super) struct Args { /// Verbosity level for the program. #[arg(short, long)] pub verbose: bool, + /// Suppress in-place progress spinners. Trace logging is unaffected. + #[arg(short = 'q', long)] + pub quiet: bool, } diff --git a/ben/src/cli/reben/mod.rs b/ben/src/cli/reben/mod.rs index 9ff88e3..86813cf 100644 --- a/ben/src/cli/reben/mod.rs +++ b/ben/src/cli/reben/mod.rs @@ -12,13 +12,14 @@ use args::{Args, Mode}; use ben_mode::run_ben_mode; use json_mode::run_json_mode; -use crate::cli::common::{set_verbose, CliError, CliResult}; +use crate::cli::common::{set_quiet, set_verbose, CliError, CliResult}; use clap::Parser; /// Parse CLI arguments and execute the selected `reben` mode. pub fn run() -> CliResult { let args = Args::parse(); set_verbose(args.verbose); + set_quiet(args.quiet); run_with_args(args).map_err(CliError::from) } diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index 4dbd7e3..44a2bfc 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -2,7 +2,8 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::{AssignmentReader, XZAssignmentReader}; -use crate::{progress, BenVariant}; +use crate::progress::Spinner; +use crate::BenVariant; use serde_json::json; use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; @@ -55,10 +56,11 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i BenVariant::TwoDelta, ); let mut sample_number = 1usize; + let spinner = Spinner::new("Decoding sample"); for record in &mut xben { let (assignment, count) = record?; for _ in 0..count { - progress!("Decoding sample: {}\r", sample_number); + spinner.set_count(sample_number as u64); let line = json!({ "assignment": assignment, "sample": sample_number, @@ -69,8 +71,6 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i sample_number += 1; } } - tracing::trace!(""); - tracing::trace!("Done!"); return Ok(()); } None => { @@ -85,6 +85,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let mut line_count: usize = 0; let mut starting_sample: usize = 0; + let spinner = Spinner::new("Decoding sample"); loop { let count = decoder.read(&mut buffer)?; if count == 0 { @@ -101,7 +102,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i if overflow[i - 3..=i] == [0, 0, 0, 0] { last_valid_assignment = i + 1; line_count += 1; - progress!("Decoding sample: {}\r", line_count); + spinner.set_count(line_count as u64); } } } else { @@ -111,7 +112,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let lines = &overflow[i + 1..i + 3]; let n_lines = u16::from_be_bytes([lines[0], lines[1]]); line_count += n_lines as usize; - progress!("Decoding sample: {}\r", line_count); + spinner.set_count(line_count as u64); } } } @@ -129,8 +130,6 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i overflow.drain(..last_valid_assignment); starting_sample = line_count; } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index d497f0f..921e5df 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -3,7 +3,8 @@ use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN use crate::format::FormatError; use crate::io::reader::XZAssignmentReader; use crate::io::writer::AssignmentWriter; -use crate::{progress, BenVariant}; +use crate::progress::Spinner; +use crate::BenVariant; use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; @@ -64,6 +65,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let mut overflow: Vec = Vec::new(); let mut line_count: usize = 0; + let spinner = Spinner::new("Decoding sample"); loop { let count = decoder.read(&mut buffer)?; if count == 0 { @@ -80,7 +82,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: if overflow[i - 3..=i] == [0, 0, 0, 0] { last_valid_assignment = i + 1; line_count += 1; - progress!("Decoding sample: {}\r", line_count); + spinner.set_count(line_count as u64); } } } else { @@ -90,7 +92,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let lines = &overflow[i + 1..i + 3]; let n_lines = u16::from_be_bytes([lines[0], lines[1]]); line_count += n_lines as usize; - progress!("Decoding sample: {}\r", line_count); + spinner.set_count(line_count as u64); } } } @@ -102,8 +104,6 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: ben32_to_ben_lines(&overflow[0..last_valid_assignment], &mut writer, variant)?; overflow = overflow[last_valid_assignment..].to_vec(); } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index d3f7a3b..52c2e80 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,6 +1,7 @@ use crate::codec::encode::errors::EncodeError; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; -use crate::{progress, BenVariant}; +use crate::progress::Spinner; +use crate::BenVariant; use serde_json::Value; use std::io::{self, BufRead, Result, Write}; use xz2::stream::MtStreamBuilder; @@ -55,10 +56,11 @@ pub fn encode_jsonl_to_xben( ben_encoder = ben_encoder.with_chunk_size(cs); } - let mut line_num = 1; + let mut line_num = 1u64; + let spinner = Spinner::new("Encoding line"); for line_result in reader.lines() { - progress!("Encoding line: {}\r", line_num); + spinner.set_count(line_num); line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).map_err(|e| { @@ -71,9 +73,6 @@ pub fn encode_jsonl_to_xben( ben_encoder.write_json_value(data)?; } - tracing::trace!(""); - tracing::trace!("Done!"); - Ok(()) } @@ -97,10 +96,11 @@ pub fn encode_jsonl_to_ben( writer: W, variant: BenVariant, ) -> Result<()> { - let mut line_num = 1; + let mut line_num = 1u64; + let spinner = Spinner::new("Encoding line"); let mut ben_encoder = AssignmentWriter::new(writer, variant)?; for line_result in reader.lines() { - progress!("Encoding line: {}\r", line_num); + spinner.set_count(line_num); line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).map_err(|e| { @@ -112,7 +112,5 @@ pub fn encode_jsonl_to_ben( ben_encoder.write_json_value(data)?; } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 31a4c71..6540414 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -15,7 +15,8 @@ use std::io::{self, Read, Write}; use crate::codec::decode::decode_ben_line; use crate::codec::BenEncodeFrame; -use crate::{progress, BenVariant}; +use crate::progress::Spinner; +use crate::BenVariant; /// Convert a single ben32 frame into a BEN frame payload. /// @@ -179,7 +180,8 @@ pub fn ben_to_ben32_lines( mut writer: W, variant: BenVariant, ) -> io::Result<()> { - let mut sample_number = 1; + let mut sample_number = 1usize; + let spinner = Spinner::new("Encoding line"); 'outer: loop { let mut tmp_buffer = [0u8]; let max_val_bits = match reader.read_exact(&mut tmp_buffer) { @@ -195,7 +197,7 @@ pub fn ben_to_ben32_lines( let max_len_bits = reader.read_u8()?; let n_bytes = reader.read_u32::()?; - progress!("Encoding line: {}\r", sample_number); + spinner.set_count(sample_number as u64); match variant { BenVariant::Standard => { @@ -219,8 +221,6 @@ pub fn ben_to_ben32_lines( } } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs index 6dd4cae..9650017 100644 --- a/ben/src/io/reader/assignment_reader.rs +++ b/ben/src/io/reader/assignment_reader.rs @@ -4,8 +4,9 @@ use crate::codec::{ BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, TwoDeltaDecodeFrame, }; use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::progress::Spinner; use crate::util::rle::rle_to_vec; -use crate::{progress, BenVariant}; +use crate::BenVariant; use serde_json::json; use std::io::{self, Cursor, Read, Write}; @@ -17,6 +18,7 @@ pub struct AssignmentReader { previous_assignment: Option>, twodelta_consumed_first_frame: bool, silent: bool, + spinner: Option, } /// Internal frame representation, one variant per BEN encoding type. @@ -74,6 +76,7 @@ impl AssignmentReader { previous_assignment: None, twodelta_consumed_first_frame: false, silent: false, + spinner: None, }), None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), } @@ -82,6 +85,9 @@ impl AssignmentReader { /// Suppress progress output from this decoder's iterator. pub fn silent(mut self, silent: bool) -> Self { self.silent = silent; + if silent { + self.spinner = None; + } self } @@ -201,7 +207,9 @@ impl AssignmentReader { self.previous_assignment = Some(assignment); self.sample_count += count as usize; if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); + self.spinner + .get_or_insert_with(|| Spinner::new("Decoding sample")) + .set_count(self.sample_count as u64); } if !keep_going { return Ok(()); @@ -270,7 +278,9 @@ impl Iterator for AssignmentReader { self.previous_assignment = Some(assignment.clone()); self.sample_count += count as usize; if !self.silent { - progress!("Decoding sample: {}\r", self.sample_count); + self.spinner + .get_or_insert_with(|| Spinner::new("Decoding sample")) + .set_count(self.sample_count as u64); } Some(Ok((assignment, count))) } diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs index 222ecc1..e85c5cf 100644 --- a/ben/src/io/reader/xz_assignment_reader.rs +++ b/ben/src/io/reader/xz_assignment_reader.rs @@ -4,8 +4,9 @@ use super::twodelta::{XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; use crate::codec::encode::encode_ben32_assignments; use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::progress::Spinner; use crate::util::rle::rle_to_vec; -use crate::{progress, BenVariant}; +use crate::BenVariant; use serde_json::json; use std::io::{self, BufReader, Cursor, Read, Write}; use xz2::read::XzDecoder; @@ -321,12 +322,13 @@ impl XZAssignmentReader { F: FnMut(&[u16], u16) -> io::Result, { let mut sample_count = 0usize; + let spinner = (!self.silent).then(|| Spinner::new("Decoding sample")); loop { match self.next() { Some(Ok((assignment, count))) => { sample_count += count as usize; - if !self.silent { - progress!("Decoding sample: {}\r", sample_count); + if let Some(spinner) = &spinner { + spinner.set_count(sample_count as u64); } let keep_going = f(&assignment, count)?; if !keep_going { diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index 04b7645..e11e6fd 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -8,7 +8,8 @@ use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_ use crate::codec::translate::ben_to_ben32_lines; use crate::codec::TwoDeltaEncodeFrame; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::{progress, BenVariant}; +use crate::progress::Spinner; +use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; use serde_json::Value; use std::collections::HashMap; @@ -302,7 +303,8 @@ impl XZAssignmentWriter { self.encoder.write_all(&first_count.to_be_bytes())?; let mut sample_count = first_count as usize; - progress!("Encoding line: {}\r", sample_count); + let spinner = Spinner::new("Encoding line"); + spinner.set_count(sample_count as u64); // Delta frames: unpack bitpacked run lengths and buffer into chunks. loop { @@ -339,13 +341,11 @@ impl XZAssignmentWriter { } sample_count += count as usize; - progress!("Encoding line: {}\r", sample_count); + spinner.set_count(sample_count as u64); } self.flush_chunk()?; - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 8e25c94..c36bf1c 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -33,28 +33,18 @@ pub mod format; pub mod io; /// JSON graph utilities used by relabeling workflows. pub mod json; -/// Logging and progress-output helpers used by the CLI and library. +/// Logging helpers used by the CLI and library. pub mod logging; /// Higher-level operations such as extraction and relabeling. pub mod ops; +/// In-place progress spinners for streaming operations. +pub mod progress; /// Miscellaneous utilities that do not fit into the other modules. pub mod util; #[doc(hidden)] pub mod test_utils; -/// Print an in-place progress update when trace logging is enabled. -/// -/// This is intentionally separate from normal structured logging because many -/// callsites want carriage-return based terminal updates instead of line-based -/// log records. -#[macro_export] -macro_rules! progress { - ($($arg:tt)*) => {{ - $crate::logging::trace_progress(format_args!($($arg)*)); - }} -} - #[derive(Debug, Clone, Copy, PartialEq)] /// The BEN/XBEN variant used when encoding or decoding a stream. pub enum BenVariant { diff --git a/ben/src/logging.rs b/ben/src/logging.rs index d128d70..37e6e57 100644 --- a/ben/src/logging.rs +++ b/ben/src/logging.rs @@ -32,20 +32,3 @@ pub fn init_logging() { let _ = tracing::subscriber::set_global_default(subscriber); }); } - -/// Emit a progress update to stderr when trace logging is enabled. -/// -/// This helper exists for progress-style output such as `"Encoding line: 42\r"` -/// that should redraw the current terminal line instead of creating a normal -/// structured log event. -/// -/// # Arguments -/// -/// * `args` - The formatted progress message to emit. -/// -/// # Returns -/// -/// This function does not return a value. -pub fn trace_progress(args: std::fmt::Arguments<'_>) { - tracing::trace!("{args}"); -} diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index d225845..f8f2eac 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -9,8 +9,9 @@ use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::AssignmentReader; use crate::io::writer::AssignmentWriter; +use crate::progress::Spinner; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; -use crate::{progress, BenVariant}; +use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; use std::collections::HashMap; use std::io::{self, Cursor, Read, Write}; @@ -134,6 +135,7 @@ where let mut decoder = AssignmentReader::new(reader)?.silent(true); let mut encoder = AssignmentWriter::new(writer, variant)?; let mut sample_number = 0usize; + let spinner = Spinner::new("Relabeling line"); decoder.for_each_assignment(|assignment, count| { if max_samples.is_some_and(|limit| sample_number >= limit) { @@ -151,12 +153,10 @@ where encoder.write_assignment(relabeled)?; sample_number += out_count; - progress!("Relabelling line: {}\r", sample_number); + spinner.set_count(sample_number as u64); Ok(true) })?; - tracing::trace!(""); - tracing::trace!("Done!"); encoder.finish()?; Ok(()) } @@ -319,6 +319,7 @@ fn relabel_ben_lines_impl( ) -> io::Result<()> { let mut sample_number = 0; let mut label_map = HashMap::new(); + let spinner = Spinner::new("Relabeling line"); loop { if max_samples.is_some_and(|limit| sample_number >= limit) { break; @@ -372,10 +373,8 @@ fn relabel_ben_lines_impl( sample_number += count_occurrences as usize; - progress!("Relabeling line: {}\r", sample_number); + spinner.set_count(sample_number as u64); } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } @@ -545,6 +544,7 @@ fn relabel_ben_lines_with_map_impl( let mut assignment_vec = Vec::new(); let mut new_assignment_vec = vec![0u16; permutation.len()]; let mut new_rle = Vec::new(); + let spinner = Spinner::new("Relabeling line"); loop { if max_samples.is_some_and(|limit| sample_number >= limit) { break; @@ -602,10 +602,8 @@ fn relabel_ben_lines_with_map_impl( } sample_number += count_occurrences as usize; - progress!("Relabeling line: {}\r", sample_number); + spinner.set_count(sample_number as u64); } - tracing::trace!(""); - tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/progress/mod.rs b/ben/src/progress/mod.rs new file mode 100644 index 0000000..ca55d15 --- /dev/null +++ b/ben/src/progress/mod.rs @@ -0,0 +1,82 @@ +//! In-place progress spinners for streaming encode/decode/relabel loops. +//! +//! Streaming operations have no upfront totals (BEN/JSONL inputs are read +//! frame-by-frame), so a percentage bar is not possible — this module +//! provides a running-counter spinner instead. The spinner writes directly +//! to stderr via [`indicatif`], bypassing `tracing` (whose fmt subscriber +//! appends `\n` and would defeat carriage-return redraws). +//! +//! Visibility is gated by two checks performed at construction time: +//! 1. `cli::common::is_quiet()` — the `--quiet` CLI flag. +//! 2. `std::io::stderr().is_terminal()` — auto-disable when stderr is +//! redirected, so logs and pipelines stay clean. +//! +//! Both checks happen once in [`Spinner::new`]; the resulting [`Spinner`] +//! is either a live indicatif bar or a no-op stub. + +use std::io::IsTerminal; +use std::time::Duration; + +use indicatif::{ProgressBar, ProgressStyle}; + +/// A scope-bound progress spinner backed by [`indicatif::ProgressBar`]. +/// +/// The spinner animates on a steady tick and exposes a single counter via +/// [`Spinner::set_count`]. On drop, the spinner clears its line so that +/// subsequent stderr writes start fresh. +pub struct Spinner { + bar: Option, +} + +impl Spinner { + /// Build a spinner for a streaming operation. + /// + /// Returns a no-op spinner when `--quiet` is set or when stderr is not + /// a TTY. + /// + /// # Arguments + /// + /// * `prefix` - The label shown before the running counter, e.g. + /// `"Encoding line"`. + /// + /// # Returns + /// + /// A [`Spinner`] that may or may not have an active indicatif bar. + pub fn new(prefix: &'static str) -> Self { + if crate::cli::common::is_quiet() || !std::io::stderr().is_terminal() { + return Self { bar: None }; + } + + let template = format!("{{spinner}} {prefix}: {{pos}}"); + let style = ProgressStyle::with_template(&template) + .unwrap_or_else(|_| ProgressStyle::default_spinner()); + + let bar = ProgressBar::new_spinner().with_style(style); + bar.enable_steady_tick(Duration::from_millis(80)); + + Self { bar: Some(bar) } + } + + /// Update the running counter. No-op when the spinner is disabled. + /// + /// # Arguments + /// + /// * `n` - The new counter value to display. + /// + /// # Returns + /// + /// This function does not return a value. + pub fn set_count(&self, n: u64) { + if let Some(bar) = &self.bar { + bar.set_position(n); + } + } +} + +impl Drop for Spinner { + fn drop(&mut self) { + if let Some(bar) = self.bar.take() { + bar.finish_and_clear(); + } + } +} From 12aabce38abcb2f8cde5078ae227a7fe36e7f7e9 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 5 May 2026 23:01:26 -0600 Subject: [PATCH 084/221] get rid of frame duplicates --- ben/src/codec/decode/tests/twodelta.rs | 10 +- ben/src/codec/decode/twodelta.rs | 25 +- ben/src/codec/encode/tests.rs | 27 +- ben/src/codec/encode/twodelta.rs | 26 +- ben/src/codec/frames/ben_decode.rs | 85 -- ben/src/codec/frames/ben_encode.rs | 90 --- ben/src/codec/frames/decode.rs | 289 +++++++ ben/src/codec/frames/encode.rs | 387 +++++++++ ben/src/codec/frames/mkv_decode.rs | 94 --- ben/src/codec/frames/mkv_encode.rs | 100 --- ben/src/codec/frames/mod.rs | 59 +- ben/src/codec/frames/tests.rs | 939 +++++++++++----------- ben/src/codec/frames/twodelta_decode.rs | 59 -- ben/src/codec/frames/twodelta_encode.rs | 202 ----- ben/src/codec/mod.rs | 5 +- ben/src/codec/translate/mod.rs | 21 +- ben/src/codec/translate/tests.rs | 4 +- ben/src/io/reader/assignment_reader.rs | 199 ++--- ben/src/io/reader/subsample.rs | 7 +- ben/src/io/writer/assignment_writer.rs | 22 +- ben/src/io/writer/xz_assignment_writer.rs | 33 +- ben/src/ops/relabel/mod.rs | 14 +- ben/src/ops/relabel/tests.rs | 19 +- ben/tests/test_assignment_reader.rs | 21 +- ben/tests/test_coverage.rs | 78 +- ben/tests/test_impls_pipeline.rs | 6 +- ben/tests/test_stress_edges.rs | 11 +- 27 files changed, 1406 insertions(+), 1426 deletions(-) delete mode 100644 ben/src/codec/frames/ben_decode.rs delete mode 100644 ben/src/codec/frames/ben_encode.rs create mode 100644 ben/src/codec/frames/decode.rs create mode 100644 ben/src/codec/frames/encode.rs delete mode 100644 ben/src/codec/frames/mkv_decode.rs delete mode 100644 ben/src/codec/frames/mkv_encode.rs delete mode 100644 ben/src/codec/frames/twodelta_decode.rs delete mode 100644 ben/src/codec/frames/twodelta_encode.rs diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index 047adbc..ec8ad56 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -3,7 +3,7 @@ use crate::codec::decode::{ decode_xben_to_jsonl, }; use crate::codec::encode::{encode_ben_to_xben, encode_twodelta_frame}; -use crate::codec::frames::TwoDeltaEncodeFrame; +use crate::codec::frames::BenEncodeFrame; use crate::io::writer::AssignmentWriter; use crate::util::rle::rle_to_vec; use crate::BenVariant; @@ -83,7 +83,7 @@ fn apply_runs_alternating_single_positions() { #[test] fn decode_twodelta_frame_basic() { - let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); let prev = vec![1u16, 2, 1, 2]; let result = decode_twodelta_frame(prev, &frame).unwrap(); assert_eq!(result, vec![1, 1, 2, 2]); @@ -93,7 +93,7 @@ fn decode_twodelta_frame_basic() { fn decode_twodelta_frame_full_swap() { // pair=(2,1) means run starts with value 2; run_lengths=[2,2] // prev [1,2,1,2]: pair positions 0,1,2,3 → [2,2,1,1] - let frame = TwoDeltaEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None); + let frame = BenEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None); let prev = vec![1u16, 2, 1, 2]; let result = decode_twodelta_frame(prev, &frame).unwrap(); assert_eq!(result, vec![2, 2, 1, 1]); @@ -103,8 +103,8 @@ fn decode_twodelta_frame_full_swap() { fn decode_twodelta_frame_chain_returns_to_original() { // Frame 1: (1,2) run=[2,2] applied to [1,2,1,2] → [1,1,2,2] // Frame 2: (1,2) run=[1,1,1,1] applied to [1,1,2,2] → [1,2,1,2] - let f1 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); - let f2 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None); + let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let f2 = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None); let initial = vec![1u16, 2, 1, 2]; let after_f1 = decode_twodelta_frame(initial.clone(), &f1).unwrap(); assert_eq!(after_f1, vec![1, 1, 2, 2]); diff --git a/ben/src/codec/decode/twodelta.rs b/ben/src/codec/decode/twodelta.rs index 2003c09..8441a27 100644 --- a/ben/src/codec/decode/twodelta.rs +++ b/ben/src/codec/decode/twodelta.rs @@ -1,5 +1,5 @@ use super::errors::DecodeError; -use crate::codec::TwoDeltaEncodeFrame; +use crate::codec::BenEncodeFrame; use std::io; /// Apply decoded TwoDelta run lengths to produce a new assignment vector. @@ -66,14 +66,29 @@ pub(crate) fn apply_twodelta_runs_to_assignment( /// # Arguments /// /// * `previous` - The assignment vector from the preceding frame. -/// * `frame` - The TwoDelta frame containing the pair and run-length vector. +/// * `frame` - A TwoDelta-arm [`BenEncodeFrame`] containing the pair and +/// run-length vector. /// /// # Returns /// -/// Returns the updated assignment vector. +/// Returns the updated assignment vector, or an error if `frame` is not the +/// `TwoDelta` arm. pub fn decode_twodelta_frame( previous: Vec, - frame: &TwoDeltaEncodeFrame, + frame: &BenEncodeFrame, ) -> io::Result> { - apply_twodelta_runs_to_assignment(previous, frame.pair, &frame.run_length_vector) + match frame { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => apply_twodelta_runs_to_assignment(previous, *pair, run_length_vector), + other => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "decode_twodelta_frame called with non-TwoDelta variant: {:?}", + other.variant() + ), + )), + } } diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 9363642..a94dc93 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::codec::frames::{BenConstruct, BenEncodeFrame}; +use crate::codec::frames::BenEncodeFrame; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; @@ -262,8 +262,13 @@ fn test_encode_jsonl_to_ben_len_65535() { #[test] fn test_encode_ben_vec_from_assign_matches_rle_entrypoint() { let assign_vec = vec![4u16, 4, 4, 1, 1, 3, 3, 3, 2]; - let direct = BenEncodeFrame::from_assignment(assign_vec.clone(), None); - let via_rle = BenEncodeFrame::from_rle(crate::util::rle::assign_to_rle(assign_vec), None); + let direct = + BenEncodeFrame::from_assignment(assign_vec.clone(), BenVariant::Standard, None); + let via_rle = BenEncodeFrame::from_rle( + crate::util::rle::assign_to_rle(assign_vec), + BenVariant::Standard, + None, + ); assert_eq!(direct, via_rle); } @@ -802,8 +807,8 @@ fn twodelta_encode_with_pair_and_mask_hints() { let frame = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) .unwrap(); - assert_eq!(frame.pair, (2, 1)); - assert!(!frame.run_length_vector.is_empty()); + assert_eq!(frame.pair().unwrap(), (2, 1)); + assert!(!frame.run_length_vector().unwrap().is_empty()); // Verify masks were updated assert_eq!(masks[&2], vec![0, 2]); assert_eq!(masks[&1], vec![1, 3]); @@ -822,7 +827,7 @@ fn twodelta_encode_with_mask_hint_only() { let frame = encode_twodelta_frame_with_hint(&prev, &curr, None, Some(&mut masks), None).unwrap(); - assert_eq!(frame.pair, (2, 1)); + assert_eq!(frame.pair().unwrap(), (2, 1)); } #[test] @@ -986,8 +991,8 @@ fn twodelta_encode_with_count() { let prev = vec![1u16, 1, 2, 2]; let next = vec![2u16, 1, 2, 1]; let frame = encode_twodelta_frame(&prev, &next, Some(5)).unwrap(); - // Verify the count is embedded in the raw_bytes tail - let raw = &frame.raw_bytes; + // Verify the count is embedded in the serialized frame's tail + let raw = frame.as_slice(); let count = u16::from_be_bytes([raw[raw.len() - 2], raw[raw.len() - 1]]); assert_eq!(count, 5); } @@ -1004,8 +1009,8 @@ fn twodelta_encode_run_lengths_correct() { let prev = vec![1u16, 1, 2, 2]; let next = vec![2u16, 1, 2, 1]; let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); - assert_eq!(frame.pair, (2, 1)); - assert_eq!(frame.run_length_vector, vec![1, 1, 1, 1]); + assert_eq!(frame.pair().unwrap(), (2, 1)); + assert_eq!(frame.run_length_vector().unwrap(), vec![1, 1, 1, 1]); } #[test] @@ -1017,7 +1022,7 @@ fn twodelta_encode_run_lengths_with_non_pair_gaps() { let prev = vec![1u16, 3, 2, 3, 1]; let next = vec![2u16, 3, 1, 3, 2]; let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); - assert_eq!(frame.run_length_vector, vec![1, 1, 1]); + assert_eq!(frame.run_length_vector().unwrap(), vec![1, 1, 1]); } // ── TwoDelta encode→decode roundtrip ──────────────────────────────────────── diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 17f595d..a3127ae 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -1,5 +1,5 @@ use super::errors::EncodeError; -use crate::codec::frames::TwoDeltaEncodeFrame; +use crate::codec::BenEncodeFrame; use std::collections::HashMap; use std::io::{Error, ErrorKind, Result}; @@ -18,7 +18,7 @@ use std::io::{Error, ErrorKind, Result}; /// /// # Returns /// -/// A `TwoDeltaEncodeFrame` describing the transition from `previous_assignment` to +/// A `BenEncodeFrame` describing the transition from `previous_assignment` to /// `new_assignment`. /// /// # TwoDelta encoding @@ -67,7 +67,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( delta_pair: Option<(u16, u16)>, previous_masks: Option<&mut HashMap>>, count: Option, -) -> Result { +) -> Result { let previous_assignment = previous_assignment.as_ref(); let new_assignment = new_assignment.as_ref(); @@ -107,7 +107,7 @@ pub(crate) fn encode_twodelta_frame_with_hint( _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment, count), } - // Ok(TwoDeltaEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) + // Ok(BenEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) } /// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return @@ -181,7 +181,7 @@ fn validate_masks_and_order_pairs_for_twodelta( /// /// # Returns /// -/// A `TwoDeltaEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if no +/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if no /// position actually changed value (signalling the frame can be deduplicated), or /// another error if a mask entry is inconsistent with the assignment data. fn construct_twodelta_frame_from_pair_and_mask_hints( @@ -190,7 +190,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( delta_pair: (u16, u16), previous_masks: &mut HashMap>, count: Option, -) -> Result { +) -> Result { let pair = match validate_masks_and_order_pairs_for_twodelta(delta_pair, previous_masks, current) { Ok(pair) => pair, @@ -289,7 +289,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous_masks.insert(pair.0, new_mask_a); previous_masks.insert(pair.1, new_mask_b); - Ok(TwoDeltaEncodeFrame::from_run_lengths( + Ok(BenEncodeFrame::from_run_lengths( pair, run_lengths, count, @@ -313,14 +313,14 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( /// /// # Returns /// -/// A `TwoDeltaEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if the +/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if the /// two assignments are identical. fn construct_twodelta_frame_from_mask_hint( previous: &[u16], current: &[u16], previous_masks: &mut HashMap>, count: Option, -) -> Result { +) -> Result { for (&assign0, &assign1) in previous.iter().zip(current.iter()) { if assign0 != assign1 { return construct_twodelta_frame_from_pair_and_mask_hints( @@ -352,13 +352,13 @@ fn construct_twodelta_frame_from_mask_hint( /// /// # Returns /// -/// A `TwoDeltaEncodeFrame` for the transition, or an error if more than two distinct ids +/// A `BenEncodeFrame` for the transition, or an error if more than two distinct ids /// appear across all changed positions. fn construct_twodelta_frame_from_scratch( previous: &[u16], current: &[u16], count: Option, -) -> Result { +) -> Result { // Find the pair at the first changed position. let first_change = previous .iter() @@ -406,7 +406,7 @@ fn construct_twodelta_frame_from_scratch( } run_lengths.push(run_count); - Ok(TwoDeltaEncodeFrame::from_run_lengths( + Ok(BenEncodeFrame::from_run_lengths( enc_pair, run_lengths, count, @@ -436,6 +436,6 @@ pub fn encode_twodelta_frame( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, count: Option, -) -> Result { +) -> Result { encode_twodelta_frame_with_hint(previous_assignment, new_assignment, None, None, count) } diff --git a/ben/src/codec/frames/ben_decode.rs b/ben/src/codec/frames/ben_decode.rs deleted file mode 100644 index 07e0b2c..0000000 --- a/ben/src/codec/frames/ben_decode.rs +++ /dev/null @@ -1,85 +0,0 @@ -use super::BenDecode; -use byteorder::{BigEndian, ReadBytesExt}; -use std::io; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BenDecodeFrame { - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized BEN frame bytes, including the header and payload. - pub raw_bytes: Vec, -} - -impl BenDecodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for BenDecodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for BenDecodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for BenDecodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &BenDecodeFrame) -> bool { - *self == other.raw_bytes - } -} - -impl BenDecode for BenDecodeFrame { - /// Read the next Standard BEN frame from the stream. - /// - /// Standard BEN frames have no trailing count; `count` is always set to 1. - /// Returns `Ok(None)` on a clean EOF at a frame boundary. - fn from_reader(reader: &mut impl io::Read) -> io::Result> { - let max_val_bit_count = match reader.read_u8() { - Ok(v) => v, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e), - }; - - let max_len_bit_count = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - - let mut raw_bytes = vec![0u8; n_bytes as usize]; - reader.read_exact(&mut raw_bytes)?; - - Ok(Some(BenDecodeFrame { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - })) - } -} diff --git a/ben/src/codec/frames/ben_encode.rs b/ben/src/codec/frames/ben_encode.rs deleted file mode 100644 index 20fbf6c..0000000 --- a/ben/src/codec/frames/ben_encode.rs +++ /dev/null @@ -1,90 +0,0 @@ -use super::{compress_rle_to_ben_bytes, BenConstruct}; - -/// Canonical representation of a BEN frame. -/// -/// The frame stores the semantic RLE runs together with the derived header -/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN -/// frame, including the two one-byte bit-width fields and the four-byte payload -/// length. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BenEncodeFrame { - // The RLE runs that were encoded into this frame, stored here for reference - pub runs: Vec<(u16, u16)>, - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized BEN frame bytes, including the header and payload. - pub raw_bytes: Vec, -} - -impl BenEncodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for BenEncodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for BenEncodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for BenEncodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &BenEncodeFrame) -> bool { - *self == other.raw_bytes - } -} - -impl BenConstruct for BenEncodeFrame { - /// Build a frame from an RLE run vector. - fn from_rle(runs: Vec<(u16, u16)>, _count: Option) -> Self { - let (max_val, max_len) = runs - .iter() - .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { - (max_val.max(val), max_len.max(len)) - }); - let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - let raw_bytes = - compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); - - Self { - runs, - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - } - } -} diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs new file mode 100644 index 0000000..cd7eb88 --- /dev/null +++ b/ben/src/codec/frames/decode.rs @@ -0,0 +1,289 @@ +use super::encode::BenEncodeFrame; +use crate::BenVariant; +use byteorder::{BigEndian, ReadBytesExt}; +use std::io::{self, Read}; + +/// One sample's encoded bytes at the frame layer, freshly read from a wire +/// stream. +/// +/// `Standard` and `MkvChain` carry **opaque** bit-packed payload bytes — the +/// runs are not expanded until a caller asks for them. This is what makes +/// frame-level subsampling cheap: the iterator can pull frames at byte level +/// and only the kept frames pay the bit-unpacking cost. +/// +/// `TwoDelta` is the exception: applying a delta to the previous assignment +/// requires the run-length vector, so the decoder unpacks it eagerly at parse +/// time. This is not a regression; the bytes would have been needed +/// immediately on use anyway. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BenDecodeFrame { + /// A `Standard`-variant frame with no trailing repetition count. + Standard { + /// The number of bits used to encode the maximum district id. + max_val_bit_count: u8, + /// The number of bits used to encode the maximum run length. + max_len_bit_count: u8, + /// The number of bytes in the packed payload. + n_bytes: u32, + /// The bit-packed payload bytes — opaque until `expand` is called. + raw_bytes: Vec, + }, + /// An `MkvChain`-variant frame carrying its repetition count. + MkvChain { + /// The number of bits used to encode the maximum district id. + max_val_bit_count: u8, + /// The number of bits used to encode the maximum run length. + max_len_bit_count: u8, + /// The number of bytes in the packed payload. + n_bytes: u32, + /// The bit-packed payload bytes — opaque until `expand` is called. + raw_bytes: Vec, + /// The number of times this frame repeats. + count: u16, + }, + /// A `TwoDelta`-variant delta frame. Run lengths are eagerly decoded at + /// parse time because applying the delta needs them. + TwoDelta { + /// The pair of district ids encoded in this frame. + pair: (u16, u16), + /// The unpacked alternating run lengths over the positions occupied + /// by the pair. + run_lengths: Vec, + /// The number of times this delta repeats. + count: u16, + }, +} + +impl BenDecodeFrame { + /// Read the next frame in the wire format dictated by `variant`. + /// + /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` + /// on success, and `Err` on any I/O or format error. + /// + /// Note: in a `TwoDelta` *stream*, the first frame is encoded in + /// `MkvChain` wire format. The caller (e.g. [`AssignmentReader`]) tracks + /// that state and passes [`BenVariant::MkvChain`] for the first frame and + /// [`BenVariant::TwoDelta`] for the rest. + /// + /// [`AssignmentReader`]: crate::io::reader::AssignmentReader + pub fn from_reader( + reader: &mut impl Read, + variant: BenVariant, + ) -> io::Result> { + match variant { + BenVariant::Standard => Self::read_standard(reader), + BenVariant::MkvChain => Self::read_mkv_chain(reader), + BenVariant::TwoDelta => Self::read_twodelta(reader), + } + } + + fn read_standard(reader: &mut impl Read) -> io::Result> { + let max_val_bit_count = match reader.read_u8() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let max_len_bit_count = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut raw_bytes = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut raw_bytes)?; + + Ok(Some(Self::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + })) + } + + fn read_mkv_chain(reader: &mut impl Read) -> io::Result> { + let max_val_bit_count = match reader.read_u8() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let max_len_bit_count = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + + let mut raw_bytes = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut raw_bytes)?; + + let count = reader.read_u16::()?; + + Ok(Some(Self::MkvChain { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + })) + } + + fn read_twodelta(reader: &mut impl Read) -> io::Result> { + let pair_a = match reader.read_u16::() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e), + }; + + let pair_b = reader.read_u16::()?; + let max_len_bits = reader.read_u8()?; + if max_len_bits == 0 || max_len_bits > 16 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("invalid TwoDelta run-length bit width: {max_len_bits}"), + )); + } + let n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; n_bytes as usize]; + reader.read_exact(&mut payload)?; + + let count = reader.read_u16::()?; + + // Reuse the encode-side bit unpacker so the unpack logic lives in one + // place; we then drop the resulting BenEncodeFrame's raw_bytes since + // the decode-side TwoDelta arm doesn't keep them. + let pair = (pair_a, pair_b); + let encode_frame = BenEncodeFrame::from_parts(pair, max_len_bits, payload, count); + let run_lengths = match encode_frame { + BenEncodeFrame::TwoDelta { + run_length_vector, .. + } => run_length_vector, + _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), + }; + + Ok(Some(Self::TwoDelta { + pair, + run_lengths, + count, + })) + } + + /// The frame's repetition count (`1` for `Standard`). + pub fn count(&self) -> u16 { + match self { + Self::Standard { .. } => 1, + Self::MkvChain { count, .. } | Self::TwoDelta { count, .. } => *count, + } + } + + /// The variant tag corresponding to this frame's arm. + pub fn variant(&self) -> BenVariant { + match self { + Self::Standard { .. } => BenVariant::Standard, + Self::MkvChain { .. } => BenVariant::MkvChain, + Self::TwoDelta { .. } => BenVariant::TwoDelta, + } + } + + /// Borrow the bit-packed payload bytes for `Standard`/`MkvChain` arms. + /// Returns `None` for `TwoDelta` (which doesn't keep raw bytes after + /// parsing). + pub fn raw_bytes(&self) -> Option<&[u8]> { + match self { + Self::Standard { raw_bytes, .. } | Self::MkvChain { raw_bytes, .. } => { + Some(raw_bytes) + } + Self::TwoDelta { .. } => None, + } + } + + /// The bit width of the largest district id in this frame, or `None` for + /// `TwoDelta` (which doesn't carry one). + pub fn max_val_bit_count(&self) -> Option { + match self { + Self::Standard { + max_val_bit_count, .. + } + | Self::MkvChain { + max_val_bit_count, .. + } => Some(*max_val_bit_count), + Self::TwoDelta { .. } => None, + } + } + + /// The bit width of the largest run length, or `None` for `TwoDelta` + /// (whose width sat in the wire format but is not retained on decode). + pub fn max_len_bit_count(&self) -> Option { + match self { + Self::Standard { + max_len_bit_count, .. + } + | Self::MkvChain { + max_len_bit_count, .. + } => Some(*max_len_bit_count), + Self::TwoDelta { .. } => None, + } + } + + /// The number of payload bytes for `Standard`/`MkvChain`, or `None` for + /// `TwoDelta`. + pub fn n_bytes(&self) -> Option { + match self { + Self::Standard { n_bytes, .. } | Self::MkvChain { n_bytes, .. } => Some(*n_bytes), + Self::TwoDelta { .. } => None, + } + } + + /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for + /// the snapshot arms. + pub fn pair(&self) -> Option<(u16, u16)> { + match self { + Self::TwoDelta { pair, .. } => Some(*pair), + _ => None, + } + } + + /// Borrow the alternating run-length vector for a `TwoDelta` frame, or + /// `None` for the snapshot arms. + pub fn run_lengths(&self) -> Option<&[u16]> { + match self { + Self::TwoDelta { run_lengths, .. } => Some(run_lengths), + _ => None, + } + } + + /// Materialize the frame as a full assignment vector. + /// + /// `Standard` and `MkvChain` ignore `prev` (any owned vector is dropped). + /// `TwoDelta` consumes `prev` in place to apply the delta and returns an + /// error if `prev` is `None`. + pub fn expand(&self, prev: Option>) -> io::Result> { + use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError}; + use crate::util::rle::rle_to_vec; + use std::io::Cursor; + + match self { + Self::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + } + | Self::MkvChain { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + .. + } => decode_ben_line( + Cursor::new(raw_bytes), + *max_val_bit_count, + *max_len_bit_count, + *n_bytes, + ) + .map(rle_to_vec), + Self::TwoDelta { + pair, run_lengths, .. + } => { + let prev = + prev.ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?; + apply_twodelta_runs_to_assignment(prev, *pair, run_lengths) + } + } + } +} diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs new file mode 100644 index 0000000..5836af4 --- /dev/null +++ b/ben/src/codec/frames/encode.rs @@ -0,0 +1,387 @@ +use super::compress_rle_to_ben_bytes; +use crate::util::rle::assign_to_rle; +use crate::BenVariant; + +/// One sample's encoded bytes at the frame layer. +/// +/// Variants mirror [`BenVariant`]: a stream's variant tag dictates which arm +/// each frame in the stream uses. Encode-side arms carry the source RLE runs +/// (or run-length vector for `TwoDelta`) alongside the serialized `raw_bytes`, +/// because frames on this side are built *from* runs. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BenEncodeFrame { + /// A `Standard`-variant frame. No trailing repetition count on the wire. + Standard { + /// The RLE runs that were encoded into this frame. + runs: Vec<(u16, u16)>, + /// The number of bits used to encode the maximum district id. + max_val_bit_count: u8, + /// The number of bits used to encode the maximum run length. + max_len_bit_count: u8, + /// The number of bytes in the packed payload. + n_bytes: u32, + /// The full serialized frame bytes (frame header + payload). + raw_bytes: Vec, + }, + /// An `MkvChain`-variant frame. Carries a trailing `u16` repetition count. + MkvChain { + /// The RLE runs that were encoded into this frame. + runs: Vec<(u16, u16)>, + /// The number of bits used to encode the maximum district id. + max_val_bit_count: u8, + /// The number of bits used to encode the maximum run length. + max_len_bit_count: u8, + /// The number of bytes in the packed payload. + n_bytes: u32, + /// The full serialized frame bytes (frame header + payload + count). + raw_bytes: Vec, + /// The number of times this frame repeats. + count: u16, + }, + /// A `TwoDelta`-variant frame: a delta over `pair` with alternating run + /// lengths. Carries a trailing `u16` repetition count. + TwoDelta { + /// The pair of district ids encoded in this frame. + /// `pair.0` corresponds to the first run. + pair: (u16, u16), + /// The number of bits used to encode the maximum run length. + max_len_bit_count: u8, + /// The number of bytes in the packed payload. + n_bytes: u32, + /// The alternating run-length vector over the positions occupied by + /// the pair. + run_length_vector: Vec, + /// The full serialized TwoDelta frame bytes (header + payload + count). + raw_bytes: Vec, + /// The number of times this frame repeats. + count: u16, + }, +} + +impl BenEncodeFrame { + /// Build a `Standard` or `MkvChain` frame from RLE runs. + /// + /// `count` is ignored for `Standard` and defaults to `1` for `MkvChain`. + /// + /// # Panics + /// + /// Panics if `variant` is [`BenVariant::TwoDelta`]; use + /// [`BenEncodeFrame::from_run_lengths`] for that. + pub fn from_rle(runs: Vec<(u16, u16)>, variant: BenVariant, count: Option) -> Self { + let (max_val, max_len) = runs + .iter() + .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { + (max_val.max(val), max_len.max(len)) + }); + let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; + let payload_bits = assign_bits * runs.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + let mut raw_bytes = + compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); + + match variant { + BenVariant::Standard => Self::Standard { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + }, + BenVariant::MkvChain => { + let count = count.unwrap_or(1); + raw_bytes.extend(count.to_be_bytes()); + Self::MkvChain { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + } + } + BenVariant::TwoDelta => panic!( + "BenEncodeFrame::from_rle does not support TwoDelta; \ + use BenEncodeFrame::from_run_lengths instead", + ), + } + } + + /// Build a `Standard` or `MkvChain` frame from an assignment vector. + /// + /// # Panics + /// + /// Panics if `variant` is [`BenVariant::TwoDelta`]; TwoDelta frames cannot + /// be derived from a single assignment vector. + pub fn from_assignment( + assignment: impl AsRef<[u16]>, + variant: BenVariant, + count: Option, + ) -> Self { + Self::from_rle(assign_to_rle(assignment), variant, count) + } + + /// Build a `TwoDelta` frame from a pair and pre-computed run lengths. + /// + /// `count` defaults to `1` if `None`. + pub fn from_run_lengths( + pair: (u16, u16), + run_length_vector: Vec, + count: Option, + ) -> Self { + let count = count.unwrap_or(1); + + let max_len = run_length_vector.iter().copied().max().unwrap_or(0); + let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); + + let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; + let n_bytes = payload_bits.div_ceil(8); + + // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) + count (2) + let mut raw_bytes = Vec::with_capacity((n_bytes + 11) as usize); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + + let mut remainder: u32 = 0; + let mut remainder_bits: u8 = 0; + + for &item in &run_length_vector { + let mut packed = (remainder << max_len_bit_count) | item as u32; + let mut bits_left = remainder_bits + max_len_bit_count; + + while bits_left >= 8 { + bits_left -= 8; + raw_bytes.push((packed >> bits_left) as u8); + packed &= !((u32::MAX) << bits_left); + } + + remainder = packed; + remainder_bits = bits_left; + } + + if remainder_bits > 0 { + raw_bytes.push((remainder << (8 - remainder_bits)) as u8); + } + + raw_bytes.extend(count.to_be_bytes()); + + Self::TwoDelta { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + count, + } + } + + /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a + /// raw payload. + /// + /// This is the inverse of [`BenEncodeFrame::from_run_lengths`]: it + /// re-assembles the serialized bytes and decodes the bit-packed payload + /// back into the run-length vector so that both representations are + /// available on the resulting frame. + pub fn from_parts( + pair: (u16, u16), + max_len_bit_count: u8, + payload: Vec, + count: u16, + ) -> Self { + let n_bytes = payload.len() as u32; + let mut raw_bytes = Vec::with_capacity(9 + payload.len() + 2); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); + raw_bytes.extend_from_slice(&payload); + raw_bytes.extend_from_slice(&count.to_be_bytes()); + + let mut run_length_vector = Vec::new(); + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + + for &byte in payload[..n_bytes as usize].iter() { + buffer |= (byte as u32).to_be() >> n_bits_in_buff; + n_bits_in_buff += 8; + + while n_bits_in_buff >= max_len_bit_count as u16 { + let item = (buffer >> (32 - max_len_bit_count)) as u16; + buffer <<= max_len_bit_count; + n_bits_in_buff -= max_len_bit_count as u16; + if item > 0 { + run_length_vector.push(item); + } + } + } + + Self::TwoDelta { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + count, + } + } + + /// Borrow the serialized frame bytes. + pub fn as_slice(&self) -> &[u8] { + match self { + Self::Standard { raw_bytes, .. } => raw_bytes, + Self::MkvChain { raw_bytes, .. } => raw_bytes, + Self::TwoDelta { raw_bytes, .. } => raw_bytes, + } + } + + /// Clone out the serialized frame bytes. + pub fn to_bytes(&self) -> Vec { + self.as_slice().to_vec() + } + + /// Consume the frame and return the serialized frame bytes without cloning. + pub fn into_bytes(self) -> Vec { + match self { + Self::Standard { raw_bytes, .. } => raw_bytes, + Self::MkvChain { raw_bytes, .. } => raw_bytes, + Self::TwoDelta { raw_bytes, .. } => raw_bytes, + } + } + + /// Borrow just the packed payload bytes (the variant-specific region + /// between the frame header and any trailing count). + /// + /// Returns the payload slice for any well-formed frame. + pub fn payload(&self) -> &[u8] { + match self { + Self::Standard { + n_bytes, + raw_bytes, + .. + } + | Self::MkvChain { + n_bytes, + raw_bytes, + .. + } => &raw_bytes[6..6 + *n_bytes as usize], + Self::TwoDelta { + n_bytes, + raw_bytes, + .. + } => &raw_bytes[9..9 + *n_bytes as usize], + } + } + + /// The frame's repetition count (`1` for `Standard`). + pub fn count(&self) -> u16 { + match self { + Self::Standard { .. } => 1, + Self::MkvChain { count, .. } | Self::TwoDelta { count, .. } => *count, + } + } + + /// The variant tag corresponding to this frame's arm. + pub fn variant(&self) -> BenVariant { + match self { + Self::Standard { .. } => BenVariant::Standard, + Self::MkvChain { .. } => BenVariant::MkvChain, + Self::TwoDelta { .. } => BenVariant::TwoDelta, + } + } + + /// The bit width of the largest district id in this frame, or `None` for + /// `TwoDelta` (which doesn't carry one). + pub fn max_val_bit_count(&self) -> Option { + match self { + Self::Standard { + max_val_bit_count, .. + } + | Self::MkvChain { + max_val_bit_count, .. + } => Some(*max_val_bit_count), + Self::TwoDelta { .. } => None, + } + } + + /// The bit width of the largest run length in this frame. + pub fn max_len_bit_count(&self) -> u8 { + match self { + Self::Standard { + max_len_bit_count, .. + } + | Self::MkvChain { + max_len_bit_count, .. + } + | Self::TwoDelta { + max_len_bit_count, .. + } => *max_len_bit_count, + } + } + + /// The number of bytes in the packed payload region. + pub fn n_bytes(&self) -> u32 { + match self { + Self::Standard { n_bytes, .. } + | Self::MkvChain { n_bytes, .. } + | Self::TwoDelta { n_bytes, .. } => *n_bytes, + } + } + + /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for + /// the snapshot arms. + pub fn pair(&self) -> Option<(u16, u16)> { + match self { + Self::TwoDelta { pair, .. } => Some(*pair), + _ => None, + } + } + + /// Borrow the source RLE runs for `Standard` and `MkvChain`, or `None` + /// for `TwoDelta` (which carries `run_length_vector` instead). + pub fn runs(&self) -> Option<&[(u16, u16)]> { + match self { + Self::Standard { runs, .. } | Self::MkvChain { runs, .. } => Some(runs), + Self::TwoDelta { .. } => None, + } + } + + /// Borrow the alternating run-length vector for a `TwoDelta` frame, or + /// `None` for the snapshot arms. + pub fn run_length_vector(&self) -> Option<&[u16]> { + match self { + Self::TwoDelta { + run_length_vector, .. + } => Some(run_length_vector), + _ => None, + } + } +} + +impl AsRef<[u8]> for BenEncodeFrame { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl std::ops::Deref for BenEncodeFrame { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq> for BenEncodeFrame { + fn eq(&self, other: &Vec) -> bool { + self.as_slice() == other.as_slice() + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &BenEncodeFrame) -> bool { + self.as_slice() == other.as_slice() + } +} diff --git a/ben/src/codec/frames/mkv_decode.rs b/ben/src/codec/frames/mkv_decode.rs deleted file mode 100644 index f9022a2..0000000 --- a/ben/src/codec/frames/mkv_decode.rs +++ /dev/null @@ -1,94 +0,0 @@ -use super::BenDecode; -use byteorder::{BigEndian, ReadBytesExt}; -use std::io::{self, Read}; - -/// A decoded MkvChain BEN frame, including its repetition count. -/// -/// Symmetric to `MkvBenEncodeFrame` but stores only the decoded payload bytes -/// and header fields rather than the original RLE runs. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct MkvBenDecodeFrame { - /// The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - /// The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - /// The number of bytes in the packed payload. - pub n_bytes: u32, - /// The packed payload bytes (not including the 6-byte header or count). - pub raw_bytes: Vec, - /// The number of times this assignment repeats. - pub count: u16, -} - -impl MkvBenDecodeFrame { - /// Borrow the packed payload bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the packed payload bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the packed payload bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl BenDecode for MkvBenDecodeFrame { - /// Read the next MkvChain BEN frame from the stream. - /// - /// MkvChain frames carry a trailing `u16` repetition count. - /// Returns `Ok(None)` on a clean EOF at a frame boundary. - fn from_reader(reader: &mut impl Read) -> io::Result> { - let max_val_bit_count = match reader.read_u8() { - Ok(v) => v, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e), - }; - - let max_len_bit_count = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - - let mut raw_bytes = vec![0u8; n_bytes as usize]; - reader.read_exact(&mut raw_bytes)?; - - let count = reader.read_u16::()?; - - Ok(Some(MkvBenDecodeFrame { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - count, - })) - } -} - -impl AsRef<[u8]> for MkvBenDecodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for MkvBenDecodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for MkvBenDecodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &MkvBenDecodeFrame) -> bool { - *self == other.raw_bytes - } -} diff --git a/ben/src/codec/frames/mkv_encode.rs b/ben/src/codec/frames/mkv_encode.rs deleted file mode 100644 index 311a909..0000000 --- a/ben/src/codec/frames/mkv_encode.rs +++ /dev/null @@ -1,100 +0,0 @@ -use super::{compress_rle_to_ben_bytes, BenConstruct}; - -/// Canonical representation of a BEN frame. -/// -/// The frame stores the semantic RLE runs together with the derived header -/// fields and the serialized frame bytes. `to_bytes()` returns the full BEN -/// frame, including the two one-byte bit-width fields and the four-byte payload -/// length. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct MkvBenEncodeFrame { - // The RLE runs that were encoded into this frame, stored here for reference - pub runs: Vec<(u16, u16)>, - // The number of bits used to encode the maximum label value in this frame. - pub max_val_bit_count: u8, - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The full serialized MKVBEN frame bytes, including the header and payload. - pub raw_bytes: Vec, - // The number of times that this frame was repeated - pub count: u16, -} - -impl MkvBenEncodeFrame { - /// Borrow the serialized BEN frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized BEN frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized BEN bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } -} - -impl AsRef<[u8]> for MkvBenEncodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for MkvBenEncodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl PartialEq> for MkvBenEncodeFrame { - fn eq(&self, other: &Vec) -> bool { - self.raw_bytes == *other - } -} - -impl PartialEq for Vec { - fn eq(&self, other: &MkvBenEncodeFrame) -> bool { - *self == other.raw_bytes - } -} - -impl BenConstruct for MkvBenEncodeFrame { - /// Build a frame from an RLE run vector. - fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self { - let count = match count { - Some(v) => v, - None => 1, - }; - - let (max_val, max_len) = runs - .iter() - .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { - (max_val.max(val), max_len.max(len)) - }); - let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - let mut raw_bytes = - compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); - - raw_bytes.extend(count.to_be_bytes()); - - Self { - runs, - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - count, - } - } -} diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index 2088f8d..489cbe6 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -1,43 +1,32 @@ -mod ben_decode; -mod ben_encode; -mod mkv_decode; -mod mkv_encode; -mod twodelta_decode; -mod twodelta_encode; +//! Frame-layer types — one sample's encoded bytes. +//! +//! See `docs/glossary.md` for the encoding-stack layering. This module owns +//! layer 2 (frame). Each direction is a single enum whose arms mirror +//! [`crate::BenVariant`]: +//! +//! - [`BenEncodeFrame`] is built **from** RLE runs (or a pair + run-length +//! vector for the `TwoDelta` arm) and carries the source representation +//! alongside the serialized bytes. +//! - [`BenDecodeFrame`] is built **from** wire bytes and keeps the bit-packed +//! payload opaque on `Standard`/`MkvChain` arms so frame-level subsampling +//! stays cheap (no eager bit-unpacking). + +mod decode; +mod encode; #[cfg(test)] mod tests; -pub use ben_decode::BenDecodeFrame; -pub use ben_encode::BenEncodeFrame; -pub use mkv_decode::MkvBenDecodeFrame; -pub use mkv_encode::MkvBenEncodeFrame; -pub use twodelta_decode::TwoDeltaDecodeFrame; -pub use twodelta_encode::TwoDeltaEncodeFrame; - -use crate::util::rle::assign_to_rle; -use std::io; - -pub trait BenConstruct { - fn from_rle(runs: Vec<(u16, u16)>, count: Option) -> Self; - - fn from_assignment(assignments: impl AsRef<[u16]>, count: Option) -> Self - where - Self: Sized, - { - Self::from_rle(assign_to_rle(assignments), count) - } -} - -pub trait BenDecode: Sized { - /// Read the next frame from a byte stream. - /// - /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` - /// on success, and `Err` on any IO or format error. - fn from_reader(reader: &mut impl io::Read) -> io::Result>; -} +pub use decode::BenDecodeFrame; +pub use encode::BenEncodeFrame; -/// Compresses a run-length encoded vector into BEN payload bytes. +/// Bit-pack an RLE run vector into a serialized BEN frame payload. +/// +/// Output layout: +/// +/// ```text +/// [max_val_bit_count: u8][max_len_bit_count: u8][n_bytes: u32 BE][packed payload...] +/// ``` pub(super) fn compress_rle_to_ben_bytes( max_val_bit_count: u8, max_len_bit_count: u8, diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index b0b9ff9..a80645b 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -1,6 +1,9 @@ use super::*; +use crate::BenVariant; use std::io::{self, Read}; +// ── Helpers ───────────────────────────────────────────────────────────────── + /// A reader that returns one successful byte then an I/O error. struct ErrorAfterOneByte; @@ -14,644 +17,628 @@ impl Read for ErrorAfterOneByte { } } -// ── BenDecodeFrame ────────────────────────────────────────────────────────── +fn unwrap_standard(frame: BenDecodeFrame) -> (u8, u8, u32, Vec) { + match frame { + BenDecodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + } => (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes), + other => panic!("expected Standard, got {:?}", other), + } +} + +fn unwrap_mkv(frame: BenDecodeFrame) -> (u8, u8, u32, Vec, u16) { + match frame { + BenDecodeFrame::MkvChain { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + } => ( + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + ), + other => panic!("expected MkvChain, got {:?}", other), + } +} + +fn unwrap_twodelta(frame: BenDecodeFrame) -> ((u16, u16), Vec, u16) { + match frame { + BenDecodeFrame::TwoDelta { + pair, + run_lengths, + count, + } => (pair, run_lengths, count), + other => panic!("expected TwoDelta, got {:?}", other), + } +} + +fn unwrap_encode_standard(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u32, Vec) { + match frame { + BenEncodeFrame::Standard { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + } => (runs, max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes), + other => panic!("expected Standard encode arm, got {:?}", other), + } +} + +fn unwrap_encode_mkv(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u32, Vec, u16) { + match frame { + BenEncodeFrame::MkvChain { + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + } => ( + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + count, + ), + other => panic!("expected MkvChain encode arm, got {:?}", other), + } +} + +fn unwrap_encode_twodelta( + frame: BenEncodeFrame, +) -> ((u16, u16), u8, u32, Vec, Vec, u16) { + match frame { + BenEncodeFrame::TwoDelta { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + count, + } => ( + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + count, + ), + other => panic!("expected TwoDelta encode arm, got {:?}", other), + } +} + +// ── BenDecodeFrame::from_reader (Standard) ────────────────────────────────── #[test] -fn ben_decode_frame_from_reader_standard_frame() { - // Header: max_val_bits=2, max_len_bits=3, n_bytes=2 - // Payload: 2 bytes +fn ben_decode_standard_from_reader() { + // Header: max_val_bits=2, max_len_bits=3, n_bytes=2; payload 2 bytes. let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; let mut cursor = io::Cursor::new(data); - let frame = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - assert_eq!(frame.max_val_bit_count, 2); - assert_eq!(frame.max_len_bit_count, 3); - assert_eq!(frame.n_bytes, 2); - assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + let (mvb, mlb, n, payload) = unwrap_standard(frame); + assert_eq!(mvb, 2); + assert_eq!(mlb, 3); + assert_eq!(n, 2); + assert_eq!(payload, vec![0xAB, 0xCD]); } #[test] -fn ben_decode_frame_from_reader_eof_returns_none() { - let data: Vec = vec![]; - let mut cursor = io::Cursor::new(data); - let result = BenDecodeFrame::from_reader(&mut cursor).unwrap(); +fn ben_decode_standard_eof_returns_none() { + let mut cursor = io::Cursor::new(Vec::::new()); + let result = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard).unwrap(); assert!(result.is_none()); } #[test] -fn ben_decode_frame_from_reader_truncated_header_errors() { - // Only 1 byte — too short for a full header - let data: Vec = vec![2]; - let mut cursor = io::Cursor::new(data); - let err = BenDecodeFrame::from_reader(&mut cursor).unwrap_err(); +fn ben_decode_standard_truncated_header_errors() { + let mut cursor = io::Cursor::new(vec![2u8]); // only one header byte + let err = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } #[test] -fn ben_decode_frame_to_bytes() { - let frame = BenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 2, - raw_bytes: vec![0xAB, 0xCD], - }; - let bytes = frame.to_bytes(); - assert_eq!(bytes, vec![0xAB, 0xCD]); - // Original frame still usable (not consumed) - assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); -} - -#[test] -fn ben_decode_frame_into_bytes() { - let frame = BenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 2, - raw_bytes: vec![0xAB, 0xCD], - }; - let bytes = frame.into_bytes(); - assert_eq!(bytes, vec![0xAB, 0xCD]); -} - -#[test] -fn ben_decode_frame_as_ref() { - let frame = BenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 2, - raw_bytes: vec![0xAB, 0xCD], - }; - let slice: &[u8] = frame.as_ref(); - assert_eq!(slice, &[0xAB, 0xCD]); -} - -#[test] -fn ben_decode_frame_deref() { - let frame = BenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 2, - raw_bytes: vec![0xAB, 0xCD], - }; - // Deref lets us call slice methods directly - assert_eq!(frame.len(), 2); - assert_eq!(frame[0], 0xAB); - assert_eq!(frame[1], 0xCD); -} - -#[test] -fn ben_decode_frame_partial_eq_vec() { - let frame = BenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 2, - raw_bytes: vec![0xAB, 0xCD], - }; - let v = vec![0xAB, 0xCD]; - // Both directions - assert_eq!(frame, v); - assert_eq!(v, frame); - // Inequality - let v2 = vec![0xFF]; - assert_ne!(frame, v2); - assert_ne!(v2, frame); -} - -// ── MkvBenDecodeFrame ─────────────────────────────────────────────────────── - -#[test] -fn mkv_decode_frame_from_reader() { - // Header: max_val_bits=2, max_len_bits=3, n_bytes=2 - // Payload: 2 bytes - // Count: u16 BE = 5 - let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD, 0, 5]; +fn ben_decode_standard_non_eof_read_error_propagates() { + let err = + BenDecodeFrame::from_reader(&mut ErrorAfterOneByte, BenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +} + +// ── BenDecodeFrame::from_reader (MkvChain) ────────────────────────────────── + +#[test] +fn ben_decode_mkv_from_reader() { + // Header (6) + payload (2) + count (2) = 10 bytes. + let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD, 0x00, 0x07]; let mut cursor = io::Cursor::new(data); - let frame = MkvBenDecodeFrame::from_reader(&mut cursor) + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); - assert_eq!(frame.max_val_bit_count, 2); - assert_eq!(frame.max_len_bit_count, 3); - assert_eq!(frame.n_bytes, 2); - assert_eq!(frame.raw_bytes, vec![0xAB, 0xCD]); - assert_eq!(frame.count, 5); + let (mvb, mlb, n, payload, count) = unwrap_mkv(frame); + assert_eq!(mvb, 2); + assert_eq!(mlb, 3); + assert_eq!(n, 2); + assert_eq!(payload, vec![0xAB, 0xCD]); + assert_eq!(count, 7); } #[test] -fn mkv_decode_frame_from_reader_eof_returns_none() { - let data: Vec = vec![]; - let mut cursor = io::Cursor::new(data); - let result = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap(); +fn ben_decode_mkv_eof_returns_none() { + let mut cursor = io::Cursor::new(Vec::::new()); + let result = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain).unwrap(); assert!(result.is_none()); } #[test] -fn mkv_decode_frame_from_reader_truncated_count_errors() { - // Valid header + payload, but missing count bytes - let data: Vec = vec![2, 3, 0, 0, 0, 1, 0xFF]; +fn ben_decode_mkv_truncated_count_errors() { + // Header + payload but no count. + let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; let mut cursor = io::Cursor::new(data); - let err = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap_err(); + let err = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } #[test] -fn mkv_decode_frame_to_bytes() { - let frame = MkvBenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 1, - raw_bytes: vec![0xFF], - count: 3, - }; - let bytes = frame.to_bytes(); - assert_eq!(bytes, vec![0xFF]); - assert_eq!(frame.raw_bytes, vec![0xFF]); +fn ben_decode_mkv_count_max_u16() { + let data: Vec = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD, 0xFF, 0xFF]; + let mut cursor = io::Cursor::new(data); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) + .unwrap() + .unwrap(); + let (_, _, _, _, count) = unwrap_mkv(frame); + assert_eq!(count, u16::MAX); } #[test] -fn mkv_decode_frame_into_bytes() { - let frame = MkvBenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 1, - raw_bytes: vec![0xFF], - count: 3, - }; - let bytes = frame.into_bytes(); - assert_eq!(bytes, vec![0xFF]); +fn ben_decode_mkv_non_eof_read_error_propagates() { + let err = + BenDecodeFrame::from_reader(&mut ErrorAfterOneByte, BenVariant::MkvChain).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } +// ── BenDecodeFrame::from_reader (TwoDelta) ────────────────────────────────── + #[test] -fn mkv_decode_frame_as_ref() { - let frame = MkvBenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 1, - raw_bytes: vec![0xFF], - count: 3, - }; - let slice: &[u8] = frame.as_ref(); - assert_eq!(slice, &[0xFF]); +fn ben_decode_twodelta_from_reader() { + // Build a TwoDelta encode frame, then read it back as a decode frame. + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(5)); + let bytes = encoded.into_bytes(); + + let mut cursor = io::Cursor::new(bytes); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) + .unwrap() + .unwrap(); + let (pair, run_lengths, count) = unwrap_twodelta(frame); + assert_eq!(pair, (1, 2)); + assert_eq!(run_lengths, vec![2, 2]); + assert_eq!(count, 5); } #[test] -fn mkv_decode_frame_deref() { - let frame = MkvBenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 1, - raw_bytes: vec![0xFF], - count: 3, - }; - assert_eq!(frame.len(), 1); - assert_eq!(frame[0], 0xFF); +fn ben_decode_twodelta_eof_returns_none() { + let mut cursor = io::Cursor::new(Vec::::new()); + let result = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta).unwrap(); + assert!(result.is_none()); } #[test] -fn mkv_decode_frame_partial_eq_vec() { - let frame = MkvBenDecodeFrame { - max_val_bit_count: 2, - max_len_bit_count: 3, - n_bytes: 1, - raw_bytes: vec![0xFF], - count: 3, - }; - let v = vec![0xFF]; - assert_eq!(frame, v); - assert_eq!(v, frame); - let v2 = vec![0x00]; - assert_ne!(frame, v2); - assert_ne!(v2, frame); +fn ben_decode_twodelta_truncated_errors() { + // Only the pair bytes; no max_len_bits, n_bytes, payload, or count. + let data: Vec = vec![0, 1, 0, 2]; + let mut cursor = io::Cursor::new(data); + let err = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } -// ── MkvBenEncodeFrame ─────────────────────────────────────────────────────── +#[test] +fn ben_decode_twodelta_invalid_max_len_bits_zero_errors() { + // pair (4) + max_len_bits=0 (1) + n_bytes=0 (4) + count (2) + let data: Vec = vec![0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1]; + let mut cursor = io::Cursor::new(data); + let err = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); +} #[test] -fn mkv_encode_frame_from_rle_count_none_defaults_to_1() { - let runs = vec![(1u16, 4u16), (2, 1)]; - let frame = MkvBenEncodeFrame::from_rle(runs.clone(), None); - assert_eq!(frame.count, 1); - assert_eq!(frame.runs, runs); +fn ben_decode_twodelta_count_max_u16() { + let encoded = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1], Some(u16::MAX)); + let bytes = encoded.into_bytes(); + let mut cursor = io::Cursor::new(bytes); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) + .unwrap() + .unwrap(); + let (_, _, count) = unwrap_twodelta(frame); + assert_eq!(count, u16::MAX); } +// ── BenEncodeFrame::from_rle ──────────────────────────────────────────────── + #[test] -fn mkv_encode_frame_from_rle_with_count() { - let runs = vec![(1u16, 4u16), (2, 1)]; - let frame = MkvBenEncodeFrame::from_rle(runs.clone(), Some(7)); - assert_eq!(frame.count, 7); +fn encode_from_rle_standard_carries_runs_and_bytes() { + let runs = vec![(1u16, 2u16), (2, 3), (3, 1)]; + let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let (got_runs, mvb, mlb, n, raw) = unwrap_encode_standard(frame); + assert_eq!(got_runs, runs); + assert_eq!(mvb, 2); // max value 3 fits in 2 bits + assert_eq!(mlb, 2); // max length 3 fits in 2 bits + assert!(n > 0); + assert_eq!(raw[0], mvb); + assert_eq!(raw[1], mlb); + assert_eq!(&raw[2..6], n.to_be_bytes().as_slice()); } #[test] -fn mkv_encode_frame_to_bytes() { - let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); - let bytes = frame.to_bytes(); - assert_eq!(bytes, frame.raw_bytes); - // Frame still usable - assert!(!frame.raw_bytes.is_empty()); +fn encode_from_rle_mkv_count_none_defaults_to_one() { + let runs = vec![(1u16, 2u16), (2, 3)]; + let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, None); + let (_, _, _, _, raw, count) = unwrap_encode_mkv(frame); + assert_eq!(count, 1); + let trailing = &raw[raw.len() - 2..]; + assert_eq!(trailing, 1u16.to_be_bytes()); } #[test] -fn mkv_encode_frame_into_bytes() { - let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); - let expected = frame.raw_bytes.clone(); - let bytes = frame.into_bytes(); - assert_eq!(bytes, expected); +fn encode_from_rle_mkv_with_count() { + let runs = vec![(1u16, 2u16)]; + let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(7)); + let (_, _, _, _, raw, count) = unwrap_encode_mkv(frame); + assert_eq!(count, 7); + let trailing = &raw[raw.len() - 2..]; + assert_eq!(trailing, 7u16.to_be_bytes()); } #[test] -fn mkv_encode_frame_as_ref() { - let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); - let slice: &[u8] = frame.as_ref(); - assert_eq!(slice, &frame.raw_bytes); +#[should_panic(expected = "TwoDelta")] +fn encode_from_rle_twodelta_panics() { + let runs = vec![(1u16, 2u16)]; + let _ = BenEncodeFrame::from_rle(runs, BenVariant::TwoDelta, None); } #[test] -fn mkv_encode_frame_deref() { - let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); - assert_eq!(frame.len(), frame.raw_bytes.len()); +fn encode_single_run_frame() { + let runs = vec![(5u16, 1u16)]; + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + let (_, mvb, mlb, _, _) = unwrap_encode_standard(frame); + assert_eq!(mvb, 3); // 5 fits in 3 bits + assert_eq!(mlb, 1); // 1 fits in 1 bit } #[test] -fn mkv_encode_frame_partial_eq_vec() { - let frame = MkvBenEncodeFrame::from_rle(vec![(1u16, 2u16)], Some(1)); - let v = frame.raw_bytes.clone(); - assert_eq!(frame, v); - assert_eq!(v, frame); - let v2 = vec![0xFF, 0xFF, 0xFF]; - assert_ne!(frame, v2); - assert_ne!(v2, frame); +fn encode_large_values_near_u16_max() { + let runs = vec![(u16::MAX, u16::MAX)]; + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + let (_, mvb, mlb, _, _) = unwrap_encode_standard(frame); + assert_eq!(mvb, 16); + assert_eq!(mlb, 16); } -// ── TwoDeltaDecodeFrame ───────────────────────────────────────────────────── +// ── BenEncodeFrame::from_assignment ───────────────────────────────────────── #[test] -fn twodelta_decode_frame_from_reader() { - // pair: (0, 2), (0, 1), max_len_bits: 1, n_bytes: 0,0,0,1, payload: 0xC0, count: 0,1 - let data: Vec = vec![0, 2, 0, 1, 1, 0, 0, 0, 1, 0xC0, 0, 1]; - let mut cursor = io::Cursor::new(data); - let frame = TwoDeltaDecodeFrame::from_reader(&mut cursor) - .unwrap() - .unwrap(); - assert_eq!(frame.pair, (2, 1)); - assert_eq!(frame.count, 1); - assert!(!frame.run_lengths.is_empty()); +fn encode_from_assignment_standard() { + let assignment = vec![1u16, 1, 2, 2, 3]; + let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); + let (runs, _, _, _, _) = unwrap_encode_standard(frame); + assert_eq!(runs, vec![(1, 2), (2, 2), (3, 1)]); } #[test] -fn twodelta_decode_frame_from_reader_eof_returns_none() { - let data: Vec = vec![]; - let mut cursor = io::Cursor::new(data); - let result = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap(); - assert!(result.is_none()); +fn encode_from_assignment_mkv_carries_count() { + let assignment = vec![1u16, 1, 2, 2]; + let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::MkvChain, Some(9)); + let (_, _, _, _, _, count) = unwrap_encode_mkv(frame); + assert_eq!(count, 9); } +// ── BenEncodeFrame::from_run_lengths / from_parts (TwoDelta) ──────────────── + #[test] -fn twodelta_decode_frame_from_reader_truncated_errors() { - // Only pair_a, missing pair_b - let data: Vec = vec![0, 2]; - let mut cursor = io::Cursor::new(data); - let err = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +fn twodelta_from_run_lengths_count_none_defaults_to_one() { + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let (pair, _, _, runs, _, count) = unwrap_encode_twodelta(frame); + assert_eq!(pair, (1, 2)); + assert_eq!(runs, vec![2, 2]); + assert_eq!(count, 1); } -// ── Encode→Decode Roundtrips ──────────────────────────────────────────────── +#[test] +fn twodelta_from_run_lengths_then_from_parts_roundtrip() { + let original = BenEncodeFrame::from_run_lengths((3, 4), vec![5, 5, 5], Some(2)); + let bytes = original.as_slice().to_vec(); + let (pair, max_len_bits, n_bytes, _, _, count) = + unwrap_encode_twodelta(original.clone()); + let payload_slice = &bytes[9..9 + n_bytes as usize]; + let rebuilt = + BenEncodeFrame::from_parts(pair, max_len_bits, payload_slice.to_vec(), count); + let (rb_pair, _, _, rb_runs, _, rb_count) = unwrap_encode_twodelta(rebuilt); + assert_eq!(rb_pair, pair); + assert_eq!(rb_runs, vec![5, 5, 5]); + assert_eq!(rb_count, count); +} #[test] -fn ben_encode_decode_roundtrip_standard() { - use crate::codec::decode::decode_ben_line; - // Encode a Standard frame, then decode it via BenDecodeFrame::from_reader - let runs = vec![(1u16, 4), (2, 1), (3, 3)]; - let encode_frame = BenEncodeFrame::from_rle(runs.clone(), None); +fn twodelta_from_parts_preserves_nontrivial_count() { + let original = BenEncodeFrame::from_run_lengths((1, 9), vec![3, 3], Some(42)); + let bytes = original.as_slice().to_vec(); + let (_, max_len_bits, n_bytes, _, _, _) = unwrap_encode_twodelta(original); + let payload = bytes[9..9 + n_bytes as usize].to_vec(); + let rebuilt = BenEncodeFrame::from_parts((1, 9), max_len_bits, payload, 42); + let (_, _, _, _, _, count) = unwrap_encode_twodelta(rebuilt); + assert_eq!(count, 42); +} - // from_reader expects just the header+payload (no banner) - let mut cursor = io::Cursor::new(encode_frame.as_slice()); - let decode_frame = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); +#[test] +fn twodelta_from_run_lengths_single_run() { + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![5], Some(3)); + let (pair, _, _, runs, _, count) = unwrap_encode_twodelta(frame); + assert_eq!(pair, (1, 2)); + assert_eq!(runs, vec![5]); + assert_eq!(count, 3); +} - assert_eq!( - decode_frame.max_val_bit_count, - encode_frame.max_val_bit_count - ); - assert_eq!( - decode_frame.max_len_bit_count, - encode_frame.max_len_bit_count - ); - assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); +// ── Encode/decode roundtrips ──────────────────────────────────────────────── + +#[test] +fn standard_encode_decode_roundtrip() { + let runs = vec![(1u16, 4u16), (2, 3), (3, 1)]; + let encoded = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let bytes = encoded.into_bytes(); - // Verify the payload decodes back to the original RLE runs - let decoded_runs = decode_ben_line( - io::Cursor::new(&decode_frame.raw_bytes), - decode_frame.max_val_bit_count, - decode_frame.max_len_bit_count, - decode_frame.n_bytes, - ) - .unwrap(); - assert_eq!(decoded_runs, runs); + let mut cursor = io::Cursor::new(bytes); + let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + let (mvb, mlb, n_bytes, raw) = unwrap_standard(decoded); + assert_eq!(mvb, 2); + assert_eq!(mlb, 3); + assert!(n_bytes > 0); + assert!(!raw.is_empty()); } #[test] fn mkv_encode_decode_roundtrip() { - use crate::codec::decode::decode_ben_line; - let runs = vec![(1u16, 4), (2, 1), (3, 3)]; - let encode_frame = MkvBenEncodeFrame::from_rle(runs.clone(), Some(42)); + let runs = vec![(1u16, 4u16), (2, 3)]; + let encoded = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(11)); + let bytes = encoded.into_bytes(); - let mut cursor = io::Cursor::new(encode_frame.as_slice()); - let decode_frame = MkvBenDecodeFrame::from_reader(&mut cursor) + let mut cursor = io::Cursor::new(bytes); + let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); - - assert_eq!( - decode_frame.max_val_bit_count, - encode_frame.max_val_bit_count - ); - assert_eq!( - decode_frame.max_len_bit_count, - encode_frame.max_len_bit_count - ); - assert_eq!(decode_frame.n_bytes, encode_frame.n_bytes); - assert_eq!(decode_frame.count, 42); - - let decoded_runs = decode_ben_line( - io::Cursor::new(&decode_frame.raw_bytes), - decode_frame.max_val_bit_count, - decode_frame.max_len_bit_count, - decode_frame.n_bytes, - ) - .unwrap(); - assert_eq!(decoded_runs, runs); + let (_, _, _, _, count) = unwrap_mkv(decoded); + assert_eq!(count, 11); } #[test] fn twodelta_encode_decode_roundtrip() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let run_lengths = vec![3u16, 2, 1, 4]; - let encode_frame = TwoDeltaEncodeFrame::from_run_lengths((5, 10), run_lengths.clone(), Some(7)); + let encoded = BenEncodeFrame::from_run_lengths((4, 7), vec![3, 3, 3], Some(8)); + let bytes = encoded.into_bytes(); - // Write the raw_bytes (which include pair, max_len_bits, n_bytes, payload, count) - let mut cursor = io::Cursor::new(encode_frame.as_slice()); - let decode_frame = TwoDeltaDecodeFrame::from_reader(&mut cursor) + let mut cursor = io::Cursor::new(bytes); + let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() .unwrap(); - - assert_eq!(decode_frame.pair, (5, 10)); - assert_eq!(decode_frame.count, 7); - assert_eq!(decode_frame.run_lengths, run_lengths); + let (pair, runs, count) = unwrap_twodelta(decoded); + assert_eq!(pair, (4, 7)); + assert_eq!(runs, vec![3, 3, 3]); + assert_eq!(count, 8); } -// ── Back-to-back frame reads ──────────────────────────────────────────────── +// ── Back-to-back parsing ──────────────────────────────────────────────────── #[test] -fn ben_decode_two_frames_back_to_back() { - let f1 = BenEncodeFrame::from_rle(vec![(1u16, 2), (3, 4)], None); - let f2 = BenEncodeFrame::from_rle(vec![(7u16, 1), (8, 1), (9, 1)], None); - - let mut data = Vec::new(); - data.extend_from_slice(f1.as_slice()); - data.extend_from_slice(f2.as_slice()); - - let mut cursor = io::Cursor::new(data); - let d1 = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - let d2 = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); - let d3 = BenDecodeFrame::from_reader(&mut cursor).unwrap(); +fn standard_decode_two_frames_back_to_back() { + let f1 = BenEncodeFrame::from_rle(vec![(1, 2), (2, 1)], BenVariant::Standard, None); + let f2 = BenEncodeFrame::from_rle(vec![(3, 1), (4, 2)], BenVariant::Standard, None); + let mut bytes = f1.into_bytes(); + bytes.extend(f2.into_bytes()); - assert_eq!(d1.max_val_bit_count, f1.max_val_bit_count); - assert_eq!(d2.max_val_bit_count, f2.max_val_bit_count); - assert!(d3.is_none()); // clean EOF + let mut cursor = io::Cursor::new(bytes); + let _ = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + let _ = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + let none = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard).unwrap(); + assert!(none.is_none()); } #[test] fn mkv_decode_two_frames_back_to_back() { - let f1 = MkvBenEncodeFrame::from_rle(vec![(1u16, 2)], Some(10)); - let f2 = MkvBenEncodeFrame::from_rle(vec![(5u16, 5)], Some(20)); - - let mut data = Vec::new(); - data.extend_from_slice(f1.as_slice()); - data.extend_from_slice(f2.as_slice()); + let f1 = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::MkvChain, Some(3)); + let f2 = BenEncodeFrame::from_rle(vec![(2, 4)], BenVariant::MkvChain, Some(5)); + let mut bytes = f1.into_bytes(); + bytes.extend(f2.into_bytes()); - let mut cursor = io::Cursor::new(data); - let d1 = MkvBenDecodeFrame::from_reader(&mut cursor) + let mut cursor = io::Cursor::new(bytes); + let d1 = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); - let d2 = MkvBenDecodeFrame::from_reader(&mut cursor) + let d2 = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); - let d3 = MkvBenDecodeFrame::from_reader(&mut cursor).unwrap(); - - assert_eq!(d1.count, 10); - assert_eq!(d2.count, 20); - assert!(d3.is_none()); + assert_eq!(d1.count(), 3); + assert_eq!(d2.count(), 5); } #[test] fn twodelta_decode_two_frames_back_to_back() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let f1 = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], Some(1)); - let f2 = TwoDeltaEncodeFrame::from_run_lengths((3, 4), vec![1, 1, 1], Some(5)); + let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)); + let f2 = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1, 1, 1], Some(1)); + let mut bytes = f1.into_bytes(); + bytes.extend(f2.into_bytes()); - let mut data = Vec::new(); - data.extend_from_slice(f1.as_slice()); - data.extend_from_slice(f2.as_slice()); - - let mut cursor = io::Cursor::new(data); - let d1 = TwoDeltaDecodeFrame::from_reader(&mut cursor) + let mut cursor = io::Cursor::new(bytes); + let d1 = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() .unwrap(); - let d2 = TwoDeltaDecodeFrame::from_reader(&mut cursor) + let d2 = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() .unwrap(); - let d3 = TwoDeltaDecodeFrame::from_reader(&mut cursor).unwrap(); - - assert_eq!(d1.pair, (1, 2)); - assert_eq!(d1.run_lengths, vec![3, 2]); - assert_eq!(d1.count, 1); - assert_eq!(d2.pair, (3, 4)); - assert_eq!(d2.run_lengths, vec![1, 1, 1]); - assert_eq!(d2.count, 5); - assert!(d3.is_none()); + assert_eq!(unwrap_twodelta(d1).0, (1, 2)); + assert_eq!(unwrap_twodelta(d2).0, (3, 4)); } -// ── Boundary values ───────────────────────────────────────────────────────── +// ── Inspector methods (count, variant, raw_bytes) ─────────────────────────── #[test] -fn mkv_decode_frame_count_max_u16() { - let f = MkvBenEncodeFrame::from_rle(vec![(1u16, 1)], Some(u16::MAX)); - let mut cursor = io::Cursor::new(f.as_slice()); - let d = MkvBenDecodeFrame::from_reader(&mut cursor) +fn decode_count_returns_one_for_standard() { + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() .unwrap(); - assert_eq!(d.count, u16::MAX); + assert_eq!(frame.count(), 1); } #[test] -fn twodelta_decode_frame_count_max_u16() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let f = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(u16::MAX)); - let mut cursor = io::Cursor::new(f.as_slice()); - let d = TwoDeltaDecodeFrame::from_reader(&mut cursor) +fn decode_variant_method() { + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() .unwrap(); - assert_eq!(d.count, u16::MAX); -} + assert_eq!(frame.variant(), BenVariant::Standard); -#[test] -fn ben_encode_single_run_frame() { - use crate::codec::decode::decode_ben_line; - let runs = vec![(1u16, 1)]; - let frame = BenEncodeFrame::from_rle(runs.clone(), None); - - let mut cursor = io::Cursor::new(frame.as_slice()); - let decoded = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(2)); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) + .unwrap() + .unwrap(); + assert_eq!(frame.variant(), BenVariant::MkvChain); - let decoded_runs = decode_ben_line( - io::Cursor::new(&decoded.raw_bytes), - decoded.max_val_bit_count, - decoded.max_len_bit_count, - decoded.n_bytes, - ) - .unwrap(); - assert_eq!(decoded_runs, runs); + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) + .unwrap() + .unwrap(); + assert_eq!(frame.variant(), BenVariant::TwoDelta); } #[test] -fn ben_encode_large_values_near_u16_max() { - use crate::codec::decode::decode_ben_line; - let runs = vec![(65534u16, 65535u16), (1, 1)]; - let frame = BenEncodeFrame::from_rle(runs.clone(), None); +fn decode_raw_bytes_returns_some_for_snapshot_arms_none_for_twodelta() { + let std_encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let mut cursor = io::Cursor::new(std_encoded.into_bytes()); + let std_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + assert!(std_frame.raw_bytes().is_some()); - let mut cursor = io::Cursor::new(frame.as_slice()); - let decoded = BenDecodeFrame::from_reader(&mut cursor).unwrap().unwrap(); + let mkv_encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(1)); + let mut cursor = io::Cursor::new(mkv_encoded.into_bytes()); + let mkv_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) + .unwrap() + .unwrap(); + assert!(mkv_frame.raw_bytes().is_some()); - let decoded_runs = decode_ben_line( - io::Cursor::new(&decoded.raw_bytes), - decoded.max_val_bit_count, - decoded.max_len_bit_count, - decoded.n_bytes, - ) - .unwrap(); - assert_eq!(decoded_runs, runs); + let td_encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let mut cursor = io::Cursor::new(td_encoded.into_bytes()); + let td_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) + .unwrap() + .unwrap(); + assert!(td_frame.raw_bytes().is_none()); } -#[test] -fn twodelta_from_run_lengths_then_from_parts_roundtrip() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - // Verify that packing via from_run_lengths then unpacking via from_parts - // reproduces the same run_length_vector - let run_lengths = vec![5u16, 3, 7, 1, 2]; - let encoded = TwoDeltaEncodeFrame::from_run_lengths((10, 20), run_lengths.clone(), None); - - let reconstructed = TwoDeltaEncodeFrame::from_parts( - encoded.pair, - encoded.max_len_bit_count, - encoded.payload().to_vec(), - encoded.count, - ); - assert_eq!(reconstructed.run_length_vector, run_lengths); - assert_eq!(reconstructed.pair, (10, 20)); - assert_eq!(reconstructed.count, encoded.count); - assert_eq!(reconstructed.raw_bytes, encoded.raw_bytes); -} +// ── Encode-side inspectors and conversions ────────────────────────────────── #[test] -fn twodelta_from_parts_preserves_nontrivial_count() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - // Regression: from_parts previously hardcoded count = 1 in raw_bytes, - // so reconstructed frames silently emitted the wrong trailing count - // bytes. Verify count > 1 now round-trips through from_parts. - let run_lengths = vec![5u16, 3, 7, 1, 2]; - let encoded = TwoDeltaEncodeFrame::from_run_lengths((10, 20), run_lengths.clone(), Some(42)); - - let reconstructed = TwoDeltaEncodeFrame::from_parts( - encoded.pair, - encoded.max_len_bit_count, - encoded.payload().to_vec(), - 42, - ); - - assert_eq!(reconstructed.count, 42); - let trailing = &reconstructed.raw_bytes[reconstructed.raw_bytes.len() - 2..]; - assert_eq!(trailing, &42u16.to_be_bytes()); - assert_eq!(reconstructed.raw_bytes, encoded.raw_bytes); +fn encode_as_slice_to_bytes_into_bytes_agree() { + let encoded = BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None); + let s = encoded.as_slice().to_vec(); + let t = encoded.to_bytes(); + let i = encoded.into_bytes(); + assert_eq!(s, t); + assert_eq!(s, i); } #[test] -fn twodelta_from_run_lengths_single_run() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let run_lengths = vec![100u16]; - let encoded = TwoDeltaEncodeFrame::from_run_lengths((1, 2), run_lengths.clone(), None); - - let mut cursor = io::Cursor::new(encoded.as_slice()); - let decoded = TwoDeltaDecodeFrame::from_reader(&mut cursor) - .unwrap() - .unwrap(); - assert_eq!(decoded.run_lengths, run_lengths); -} +fn encode_count_method() { + let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + assert_eq!(std_frame.count(), 1); -// ── BenEncodeFrame trait impls ────────────────────────────────────────────── + let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(7)); + assert_eq!(mkv_frame.count(), 7); -#[test] -fn ben_encode_frame_partial_eq_vec_both_directions() { - let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); - let v = frame.raw_bytes.clone(); - assert_eq!(frame, v); - assert_eq!(v, frame); - let v2 = vec![0xFF, 0xFF, 0xFF]; - assert_ne!(frame, v2); - assert_ne!(v2, frame); + let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(13)); + assert_eq!(td_frame.count(), 13); } #[test] -fn ben_encode_frame_as_ref_and_deref() { - let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); - let slice: &[u8] = frame.as_ref(); - assert_eq!(slice, &frame.raw_bytes[..]); - assert_eq!(frame.len(), frame.raw_bytes.len()); -} +fn encode_variant_method() { + let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + assert_eq!(std_frame.variant(), BenVariant::Standard); -#[test] -fn ben_encode_frame_to_bytes_and_into_bytes() { - let frame = BenEncodeFrame::from_rle(vec![(1u16, 2)], None); - let to = frame.to_bytes(); - let expected = frame.raw_bytes.clone(); - assert_eq!(to, expected); - let into = frame.into_bytes(); - assert_eq!(into, expected); -} + let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, None); + assert_eq!(mkv_frame.variant(), BenVariant::MkvChain); -// ── TwoDeltaEncodeFrame trait impls ───────────────────────────────────────── + let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], None); + assert_eq!(td_frame.variant(), BenVariant::TwoDelta); +} #[test] -fn twodelta_encode_frame_as_ref_and_deref() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], None); - let slice: &[u8] = frame.as_ref(); - assert_eq!(slice, &frame.raw_bytes[..]); - assert_eq!(frame.len(), frame.raw_bytes.len()); +fn encode_payload_returns_packed_payload_region() { + let frame = BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None); + let bytes = frame.as_slice().to_vec(); + let payload = frame.payload().to_vec(); + // For Standard, payload is bytes[6..6+n_bytes]. + let (_, _, _, n_bytes, _) = unwrap_encode_standard(frame); + assert_eq!(payload, bytes[6..6 + n_bytes as usize]); } #[test] -fn twodelta_encode_frame_to_bytes_and_into_bytes() { - use crate::codec::frames::twodelta_encode::TwoDeltaEncodeFrame; - let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![3, 2], None); - let to = frame.to_bytes(); - let expected = frame.raw_bytes.clone(); - assert_eq!(to, expected); - let into = frame.into_bytes(); - assert_eq!(into, expected); +fn encode_as_ref_and_deref_match_as_slice() { + let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None); + let s = frame.as_slice(); + let r: &[u8] = frame.as_ref(); + assert_eq!(s, r); + // Deref makes slice methods callable directly. + assert_eq!(frame.len(), s.len()); } -// ── Non-EOF read errors propagate from frame decoders ─────────────────────── - #[test] -fn ben_decode_frame_non_eof_read_error_propagates() { - let err = BenDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +fn encode_partial_eq_vec_both_directions() { + let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None); + let bytes: Vec = frame.as_slice().to_vec(); + assert_eq!(frame, bytes); + assert_eq!(bytes, frame); } +// ── BenDecodeFrame::expand ────────────────────────────────────────────────── + #[test] -fn mkv_decode_frame_non_eof_read_error_propagates() { - let err = MkvBenDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +fn decode_expand_standard_assignment() { + // An assignment of [1, 1, 2, 2, 3] becomes RLE [(1,2),(2,2),(3,1)]. + let encoded = BenEncodeFrame::from_assignment( + &[1u16, 1, 2, 2, 3], + BenVariant::Standard, + None, + ); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) + .unwrap() + .unwrap(); + let assignment = decoded.expand(None).unwrap(); + assert_eq!(assignment, vec![1, 1, 2, 2, 3]); } #[test] -fn twodelta_decode_frame_non_eof_read_error_propagates() { - let err = TwoDeltaDecodeFrame::from_reader(&mut ErrorAfterOneByte).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); +fn decode_expand_twodelta_requires_prev() { + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)); + let mut cursor = io::Cursor::new(encoded.into_bytes()); + let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) + .unwrap() + .unwrap(); + let err = decoded.expand(None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); } diff --git a/ben/src/codec/frames/twodelta_decode.rs b/ben/src/codec/frames/twodelta_decode.rs deleted file mode 100644 index 82330dd..0000000 --- a/ben/src/codec/frames/twodelta_decode.rs +++ /dev/null @@ -1,59 +0,0 @@ -use super::twodelta_encode::TwoDeltaEncodeFrame; -use super::BenDecode; -use byteorder::{BigEndian, ReadBytesExt}; -use std::io::{self, Read}; - -/// A decoded TwoDelta delta frame, containing only what's needed to apply the delta. -/// -/// Unlike `TwoDeltaEncodeFrame`, this type does not retain raw bytes or -/// bit-packing metadata. It delegates bit-unpacking of the run lengths to -/// `TwoDeltaEncodeFrame::from_parts` and then discards everything except -/// `pair`, `run_lengths`, and `count`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TwoDeltaDecodeFrame { - /// The ordered pair of district ids involved in the delta. - pub pair: (u16, u16), - /// The unpacked run-length vector over the positions occupied by the pair. - pub run_lengths: Vec, - /// The number of times this delta repeats. - pub count: u16, -} - -impl BenDecode for TwoDeltaDecodeFrame { - /// Read the next TwoDelta delta frame from the stream. - /// - /// Reads pair, max_len_bits, n_bytes, payload, and count, then delegates - /// bit-unpacking to `TwoDeltaEncodeFrame::from_parts`. - /// Returns `Ok(None)` on a clean EOF at a frame boundary. - fn from_reader(reader: &mut impl Read) -> io::Result> { - let pair_a = match reader.read_u16::() { - Ok(v) => v, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e), - }; - - let pair_b = reader.read_u16::()?; - let max_len_bits = reader.read_u8()?; - if max_len_bits == 0 || max_len_bits > 16 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("invalid TwoDelta run-length bit width: {max_len_bits}"), - )); - } - let n_bytes = reader.read_u32::()?; - - let mut payload = vec![0u8; n_bytes as usize]; - reader.read_exact(&mut payload)?; - - let count = reader.read_u16::()?; - - let encode_frame = - TwoDeltaEncodeFrame::from_parts((pair_a, pair_b), max_len_bits, payload, count); - - Ok(Some(TwoDeltaDecodeFrame { - pair: encode_frame.pair, - run_lengths: encode_frame.run_length_vector, - count, - })) - } -} diff --git a/ben/src/codec/frames/twodelta_encode.rs b/ben/src/codec/frames/twodelta_encode.rs deleted file mode 100644 index 2663666..0000000 --- a/ben/src/codec/frames/twodelta_encode.rs +++ /dev/null @@ -1,202 +0,0 @@ -/// Canonical representation of a TwoDelta frame. -/// -/// A TwoDelta frame stores the two district ids that may change relative to -/// the previous sample and then encodes the lengths of alternating runs over -/// just those two ids. The first run always corresponds to `pair.0`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TwoDeltaEncodeFrame { - // The pair of district ids that are encoded in this frame, stored here for reference. - // Canonically, `pair.0` is the id for the first run in the run-length vector and `pair.1` - // is the id for the second run. - pub pair: (u16, u16), - // The number of bits used to encode the maximum run length in this frame. - pub max_len_bit_count: u8, - // The number of bytes in the packed payload. - pub n_bytes: u32, - // The run-length vector that was encoded into this frame, stored here for reference. - pub run_length_vector: Vec, - // The full serialized TwoDelta frame bytes, including the header, payload, and count. - pub raw_bytes: Vec, - // The number of times this frame is repeated. Mirrors the trailing u16 in `raw_bytes`. - pub count: u16, -} - -impl TwoDeltaEncodeFrame { - /// Borrow just the packed payload bytes. - pub fn payload(&self) -> &[u8] { - &self.raw_bytes[9..9 + self.n_bytes as usize] - } - - /// Borrow the serialized TwoDelta frame bytes. - pub fn as_slice(&self) -> &[u8] { - &self.raw_bytes - } - - /// Clone out the serialized TwoDelta frame bytes. - pub fn to_bytes(&self) -> Vec { - self.raw_bytes.clone() - } - - /// Consume the frame and return the serialized bytes without cloning. - pub fn into_bytes(self) -> Vec { - self.raw_bytes - } - - /// Build a TwoDelta frame by packing a run-length vector into the binary format. - /// - /// Run lengths are packed at `max_len_bit_count` bits per value (the minimum - /// bit width needed to represent the largest run length), MSB-first with no - /// padding between values. If the total bit count is not a multiple of 8, the - /// final byte is zero-padded on the right. - /// - /// The serialized layout is: - /// ```text - /// [pair.0: u16 BE][pair.1: u16 BE][max_len_bit_count: u8][n_bytes: u32 BE][payload...][count: u16 BE] - /// ``` - /// where the payload is the bit-packed run lengths. - /// - /// # Arguments - /// - /// * `pair` - The ordered pair of district ids. `pair.0` corresponds to the first run. - /// * `run_length_vector` - The lengths of alternating runs of `pair.0` and `pair.1` - /// over the positions occupied by the pair, in position order. - /// - /// # Returns - /// - /// A fully serialized `TwoDeltaEncodeFrame` with both the packed `raw_bytes` and the - /// original `run_length_vector` stored on the struct. - pub fn from_run_lengths( - pair: (u16, u16), - run_length_vector: Vec, - count: Option, - ) -> Self { - let count = match count { - Some(v) => v, - None => 1, - }; - - let max_len = run_length_vector.iter().copied().max().unwrap_or(0); - let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - - let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; - let n_bytes = payload_bits.div_ceil(8); - - // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) - let mut raw_bytes = Vec::with_capacity((n_bytes + 9) as usize); - raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); - raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); - raw_bytes.push(max_len_bit_count); - raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); - - let mut remainder: u32 = 0; - let mut remainder_bits: u8 = 0; - - for &item in &run_length_vector { - let mut packed = (remainder << max_len_bit_count) | item as u32; - let mut bits_left = remainder_bits + max_len_bit_count; - - while bits_left >= 8 { - bits_left -= 8; - raw_bytes.push((packed >> bits_left) as u8); - packed &= !((u32::MAX) << bits_left); - } - - remainder = packed; - remainder_bits = bits_left; - } - - if remainder_bits > 0 { - raw_bytes.push((remainder << (8 - remainder_bits)) as u8); - } - - raw_bytes.extend(count.to_be_bytes()); - - Self { - pair, - max_len_bit_count, - n_bytes, - run_length_vector, - raw_bytes, - count, - } - } - - /// Reconstruct a TwoDelta frame from already-parsed header fields and a raw payload. - /// - /// This is the inverse of `from_run_lengths`: it re-assembles the serialized bytes - /// and decodes the bit-packed payload back into the run-length vector so that both - /// representations are available on the resulting frame. - /// - /// The decoding reads `max_len_bit_count` bits at a time from the payload, MSB-first, - /// and discards any trailing zero-valued items produced by right-padding in the final byte. - /// - /// # Arguments - /// - /// * `pair` - The ordered pair of district ids as read from the frame header. - /// * `max_len_bit_count` - The bit width of each packed run length, as read from the - /// frame header. - /// * `payload` - The raw packed payload bytes, not including the 9-byte header. - /// * `count` - The repetition count for the frame, as read from the trailing `u16` - /// in the wire format. - /// - /// # Returns - /// - /// A `TwoDeltaEncodeFrame` with `raw_bytes` (header + payload + count), the decoded - /// `run_length_vector`, and `count` populated. - pub fn from_parts( - pair: (u16, u16), - max_len_bit_count: u8, - payload: Vec, - count: u16, - ) -> Self { - let n_bytes = payload.len() as u32; - let mut raw_bytes = Vec::with_capacity(9 + payload.len() + 2); - raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); - raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); - raw_bytes.push(max_len_bit_count); - raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); - raw_bytes.extend_from_slice(&payload); - raw_bytes.extend_from_slice(&count.to_be_bytes()); - - let mut run_length_vector = Vec::new(); - let mut buffer: u32 = 0; - let mut n_bits_in_buff: u16 = 0; - - for &byte in payload[..n_bytes as usize].iter() { - buffer |= (byte as u32).to_be() >> n_bits_in_buff; - n_bits_in_buff += 8; - - while n_bits_in_buff >= max_len_bit_count as u16 { - let item = (buffer >> (32 - max_len_bit_count)) as u16; - buffer <<= max_len_bit_count; - n_bits_in_buff -= max_len_bit_count as u16; - if item > 0 { - run_length_vector.push(item); - } - } - } - - Self { - pair, - max_len_bit_count, - n_bytes, - run_length_vector, - raw_bytes, - count, - } - } -} - -impl AsRef<[u8]> for TwoDeltaEncodeFrame { - fn as_ref(&self) -> &[u8] { - self.as_slice() - } -} - -impl std::ops::Deref for TwoDeltaEncodeFrame { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} diff --git a/ben/src/codec/mod.rs b/ben/src/codec/mod.rs index 5033978..e939599 100644 --- a/ben/src/codec/mod.rs +++ b/ben/src/codec/mod.rs @@ -10,7 +10,4 @@ pub mod encode; pub mod frames; pub mod translate; -pub use frames::{ - BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, MkvBenEncodeFrame, - TwoDeltaDecodeFrame, TwoDeltaEncodeFrame, -}; +pub use frames::{BenDecodeFrame, BenEncodeFrame}; diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 6540414..3d717c5 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -9,7 +9,6 @@ mod errors; use errors::TranslateError; -use crate::codec::BenConstruct; use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Read, Write}; @@ -23,11 +22,18 @@ use crate::BenVariant; /// # Arguments /// /// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator. +/// * `variant` - The BEN variant. Determines whether the resulting BEN frame +/// embeds a trailing repetition count. +/// * `count` - The repetition count for `MkvChain`. Ignored for `Standard`. /// /// # Returns /// /// Returns the encoded BEN frame payload and header. -fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { +fn ben32_to_ben_line( + ben32_vec: Vec, + variant: BenVariant, + count: u16, +) -> io::Result> { let mut buffer = [0u8; 4]; let mut ben32_rle: Vec<(u16, u16)> = Vec::new(); @@ -44,9 +50,9 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { let encoded = u32::from_be_bytes(buffer); let value = (encoded >> 16) as u16; - let count = (encoded & 0xFFFF) as u16; + let len = (encoded & 0xFFFF) as u16; - ben32_rle.push((value, count)); + ben32_rle.push((value, len)); } let eol_offset = ben32_vec.len(); @@ -58,7 +64,7 @@ fn ben32_to_ben_line(ben32_vec: Vec) -> io::Result> { })); } - Ok(BenEncodeFrame::from_rle(ben32_rle, None).into_bytes()) + Ok(BenEncodeFrame::from_rle(ben32_rle, variant, Some(count)).into_bytes()) } /// Translate a stream of ben32 frames into BEN frames. @@ -112,11 +118,8 @@ pub fn ben32_to_ben_lines( } } - let ben_vec = ben32_to_ben_line(ben32_vec)?; + let ben_vec = ben32_to_ben_line(ben32_vec, variant, n_reps)?; writer.write_all(&ben_vec)?; - if variant == BenVariant::MkvChain { - writer.write_all(&n_reps.to_be_bytes())?; - } } Ok(()) diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 259bcaa..aca135c 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -280,7 +280,7 @@ fn test_ben_to_ben32_lines_non_eof_error_on_frame_boundary() { #[test] fn test_ben32_to_ben_line_rejects_invalid_length() { - let err = ben32_to_ben_line(vec![1, 2, 3]).unwrap_err(); + let err = ben32_to_ben_line(vec![1, 2, 3], BenVariant::Standard, 0).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), @@ -290,7 +290,7 @@ fn test_ben32_to_ben_line_rejects_invalid_length() { #[test] fn test_ben32_to_ben_line_rejects_missing_terminator() { - let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1]).unwrap_err(); + let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1], BenVariant::Standard, 0).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs index 9650017..c7a70ba 100644 --- a/ben/src/io/reader/assignment_reader.rs +++ b/ben/src/io/reader/assignment_reader.rs @@ -1,14 +1,10 @@ use super::errors::DecoderInitError; -use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError}; -use crate::codec::{ - BenConstruct, BenDecode, BenDecodeFrame, BenEncodeFrame, MkvBenDecodeFrame, TwoDeltaDecodeFrame, -}; +use crate::codec::{BenDecodeFrame, BenEncodeFrame}; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::progress::Spinner; -use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::json; -use std::io::{self, Cursor, Read, Write}; +use std::io::{self, Read, Write}; /// Iterator over decoded assignments in an uncompressed BEN stream. pub struct AssignmentReader { @@ -21,26 +17,6 @@ pub struct AssignmentReader { spinner: Option, } -/// Internal frame representation, one variant per BEN encoding type. -enum StoredBenFrame { - /// A Standard BEN frame (count is always 1). - Standard(BenDecodeFrame), - /// An MkvChain BEN frame carrying its repetition count. - MkvChain(MkvBenDecodeFrame), - /// A TwoDelta delta frame carrying its pair, run lengths, and count. - TwoDelta(TwoDeltaDecodeFrame), -} - -impl StoredBenFrame { - fn count(&self) -> u16 { - match self { - Self::Standard(_) => 1, - Self::MkvChain(f) => f.count, - Self::TwoDelta(f) => f.count, - } - } -} - fn zero_count_frame_error() -> io::Error { io::Error::new( io::ErrorKind::InvalidData, @@ -117,35 +93,25 @@ impl AssignmentReader { }) } - /// Read and return the next stored frame from the underlying BEN stream. + /// Read the next frame from the underlying BEN stream. /// - /// Delegates to the appropriate `BenDecode::from_reader` implementation - /// based on the variant and whether the first TwoDelta frame has been read. + /// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire + /// format; this method tracks that state so the frame module stays + /// variant-clean. /// /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read /// failure, or `None` at a clean end of stream. - fn pop_frame_from_reader(&mut self) -> Option> { - match self.variant { - BenVariant::Standard => BenDecodeFrame::from_reader(&mut self.reader) - .transpose() - .map(|r| r.map(StoredBenFrame::Standard)), - BenVariant::MkvChain => MkvBenDecodeFrame::from_reader(&mut self.reader) - .transpose() - .map(|r| r.map(StoredBenFrame::MkvChain)), - BenVariant::TwoDelta => { - if !self.twodelta_consumed_first_frame { - // First TwoDelta frame is encoded in MkvChain format. - self.twodelta_consumed_first_frame = true; - MkvBenDecodeFrame::from_reader(&mut self.reader) - .transpose() - .map(|r| r.map(StoredBenFrame::MkvChain)) - } else { - TwoDeltaDecodeFrame::from_reader(&mut self.reader) - .transpose() - .map(|r| r.map(StoredBenFrame::TwoDelta)) - } - } - } + fn pop_frame_from_reader(&mut self) -> Option> { + let read_variant = if self.variant == BenVariant::TwoDelta + && !self.twodelta_consumed_first_frame + { + self.twodelta_consumed_first_frame = true; + BenVariant::MkvChain + } else { + self.variant + }; + + BenDecodeFrame::from_reader(&mut self.reader, read_variant).transpose() } /// Consume this decoder and iterate over raw BEN frames instead of @@ -191,17 +157,7 @@ impl AssignmentReader { return Err(zero_count_frame_error()); } - let assignment = match frame { - StoredBenFrame::Standard(f) => decode_ben_frame_to_assignment(&f)?, - StoredBenFrame::MkvChain(f) => decode_mkv_frame_to_assignment(&f)?, - StoredBenFrame::TwoDelta(f) => { - let prev = self - .previous_assignment - .take() - .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?; - apply_twodelta_runs_to_assignment(prev, f.pair, &f.run_lengths)? - } - }; + let assignment = frame.expand(self.previous_assignment.take())?; let keep_going = f(&assignment, count)?; self.previous_assignment = Some(assignment); @@ -218,45 +174,6 @@ impl AssignmentReader { } } -/// Decode a raw Standard BEN frame into a full assignment vector. -pub(super) fn decode_ben_frame_to_assignment(frame: &BenDecodeFrame) -> io::Result> { - decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bit_count, - frame.max_len_bit_count, - frame.n_bytes, - ) - .map(rle_to_vec) -} - -/// Decode a raw MkvChain BEN frame into a full assignment vector. -pub(super) fn decode_mkv_frame_to_assignment(frame: &MkvBenDecodeFrame) -> io::Result> { - decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bit_count, - frame.max_len_bit_count, - frame.n_bytes, - ) - .map(rle_to_vec) -} - -/// Decode a stored BEN frame into a full assignment vector. -fn decode_stored_frame_to_assignment( - previous_assignment: &mut Option>, - frame: &StoredBenFrame, -) -> io::Result> { - match frame { - StoredBenFrame::Standard(f) => decode_ben_frame_to_assignment(f), - StoredBenFrame::MkvChain(f) => decode_mkv_frame_to_assignment(f), - StoredBenFrame::TwoDelta(f) => { - let prev = previous_assignment - .take() - .ok_or_else(|| io::Error::from(DecodeError::TwoDeltaNoAnchorFrame))?; - apply_twodelta_runs_to_assignment(prev, f.pair, &f.run_lengths) - } - } -} - impl Iterator for AssignmentReader { type Item = io::Result; @@ -270,11 +187,10 @@ impl Iterator for AssignmentReader { if count == 0 { return Some(Err(zero_count_frame_error())); } - let assignment = - match decode_stored_frame_to_assignment(&mut self.previous_assignment, &frame) { - Ok(assgn) => assgn, - Err(e) => return Some(Err(e)), - }; + let assignment = match frame.expand(self.previous_assignment.take()) { + Ok(a) => a, + Err(e) => return Some(Err(e)), + }; self.previous_assignment = Some(assignment.clone()); self.sample_count += count as usize; if !self.silent { @@ -303,44 +219,57 @@ impl AssignmentFrameReader { impl Iterator for AssignmentFrameReader { type Item = io::Result<(BenDecodeFrame, u16)>; - /// Return the next raw BEN frame from the input stream. + /// Return the next raw BEN frame from the input stream paired with its + /// repetition count. + /// + /// For `Standard` and `MkvChain` streams, returns the frame as read off + /// the wire (with `count` taken from the frame for `MkvChain`, or `1` + /// for `Standard`). /// - /// For Standard and MkvChain streams, returns the raw decoded frame paired - /// with its repetition count. - /// For TwoDelta streams, materializes each assignment and re-encodes it. + /// For `TwoDelta` streams, materializes each assignment via `expand` + /// and re-encodes it as a Standard-shaped decode frame so downstream + /// subsampling consumers always see self-contained frames. fn next(&mut self) -> Option { match self.inner.variant { - BenVariant::Standard => BenDecodeFrame::from_reader(&mut self.inner.reader) - .transpose() - .map(|r| r.map(|frame| (frame, 1))), - BenVariant::MkvChain => { - MkvBenDecodeFrame::from_reader(&mut self.inner.reader) - .transpose() - .map(|r| r.and_then(|frame| { - let count = frame.count; + BenVariant::Standard | BenVariant::MkvChain => { + match self.inner.pop_frame_from_reader() { + Some(Ok(frame)) => { + let count = frame.count(); if count == 0 { - return Err(zero_count_frame_error()); + return Some(Err(zero_count_frame_error())); } - Ok(( - BenDecodeFrame { - max_val_bit_count: frame.max_val_bit_count, - max_len_bit_count: frame.max_len_bit_count, - n_bytes: frame.n_bytes, - raw_bytes: frame.raw_bytes, - }, - count, - )) - })) + Some(Ok((frame, count))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } } BenVariant::TwoDelta => match self.inner.next() { Some(Ok((assignment, count))) => { - let encoded = BenEncodeFrame::from_assignment(&assignment, None); + let encoded = + BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); + let (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes) = match encoded { + BenEncodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + .. + } => (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes), + _ => unreachable!( + "BenEncodeFrame::from_assignment(Standard) always returns Standard" + ), + }; + // Strip the 6-byte frame header so the emitted decode-side + // frame's raw_bytes matches the historical payload-only + // shape that BenDecodeFrame::Standard carries. + let payload_only = raw_bytes[6..].to_vec(); Some(Ok(( - BenDecodeFrame { - max_val_bit_count: encoded.max_val_bit_count, - max_len_bit_count: encoded.max_len_bit_count, - n_bytes: encoded.n_bytes, - raw_bytes: encoded.raw_bytes[6..].to_vec(), + BenDecodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes: payload_only, }, count, ))) diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index b0b59b5..62e1532 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -1,4 +1,3 @@ -use super::assignment_reader::decode_ben_frame_to_assignment; use super::assignment_reader::AssignmentFrameReader; use super::errors::DecoderInitError; use super::xz_assignment_reader::decode_xben_frame_to_assignment; @@ -45,9 +44,13 @@ pub enum Selection { /// # Returns /// /// Returns the expanded assignment vector. +/// +/// `AssignmentFrameReader` rewrites TwoDelta BEN frames into self-contained +/// Standard frames before they reach this path, so `Ben(...)` is always a +/// `Standard` or `MkvChain` arm and `expand(None)` is always sufficient here. pub(super) fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { match frame { - DecodeFrame::Ben(f) => decode_ben_frame_to_assignment(f), + DecodeFrame::Ben(f) => f.expand(None), DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), } } diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs index ecb2107..b6b09a3 100644 --- a/ben/src/io/writer/assignment_writer.rs +++ b/ben/src/io/writer/assignment_writer.rs @@ -1,6 +1,6 @@ use super::utils::parse_json_assignment; use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::{BenConstruct, BenEncodeFrame, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; +use crate::codec::BenEncodeFrame; use crate::format::banners::banner_for_variant; use crate::BenVariant; use serde_json::Value; @@ -60,24 +60,30 @@ impl AssignmentWriter { match self.variant { BenVariant::Standard => { - let frame = BenEncodeFrame::from_assignment(&pending_sample, None); + let frame = + BenEncodeFrame::from_assignment(&pending_sample, BenVariant::Standard, None); for _ in 0..self.sample_count { self.writer.write_all(frame.as_slice())?; } } BenVariant::MkvChain => { - let frame = - MkvBenEncodeFrame::from_assignment(&pending_sample, Some(self.sample_count)); + let frame = BenEncodeFrame::from_assignment( + &pending_sample, + BenVariant::MkvChain, + Some(self.sample_count), + ); self.writer.write_all(frame.as_slice())?; } BenVariant::TwoDelta => { if self.previous_sample.is_empty() { - // First frame: encode as MkvBen and build the initial masks. + // First frame: encode in MkvChain wire format and build + // the initial position masks. for (idx, &val) in pending_sample.iter().enumerate() { self.previous_masks.entry(val).or_default().push(idx); } - let frame = MkvBenEncodeFrame::from_assignment( + let frame = BenEncodeFrame::from_assignment( &pending_sample, + BenVariant::MkvChain, Some(self.sample_count), ); self.writer.write_all(frame.as_slice())?; @@ -165,7 +171,7 @@ impl AssignmentWriter { pub(super) fn twodelta_repeat_frame( assignment: &[u16], count: u16, -) -> io::Result { +) -> io::Result { let first = assignment.first().copied().unwrap_or(0); let second = assignment .iter() @@ -201,7 +207,7 @@ pub(super) fn twodelta_repeat_frame( run_lengths.push(run_len); } - Ok(TwoDeltaEncodeFrame::from_run_lengths( + Ok(BenEncodeFrame::from_run_lengths( (first, second), run_lengths, Some(count), diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index e11e6fd..3bd90be 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -6,7 +6,7 @@ use super::utils::{encode_xben_twodelta_full_frame, parse_json_assignment}; use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; -use crate::codec::TwoDeltaEncodeFrame; +use crate::codec::BenEncodeFrame; use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; use crate::progress::Spinner; use crate::BenVariant; @@ -227,9 +227,19 @@ impl XZAssignmentWriter { Some(&mut self.previous_masks), None, )?; + let (pair, run_lengths) = match frame { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!( + "encode_twodelta_frame_with_hint always returns the TwoDelta arm" + ), + }; self.chunk_buffer.push(BufferedDeltaFrame { - pair: frame.pair, - run_lengths: frame.run_length_vector, + pair, + run_lengths, count: 1, }); self.previous_assignment = assign_vec; @@ -321,17 +331,24 @@ impl XZAssignmentWriter { reader.read_exact(&mut payload)?; let count = reader.read_u16::()?; - // Unpack bitpacked run lengths. - let frame = TwoDeltaEncodeFrame::from_parts( + // Unpack bitpacked run lengths via the frame layer's TwoDelta + // constructor, then peel out the fields we need for buffering. + let (pair, run_lengths) = match BenEncodeFrame::from_parts( (pair_a, pair_b), delta_max_len_bits, payload, count, - ); - let run_lengths = frame.run_length_vector; + ) { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), + }; self.chunk_buffer.push(BufferedDeltaFrame { - pair: frame.pair, + pair, run_lengths, count, }); diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index f8f2eac..108f095 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -4,7 +4,7 @@ mod errors; use errors::RelabelError; use crate::codec::decode::decode_ben_line; -use crate::codec::{BenConstruct, BenEncodeFrame}; +use crate::codec::BenEncodeFrame; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::AssignmentReader; @@ -365,11 +365,9 @@ fn relabel_ben_lines_impl( 1 }; - let relabeled = BenEncodeFrame::from_rle(ben_line, None); + let relabeled = + BenEncodeFrame::from_rle(ben_line, variant, Some(count_occurrences)); writer.write_all(relabeled.as_slice())?; - if variant == BenVariant::MkvChain { - writer.write_all(&count_occurrences.to_be_bytes())?; - } sample_number += count_occurrences as usize; @@ -595,11 +593,9 @@ fn relabel_ben_lines_with_map_impl( 1 }; - let relabeled = BenEncodeFrame::from_rle(new_rle.clone(), None); + let relabeled = + BenEncodeFrame::from_rle(new_rle.clone(), variant, Some(count_occurrences)); writer.write_all(relabeled.as_slice())?; - if variant == BenVariant::MkvChain { - writer.write_all(&count_occurrences.to_be_bytes())?; - } sample_number += count_occurrences as usize; spinner.set_count(sample_number as u64); diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index d14ec7b..30d8ce8 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -1,8 +1,9 @@ use super::*; use crate::codec::decode::decode_ben_to_jsonl; use crate::codec::encode::encode_jsonl_to_ben; -use crate::codec::{BenConstruct, BenEncodeFrame}; +use crate::codec::BenEncodeFrame; use crate::util::rle::assign_to_rle; +use crate::BenVariant; use rand::seq::SliceRandom; use rand::SeedableRng; use rand_chacha::ChaCha8Rng; @@ -44,10 +45,10 @@ where fn test_relabel_ben_line_simple() { let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; - let input = BenEncodeFrame::from_rle(in_rle, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; - let expected = BenEncodeFrame::from_rle(out_rle, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); let mut buf = Vec::new(); relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); @@ -218,11 +219,11 @@ fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; let in_rle = assign_to_rle(in_assign); - let input = BenEncodeFrame::from_rle(in_rle, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); let mut new_to_old_map = HashMap::new(); new_to_old_map.insert(0, 2); @@ -253,11 +254,11 @@ fn test_relabel_ben_line_with_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign); - let input = BenEncodeFrame::from_rle(in_rle, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); let mut buf = Vec::new(); relabel_ben_lines_with_map( @@ -284,11 +285,11 @@ fn test_relabel_ben_line_with_large_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign.to_vec()); - let input = BenEncodeFrame::from_rle(in_rle, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); let mut buf = Vec::new(); relabel_ben_lines_with_map( diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index 26e8d85..fe5dae7 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -4,12 +4,11 @@ //! equivalent depth for the two more complex variants. The helpers intentionally //! mirror those in `test_coverage.rs` so that the two suites are easy to compare. -use binary_ensemble::codec::decode::{decode_ben_line, decode_ben_to_jsonl}; +use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::codec::encode::encode_jsonl_to_ben; use binary_ensemble::format::banners::{MKVCHAIN_BEN_BANNER, TWODELTA_BEN_BANNER}; use binary_ensemble::io::reader::{AssignmentFrameReader, AssignmentReader}; use binary_ensemble::io::writer::AssignmentWriter; -use binary_ensemble::util::rle::rle_to_vec; use binary_ensemble::BenVariant; use std::io::{self, Cursor}; @@ -432,14 +431,7 @@ mod mkvchain { .unwrap() .unwrap(); - let decoded = decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bit_count, - frame.max_len_bit_count, - frame.n_bytes, - ) - .map(rle_to_vec) - .unwrap(); + let decoded = frame.expand(None).unwrap(); assert_eq!(decoded, assignment); } @@ -1027,14 +1019,7 @@ mod twodelta { assert_eq!(frames.len(), 3); for (i, (frame, _count)) in frames.iter().enumerate() { - let decoded = decode_ben_line( - Cursor::new(&frame.raw_bytes), - frame.max_val_bit_count, - frame.max_len_bit_count, - frame.n_bytes, - ) - .map(rle_to_vec) - .unwrap(); + let decoded = frame.expand(None).unwrap(); assert_eq!(decoded, input[i], "frame {i} decoded incorrectly"); } } diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index bef2563..4471697 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -9,7 +9,7 @@ use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, encode_twodelta_frame, }; -use binary_ensemble::codec::{BenConstruct, BenEncodeFrame, TwoDeltaEncodeFrame}; +use binary_ensemble::codec::BenEncodeFrame; use binary_ensemble::format::banners::{ banner_for_variant, has_known_banner_prefix, variant_from_banner, BANNER_LEN, MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, @@ -700,7 +700,7 @@ fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { #[test] fn encode_ben_vec_from_rle_empty_rle() { // Empty RLE produces a minimal frame with zero payload bytes. - let frame = BenEncodeFrame::from_rle(vec![], None); + let frame = BenEncodeFrame::from_rle(vec![], BenVariant::Standard, None); // 1 byte max_val_bits + 1 byte max_len_bits + 4 bytes n_bytes = 6 bytes assert_eq!(frame.as_slice().len(), 6); } @@ -709,21 +709,21 @@ fn encode_ben_vec_from_rle_empty_rle() { fn encode_ben_vec_from_assign_and_rle_are_equivalent() { let assign = vec![3u16, 3, 3, 1, 2, 2]; let rle = assign_to_rle(&assign); - let via_assign = BenEncodeFrame::from_assignment(&assign, None); - let via_rle = BenEncodeFrame::from_rle(rle, None); + let via_assign = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None); + let via_rle = BenEncodeFrame::from_rle(rle, BenVariant::Standard, None); assert_eq!(via_assign.as_slice(), via_rle.as_slice()); } #[test] fn encode_ben_vec_from_assign_single_element() { - let frame = BenEncodeFrame::from_assignment(&[42u16], None); + let frame = BenEncodeFrame::from_assignment(&[42u16], BenVariant::Standard, None); assert!(!frame.as_slice().is_empty()); } #[test] fn encode_ben_vec_from_assign_all_same() { let assign = vec![7u16; 500]; - let frame = BenEncodeFrame::from_assignment(&assign, None); + let frame = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None); // Should encode efficiently — the payload compresses a single run. assert!(!frame.as_slice().is_empty()); } @@ -1522,7 +1522,7 @@ fn encode_twodelta_frame_valid_two_value_transition() { let next = vec![2u16, 2, 1, 1]; let frame = encode_twodelta_frame(&prev, &next, Some(1)).unwrap(); // All 4 positions belong to the pair, and all flip - assert_eq!(frame.n_bytes as usize, frame.payload().len()); + assert_eq!(frame.n_bytes() as usize, frame.payload().len()); } #[test] @@ -1531,7 +1531,7 @@ fn encode_twodelta_frame_single_value_swap() { let prev = vec![1u16, 1, 1, 2]; let next = vec![1u16, 1, 1, 1]; let frame = encode_twodelta_frame(&prev, &next, Some(1)).unwrap(); - assert_eq!(frame.pair, (1, 2)); + assert_eq!(frame.pair().unwrap(), (1, 2)); } // ────────────────────────────────────────────────────────────────────────────── @@ -1542,8 +1542,8 @@ fn encode_twodelta_frame_single_value_swap() { fn twodelta_frame_pair_accessor() { let pair = (3u16, 7u16); let run_lengths = vec![2u16, 3, 1]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.pair, pair); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); + assert_eq!(frame.pair().unwrap(), pair); } #[test] @@ -1551,23 +1551,23 @@ fn twodelta_frame_max_len_bits_accessor() { // max run length = 4 = 0b100 → 3 bits let pair = (1u16, 2u16); let run_lengths = vec![4u16, 4]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.max_len_bit_count, 3); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); + assert_eq!(frame.max_len_bit_count(), 3); } #[test] fn twodelta_frame_n_bytes_and_payload_consistent() { let pair = (5u16, 10u16); let run_lengths = vec![1u16, 2, 3]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.n_bytes as usize, frame.payload().len()); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); + assert_eq!(frame.n_bytes() as usize, frame.payload().len()); } #[test] fn twodelta_frame_to_bytes_and_as_slice_same() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); assert_eq!(frame.to_bytes(), frame.as_slice()); } @@ -1575,7 +1575,7 @@ fn twodelta_frame_to_bytes_and_as_slice_same() { fn twodelta_frame_into_bytes_consumes() { let pair = (1u16, 2u16); let run_lengths = vec![3u16, 2]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); let expected = frame.to_bytes(); let actual = frame.into_bytes(); assert_eq!(actual, expected); @@ -1585,25 +1585,25 @@ fn twodelta_frame_into_bytes_consumes() { fn twodelta_frame_from_parts_round_trip() { let pair = (10u16, 20u16); let run_lengths = vec![2u16, 5, 1]; - let original = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); - let reconstructed = TwoDeltaEncodeFrame::from_parts( + let original = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); + let reconstructed = BenEncodeFrame::from_parts( pair, - original.max_len_bit_count, + original.max_len_bit_count(), original.payload().to_vec(), - original.count, + original.count(), ); assert_eq!(original.as_slice(), reconstructed.as_slice()); - assert_eq!(original.pair, reconstructed.pair); - assert_eq!(original.max_len_bit_count, reconstructed.max_len_bit_count); - assert_eq!(original.n_bytes, reconstructed.n_bytes); - assert_eq!(original.count, reconstructed.count); + assert_eq!(original.pair().unwrap(), reconstructed.pair().unwrap()); + assert_eq!(original.max_len_bit_count(), reconstructed.max_len_bit_count()); + assert_eq!(original.n_bytes(), reconstructed.n_bytes()); + assert_eq!(original.count(), reconstructed.count()); } #[test] fn twodelta_frame_asref_and_deref() { let pair = (1u16, 2u16); let run_lengths = vec![3u16]; - let frame = TwoDeltaEncodeFrame::from_run_lengths(pair, run_lengths, None); + let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); let as_ref: &[u8] = frame.as_ref(); let deref: &[u8] = &*frame; assert_eq!(as_ref, deref); @@ -1617,45 +1617,45 @@ fn twodelta_frame_asref_and_deref() { #[test] fn encode_ben_frame_from_rle_runs_accessor() { let runs = vec![(3u16, 2u16), (5u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs.clone(), None); - assert_eq!(frame.runs.as_slice(), runs.as_slice()); + let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + assert_eq!(frame.runs().unwrap().as_slice(), runs.as_slice()); } #[test] fn encode_ben_frame_max_val_bits() { // max value = 5 = 0b101 → 3 bits let runs = vec![(1u16, 3u16), (5u16, 2u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); - assert_eq!(frame.max_val_bit_count, 3); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + assert_eq!(frame.max_val_bit_count(), Some(3)); } #[test] fn encode_ben_frame_max_len_bits() { // max run length = 7 = 0b111 → 3 bits let runs = vec![(1u16, 7u16), (2u16, 1u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); - assert_eq!(frame.max_len_bit_count, 3); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + assert_eq!(frame.max_len_bit_count(), 3); } #[test] fn encode_ben_frame_n_bytes_consistent() { // Frame layout: 1 byte (max_val_bits) + 1 byte (max_len_bits) + 4 bytes (n_bytes header) + n_bytes payload let runs = vec![(1u16, 5u16), (2u16, 3u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); - assert_eq!(frame.n_bytes as usize + 6, frame.as_slice().len()); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + assert_eq!(frame.n_bytes() as usize + 6, frame.as_slice().len()); } #[test] fn encode_ben_frame_to_bytes_and_as_slice_same() { let runs = vec![(1u16, 2u16), (3u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); assert_eq!(frame.to_bytes(), frame.as_slice()); } #[test] fn encode_ben_frame_into_bytes_consumes() { let runs = vec![(1u16, 2u16), (3u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); let expected = frame.to_bytes(); let actual = frame.into_bytes(); assert_eq!(actual, expected); @@ -1664,7 +1664,7 @@ fn encode_ben_frame_into_bytes_consumes() { #[test] fn encode_ben_frame_eq_with_vec_u8() { let runs = vec![(1u16, 2u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); let bytes = frame.to_bytes(); assert!(frame == bytes); assert!(bytes == frame); @@ -1673,7 +1673,7 @@ fn encode_ben_frame_eq_with_vec_u8() { #[test] fn encode_ben_frame_asref_and_deref() { let runs = vec![(1u16, 1u16)]; - let frame = BenEncodeFrame::from_rle(runs, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); let as_ref: &[u8] = frame.as_ref(); let deref: &[u8] = &*frame; assert_eq!(as_ref, deref); @@ -1683,9 +1683,9 @@ fn encode_ben_frame_asref_and_deref() { #[test] fn encode_ben_frame_from_assignment() { let assignment = vec![1u16, 1, 2, 2, 3]; - let frame = BenEncodeFrame::from_assignment(&assignment, None); + let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); // Frame from assignment should produce runs - let runs = &frame.runs[..]; + let runs = &frame.runs().unwrap()[..]; assert_eq!(runs, &[(1u16, 2u16), (2u16, 2u16), (3u16, 1u16)]); } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index ce83284..667769c 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -6,7 +6,7 @@ use binary_ensemble::codec::decode::{ use binary_ensemble::codec::encode::{ encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben, xz_compress, }; -use binary_ensemble::codec::{BenConstruct, BenEncodeFrame}; +use binary_ensemble::codec::BenEncodeFrame; use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, AssignmentReader, DecodeFrame, DecoderInitError, SubsampleFrameDecoder, XZAssignmentReader, @@ -764,7 +764,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { fn encode_decode_ben32_odd_bit_packing_roundtrip() { // values up to 3 (2 bits), lengths big to make non-byte boundary let rle = vec![(1u16, 3u16), (2, 5), (3, 7)]; - let ben_frame = BenEncodeFrame::from_rle(rle.clone(), None); + let ben_frame = BenEncodeFrame::from_rle(rle.clone(), BenVariant::Standard, None); let ben = ben_frame.as_slice(); // ben layout: [max_val_bits, max_len_bits, n_bytes, payload...] let max_val_bits = ben[0]; @@ -1378,7 +1378,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { encoder.finish().unwrap(); } - let expected_first = BenEncodeFrame::from_assignment(&first, None); + let expected_first = BenEncodeFrame::from_assignment(&first, BenVariant::Standard, None); assert_eq!(&ben[..17], b"TWODELTA BEN FILE"); assert_eq!( &ben[17..17 + expected_first.as_slice().len()], diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index c3d961b..38f66d4 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -3,7 +3,8 @@ use binary_ensemble::codec::decode::{ xz_decompress, }; use binary_ensemble::codec::encode::{encode_jsonl_to_xben, xz_compress}; -use binary_ensemble::codec::{BenConstruct, MkvBenEncodeFrame, TwoDeltaEncodeFrame}; +use binary_ensemble::codec::BenEncodeFrame; +use binary_ensemble::BenVariant; use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; @@ -19,7 +20,6 @@ use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::{AssignmentReader, XZAssignmentReader}; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::ops::relabel::relabel_ben_file_with_map; -use binary_ensemble::BenVariant; use std::cell::RefCell; use std::collections::HashMap; use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; @@ -316,7 +316,8 @@ fn malformed_ben_bit_widths_return_invalid_data() { #[test] fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { - let anchor = MkvBenEncodeFrame::from_assignment(vec![1u16, 2], Some(1)); + let anchor = + BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)); let mut ben = TWODELTA_BEN_BANNER.to_vec(); ben.extend_from_slice(anchor.as_slice()); ben.extend_from_slice(&[0, 1, 0, 2, 0, 0, 0, 0, 0, 1]); @@ -326,7 +327,7 @@ fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { let err = reader.next().unwrap().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); - let frame = TwoDeltaEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); let err = decode_twodelta_frame(vec![1u16], &frame).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } @@ -466,7 +467,7 @@ fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { #[test] fn zero_count_frames_are_rejected() { - let frame = MkvBenEncodeFrame::from_assignment(vec![1u16], Some(0)); + let frame = BenEncodeFrame::from_assignment(vec![1u16], BenVariant::MkvChain, Some(0)); let mut ben = MKVCHAIN_BEN_BANNER.to_vec(); ben.extend_from_slice(frame.as_slice()); let err = AssignmentReader::new(ben.as_slice()) From 6c1318dfb8c0f37dfc3bd8bed6479b3651262611 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 6 May 2026 07:33:53 -0600 Subject: [PATCH 085/221] Fix python tests --- ben-py/src/decode/decoder.rs | 43 ++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 418cfa6..1674ea6 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -21,7 +21,10 @@ pub struct PyBenDecoder { path: PathBuf, mode: DecoderMode, backend: DecoderBackend, - iter: DynIter, + /// Lazily-constructed frame iterator. We defer construction so opening + /// a bundle whose stream is empty or truncated still succeeds — only + /// methods that actually walk the stream need a live iterator. + iter: Option, current_assignment: Option>, remaining_count: u16, base_len: Option, @@ -89,12 +92,16 @@ impl PyBenDecoder { warn_xben_startup(py)?; } - let iter = build_bundle_iter(&file_path, &state, derived_mode)?; + // Iter construction is deferred: opening a bundle with an + // empty or truncated stream is legal (incomplete or zero-sample + // finalized bundles), and metadata methods like + // `count_samples`, `asset_names`, and `extract_stream` don't + // need a live iterator. Iteration paths build it on demand. Ok(Self { path: file_path, mode: derived_mode, backend: DecoderBackend::Bundle(state), - iter, + iter: None, current_assignment: None, remaining_count: 0, base_len: None, @@ -105,12 +112,16 @@ impl PyBenDecoder { if matches!(parsed_mode, DecoderMode::XBen) { warn_xben_startup(py)?; } + // For plain streams, opening the file as a BEN/XBEN reader is + // the only way to learn the variant — keep eager construction + // so we surface a malformed-banner error at open time, matching + // the documented behaviour of `BenDecoder("…", mode="ben")`. let iter = build_plain_iter(&file_path, parsed_mode)?; Ok(Self { path: file_path, mode: parsed_mode, backend: DecoderBackend::Plain, - iter, + iter: Some(iter), current_assignment: None, remaining_count: 0, base_len: None, @@ -148,7 +159,7 @@ impl PyBenDecoder { } }; - slf.iter = new_iter; + slf.iter = Some(new_iter); Ok(slf.into()) } @@ -158,7 +169,25 @@ impl PyBenDecoder { let a = slf.current_assignment.as_ref().unwrap().clone(); return Ok(Some(a)); } - match slf.iter.next() { + // Build the iterator on first use (e.g. when iteration begins + // without an explicit `__iter__` call). For bundle backends with + // empty/truncated streams this is where the BEN-banner-required + // error surfaces, instead of at `BenDecoder(...)` construction. + if slf.iter.is_none() { + let path = slf.path.clone(); + let mode = slf.mode; + let new_iter: DynIter = match &slf.backend { + DecoderBackend::Plain => build_plain_iter(&path, mode)?, + DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, + }; + slf.iter = Some(new_iter); + } + let next = slf + .iter + .as_mut() + .expect("iter populated by the lazy-init branch above") + .next(); + match next { Some(Ok((assignment, count))) => { if count == 0 { return Err(PyException::new_err( @@ -533,7 +562,7 @@ fn reset_with_selection( ) -> PyResult<()> { let frames = build_frames_for_subsample(&decoder.path, decoder.mode, &decoder.backend)?; let frame_decoder = SubsampleFrameDecoder::new(frames, selection); - decoder.iter = Box::new(frame_decoder); + decoder.iter = Some(Box::new(frame_decoder)); decoder.current_assignment = None; decoder.remaining_count = 0; decoder.len_hint = Some(len_hint); From 143bf91419dba97b372b5bcc9e387d8b9b91d562 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 6 May 2026 15:22:29 -0600 Subject: [PATCH 086/221] Increase size of xz block to improve ability to parallelize --- ben-py/src/encode/py_funcs.rs | 44 ++++++++---- ben/src/cli/ben/args.rs | 8 +++ ben/src/cli/ben/bundle.rs | 3 + ben/src/cli/ben/modes/xencode.rs | 3 + ben/src/cli/ben/modes/xz_compress.rs | 8 ++- ben/src/codec/decode/jsonl.rs | 2 +- ben/src/codec/decode/tests/mkvchain.rs | 4 +- ben/src/codec/decode/tests/mod.rs | 19 +++--- ben/src/codec/decode/tests/standard.rs | 6 +- ben/src/codec/decode/tests/twodelta.rs | 3 + ben/src/codec/encode/jsonl.rs | 21 ++++-- ben/src/codec/encode/mod.rs | 2 +- ben/src/codec/encode/path.rs | 29 ++++++-- ben/src/codec/encode/tests.rs | 7 ++ ben/src/codec/encode/xz.rs | 95 +++++++++++++++++--------- ben/src/io/reader/tests.rs | 8 +-- ben/src/ops/extract/tests.rs | 1 + ben/tests/test_coverage.rs | 6 +- ben/tests/test_impls_pipeline.rs | 26 ++++++- ben/tests/test_pipeline.rs | 3 + ben/tests/test_stress_edges.rs | 8 ++- 21 files changed, 231 insertions(+), 75 deletions(-) diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index 2b93965..7ab9fae 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -9,9 +9,9 @@ use pyo3::prelude::*; use std::path::PathBuf; #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, n_threads = None, compression_level = None))] +#[pyo3(signature = (in_file, out_file, overwrite=false, n_threads=None, compression_level=None, xz_block_size=None))] #[pyo3( - text_signature = "(in_file, out_file, overwrite=false, n_threads=None, compression_level=None)" + text_signature = "(in_file, out_file, overwrite=False, n_threads=None, compression_level=None, xz_block_size=None)" )] pub fn encode_ben_to_xben( in_file: PathBuf, @@ -19,12 +19,21 @@ pub fn encode_ben_to_xben( overwrite: bool, n_threads: Option, compression_level: Option, + xz_block_size: Option, ) -> PyResult<()> { validate_input_output_paths(&in_file, &out_file)?; let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - core_encode_ben_to_xben(reader, writer, n_threads, compression_level, None).map_err(|e| { + core_encode_ben_to_xben( + reader, + writer, + n_threads, + compression_level, + None, + xz_block_size, + ) + .map_err(|e| { PyIOError::new_err(format!( "Failed to convert BEN to XBEN from {} to {}: {e}", in_file.display(), @@ -60,9 +69,9 @@ pub fn encode_jsonl_to_ben( } #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None))] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None, xz_block_size=None))] #[pyo3( - text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain', n_threads=None, compression_level=None)" + text_signature = "(in_file, out_file, overwrite=False, variant='mkv_chain', n_threads=None, compression_level=None, xz_block_size=None)" )] pub fn encode_jsonl_to_xben( in_file: PathBuf, @@ -71,19 +80,28 @@ pub fn encode_jsonl_to_xben( variant: &str, n_threads: Option, compression_level: Option, + xz_block_size: Option, ) -> PyResult<()> { let ben_var = parse_variant(Some(variant))?; validate_input_output_paths(&in_file, &out_file)?; let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; - core_encode_jsonl_to_xben(reader, writer, ben_var, n_threads, compression_level, None) - .map_err(|e| { - PyIOError::new_err(format!( - "Failed to convert JSONL to XBEN from {} to {}: {e}", - in_file.display(), - out_file.display() - )) - })?; + core_encode_jsonl_to_xben( + reader, + writer, + ben_var, + n_threads, + compression_level, + None, + xz_block_size, + ) + .map_err(|e| { + PyIOError::new_err(format!( + "Failed to convert JSONL to XBEN from {} to {}: {e}", + in_file.display(), + out_file.display() + )) + })?; Ok(()) } diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index 528095d..85a55d1 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -131,6 +131,14 @@ pub(super) struct Args { /// Default is 10,000. #[arg(long)] pub chunk_size: Option, + /// Per-block size in bytes for the multithreaded XZ encoder. + /// liblzma needs a non-zero block size to actually fan compression + /// out across worker threads; smaller blocks scale parallelism better + /// at a slight compression-ratio cost. Defaults to 16 MiB when + /// `--n-cpus > 1`, or 0 (liblzma auto, ~192 MiB at preset 9) for + /// single-thread runs. + #[arg(long)] + pub xz_block_size: Option, /// Embed a graph JSON asset alongside the assignment stream and emit /// the result as a `.bendl` bundle. The graph is added after the /// assignment stream has been fully written. Only applies to the diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index e674eaa..eb8af39 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -93,6 +93,7 @@ pub(super) fn run_xencode_bundle_with_graph( n_threads: Option, compression_level: Option, chunk_size: Option, + block_size: Option, graph_path: &Path, ) -> Result<()> { std::fs::metadata(graph_path).map_err(|e| { @@ -123,6 +124,7 @@ pub(super) fn run_xencode_bundle_with_graph( n_threads, compression_level, chunk_size, + block_size, )?; } else { encode_jsonl_to_xben( @@ -132,6 +134,7 @@ pub(super) fn run_xencode_bundle_with_graph( n_threads, compression_level, chunk_size, + block_size, )?; } handle diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs index 1f67eb0..a550aaa 100644 --- a/ben/src/cli/ben/modes/xencode.rs +++ b/ben/src/cli/ben/modes/xencode.rs @@ -51,6 +51,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { args.n_cpus, args.compression_level, args.chunk_size, + args.xz_block_size, graph_path, )?; return Ok(()); @@ -78,6 +79,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { args.n_cpus, args.compression_level, args.chunk_size, + args.xz_block_size, )?; Ok(()) } else if jsonl_and_xben { @@ -89,6 +91,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { args.n_cpus, args.compression_level, args.chunk_size, + args.xz_block_size, )?; Ok(()) } else { diff --git a/ben/src/cli/ben/modes/xz_compress.rs b/ben/src/cli/ben/modes/xz_compress.rs index b80a019..c7c4b62 100644 --- a/ben/src/cli/ben/modes/xz_compress.rs +++ b/ben/src/cli/ben/modes/xz_compress.rs @@ -24,7 +24,13 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { check_overwrite(&out_file_name, args.overwrite)?; let writer = BufWriter::new(File::create(out_file_name)?); - xz_compress(reader, writer, args.n_cpus, args.compression_level)?; + xz_compress( + reader, + writer, + args.n_cpus, + args.compression_level, + args.xz_block_size, + )?; tracing::trace!("Done!"); Ok(()) } diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index 44a2bfc..dc0c342 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -146,7 +146,7 @@ mod tests { // Build a valid Standard XBEN stream. let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; let mut xben = Vec::new(); - encode_jsonl_to_xben(jsonl.as_slice(), &mut xben, BenVariant::Standard, Some(1), Some(1), None) + encode_jsonl_to_xben(jsonl.as_slice(), &mut xben, BenVariant::Standard, Some(1), Some(1), None, None) .unwrap(); // Use a read-only File as the writer — writing to it fails with a diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs index 739d2b6..e320a34 100644 --- a/ben/src/codec/decode/tests/mkvchain.rs +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -180,6 +180,7 @@ fn decode_xben_to_ben_mkvchain_roundtrip() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -205,6 +206,7 @@ fn decode_xben_to_jsonl_mkvchain_count_expands() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -236,7 +238,7 @@ fn decode_xben_to_jsonl_rejects_mkvchain_partial_overflow() { let mut xz = Vec::new(); let mut inner = b"MKVCHAIN BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 7cb6dfd..3df836f 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -40,6 +40,7 @@ fn decode_xben_to_ben_twodelta_roundtrip() { Some(1), Some(1), None, + None, ) .unwrap(); @@ -82,6 +83,7 @@ fn decode_xben_to_jsonl_twodelta() { Some(1), Some(1), None, + None, ) .unwrap(); @@ -105,7 +107,7 @@ fn decode_xben_to_jsonl_rejects_invalid_banner() { let mut bad_data = b"GARBAGE BANNER!!!".to_vec(); bad_data.extend_from_slice(&[0u8; 20]); let mut xz = Vec::new(); - xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1)).unwrap(); + xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1), None).unwrap(); let mut output = Vec::new(); let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut output).unwrap_err(); @@ -128,7 +130,7 @@ fn encode_ben_to_xben_roundtrip() { // BEN → XBEN let mut xben = Vec::new(); - encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None).unwrap(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None, None).unwrap(); // XBEN → BEN let mut ben2 = Vec::new(); @@ -152,7 +154,7 @@ fn encode_ben_to_xben_with_chunk_size() { encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); let mut xben = Vec::new(); - encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), Some(1)).unwrap(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), Some(1), None).unwrap(); assert!(!xben.is_empty()); // Verify content roundtrips correctly @@ -185,7 +187,7 @@ fn encode_ben_to_xben_mkvchain_roundtrip() { encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); let mut xben = Vec::new(); - encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None).unwrap(); + encode_ben_to_xben(ben.as_slice(), &mut xben, Some(1), Some(1), None, None).unwrap(); let mut ben2 = Vec::new(); decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); @@ -246,6 +248,7 @@ fn decode_xben_to_ben_twodelta_with_repeated_assignments() { Some(1), Some(1), None, + None, ) .unwrap(); @@ -271,7 +274,7 @@ fn xz_decompress_roundtrip() { let original = b"hello world, this is a test of xz_decompress"; let mut compressed = Vec::new(); - xz_compress(original.as_slice(), &mut compressed, Some(1), Some(1)).unwrap(); + xz_compress(original.as_slice(), &mut compressed, Some(1), Some(1), None).unwrap(); let mut decompressed = Vec::new(); xz_decompress(BufReader::new(compressed.as_slice()), &mut decompressed).unwrap(); @@ -284,7 +287,7 @@ fn xz_compress_direct_test() { let data = b"compress me please with xz"; let mut out = Vec::new(); - xz_compress(data.as_slice(), &mut out, None, None).unwrap(); + xz_compress(data.as_slice(), &mut out, None, None, None).unwrap(); assert!(!out.is_empty()); let mut decompressed = Vec::new(); @@ -302,7 +305,7 @@ fn encode_ben_to_xben_rejects_invalid_banner() { let garbage = b"GARBAGE BANNER!!!extra_padding"; let mut out = Vec::new(); - let err = encode_ben_to_xben(garbage.as_slice(), &mut out, Some(1), Some(1), None).unwrap_err(); + let err = encode_ben_to_xben(garbage.as_slice(), &mut out, Some(1), Some(1), None, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -315,7 +318,7 @@ fn decode_xben_to_ben_rejects_invalid_banner() { let mut bad_data = b"GARBAGE BANNER!!!".to_vec(); bad_data.extend_from_slice(&[0u8; 20]); let mut xz = Vec::new(); - xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1)).unwrap(); + xz_compress(bad_data.as_slice(), &mut xz, Some(1), Some(1), None).unwrap(); let mut output = Vec::new(); let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut output).unwrap_err(); diff --git a/ben/src/codec/decode/tests/standard.rs b/ben/src/codec/decode/tests/standard.rs index bb74deb..1b139b3 100644 --- a/ben/src/codec/decode/tests/standard.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -299,6 +299,7 @@ fn test_decode_xben_to_ben_rejects_invalid_inner_header() { &mut xz, Some(1), Some(0), + None, ) .unwrap(); @@ -314,6 +315,7 @@ fn test_decode_xben_to_jsonl_rejects_invalid_inner_header() { &mut xz, Some(1), Some(0), + None, ) .unwrap(); @@ -326,7 +328,7 @@ fn test_decode_xben_to_ben_handles_partial_overflow_without_frame() { let mut xz = Vec::new(); let mut inner = b"STANDARD BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); let mut out = Vec::new(); decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap(); @@ -338,7 +340,7 @@ fn test_decode_xben_to_jsonl_handles_partial_overflow_without_frame() { let mut xz = Vec::new(); let mut inner = b"STANDARD BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index ec8ad56..f3f99ed 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -493,6 +493,7 @@ fn decode_xben_to_jsonl_twodelta_anchor_only() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -517,6 +518,7 @@ fn decode_xben_to_jsonl_twodelta_chain_roundtrip() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -540,6 +542,7 @@ fn decode_xben_to_jsonl_twodelta_with_repetitions() { Some(1), Some(0), None, + None, ) .unwrap(); diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 52c2e80..f7a1837 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,4 +1,5 @@ -use crate::codec::encode::errors::EncodeError; +use crate::codec::encode::xz::XZ_DEFAULT_MT_BLOCK_SIZE; +use crate::codec::encode::EncodeError; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::progress::Spinner; use crate::BenVariant; @@ -21,6 +22,11 @@ use xz2::write::XzEncoder; /// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe /// default is chosen. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for +/// Standard and MkvChain variants. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. +/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or +/// `0` (liblzma auto) for single-thread runs. /// /// # Returns /// @@ -32,9 +38,10 @@ pub fn encode_jsonl_to_xben( n_threads: Option, compression_level: Option, chunk_size: Option, + block_size: Option, ) -> Result<()> { - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus + let n_cpus: u32 = n_threads + .unwrap_or(1) .min( std::thread::available_parallelism() .map(|n| n.get()) @@ -44,10 +51,16 @@ pub fn encode_jsonl_to_xben( let level = compression_level.unwrap_or(9).clamp(0, 9); + let resolved_block_size = match block_size { + Some(n) => n, + None if n_cpus > 1 => XZ_DEFAULT_MT_BLOCK_SIZE, + None => 0, + }; + let mt = MtStreamBuilder::new() .threads(n_cpus) .preset(level) - .block_size(0) + .block_size(resolved_block_size) .encoder() .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; let encoder = XzEncoder::new_stream(writer, mt); diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 6c80e91..09c00c5 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -18,7 +18,7 @@ pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use path::{ encode_ben_to_xben_path, encode_jsonl_to_ben_path, encode_jsonl_to_xben_path, xz_compress_path, }; -pub use xz::{encode_ben_to_xben, xz_compress}; +pub use xz::{encode_ben_to_xben, xz_compress, XZ_DEFAULT_MT_BLOCK_SIZE}; #[cfg(test)] mod tests; diff --git a/ben/src/codec/encode/path.rs b/ben/src/codec/encode/path.rs index 3ae9849..dd1059f 100644 --- a/ben/src/codec/encode/path.rs +++ b/ben/src/codec/encode/path.rs @@ -28,10 +28,19 @@ pub fn encode_jsonl_to_xben_path( n_threads: Option, compression_level: Option, chunk_size: Option, + block_size: Option, ) -> Result<()> { let reader = BufReader::new(File::open(input)?); let writer = BufWriter::new(File::create(output)?); - encode_jsonl_to_xben(reader, writer, variant, n_threads, compression_level, chunk_size) + encode_jsonl_to_xben( + reader, + writer, + variant, + n_threads, + compression_level, + chunk_size, + block_size, + ) } /// Encode a BEN file at `input` into an XBEN file at `output`. @@ -41,10 +50,18 @@ pub fn encode_ben_to_xben_path( n_threads: Option, compression_level: Option, chunk_size: Option, + block_size: Option, ) -> Result<()> { let reader = BufReader::new(File::open(input)?); let writer = BufWriter::new(File::create(output)?); - encode_ben_to_xben(reader, writer, n_threads, compression_level, chunk_size) + encode_ben_to_xben( + reader, + writer, + n_threads, + compression_level, + chunk_size, + block_size, + ) } /// Compress an arbitrary file at `input` into an `.xz` file at `output`. @@ -53,10 +70,11 @@ pub fn xz_compress_path( output: &Path, n_threads: Option, compression_level: Option, + block_size: Option, ) -> Result<()> { let reader = BufReader::new(File::open(input)?); let writer = BufWriter::new(File::create(output)?); - xz_compress(reader, writer, n_threads, compression_level) + xz_compress(reader, writer, n_threads, compression_level, block_size) } #[cfg(test)] @@ -111,6 +129,7 @@ mod tests { Some(1), Some(1), None, + None, ) .unwrap(); decode_xben_to_jsonl_path(&xben_out, &jsonl_back).unwrap(); @@ -138,7 +157,7 @@ mod tests { ) .unwrap(); encode_jsonl_to_ben_path(&jsonl_in, &ben, BenVariant::Standard).unwrap(); - encode_ben_to_xben_path(&ben, &xben, Some(1), Some(1), None).unwrap(); + encode_ben_to_xben_path(&ben, &xben, Some(1), Some(1), None, None).unwrap(); decode_xben_to_ben_path(&xben, &ben_back).unwrap(); // Round trip: ben_back should be byte-equivalent to ben (same banner, same content). @@ -158,7 +177,7 @@ mod tests { let plain_back = unique_path("path-xz-back.txt"); std::fs::write(&plain, b"hello world\n").unwrap(); - xz_compress_path(&plain, &xz_out, Some(1), Some(1)).unwrap(); + xz_compress_path(&plain, &xz_out, Some(1), Some(1), None).unwrap(); xz_decompress_path(&xz_out, &plain_back).unwrap(); assert_eq!(std::fs::read(&plain_back).unwrap(), b"hello world\n"); diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index a94dc93..ce7e07c 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -736,6 +736,7 @@ fn encode_jsonl_to_xben_roundtrip() { Some(1), Some(1), None, + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -754,6 +755,7 @@ fn encode_jsonl_to_xben_with_chunk_size() { Some(1), Some(1), Some(2), + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -770,6 +772,7 @@ fn encode_jsonl_to_xben_invalid_json_errors() { Some(1), Some(1), None, + None, ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -789,6 +792,7 @@ fn encode_jsonl_to_xben_mkv_variant() { Some(1), Some(1), None, + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -934,6 +938,7 @@ fn encode_jsonl_to_xben_roundtrip_verifies_content() { Some(1), Some(1), None, + None, ) .unwrap(); @@ -966,6 +971,7 @@ fn encode_jsonl_to_xben_mkv_verifies_content() { Some(1), Some(1), None, + None, ) .unwrap(); @@ -1167,6 +1173,7 @@ fn encode_jsonl_to_xben_twodelta_roundtrip() { Some(1), Some(1), None, + None, ) .unwrap(); diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 8501691..d958419 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -3,9 +3,57 @@ use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::writer::XZAssignmentWriter; use std::io::{self, BufRead, Cursor, Read, Result, Write}; -use xz2::stream::MtStreamBuilder; +use xz2::stream::{MtStreamBuilder, Stream}; use xz2::write::XzEncoder; +/// Default per-block size used by the multithreaded XZ encoder when the +/// caller does not pass an explicit `block_size`. +/// +/// liblzma's `block_size = 0` means "auto" (`3 × dict_size`), which at +/// preset 9 is ~192 MiB — far too coarse for streaming inputs to fan out +/// across worker threads. 16 MiB strikes a balance between scaling +/// thread utilization on medium ensembles and keeping per-block +/// dictionary reuse mostly intact. +pub const XZ_DEFAULT_MT_BLOCK_SIZE: u64 = 16 * 1024 * 1024; + +/// Resolve `n_threads` against the host's available parallelism. +fn resolve_threads(n_threads: Option) -> u32 { + n_threads + .unwrap_or(1) + .min( + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) as u32, + ) + .max(1) +} + +/// Build a multithreaded XZ encoder stream with the project's default +/// `block_size` policy applied. +/// +/// When `block_size` is `Some(n)`, that exact byte count is passed to +/// liblzma. When it is `None`, we default to [`XZ_DEFAULT_MT_BLOCK_SIZE`] +/// for `n_threads > 1` and to `0` (liblzma's "auto") for the single-thread +/// case so single-thread encoding does not pay any block-overhead cost. +fn build_mt_stream( + n_threads: u32, + level: u32, + block_size: Option, +) -> Result { + let resolved_block_size = match block_size { + Some(n) => n, + None if n_threads > 1 => XZ_DEFAULT_MT_BLOCK_SIZE, + None => 0, + }; + + MtStreamBuilder::new() + .threads(n_threads) + .preset(level) + .block_size(resolved_block_size) + .encoder() + .map_err(|e| io::Error::from(EncodeError::XzInit(e))) +} + /// Compress an arbitrary byte stream with XZ/LZMA2. /// /// This is a general-purpose helper used by the XBEN tooling, but it can also @@ -18,6 +66,10 @@ use xz2::write::XzEncoder; /// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe /// default is chosen. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. +/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or +/// `0` (liblzma auto) for single-thread runs. Smaller blocks improve +/// thread fan-out at a slight compression-ratio cost. /// /// # Returns /// @@ -27,26 +79,14 @@ pub fn xz_compress( writer: W, n_threads: Option, compression_level: Option, + block_size: Option, ) -> Result<()> { let mut buff = [0; 4096]; - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - + let n_cpus = resolve_threads(n_threads); let level = compression_level.unwrap_or(9).clamp(0, 9); - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(0) - .encoder() - .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; + let mt = build_mt_stream(n_cpus, level, block_size)?; let mut encoder = XzEncoder::new_stream(writer, mt); loop { @@ -72,6 +112,11 @@ pub fn xz_compress( /// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe /// default is chosen. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. +/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for +/// Standard and MkvChain variants. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. +/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or +/// `0` (liblzma auto) for single-thread runs. /// /// # Returns /// @@ -82,27 +127,15 @@ pub fn encode_ben_to_xben( n_threads: Option, compression_level: Option, chunk_size: Option, + block_size: Option, ) -> Result<()> { let mut check_buffer = [0u8; BANNER_LEN]; reader.read_exact(&mut check_buffer)?; - let mut n_cpus: u32 = n_threads.unwrap_or(1); - n_cpus = n_cpus - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - + let n_cpus = resolve_threads(n_threads); let level = compression_level.unwrap_or(9).clamp(0, 9); - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(0) - .encoder() - .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; + let mt = build_mt_stream(n_cpus, level, block_size)?; let encoder = XzEncoder::new_stream(writer, mt); let variant = variant_from_banner(&check_buffer).ok_or_else(|| { diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 8f975b5..32efd05 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -10,7 +10,7 @@ use xz2::write::XzEncoder; /// Build a minimal XBEN stream from JSONL input for testing. fn make_xben(jsonl: &str, variant: BenVariant) -> Vec { let mut xben = Vec::new(); - encode_jsonl_to_xben(jsonl.as_bytes(), &mut xben, variant, Some(1), Some(1), None).unwrap(); + encode_jsonl_to_xben(jsonl.as_bytes(), &mut xben, variant, Some(1), Some(1), None, None).unwrap(); xben } @@ -843,7 +843,7 @@ fn translate_ben_twodelta_to_xben_roundtrip() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); let mut jsonl = Vec::new(); decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); @@ -889,7 +889,7 @@ fn translate_ben_twodelta_to_xben_with_repetitions() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); @@ -918,7 +918,7 @@ fn translate_ben_twodelta_to_xben_many_deltas() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None).unwrap(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index a12d25f..873a404 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -144,6 +144,7 @@ fn test_extract_assignment_xben_roundtrip_and_errors() { Some(1), Some(0), None, + None, ) .unwrap(); diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 4471697..4472e57 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -67,6 +67,7 @@ fn encode_xben(assignments: &[Vec], variant: BenVariant) -> Vec { Some(1), Some(1), None, + None, ) .unwrap(); xben @@ -497,6 +498,7 @@ fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { Some(1), Some(1), None, + None, ) .unwrap(); xben @@ -738,7 +740,7 @@ fn encode_ben_to_xben_and_back_standard() { let ben = encode_standard_ben(&assignments); let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, None, None, None).unwrap(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, None, None, None, None).unwrap(); let mut ben2 = Vec::new(); decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); @@ -1247,6 +1249,7 @@ fn xben_round_trip_with_level_0_compression() { Some(1), Some(0), // compression level 0 None, + None, ) .unwrap(); @@ -1279,6 +1282,7 @@ fn xben_mkvchain_round_trip_preserves_all_samples() { Some(1), Some(1), None, + None, ) .unwrap(); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 667769c..5f1ed56 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -226,6 +226,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); // Decode XBEN -> BEN -> JSONL @@ -252,6 +253,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let mut ben = Vec::new(); @@ -277,6 +279,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let mut ben = Vec::new(); @@ -302,6 +305,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); // Path A: direct to JSONL @@ -331,6 +335,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); @@ -359,6 +364,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); @@ -417,6 +423,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); // Choose some indices to keep (1-based). We derive from seq length. @@ -458,6 +465,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let n = seq.len(); @@ -494,6 +502,7 @@ proptest! { Some(threads), Some(level), None, + None, ).unwrap(); let n = seq.len(); @@ -553,7 +562,7 @@ proptest! { let (threads, level) = params; let mut out = Vec::new(); - xz_compress(BufReader::new(bytes.as_slice()), &mut out, Some(threads), Some(level)).unwrap(); + xz_compress(BufReader::new(bytes.as_slice()), &mut out, Some(threads), Some(level), None).unwrap(); let mut recovered = Vec::new(); xz_decompress(BufReader::new(out.as_slice()), &mut recovered).unwrap(); @@ -586,7 +595,7 @@ fn xben_decoder_rejects_bad_banner() { let mut inner = Vec::new(); inner.extend_from_slice(b"BAD BAD BAD BAD!!"); // 17 bytes let mut xz = Vec::new(); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0)).unwrap(); + xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); let err = XZAssignmentReader::new(xz.as_slice()) .err() @@ -610,6 +619,7 @@ fn subsample_every_respects_offset() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -669,6 +679,7 @@ fn xbenencoder_drop_flushes_tail_group() { Some(1), Some(0), None, + None, ) .unwrap(); out @@ -691,6 +702,7 @@ fn ben_new_invalid_header_detects_xz() { &mut xz, Some(1), Some(0), + None, ) .unwrap(); @@ -718,6 +730,7 @@ fn xben_new_invalid_banner() { &mut wrong, Some(1), Some(0), + None, ) .unwrap(); let err = XZAssignmentReader::new(wrong.as_slice()) @@ -743,6 +756,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -830,6 +844,7 @@ fn subsample_by_indices_sorts_and_dedups() { Some(1), Some(0), None, + None, ) .unwrap(); let xb = XZAssignmentReader::new(xz.as_slice()).unwrap(); @@ -875,6 +890,7 @@ fn ben_encode_xben_respects_existing_ben_header() { Some(1), Some(0), None, + None, ) .expect("ben->xben failed"); @@ -899,6 +915,7 @@ fn xz_mt_params_are_capped_and_safe() { Some(10_000), Some(42), None, + None, ) .unwrap(); let mut ben = Vec::new(); @@ -970,6 +987,7 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -1151,6 +1169,7 @@ fn decoder_init_error_display_source_and_conversion_paths() { &mut buf, Some(1), Some(0), + None, ) .unwrap(); buf @@ -1199,6 +1218,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { Some(1), Some(0), None, + None, ) .unwrap(); assert_eq!( @@ -1221,6 +1241,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { Some(1), Some(0), None, + None, ) .unwrap(); assert_eq!( @@ -1257,6 +1278,7 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { Some(1), Some(0), None, + None, ) .unwrap(); let xben_path = unique_temp_path("sample.xben"); diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index e4d283a..933bfdd 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -266,6 +266,7 @@ fn test_xben_pipeline() { Some(1), Some(1), None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -336,6 +337,7 @@ fn test_xmkvben_pipeline() { Some(1), Some(1), None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -431,6 +433,7 @@ fn test_xtwodeltaben_pipeline() { Some(1), Some(1), None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 38f66d4..2fc564b 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -286,6 +286,7 @@ fn xben_mkvchain_splits_repetition_count_longer_than_u16_max() { Some(1), Some(0), None, + None, ) .unwrap(); @@ -343,6 +344,7 @@ fn direct_xben_helpers_propagate_corrupt_xz_errors() { Some(1), Some(0), None, + None, ) .unwrap(); xben.truncate(xben.len() - 1); @@ -367,7 +369,7 @@ fn xz_compress_propagates_input_reader_errors() { fn consume(&mut self, _amt: usize) {} } - let err = xz_compress(FailingReader, Vec::new(), Some(1), Some(0)).unwrap_err(); + let err = xz_compress(FailingReader, Vec::new(), Some(1), Some(0), None).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); } @@ -457,6 +459,7 @@ fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { &mut xben, Some(1), Some(0), + None, ) .unwrap(); @@ -487,6 +490,7 @@ fn zero_count_frames_are_rejected() { &mut xben, Some(1), Some(0), + None, ) .unwrap(); let err = XZAssignmentReader::new(xben.as_slice()) @@ -563,6 +567,7 @@ fn seeded_malformed_xben_bytes_do_not_panic() { Some(1), Some(0), Some(32), + None, ) .unwrap(); seeds.push(xben); @@ -588,6 +593,7 @@ fn seeded_malformed_xben_bytes_do_not_panic() { &mut unknown_tag_xben, Some(1), Some(0), + None, ) .unwrap(); assert_xben_bytes_do_not_panic(unknown_tag_xben); From e71975b73bbbece5a44ba7b9fee10210e3a423ac Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 6 May 2026 16:46:07 -0600 Subject: [PATCH 087/221] Allow --n-cpus -1 to mean all cores --- ben-py/src/encode/py_funcs.rs | 10 ++--- ben/src/cli/ben/args.rs | 13 ++++-- ben/src/cli/ben/modes/xencode.rs | 8 ++-- ben/src/cli/ben/modes/xz_compress.rs | 4 +- ben/src/cli/ben/tests.rs | 5 +++ ben/src/codec/encode/jsonl.rs | 5 ++- ben/src/codec/encode/mod.rs | 2 +- ben/src/codec/encode/xz.rs | 66 ++++++++++++++++++++++++---- 8 files changed, 86 insertions(+), 27 deletions(-) diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index 7ab9fae..6117c12 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -1,6 +1,6 @@ use crate::common::{open_input, open_output, parse_variant, validate_input_output_paths}; use binary_ensemble::codec::encode::{ - encode_ben_to_xben as core_encode_ben_to_xben, + cpus_from_signed, encode_ben_to_xben as core_encode_ben_to_xben, encode_jsonl_to_ben as core_encode_jsonl_to_ben, encode_jsonl_to_xben as core_encode_jsonl_to_xben, }; @@ -17,7 +17,7 @@ pub fn encode_ben_to_xben( in_file: PathBuf, out_file: PathBuf, overwrite: bool, - n_threads: Option, + n_threads: Option, compression_level: Option, xz_block_size: Option, ) -> PyResult<()> { @@ -28,7 +28,7 @@ pub fn encode_ben_to_xben( core_encode_ben_to_xben( reader, writer, - n_threads, + n_threads.map(cpus_from_signed), compression_level, None, xz_block_size, @@ -78,7 +78,7 @@ pub fn encode_jsonl_to_xben( out_file: PathBuf, overwrite: bool, variant: &str, - n_threads: Option, + n_threads: Option, compression_level: Option, xz_block_size: Option, ) -> PyResult<()> { @@ -91,7 +91,7 @@ pub fn encode_jsonl_to_xben( reader, writer, ben_var, - n_threads, + n_threads.map(cpus_from_signed), compression_level, None, xz_block_size, diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index 85a55d1..d02c625 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -117,10 +117,15 @@ pub(super) struct Args { /// Suppress in-place progress spinners. Trace logging is unaffected. #[arg(short = 'q', long)] pub quiet: bool, - /// When running x-encoder, this flag will determine the number of cpus to use on the - /// system. By default, all available cpus will be used. - #[arg(short = 'c', long)] - pub n_cpus: Option, + /// Number of threads the XZ encoder may use during x-encode and + /// xz-compress. Defaults to 1 (single-threaded). Pass an explicit + /// value to fan compression out across worker threads; values larger + /// than the host's available parallelism are silently clamped down. + /// `-1` is a sentinel meaning "use every available core" (sklearn + /// convention). See also `--xz-block-size`, which controls how much + /// input each thread gets before it can start compressing. + #[arg(short = 'c', long, allow_hyphen_values = true)] + pub n_cpus: Option, /// When running x-encoder, this flag will deterimine the level of compression to use. /// By default, the highest level of compression will be used. /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs index a550aaa..e99fa5f 100644 --- a/ben/src/cli/ben/modes/xencode.rs +++ b/ben/src/cli/ben/modes/xencode.rs @@ -5,7 +5,7 @@ use super::super::bundle::run_xencode_bundle_with_graph; use super::super::paths::{encode_setup, open_derived_writer, open_reader, open_writer}; use crate::cli::common::{CliError, CliResult}; -use crate::codec::encode::{encode_ben_to_xben, encode_jsonl_to_xben}; +use crate::codec::encode::{cpus_from_signed, encode_ben_to_xben, encode_jsonl_to_xben}; use std::path::Path; /// Execute the `x-encode` sub-mode. @@ -48,7 +48,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { &out_path, variant, ben_and_xben, - args.n_cpus, + args.n_cpus.map(cpus_from_signed), args.compression_level, args.chunk_size, args.xz_block_size, @@ -76,7 +76,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { encode_ben_to_xben( reader, writer, - args.n_cpus, + args.n_cpus.map(cpus_from_signed), args.compression_level, args.chunk_size, args.xz_block_size, @@ -88,7 +88,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { reader, writer, variant, - args.n_cpus, + args.n_cpus.map(cpus_from_signed), args.compression_level, args.chunk_size, args.xz_block_size, diff --git a/ben/src/cli/ben/modes/xz_compress.rs b/ben/src/cli/ben/modes/xz_compress.rs index c7c4b62..aa9c39a 100644 --- a/ben/src/cli/ben/modes/xz_compress.rs +++ b/ben/src/cli/ben/modes/xz_compress.rs @@ -3,7 +3,7 @@ use super::super::args::Args; use crate::cli::common::{check_overwrite, CliError, CliResult}; -use crate::codec::encode::xz_compress; +use crate::codec::encode::{cpus_from_signed, xz_compress}; use std::fs::File; use std::io::{BufReader, BufWriter}; @@ -27,7 +27,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { xz_compress( reader, writer, - args.n_cpus, + args.n_cpus.map(cpus_from_signed), args.compression_level, args.xz_block_size, )?; diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index 586ac99..285f748 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -388,6 +388,7 @@ fn run_xencode_bundle_with_graph_from_jsonl_creates_bendl() { None, None, None, + None, &graph, ) .unwrap(); @@ -426,6 +427,7 @@ fn run_xencode_bundle_with_graph_from_ben_creates_bendl() { None, None, None, + None, &graph, ) .unwrap(); @@ -495,6 +497,7 @@ fn run_xencode_bundle_with_graph_errors_on_missing_graph() { None, None, None, + None, &nonexistent, ) .unwrap_err(); @@ -554,6 +557,7 @@ fn run_xencode_bundle_with_graph_errors_on_invalid_jsonl() { None, None, None, + None, &graph, ) .unwrap_err(); @@ -580,6 +584,7 @@ fn run_xencode_bundle_with_graph_errors_on_invalid_ben() { None, None, None, + None, &graph, ) .unwrap_err(); diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index f7a1837..923608a 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -19,8 +19,9 @@ use xz2::write::XzEncoder; /// * `reader` - A JSONL input stream with one assignment record per line. /// * `writer` - The destination for the compressed XBEN bytes. /// * `variant` - The BEN variant to use inside the XBEN payload. -/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe -/// default is chosen. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` +/// (single-threaded) when `None`. Values larger than the host's +/// available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. /// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for /// Standard and MkvChain variants. diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 09c00c5..71dde76 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -18,7 +18,7 @@ pub use jsonl::{encode_jsonl_to_ben, encode_jsonl_to_xben}; pub use path::{ encode_ben_to_xben_path, encode_jsonl_to_ben_path, encode_jsonl_to_xben_path, xz_compress_path, }; -pub use xz::{encode_ben_to_xben, xz_compress, XZ_DEFAULT_MT_BLOCK_SIZE}; +pub use xz::{cpus_from_signed, encode_ben_to_xben, xz_compress, XZ_DEFAULT_MT_BLOCK_SIZE}; #[cfg(test)] mod tests; diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index d958419..fee0a80 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -20,14 +20,36 @@ pub const XZ_DEFAULT_MT_BLOCK_SIZE: u64 = 16 * 1024 * 1024; fn resolve_threads(n_threads: Option) -> u32 { n_threads .unwrap_or(1) - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) + .min(host_parallelism()) .max(1) } +/// Number of cores reported by `std::thread::available_parallelism`, +/// or `1` if the platform cannot answer. +fn host_parallelism() -> u32 { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) as u32 +} + +/// Convert a user-supplied signed thread count into the unsigned count +/// the encoder expects. +/// +/// CLI and Python users want sklearn-style sentinel semantics: +/// +/// - `n < 0` (typically `-1`) means "use every available core". +/// - `n == 0` is treated as `1` to avoid silently disabling work. +/// - `n > 0` is the literal request, clamped to host parallelism. +/// +/// Pass the result on to library APIs that take `Option`. +pub fn cpus_from_signed(n: i32) -> u32 { + if n < 0 { + host_parallelism() + } else { + (n as u32).max(1).min(host_parallelism()) + } +} + /// Build a multithreaded XZ encoder stream with the project's default /// `block_size` policy applied. /// @@ -63,8 +85,9 @@ fn build_mt_stream( /// /// * `reader` - The input byte stream to compress. /// * `writer` - The destination for the compressed XZ bytes. -/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe -/// default is chosen. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` +/// (single-threaded) when `None`. Values larger than the host's +/// available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. /// * `block_size` - Optional per-block size in bytes for the MT encoder. /// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or @@ -109,8 +132,9 @@ pub fn xz_compress( /// /// * `reader` - The input BEN stream, including its banner. /// * `writer` - The destination for the compressed XBEN bytes. -/// * `n_threads` - Optional XZ encoder thread count. When omitted, a safe -/// default is chosen. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` +/// (single-threaded) when `None`. Values larger than the host's +/// available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. /// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for /// Standard and MkvChain variants. @@ -152,3 +176,27 @@ pub fn encode_ben_to_xben( Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cpus_from_signed_negative_means_all_cores() { + let host = host_parallelism(); + assert_eq!(cpus_from_signed(-1), host); + assert_eq!(cpus_from_signed(-100), host); + } + + #[test] + fn cpus_from_signed_zero_clamps_to_one() { + assert_eq!(cpus_from_signed(0), 1); + } + + #[test] + fn cpus_from_signed_positive_clamps_to_host() { + let host = host_parallelism(); + assert_eq!(cpus_from_signed(1), 1); + assert_eq!(cpus_from_signed(i32::MAX), host); + } +} From 860012f6737e2b6894d071cc35a30d6de6fdc8a8 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 7 May 2026 14:40:18 -0600 Subject: [PATCH 088/221] Unify stream readers --- ben-py/src/decode/decoder.rs | 4 +- ben-py/src/decode/helpers.rs | 57 +- ben-py/src/decode/types.rs | 9 +- ben/src/cli/ben/bundle.rs | 3 +- ben/src/cli/bendl/create.rs | 5 +- ben/src/cli/bendl/helpers.rs | 8 - ben/src/cli/bendl/tests.rs | 7 +- ben/src/cli/pcben/mod.rs | 4 +- ben/src/cli/pcben/tests.rs | 2 +- ben/src/codec/decode/jsonl.rs | 6 +- ben/src/codec/decode/xz.rs | 4 +- ben/src/codec/frames/decode.rs | 4 +- ben/src/codec/translate/errors.rs | 2 +- ben/src/codec/translate/mod.rs | 2 +- ben/src/io/bundle/mod.rs | 4 +- ben/src/io/bundle/reader.rs | 65 +-- ben/src/io/bundle/tests/writer.rs | 43 +- ben/src/io/reader/assignment_reader.rs | 325 ----------- ben/src/io/reader/mod.rs | 6 +- ben/src/io/reader/stream_reader/ben.rs | 122 +++++ ben/src/io/reader/stream_reader/frames.rs | 180 +++++++ ben/src/io/reader/stream_reader/mod.rs | 315 +++++++++++ ben/src/io/reader/stream_reader/xben.rs | 296 ++++++++++ ben/src/io/reader/subsample.rs | 160 ++---- ben/src/io/reader/tests.rs | 243 ++++----- ben/src/io/reader/twodelta.rs | 4 +- ben/src/io/reader/xz_assignment_reader.rs | 627 ---------------------- ben/src/io/writer/tests.rs | 32 +- ben/src/ops/extract/mod.rs | 26 +- ben/src/ops/relabel/mod.rs | 4 +- ben/tests/test_assignment_reader.rs | 126 ++--- ben/tests/test_coverage.rs | 87 +-- ben/tests/test_impls_pipeline.rs | 104 ++-- ben/tests/test_stress_edges.rs | 24 +- 34 files changed, 1368 insertions(+), 1542 deletions(-) delete mode 100644 ben/src/io/reader/assignment_reader.rs create mode 100644 ben/src/io/reader/stream_reader/ben.rs create mode 100644 ben/src/io/reader/stream_reader/frames.rs create mode 100644 ben/src/io/reader/stream_reader/mod.rs create mode 100644 ben/src/io/reader/stream_reader/xben.rs delete mode 100644 ben/src/io/reader/xz_assignment_reader.rs diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 1674ea6..76cecac 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -577,8 +577,8 @@ fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { let path = decoder.path.clone(); - let mode = decoder.mode.as_str().to_string(); - py.detach(|| count_samples_from_file(&path, &mode)) + let format = decoder.mode.wire_format(); + py.detach(|| count_samples_from_file(&path, format)) .map_err(|e| { PyException::new_err(format!( "Failed to count samples in {}: {e}", diff --git a/ben-py/src/decode/helpers.rs b/ben-py/src/decode/helpers.rs index edcc6ca..df9696d 100644 --- a/ben-py/src/decode/helpers.rs +++ b/ben-py/src/decode/helpers.rs @@ -3,7 +3,7 @@ use crate::common::open_input; use binary_ensemble::io::bundle::format::BENDL_MAGIC; use binary_ensemble::io::reader::{ build_frame_iter, build_frame_iter_from_reader, count_samples_from_frame_iter, - AssignmentReader, XZAssignmentReader, + BenStreamReader, BenWireFormat, }; use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; @@ -40,23 +40,30 @@ pub(super) fn detect_is_bundle(path: &Path) -> io::Result { } } -/// Build a plain-stream iterator from `path` using `mode`. -pub(super) fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { - let reader = open_input(&path.to_path_buf())?; - match mode { - DecoderMode::Ben => { - let ben = AssignmentReader::new(reader) +fn open_stream_reader(reader: R, format: BenWireFormat) -> PyResult +where + R: Read + Send + 'static, +{ + match format { + BenWireFormat::Ben => { + let ben = BenStreamReader::from_ben(reader) .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; Ok(Box::new(ben)) } - DecoderMode::XBen => { - let xben = XZAssignmentReader::new(reader) + BenWireFormat::XBen => { + let xben = BenStreamReader::from_xben(reader) .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; Ok(Box::new(xben)) } } } +/// Build a plain-stream iterator from `path` using `mode`. +pub(super) fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { + let reader = open_input(&path.to_path_buf())?; + open_stream_reader(reader, mode.wire_format()) +} + /// Open a second file handle on the bundle path, seek to the stream /// region, and wrap it in the appropriate assignment reader so the /// decoder iterator only walks the embedded stream. @@ -66,18 +73,7 @@ pub(super) fn build_bundle_iter( mode: DecoderMode, ) -> PyResult { let reader = open_bundle_stream_reader(path, state)?; - match mode { - DecoderMode::Ben => { - let ben = AssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create BenDecoder: {e}")))?; - Ok(Box::new(ben)) - } - DecoderMode::XBen => { - let xben = XZAssignmentReader::new(reader) - .map_err(|e| PyException::new_err(format!("Failed to create XBenDecoder: {e}")))?; - Ok(Box::new(xben)) - } - } + open_stream_reader(reader, mode.wire_format()) } /// Create a `Read`-only handle bounded to the bundle's assignment stream @@ -99,16 +95,19 @@ pub(super) fn build_frames_for_subsample( mode: DecoderMode, backend: &DecoderBackend, ) -> PyResult { + let format = mode.wire_format(); match backend { - DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), mode.as_str()).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - path.display() - )) - }), + DecoderBackend::Plain => { + build_frame_iter(&path.to_path_buf(), format).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + path.display() + )) + }) + } DecoderBackend::Bundle(state) => { let reader = open_bundle_stream_reader(path, state)?; - build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + build_frame_iter_from_reader(reader, format).map_err(|e| { PyException::new_err(format!( "Failed to create frame iterator from bundle {}: {e}", path.display() @@ -124,7 +123,7 @@ pub(super) fn scan_bundle_samples( mode: DecoderMode, ) -> PyResult { let reader = open_bundle_stream_reader(path, state)?; - let iter = build_frame_iter_from_reader(reader, mode.as_str()).map_err(|e| { + let iter = build_frame_iter_from_reader(reader, mode.wire_format()).map_err(|e| { PyException::new_err(format!( "Failed to open bundle stream for sample count: {e}" )) diff --git a/ben-py/src/decode/types.rs b/ben-py/src/decode/types.rs index 94cc0e7..e826fc3 100644 --- a/ben-py/src/decode/types.rs +++ b/ben-py/src/decode/types.rs @@ -1,6 +1,6 @@ use binary_ensemble::io::bundle::format::AssignmentFormat; use binary_ensemble::io::bundle::BendlReader; -use binary_ensemble::io::reader::{MkvRecord, Selection}; +use binary_ensemble::io::reader::{BenWireFormat, MkvRecord, Selection}; use pyo3::exceptions::PyException; use pyo3::prelude::*; use std::fs::File; @@ -32,6 +32,13 @@ impl DecoderMode { } } + pub(super) fn wire_format(&self) -> BenWireFormat { + match self { + Self::Ben => BenWireFormat::Ben, + Self::XBen => BenWireFormat::XBen, + } + } + pub(super) fn from_assignment_format(fmt: AssignmentFormat) -> Self { match fmt { AssignmentFormat::Ben => Self::Ben, diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index eb8af39..b63d827 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -4,6 +4,7 @@ use crate::io::bundle::format::{AssignmentFormat, ASSET_TYPE_GRAPH, STANDARDIZED use crate::io::bundle::writer::BendlAppender; use crate::io::bundle::{AddAssetOptions, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; +use crate::io::reader::BenWireFormat; use crate::BenVariant; use std::fs::{File, OpenOptions}; use std::io::{self, BufReader, Result}; @@ -104,7 +105,7 @@ pub(super) fn run_xencode_bundle_with_graph( })?; let sample_count: i64 = if from_ben { - count_samples_from_file(input_path, "ben")? as i64 + count_samples_from_file(input_path, BenWireFormat::Ben)? as i64 } else { count_jsonl_lines(input_path)? }; diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs index 9262fca..094306b 100644 --- a/ben/src/cli/bendl/create.rs +++ b/ben/src/cli/bendl/create.rs @@ -1,11 +1,12 @@ use super::args::{CreateArgs, NamedAsset}; -use super::helpers::{add_file_asset, format_from_path, mode_str}; +use super::helpers::{add_file_asset, format_from_path}; use crate::cli::common::check_overwrite; use crate::io::bundle::format::{ ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, }; use crate::io::bundle::{AddAssetOptions, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; +use crate::io::reader::BenWireFormat; use std::fs::File; use std::io::{self, BufReader}; @@ -19,7 +20,7 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { // Count samples up front so we can patch the header at finalize time. // This pre-scan is O(stream size); the second pass streams bytes directly. - let sample_count: i64 = count_samples_from_file(&args.input, mode_str(format)) + let sample_count: i64 = count_samples_from_file(&args.input, BenWireFormat::from(format)) .map_err(|e| format!("failed to count samples in {:?}: {e}", args.input))? as i64; diff --git a/ben/src/cli/bendl/helpers.rs b/ben/src/cli/bendl/helpers.rs index b4b2577..88762c2 100644 --- a/ben/src/cli/bendl/helpers.rs +++ b/ben/src/cli/bendl/helpers.rs @@ -16,14 +16,6 @@ pub(super) fn format_from_path(path: &Path) -> Result } } -/// `mode` argument expected by `count_samples_from_file`. -pub(super) fn mode_str(format: AssignmentFormat) -> &'static str { - match format { - AssignmentFormat::Ben => "ben", - AssignmentFormat::Xben => "xben", - } -} - pub(super) fn add_file_asset( writer: &mut BendlWriter, asset_type: u16, diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index 2a2c849..1925508 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -2,7 +2,7 @@ use super::append::run_append; use super::args::{AppendArgs, CreateArgs, ExtractArgs, InspectArgs, NamedAsset}; use super::create::run_create; use super::extract::run_extract; -use super::helpers::{format_from_path, mode_str}; +use super::helpers::format_from_path; use super::inspect::run_inspect; use crate::codec::encode::encode_jsonl_to_ben; use crate::io::bundle::format::AssignmentFormat; @@ -48,11 +48,6 @@ fn format_from_path_rejects_unknown_extension() { assert!(err.contains("expected .ben or .xben")); } -#[test] -fn mode_str_returns_xben_for_xben() { - assert_eq!(mode_str(AssignmentFormat::Xben), "xben"); -} - #[test] fn run_create_with_relabel_map_and_custom_asset() { let ben = { diff --git a/ben/src/cli/pcben/mod.rs b/ben/src/cli/pcben/mod.rs index 7414502..e7e9f9a 100644 --- a/ben/src/cli/pcben/mod.rs +++ b/ben/src/cli/pcben/mod.rs @@ -1,5 +1,5 @@ use crate::cli::common::{check_overwrite, set_quiet, set_verbose, CliError, CliResult}; -use crate::io::reader::AssignmentReader; +use crate::io::reader::BenStreamReader; use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; use crate::BenVariant; use clap::{Parser, ValueEnum}; @@ -194,7 +194,7 @@ fn derive_output_path(mode: Mode, input_file: &str) -> String { /// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { - let ben_reader = AssignmentReader::new(&mut reader)?; + let ben_reader = BenStreamReader::from_ben(&mut reader)?; let mut line = String::new(); for result in ben_reader { diff --git a/ben/src/cli/pcben/tests.rs b/ben/src/cli/pcben/tests.rs index 9f0e0cb..1b75321 100644 --- a/ben/src/cli/pcben/tests.rs +++ b/ben/src/cli/pcben/tests.rs @@ -113,7 +113,7 @@ fn assignment_encode_xben_offsets_values_and_writes_xben() { #[test] fn assignment_decode_ben_iterator_error_propagates() { - // Provides a valid BEN banner so AssignmentReader::new succeeds, + // Provides a valid BEN banner so BenStreamReader::from_ben succeeds, // then returns a non-EOF error on the next read so the iterator // fires the Err(e) => return Err(e) arm (line 204). use std::io::Read; diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index dc0c342..eab31bf 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -1,7 +1,7 @@ use crate::codec::decode::jsonl_decode_ben32; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::reader::{AssignmentReader, XZAssignmentReader}; +use crate::io::reader::BenStreamReader; use crate::progress::Spinner; use crate::BenVariant; use serde_json::json; @@ -23,7 +23,7 @@ use xz2::read::XzDecoder; /// /// Returns `Ok(())` after the stream has been fully decoded and written. pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Result<()> { - let mut ben_decoder = AssignmentReader::new(reader)?; + let mut ben_decoder = BenStreamReader::from_ben(reader)?; ben_decoder.write_all_jsonl(writer) } @@ -51,7 +51,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i Some(BenVariant::Standard) => BenVariant::Standard, Some(BenVariant::MkvChain) => BenVariant::MkvChain, Some(BenVariant::TwoDelta) => { - let mut xben = XZAssignmentReader::from_decompressed_stream( + let mut xben = BenStreamReader::from_xben_decompressed( BufReader::new(decoder), BenVariant::TwoDelta, ); diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 921e5df..0e356f7 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,7 +1,7 @@ use crate::codec::translate::ben32_to_ben_lines; use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::reader::XZAssignmentReader; +use crate::io::reader::BenStreamReader; use crate::io::writer::AssignmentWriter; use crate::progress::Spinner; use crate::BenVariant; @@ -40,7 +40,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BenVariant::MkvChain } Some(BenVariant::TwoDelta) => { - let mut xben = XZAssignmentReader::from_decompressed_stream( + let mut xben = BenStreamReader::from_xben_decompressed( BufReader::new(decoder), BenVariant::TwoDelta, ); diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index cd7eb88..eb1bcff 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -61,11 +61,11 @@ impl BenDecodeFrame { /// on success, and `Err` on any I/O or format error. /// /// Note: in a `TwoDelta` *stream*, the first frame is encoded in - /// `MkvChain` wire format. The caller (e.g. [`AssignmentReader`]) tracks + /// `MkvChain` wire format. The caller (e.g. [`BenStreamReader`]) tracks /// that state and passes [`BenVariant::MkvChain`] for the first frame and /// [`BenVariant::TwoDelta`] for the rest. /// - /// [`AssignmentReader`]: crate::io::reader::AssignmentReader + /// [`BenStreamReader`]: crate::io::reader::BenStreamReader pub fn from_reader( reader: &mut impl Read, variant: BenVariant, diff --git a/ben/src/codec/translate/errors.rs b/ben/src/codec/translate/errors.rs index 785eb37..0958d8e 100644 --- a/ben/src/codec/translate/errors.rs +++ b/ben/src/codec/translate/errors.rs @@ -15,7 +15,7 @@ pub enum TranslateError { #[error( "TwoDelta BEN streams cannot be translated to ben32; \ - use XZAssignmentWriter/XZAssignmentReader for TwoDelta compressed I/O" + use XZAssignmentWriter/BenStreamReader for TwoDelta compressed I/O" )] TwoDeltaUnsupported, diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 3d717c5..3550e35 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -3,7 +3,7 @@ //! The ben32 intermediate format is used only by the Standard and MkvChain //! variants. TwoDelta streams use a separate columnar layout and bypass //! ben32 entirely — see [`XZAssignmentWriter`](crate::io::writer::XZAssignmentWriter) -//! and [`XZAssignmentReader`](crate::io::reader::XZAssignmentReader) for the +//! and [`BenStreamReader`](crate::io::reader::BenStreamReader) for the //! TwoDelta compressed-I/O path. mod errors; diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index c0b2824..b536eb4 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -21,9 +21,7 @@ pub mod writer; #[cfg(test)] mod tests; -pub use reader::{ - BendlReader, BundleAssignmentReader, BundleAssignmentReaderError, BundleValidationError, -}; +pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; pub use writer::{ AddAssetOptions, BendlStreamHandle, BendlWriteError, BendlWriter, BundleAssignmentSink, BundleAssignmentStreamCtx, diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index b2baf4d..0fb34f1 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -4,9 +4,9 @@ //! trailing directory table. It does not read any asset payload bytes //! until the caller explicitly requests them via [`BendlReader::asset_bytes`] //! or [`BendlReader::asset_reader`]. The assignment stream region is -//! likewise exposed as a byte range the caller can plumb into the -//! existing `AssignmentReader` / `XZAssignmentReader` without this module -//! reinterpreting any BEN/XBEN internals. +//! likewise exposed as a byte range the caller can plumb into a +//! [`BenStreamReader`] without this module reinterpreting any BEN/XBEN +//! internals. use std::io::{self, Read, Seek, SeekFrom, Take}; @@ -16,6 +16,16 @@ use super::format::{ standardized_name_for, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_XZ, }; +use crate::io::reader::{BenStreamReader, BenWireFormat}; + +impl From for BenWireFormat { + fn from(format: AssignmentFormat) -> Self { + match format { + AssignmentFormat::Ben => BenWireFormat::Ben, + AssignmentFormat::Xben => BenWireFormat::XBen, + } + } +} /// Reader for a single `.bendl` file. pub struct BendlReader { @@ -121,8 +131,9 @@ impl BendlReader { /// Return a `Take` reader positioned at the start of the assignment /// stream and limited to its declared length. The caller is expected - /// to wrap the returned reader in an `AssignmentReader` or - /// `XZAssignmentReader` as appropriate for `assignment_format()`. + /// to wrap the returned reader in a [`BenStreamReader`] (via + /// [`BendlReader::open_assignment_reader`] or directly) as + /// appropriate for [`BendlReader::assignment_format`]. pub fn assignment_stream_reader(&mut self) -> io::Result> { let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; @@ -130,33 +141,24 @@ impl BendlReader { } /// Construct the appropriate assignment decoder for the bundle's - /// declared `assignment_format` and return it as a - /// [`BundleAssignmentReader`] enum. - /// - /// - `AssignmentFormat::Ben` produces a - /// [`crate::io::reader::AssignmentReader`] over a `Take<&mut R>`. - /// - `AssignmentFormat::Xben` produces a - /// [`crate::io::reader::XZAssignmentReader`] over a `Take<&mut R>`. + /// declared `assignment_format` and return it as a [`BenStreamReader`] + /// over the bundle's bounded stream region. /// /// Returns an error if the header's `assignment_format` field is /// unrecognized or the embedded banner is malformed. pub fn open_assignment_reader( &mut self, - ) -> Result>, BundleAssignmentReaderError> { + ) -> Result>, BundleAssignmentReaderError> { let format = self.assignment_format().ok_or( BundleAssignmentReaderError::UnknownAssignmentFormat(self.header.assignment_format), )?; let stream = self.assignment_stream_reader()?; match format { AssignmentFormat::Ben => { - let inner = crate::io::reader::AssignmentReader::new(stream) - .map_err(BundleAssignmentReaderError::Decoder)?; - Ok(BundleAssignmentReader::Ben(inner)) + BenStreamReader::from_ben(stream).map_err(BundleAssignmentReaderError::Decoder) } AssignmentFormat::Xben => { - let inner = crate::io::reader::XZAssignmentReader::new(stream) - .map_err(BundleAssignmentReaderError::Decoder)?; - Ok(BundleAssignmentReader::Xben(inner)) + BenStreamReader::from_xben(stream).map_err(BundleAssignmentReaderError::Decoder) } } } @@ -227,31 +229,6 @@ pub(crate) fn validate_directory_entries( Ok(()) } -/// Either a BEN or an XBEN assignment decoder over a bundle's embedded -/// stream region. -/// -/// Both variants hold a `Take<&mut R>` reader limited to the stream -/// window declared in the bundle header, so they cannot accidentally -/// read into the trailing directory table. -pub enum BundleAssignmentReader { - /// The bundle carries an uncompressed BEN stream. - Ben(crate::io::reader::AssignmentReader), - /// The bundle carries an xz-compressed XBEN stream. - Xben(crate::io::reader::XZAssignmentReader), -} - -impl BundleAssignmentReader { - /// True when the reader is backed by a BEN stream. - pub fn is_ben(&self) -> bool { - matches!(self, BundleAssignmentReader::Ben(_)) - } - - /// True when the reader is backed by an XBEN stream. - pub fn is_xben(&self) -> bool { - matches!(self, BundleAssignmentReader::Xben(_)) - } -} - /// Errors raised by [`BendlReader::open_assignment_reader`]. #[derive(Debug, thiserror::Error)] pub enum BundleAssignmentReaderError { diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index aeb12f9..e449192 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -5,7 +5,8 @@ use crate::io::bundle::format::{ ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, }; -use crate::io::bundle::reader::{BendlReader, BundleAssignmentReader}; +use crate::io::bundle::reader::BendlReader; +use crate::io::reader::BenWireFormat; use crate::io::bundle::writer::{ AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter, }; @@ -522,11 +523,8 @@ fn write_ben_stream_round_trips_through_assignment_reader() { assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Ben)); let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner + assert_eq!(decoder.wire_format(), BenWireFormat::Ben); + let decoded: Vec> = decoder .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); @@ -564,11 +562,8 @@ fn write_xben_stream_round_trips_through_assignment_reader() { assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Xben(r) => r, - BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), - }; - let decoded: Vec> = inner + assert_eq!(decoder.wire_format(), BenWireFormat::XBen); + let decoded: Vec> = decoder .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); @@ -613,11 +608,8 @@ fn write_ben_stream_alongside_front_loaded_asset() { // Assignment stream is still intact after pulling asset bytes. let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner + assert_eq!(decoder.wire_format(), BenWireFormat::Ben); + let decoded: Vec> = decoder .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); @@ -641,9 +633,8 @@ fn open_assignment_reader_rejects_mismatched_format() { let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); - let decoder: BundleAssignmentReader<_> = reader.open_assignment_reader().unwrap(); - assert!(decoder.is_ben()); - assert!(!decoder.is_xben()); + let decoder = reader.open_assignment_reader().unwrap(); + assert_eq!(decoder.wire_format(), BenWireFormat::Ben); } // ----------------------------------------------------------------------- @@ -1221,11 +1212,8 @@ fn write_ben_stream_json_value_and_sample_count() { let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.sample_count(), Some(2)); let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Ben(r) => r, - BundleAssignmentReader::Xben(_) => panic!("expected Ben reader"), - }; - let decoded: Vec> = inner.silent(true).map(|r| r.unwrap().0).collect(); + assert_eq!(decoder.wire_format(), BenWireFormat::Ben); + let decoded: Vec> = decoder.silent(true).map(|r| r.unwrap().0).collect(); assert_eq!(decoded, vec![vec![1, 2, 3], vec![4, 5, 6]]); } @@ -1247,11 +1235,8 @@ fn write_xben_stream_json_value() { let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.sample_count(), Some(2)); let decoder = reader.open_assignment_reader().unwrap(); - let inner = match decoder { - BundleAssignmentReader::Xben(r) => r, - BundleAssignmentReader::Ben(_) => panic!("expected Xben reader"), - }; - let decoded: Vec> = inner.silent(true).map(|r| r.unwrap().0).collect(); + assert_eq!(decoder.wire_format(), BenWireFormat::XBen); + let decoded: Vec> = decoder.silent(true).map(|r| r.unwrap().0).collect(); assert_eq!(decoded, vec![vec![10, 20], vec![30, 40]]); } diff --git a/ben/src/io/reader/assignment_reader.rs b/ben/src/io/reader/assignment_reader.rs deleted file mode 100644 index c7a70ba..0000000 --- a/ben/src/io/reader/assignment_reader.rs +++ /dev/null @@ -1,325 +0,0 @@ -use super::errors::DecoderInitError; -use crate::codec::{BenDecodeFrame, BenEncodeFrame}; -use crate::format::banners::{variant_from_banner, BANNER_LEN}; -use crate::progress::Spinner; -use crate::BenVariant; -use serde_json::json; -use std::io::{self, Read, Write}; - -/// Iterator over decoded assignments in an uncompressed BEN stream. -pub struct AssignmentReader { - reader: R, - sample_count: usize, - variant: BenVariant, - previous_assignment: Option>, - twodelta_consumed_first_frame: bool, - silent: bool, - spinner: Option, -} - -fn zero_count_frame_error() -> io::Error { - io::Error::new( - io::ErrorKind::InvalidData, - "BEN frame count must be greater than zero", - ) -} - -impl AssignmentReader { - /// Create a decoder for an uncompressed BEN stream. - /// - /// The reader must begin with one of the BEN banners such as - /// `STANDARD BEN FILE` or `MKVCHAIN BEN FILE`. - /// - /// # Arguments - /// - /// * `reader` - The input BEN stream, including its 17-byte banner. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first BEN frame. - pub fn new(mut reader: R) -> Result { - let mut check_buffer = [0u8; BANNER_LEN]; - - if let Err(e) = reader.read_exact(&mut check_buffer) { - return Err(DecoderInitError::Io(e)); - } - - match variant_from_banner(&check_buffer) { - Some(variant) => Ok(AssignmentReader { - reader, - sample_count: 0, - variant, - previous_assignment: None, - twodelta_consumed_first_frame: false, - silent: false, - spinner: None, - }), - None => Err(DecoderInitError::InvalidFileFormat(check_buffer.to_vec())), - } - } - - /// Suppress progress output from this decoder's iterator. - pub fn silent(mut self, silent: bool) -> Self { - self.silent = silent; - if silent { - self.spinner = None; - } - self - } - - /// Return the BEN variant detected from the stream banner. - pub fn variant(&self) -> BenVariant { - self.variant - } - - /// Decode the remaining BEN stream and write it as JSONL. - /// - /// Each decoded sample is written as a JSON object containing an - /// `assignment` vector and a 1-based `sample` index. - pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - let mut sample_number = 0usize; - self.for_each_assignment(|assignment, count| { - for _ in 0..count { - sample_number += 1; - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - } - Ok(true) - }) - } - - /// Read the next frame from the underlying BEN stream. - /// - /// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire - /// format; this method tracks that state so the frame module stays - /// variant-clean. - /// - /// Returns `Some(Ok(...))` for the next frame, `Some(Err(...))` for a read - /// failure, or `None` at a clean end of stream. - fn pop_frame_from_reader(&mut self) -> Option> { - let read_variant = if self.variant == BenVariant::TwoDelta - && !self.twodelta_consumed_first_frame - { - self.twodelta_consumed_first_frame = true; - BenVariant::MkvChain - } else { - self.variant - }; - - BenDecodeFrame::from_reader(&mut self.reader, read_variant).transpose() - } - - /// Consume this decoder and iterate over raw BEN frames instead of - /// materialized assignments. - pub fn into_frames(self) -> AssignmentFrameReader { - AssignmentFrameReader { inner: self } - } - - /// Count the number of samples remaining in the BEN stream. - /// - /// Walks frame boundaries rather than expanding every assignment. - pub fn count_samples(self) -> io::Result { - let mut this = self; - let mut total = 0usize; - while let Some(frame_res) = this.pop_frame_from_reader() { - let count = frame_res?.count(); - if count == 0 { - return Err(zero_count_frame_error()); - } - total += count as usize; - } - Ok(total) - } - - /// Decode assignments and pass each one to a callback by reference. - /// - /// Unlike the `Iterator` implementation, this avoids cloning the assignment - /// buffer on every frame. The callback receives a borrowed slice and its - /// repetition count. Return `true` to continue or `false` to stop early. - pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> - where - F: FnMut(&[u16], u16) -> io::Result, - { - loop { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Err(e), - None => return Ok(()), - }; - - let count = frame.count(); - if count == 0 { - return Err(zero_count_frame_error()); - } - - let assignment = frame.expand(self.previous_assignment.take())?; - - let keep_going = f(&assignment, count)?; - self.previous_assignment = Some(assignment); - self.sample_count += count as usize; - if !self.silent { - self.spinner - .get_or_insert_with(|| Spinner::new("Decoding sample")) - .set_count(self.sample_count as u64); - } - if !keep_going { - return Ok(()); - } - } - } -} - -impl Iterator for AssignmentReader { - type Item = io::Result; - - fn next(&mut self) -> Option> { - let frame = match self.pop_frame_from_reader() { - Some(Ok(frame)) => frame, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; - let count = frame.count(); - if count == 0 { - return Some(Err(zero_count_frame_error())); - } - let assignment = match frame.expand(self.previous_assignment.take()) { - Ok(a) => a, - Err(e) => return Some(Err(e)), - }; - self.previous_assignment = Some(assignment.clone()); - self.sample_count += count as usize; - if !self.silent { - self.spinner - .get_or_insert_with(|| Spinner::new("Decoding sample")) - .set_count(self.sample_count as u64); - } - Some(Ok((assignment, count))) - } -} - -/// Iterator over raw BEN frames. -pub struct AssignmentFrameReader { - pub(super) inner: AssignmentReader, -} - -impl AssignmentFrameReader { - /// Create a raw BEN frame iterator from a reader. - pub fn new(reader: R) -> Result { - Ok(Self { - inner: AssignmentReader::new(reader)?, - }) - } -} - -impl Iterator for AssignmentFrameReader { - type Item = io::Result<(BenDecodeFrame, u16)>; - - /// Return the next raw BEN frame from the input stream paired with its - /// repetition count. - /// - /// For `Standard` and `MkvChain` streams, returns the frame as read off - /// the wire (with `count` taken from the frame for `MkvChain`, or `1` - /// for `Standard`). - /// - /// For `TwoDelta` streams, materializes each assignment via `expand` - /// and re-encodes it as a Standard-shaped decode frame so downstream - /// subsampling consumers always see self-contained frames. - fn next(&mut self) -> Option { - match self.inner.variant { - BenVariant::Standard | BenVariant::MkvChain => { - match self.inner.pop_frame_from_reader() { - Some(Ok(frame)) => { - let count = frame.count(); - if count == 0 { - return Some(Err(zero_count_frame_error())); - } - Some(Ok((frame, count))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } - BenVariant::TwoDelta => match self.inner.next() { - Some(Ok((assignment, count))) => { - let encoded = - BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); - let (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes) = match encoded { - BenEncodeFrame::Standard { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes, - .. - } => (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes), - _ => unreachable!( - "BenEncodeFrame::from_assignment(Standard) always returns Standard" - ), - }; - // Strip the 6-byte frame header so the emitted decode-side - // frame's raw_bytes matches the historical payload-only - // shape that BenDecodeFrame::Standard carries. - let payload_only = raw_bytes[6..].to_vec(); - Some(Ok(( - BenDecodeFrame::Standard { - max_val_bit_count, - max_len_bit_count, - n_bytes, - raw_bytes: payload_only, - }, - count, - ))) - } - Some(Err(err)) => Some(Err(err)), - None => None, - }, - } - } -} - -impl AssignmentReader { - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> super::subsample::SubsampleFrameDecoder< - impl Iterator> + Send, - > - where - T: IntoIterator, - { - let frames = self - .into_frames() - .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); - super::subsample::SubsampleFrameDecoder::by_indices(frames, indices) - } - - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> super::subsample::SubsampleFrameDecoder< - impl Iterator> + Send, - > { - let frames = self - .into_frames() - .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); - super::subsample::SubsampleFrameDecoder::by_range(frames, start, end) - } - - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> super::subsample::SubsampleFrameDecoder< - impl Iterator> + Send, - > { - let frames = self - .into_frames() - .map(|res| res.map(|(f, cnt)| (super::subsample::DecodeFrame::Ben(f), cnt))); - super::subsample::SubsampleFrameDecoder::every(frames, step, offset) - } -} diff --git a/ben/src/io/reader/mod.rs b/ben/src/io/reader/mod.rs index 25c70cc..89c92e5 100644 --- a/ben/src/io/reader/mod.rs +++ b/ben/src/io/reader/mod.rs @@ -1,16 +1,14 @@ -pub mod assignment_reader; pub mod errors; +mod stream_reader; pub mod subsample; #[cfg(test)] mod tests; pub(crate) mod twodelta; -pub mod xz_assignment_reader; -pub use assignment_reader::{AssignmentFrameReader, AssignmentReader}; pub use errors::DecoderInitError; +pub use stream_reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat}; pub use subsample::{ build_frame_iter, build_frame_iter_from_reader, count_samples_from_file, count_samples_from_frame_iter, Ben32Frame, DecodeFrame, FrameIter, MkvRecord, Selection, SubsampleFrameDecoder, }; -pub use xz_assignment_reader::{XZAssignmentFrameReader, XZAssignmentReader}; diff --git a/ben/src/io/reader/stream_reader/ben.rs b/ben/src/io/reader/stream_reader/ben.rs new file mode 100644 index 0000000..8cf95ac --- /dev/null +++ b/ben/src/io/reader/stream_reader/ben.rs @@ -0,0 +1,122 @@ +//! Plain-BEN iteration logic for the unified stream reader. + +use std::io::{self, Read}; + +use super::zero_count_frame_error; +use crate::codec::BenDecodeFrame; +use crate::io::reader::subsample::MkvRecord; +use crate::progress::Spinner; +use crate::BenVariant; + +/// Read the next frame from the underlying BEN stream. +/// +/// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire +/// format; this helper tracks that state so the frame module stays +/// variant-clean. +pub(super) fn pop_frame_from_reader( + reader: &mut R, + variant: BenVariant, + twodelta_consumed_first_frame: &mut bool, +) -> Option> { + let read_variant = if variant == BenVariant::TwoDelta && !*twodelta_consumed_first_frame { + *twodelta_consumed_first_frame = true; + BenVariant::MkvChain + } else { + variant + }; + + BenDecodeFrame::from_reader(reader, read_variant).transpose() +} + +#[allow(clippy::too_many_arguments)] +pub(super) fn for_each_assignment_ben( + reader: &mut R, + variant: BenVariant, + previous_assignment: &mut Option>, + twodelta_consumed_first_frame: &mut bool, + sample_count: &mut usize, + spinner: &mut Option, + silent: bool, + mut f: F, +) -> io::Result<()> +where + F: FnMut(&[u16], u16) -> io::Result, +{ + loop { + let frame = match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + + let count = frame.count(); + if count == 0 { + return Err(zero_count_frame_error("BEN")); + } + + let assignment = frame.expand(previous_assignment.take())?; + + let keep_going = f(&assignment, count)?; + *previous_assignment = Some(assignment); + *sample_count += count as usize; + if !silent { + spinner + .get_or_insert_with(|| Spinner::new("Decoding sample")) + .set_count(*sample_count as u64); + } + if !keep_going { + return Ok(()); + } + } +} + +#[allow(clippy::too_many_arguments)] +pub(super) fn next_record_ben( + reader: &mut R, + variant: BenVariant, + previous_assignment: &mut Option>, + twodelta_consumed_first_frame: &mut bool, + sample_count: &mut usize, + spinner: &mut Option, + silent: bool, +) -> Option> { + let frame = match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + Some(Ok(frame)) => frame, + Some(Err(e)) => return Some(Err(e)), + None => return None, + }; + let count = frame.count(); + if count == 0 { + return Some(Err(zero_count_frame_error("BEN"))); + } + let assignment = match frame.expand(previous_assignment.take()) { + Ok(a) => a, + Err(e) => return Some(Err(e)), + }; + *previous_assignment = Some(assignment.clone()); + *sample_count += count as usize; + if !silent { + spinner + .get_or_insert_with(|| Spinner::new("Decoding sample")) + .set_count(*sample_count as u64); + } + Some(Ok((assignment, count))) +} + +pub(super) fn count_samples_ben( + mut reader: R, + variant: BenVariant, +) -> io::Result { + let mut twodelta_consumed_first_frame = false; + let mut total = 0usize; + while let Some(frame_res) = + pop_frame_from_reader(&mut reader, variant, &mut twodelta_consumed_first_frame) + { + let count = frame_res?.count(); + if count == 0 { + return Err(zero_count_frame_error("BEN")); + } + total += count as usize; + } + Ok(total) +} diff --git a/ben/src/io/reader/stream_reader/frames.rs b/ben/src/io/reader/stream_reader/frames.rs new file mode 100644 index 0000000..fdc2c49 --- /dev/null +++ b/ben/src/io/reader/stream_reader/frames.rs @@ -0,0 +1,180 @@ +//! Raw-frame iterator surface over the unified stream reader. + +use std::io::{self, Read}; + +use super::ben::pop_frame_from_reader; +use super::xben::pop_frame_from_overflow; +use super::{zero_count_frame_error, BenStreamInner, BenStreamReader, XBenInner}; +use crate::codec::encode::encode_ben32_assignments; +use crate::codec::{BenDecodeFrame, BenEncodeFrame}; +use crate::io::reader::errors::DecoderInitError; +use crate::io::reader::subsample::DecodeFrame; +use crate::BenVariant; + +/// Iterator over raw frames from a [`BenStreamReader`]. +/// +/// In the BEN arm: `Standard` and `MkvChain` frames are yielded as read off +/// the wire; `TwoDelta` frames are materialized as assignments and re-encoded +/// as `Standard` decode frames so downstream subsample consumers always see +/// self-contained frames. +/// +/// In the XBEN arm: `Standard` and `MkvChain` frames are yielded as raw +/// ben32 byte slices with their repetition count; `TwoDelta` chunks are +/// materialized to assignments and re-encoded as ben32 frames. +pub struct BenStreamFrameReader { + inner: BenStreamReader, +} + +impl BenStreamFrameReader { + /// Create a raw frame iterator from a plain BEN stream. + pub fn from_ben(reader: R) -> Result { + Ok(Self { + inner: BenStreamReader::from_ben(reader)?, + }) + } + + /// Create a raw frame iterator from an XBEN stream. + pub fn from_xben(reader: R) -> Result { + Ok(Self { + inner: BenStreamReader::from_xben(reader)?, + }) + } + + pub(super) fn from_stream(inner: BenStreamReader) -> Self { + Self { inner } + } + + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.inner.variant() + } + + /// Return the wire format of the underlying stream. + pub fn wire_format(&self) -> super::BenWireFormat { + self.inner.wire_format() + } +} + +impl Iterator for BenStreamFrameReader { + type Item = io::Result<(DecodeFrame, u16)>; + + fn next(&mut self) -> Option { + let variant = self.inner.variant(); + let silent = self.inner.is_silent(); + match self.inner.inner_mut() { + BenStreamInner::Ben { + reader, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + } => match variant { + BenVariant::Standard | BenVariant::MkvChain => { + match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + Some(Ok(frame)) => { + let count = frame.count(); + if count == 0 { + return Some(Err(zero_count_frame_error("BEN"))); + } + Some(Ok((DecodeFrame::Ben(frame), count))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } + BenVariant::TwoDelta => { + match super::ben::next_record_ben( + reader, + variant, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + silent, + ) { + Some(Ok((assignment, count))) => { + let encoded = BenEncodeFrame::from_assignment( + &assignment, + BenVariant::Standard, + None, + ); + let (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes) = + match encoded { + BenEncodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + .. + } => { + (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes) + } + _ => unreachable!( + "BenEncodeFrame::from_assignment(Standard) always returns Standard" + ), + }; + // Strip the 6-byte frame header so the emitted decode-side + // frame's raw_bytes matches the historical payload-only + // shape that BenDecodeFrame::Standard carries. + let payload_only = raw_bytes[6..].to_vec(); + Some(Ok(( + DecodeFrame::Ben(BenDecodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes: payload_only, + }), + count, + ))) + } + Some(Err(err)) => Some(Err(err)), + None => None, + } + } + }, + BenStreamInner::XBen(inner) => next_frame_xben(inner, variant) + .map(|res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))), + } + } +} + +/// Pull the next raw ben32 frame from an XBEN inner state. +/// +/// For TwoDelta streams the underlying chunk is materialized via the record +/// iterator and re-encoded as a self-contained ben32 frame. +pub(super) fn next_frame_xben( + inner: &mut XBenInner, + variant: BenVariant, +) -> Option, u16)>> { + if variant == BenVariant::TwoDelta { + return super::xben::next_record_xben(inner, variant).map(|res| { + res.and_then(|(assignment, count)| Ok((encode_ben32_assignments(&assignment)?, count))) + }); + } + + use crate::codec::decode::DecodeError; + loop { + if let Some((frame, consumed, count)) = pop_frame_from_overflow(variant, &inner.overflow) { + if count == 0 { + inner.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error("XBEN"))); + } + let out = frame.to_vec(); + inner.overflow.drain(..consumed); + return Some(Ok((out, count))); + } + + let read = match inner.xz.read(&mut inner.buf) { + Ok(0) => { + if inner.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + inner.overflow.extend_from_slice(&inner.buf[..read]); + } +} diff --git a/ben/src/io/reader/stream_reader/mod.rs b/ben/src/io/reader/stream_reader/mod.rs new file mode 100644 index 0000000..ab9a943 --- /dev/null +++ b/ben/src/io/reader/stream_reader/mod.rs @@ -0,0 +1,315 @@ +//! Unified reader for the BEN-stack stream layer (layer 3 — see +//! `docs/glossary.md`). +//! +//! Hides the wire-format choice (BEN bit-packed vs ben32 columnar) and the +//! transport choice (plain vs xz-compressed) behind one type. The decode-side +//! laziness invariant is preserved on both wire formats: frame payload bytes +//! stay opaque until [`crate::codec::BenDecodeFrame::expand`] (frame-level +//! decode), not to be confused with +//! [`crate::io::reader::DecodeFrame::expand_self_contained`] (subsample-level). + +mod ben; +mod frames; +mod xben; + +use std::collections::VecDeque; +use std::io::{self, BufReader, Read, Write}; + +use serde_json::json; +use xz2::read::XzDecoder; + +use super::errors::DecoderInitError; +use super::subsample::{MkvRecord, SubsampleFrameDecoder}; +use crate::format::banners::{variant_from_banner, BANNER_LEN}; +use crate::progress::Spinner; +use crate::BenVariant; + +pub use frames::BenStreamFrameReader; + +/// Wire format of a BEN-stack stream. +/// +/// The Rust representation of the BEN/XBEN stream choice. This is the seam +/// the public reader API uses to dispatch on wire format; the bundle layer +/// owns its own conversion from `AssignmentFormat`. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum BenWireFormat { + /// Plain BEN bit-packed wire format over an unwrapped byte stream. + Ben, + /// BEN32 columnar / TwoDelta wire format over an xz-compressed byte stream. + XBen, +} + +/// Reader for an encoded BEN-stack stream of samples. +/// +/// Construct with [`BenStreamReader::from_ben`] or +/// [`BenStreamReader::from_xben`]. Both arms expose the same downstream +/// surface for assignment iteration, JSONL writing, sample counting, and +/// subsampling. +pub struct BenStreamReader { + inner: BenStreamInner, + variant: BenVariant, + silent: bool, +} + +/// Wire-format split: the `Ben` arm carries inline state, the `XBen` arm is +/// boxed so the enum's static size stays close to the smaller plain-BEN +/// footprint instead of being dictated by the larger xz state. +pub(crate) enum BenStreamInner { + Ben { + reader: R, + previous_assignment: Option>, + twodelta_consumed_first_frame: bool, + sample_count: usize, + spinner: Option, + }, + XBen(Box>), +} + +/// Decompressed-stream state for the `XBen` arm. +pub(crate) struct XBenInner { + pub(crate) xz: BufReader>, + pub(crate) overflow: Vec, + pub(crate) buf: Box<[u8]>, + pub(crate) previous_assignment: Option>, + pub(crate) chunk_queue: VecDeque<((u16, u16), Vec, u16)>, +} + +pub(super) fn zero_count_frame_error(label: &'static str) -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + format!("{label} frame count must be greater than zero"), + ) +} + +impl BenStreamReader { + /// Open a plain BEN stream. The reader must begin with a 17-byte BEN banner. + pub fn from_ben(mut reader: R) -> Result { + let mut check_buffer = [0u8; BANNER_LEN]; + if let Err(e) = reader.read_exact(&mut check_buffer) { + return Err(DecoderInitError::Io(e)); + } + let variant = variant_from_banner(&check_buffer) + .ok_or_else(|| DecoderInitError::InvalidFileFormat(check_buffer.to_vec()))?; + Ok(Self { + inner: BenStreamInner::Ben { + reader, + previous_assignment: None, + twodelta_consumed_first_frame: false, + sample_count: 0, + spinner: None, + }, + variant, + silent: false, + }) + } + + /// Open an XBEN stream. The reader must produce, after xz decompression, + /// a 17-byte BEN banner followed by ben32 columnar frames. + pub fn from_xben(reader: R) -> Result { + let xz = XzDecoder::new(reader); + let mut xz = BufReader::with_capacity(1 << 20, xz); + + let mut first = [0u8; BANNER_LEN]; + if let Err(e) = xz.read_exact(&mut first) { + return Err(DecoderInitError::Io(e)); + } + let variant = variant_from_banner(&first) + .ok_or_else(|| DecoderInitError::InvalidFileFormat(first.to_vec()))?; + + Ok(Self::from_xben_decompressed(xz, variant)) + } + + /// Build from a decompressed XBEN stream already positioned past the + /// 17-byte BEN banner. + pub(crate) fn from_xben_decompressed( + xz: BufReader>, + variant: BenVariant, + ) -> Self { + Self { + inner: BenStreamInner::XBen(Box::new(XBenInner { + xz, + overflow: Vec::with_capacity(1 << 20), + buf: vec![0u8; 1 << 20].into_boxed_slice(), + previous_assignment: None, + chunk_queue: VecDeque::new(), + })), + variant, + silent: false, + } + } + + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.variant + } + + /// Return the wire format (BEN vs XBEN) of this stream. + pub fn wire_format(&self) -> BenWireFormat { + match &self.inner { + BenStreamInner::Ben { .. } => BenWireFormat::Ben, + BenStreamInner::XBen(_) => BenWireFormat::XBen, + } + } + + /// Suppress progress output from this decoder's iteration paths. + /// + /// In the `Ben` arm, this clears any active spinner. In the `XBen` arm, + /// `for_each_assignment` consults `silent` before creating its local + /// spinner. + pub fn silent(mut self, silent: bool) -> Self { + self.silent = silent; + if let BenStreamInner::Ben { + spinner, .. + } = &mut self.inner + { + if silent { + *spinner = None; + } + } + self + } + + /// Whether this reader is in silent mode. + pub(crate) fn is_silent(&self) -> bool { + self.silent + } + + pub(crate) fn inner_mut(&mut self) -> &mut BenStreamInner { + &mut self.inner + } + + /// Consume this decoder and iterate over raw BEN/ben32 frames instead of + /// materialized assignments. + pub fn into_frames(self) -> BenStreamFrameReader { + BenStreamFrameReader::from_stream(self) + } + + /// Count the number of samples remaining in the stream. + /// + /// Walks frame boundaries rather than expanding every assignment. + pub fn count_samples(self) -> io::Result { + let variant = self.variant; + match self.inner { + BenStreamInner::Ben { reader, .. } => { + ben::count_samples_ben(reader, variant) + } + BenStreamInner::XBen(inner) => xben::count_samples_xben(*inner, variant), + } + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// Unlike [`Iterator`], this avoids cloning the assignment buffer on every + /// frame. The callback receives a borrowed slice and its repetition + /// count. Return `true` to continue or `false` to stop early. + pub fn for_each_assignment(&mut self, f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + let silent = self.silent; + let variant = self.variant; + match &mut self.inner { + BenStreamInner::Ben { + reader, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + } => ben::for_each_assignment_ben( + reader, + variant, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + silent, + f, + ), + BenStreamInner::XBen(inner) => { + xben::for_each_assignment_xben(inner, variant, silent, f) + } + } + } + + /// Decode the remaining stream and write it as JSONL. + /// + /// Each decoded sample is written as a JSON object containing an + /// `assignment` vector and a 1-based `sample` index. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + Ok(true) + }) + } +} + +impl Iterator for BenStreamReader { + type Item = io::Result; + + fn next(&mut self) -> Option { + let silent = self.silent; + let variant = self.variant; + match &mut self.inner { + BenStreamInner::Ben { + reader, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + } => ben::next_record_ben( + reader, + variant, + previous_assignment, + twodelta_consumed_first_frame, + sample_count, + spinner, + silent, + ), + BenStreamInner::XBen(inner) => xben::next_record_xben(inner, variant), + } + } +} + +impl BenStreamReader { + /// Convert this decoder into a subsampling iterator over explicit 1-based + /// indices. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder> + where + T: IntoIterator, + { + SubsampleFrameDecoder::by_indices(self.into_frames(), indices) + } + + /// Convert this decoder into a subsampling iterator over the inclusive + /// 1-based range `[start, end]`. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder> { + SubsampleFrameDecoder::by_range(self.into_frames(), start, end) + } + + /// Convert this decoder into a subsampling iterator that selects every + /// `step` samples from the 1-based `offset`. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder> { + SubsampleFrameDecoder::every(self.into_frames(), step, offset) + } +} diff --git a/ben/src/io/reader/stream_reader/xben.rs b/ben/src/io/reader/stream_reader/xben.rs new file mode 100644 index 0000000..55fbf70 --- /dev/null +++ b/ben/src/io/reader/stream_reader/xben.rs @@ -0,0 +1,296 @@ +//! XBEN iteration logic for the unified stream reader. + +use std::io::{self, Cursor, Read}; + +use super::{zero_count_frame_error, XBenInner}; +use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; +use crate::io::reader::subsample::MkvRecord; +use crate::io::reader::twodelta::{XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; +use crate::progress::Spinner; +use crate::util::rle::rle_to_vec; +use crate::BenVariant; + +/// Try to extract one complete ben32 frame from the buffered overflow. +/// +/// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 +/// frame and, for MkvChain streams, reads the trailing repetition count. +pub(super) fn pop_frame_from_overflow<'a>( + variant: BenVariant, + overflow: &'a [u8], +) -> Option<(&'a [u8], usize, u16)> { + if variant == BenVariant::Standard { + if overflow.len() < 4 { + return None; + } + for i in (3..overflow.len()).step_by(4) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let end = i + 1; + let frame = &overflow[..end]; + return Some((frame, end, 1)); + } + } + None + } else { + if overflow.len() < 6 { + return None; + } + for i in (3..overflow.len().saturating_sub(2)).step_by(2) { + if overflow[i - 3..=i] == [0, 0, 0, 0] { + let count_hi = overflow[i + 1]; + let count_lo = overflow[i + 2]; + let count = u16::from_be_bytes([count_hi, count_lo]); + let end = i + 3; + let frame = &overflow[..end]; + return Some((frame, end, count)); + } + } + None + } +} + +/// Try to extract one complete TwoDelta frame from the buffered overflow. +fn pop_twodelta_frame_from_overflow( + overflow: &[u8], +) -> Option, usize, u16)>> { + let tag = *overflow.first()?; + match tag { + XBEN_TWODELTA_FULL_TAG => { + if overflow.len() < 7 { + return None; + } + let run_count = + u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) as usize; + let payload_len = run_count * 4; + let total_len = 1 + 4 + payload_len + 2; + if overflow.len() < total_len { + return None; + } + + let mut runs = Vec::with_capacity(run_count); + let mut cursor = 5usize; + for _ in 0..run_count { + let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); + runs.push((value, len)); + cursor += 4; + } + let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); + Some(Ok((runs, total_len, count))) + } + XBEN_TWODELTA_CHUNK_TAG => None, + _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { + tag, + }))), + } +} + +/// Try to parse a columnar TwoDelta chunk from the overflow buffer. +/// +/// If the overflow starts with the chunk tag and contains enough bytes for +/// the full chunk, all frames are decoded and pushed onto `chunk_queue`. +/// Returns `true` on success, `false` when the overflow is incomplete. +fn try_parse_twodelta_chunk(inner: &mut XBenInner) -> bool { + if inner.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { + return false; + } + if inner.overflow.len() < 5 { + return false; + } + + let n_frames = u32::from_be_bytes([ + inner.overflow[1], + inner.overflow[2], + inner.overflow[3], + inner.overflow[4], + ]) as usize; + + let header_len: usize = 5; + let pairs_len = n_frames * 4; + let counts_len = n_frames * 2; + let run_counts_len = n_frames * 4; + let fixed_len = header_len + pairs_len + counts_len + run_counts_len; + + if inner.overflow.len() < fixed_len { + return false; + } + + let run_counts_start = header_len + pairs_len + counts_len; + let mut total_runs = 0usize; + let mut run_counts = Vec::with_capacity(n_frames); + for i in 0..n_frames { + let offset = run_counts_start + i * 4; + let rc = u32::from_be_bytes([ + inner.overflow[offset], + inner.overflow[offset + 1], + inner.overflow[offset + 2], + inner.overflow[offset + 3], + ]) as usize; + run_counts.push(rc); + total_runs += rc; + } + + let run_data_len = total_runs * 2; + let total_len = fixed_len + run_data_len; + if inner.overflow.len() < total_len { + return false; + } + + let pairs_start = header_len; + let counts_start = pairs_start + pairs_len; + let run_data_start = run_counts_start + run_counts_len; + + let mut run_cursor = run_data_start; + for i in 0..n_frames { + let po = pairs_start + i * 4; + let pair = ( + u16::from_be_bytes([inner.overflow[po], inner.overflow[po + 1]]), + u16::from_be_bytes([inner.overflow[po + 2], inner.overflow[po + 3]]), + ); + let co = counts_start + i * 2; + let count = u16::from_be_bytes([inner.overflow[co], inner.overflow[co + 1]]); + + let rc = run_counts[i]; + let mut run_lengths = Vec::with_capacity(rc); + for _ in 0..rc { + run_lengths.push(u16::from_be_bytes([ + inner.overflow[run_cursor], + inner.overflow[run_cursor + 1], + ])); + run_cursor += 2; + } + + inner.chunk_queue.push_back((pair, run_lengths, count)); + } + + inner.overflow.drain(..total_len); + true +} + +/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. +fn decode_xben_frame_to_assignment(frame_bytes: &[u8], variant: BenVariant) -> io::Result> { + let (assignment, _) = decode_ben32_line(Cursor::new(frame_bytes), variant)?; + Ok(assignment) +} + +pub(super) fn next_record_xben( + inner: &mut XBenInner, + variant: BenVariant, +) -> Option> { + loop { + match variant { + BenVariant::Standard | BenVariant::MkvChain => { + if let Some((frame_bytes, consumed, count)) = + pop_frame_from_overflow(variant, &inner.overflow) + { + if count == 0 { + inner.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error("XBEN"))); + } + let assignment = decode_xben_frame_to_assignment(frame_bytes, variant) + .expect("complete frame from pop_frame_from_overflow"); + inner.previous_assignment = Some(assignment.clone()); + inner.overflow.drain(..consumed); + return Some(Ok((assignment, count))); + } + } + BenVariant::TwoDelta => { + if let Some((pair, run_lengths, count)) = inner.chunk_queue.pop_front() { + if count == 0 { + return Some(Err(zero_count_frame_error("XBEN"))); + } + let assignment = match inner.previous_assignment.take() { + Some(prev) => apply_twodelta_runs_to_assignment(prev, pair, &run_lengths), + None => Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)), + }; + return Some(match assignment { + Ok(a) => { + inner.previous_assignment = Some(a.clone()); + Ok((a, count)) + } + Err(e) => Err(e), + }); + } + + if try_parse_twodelta_chunk(inner) { + continue; + } + + if let Some(parsed) = pop_twodelta_frame_from_overflow(&inner.overflow) { + let res = match parsed { + Ok((runs, consumed, count)) => { + if count == 0 { + inner.overflow.drain(..consumed); + return Some(Err(zero_count_frame_error("XBEN"))); + } + let assignment = rle_to_vec(runs); + inner.previous_assignment = Some(assignment.clone()); + inner.overflow.drain(..consumed); + Ok((assignment, count)) + } + Err(err) => { + inner.overflow.clear(); + Err(err) + } + }; + return Some(res); + } + } + } + + let read = match inner.xz.read(&mut inner.buf) { + Ok(0) => { + if inner.overflow.is_empty() { + return None; + } else { + return Some(Err(io::Error::from(DecodeError::XBenTruncated))); + } + } + Ok(n) => n, + Err(e) => return Some(Err(e)), + }; + inner.overflow.extend_from_slice(&inner.buf[..read]); + } +} + +pub(super) fn for_each_assignment_xben( + inner: &mut XBenInner, + variant: BenVariant, + silent: bool, + mut f: F, +) -> io::Result<()> +where + F: FnMut(&[u16], u16) -> io::Result, +{ + let mut sample_count = 0usize; + let spinner = (!silent).then(|| Spinner::new("Decoding sample")); + loop { + match next_record_xben(inner, variant) { + Some(Ok((assignment, count))) => { + sample_count += count as usize; + if let Some(spinner) = &spinner { + spinner.set_count(sample_count as u64); + } + let keep_going = f(&assignment, count)?; + if !keep_going { + return Ok(()); + } + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + } + } +} + +pub(super) fn count_samples_xben( + inner: XBenInner, + variant: BenVariant, +) -> io::Result { + use super::frames::next_frame_xben; + let mut inner = inner; + let mut total = 0usize; + while let Some(item) = next_frame_xben(&mut inner, variant) { + let (_bytes, cnt) = item?; + total += cnt as usize; + } + Ok(total) +} diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index 62e1532..a1de190 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -1,11 +1,9 @@ -use super::assignment_reader::AssignmentFrameReader; -use super::errors::DecoderInitError; -use super::xz_assignment_reader::decode_xben_frame_to_assignment; -use super::xz_assignment_reader::XZAssignmentReader; +use super::stream_reader::{BenStreamFrameReader, BenWireFormat}; +use crate::codec::decode::decode_ben32_line; use crate::codec::BenDecodeFrame; use crate::BenVariant; use std::fs::File; -use std::io::{self, BufReader, Read}; +use std::io::{self, BufReader, Cursor, Read}; use std::iter::Peekable; use std::path::{Path, PathBuf}; @@ -16,7 +14,7 @@ pub type Ben32Frame = (Vec, u16); /// A boxed iterator over generic BEN/XBEN frames used by subsampling helpers. pub type FrameIter = Box> + Send>; -#[derive(Clone)] +#[derive(Clone, Debug)] /// A generalized frame type used by the subsampling machinery. pub enum DecodeFrame { /// A raw BEN frame. @@ -25,6 +23,26 @@ pub enum DecodeFrame { XBen(Vec, BenVariant), } +impl DecodeFrame { + /// Expand a self-contained subsample frame into an assignment vector. + /// + /// Distinct from [`BenDecodeFrame::expand`] (which takes a previous + /// assignment for delta variants); the frame readers guarantee frames + /// reaching the subsample path are self-contained, so no `prev` is + /// needed: plain-BEN TwoDelta is materialized and re-encoded as + /// `Standard`, and XBEN TwoDelta is materialized and re-encoded as + /// ben32. + pub fn expand_self_contained(&self) -> io::Result> { + match self { + DecodeFrame::Ben(f) => f.expand(None), + DecodeFrame::XBen(bytes, variant) => { + let (assignment, _) = decode_ben32_line(Cursor::new(bytes), *variant)?; + Ok(assignment) + } + } + } +} + /// A selection strategy for extracting only part of a frame stream. pub enum Selection { /// Select explicit 1-based indices. @@ -35,26 +53,6 @@ pub enum Selection { Range { start: usize, end: usize }, } -/// Decode a generic frame into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame` - Either a BEN frame or an XBEN ben32 frame. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -/// -/// `AssignmentFrameReader` rewrites TwoDelta BEN frames into self-contained -/// Standard frames before they reach this path, so `Ben(...)` is always a -/// `Standard` or `MkvChain` arm and `expand(None)` is always sufficient here. -pub(super) fn decode_frame_to_assignment(frame: &DecodeFrame) -> io::Result> { - match frame { - DecodeFrame::Ben(f) => f.expand(None), - DecodeFrame::XBen(bytes, variant) => decode_xben_frame_to_assignment(bytes, *variant), - } -} - /// Iterator adaptor that decodes only selected samples from a frame stream. pub struct SubsampleFrameDecoder where @@ -70,15 +68,6 @@ where I: Iterator>, { /// Create a subsampling iterator from a lower-level frame iterator. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `selection` - The sample-selection rule to apply. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. pub fn new(inner: I, selection: Selection) -> Self { Self { inner, @@ -90,15 +79,6 @@ where /// Select a set of 1-based sample indices. /// /// Indices are sorted and deduplicated before iteration begins. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. pub fn by_indices(inner: I, indices: T) -> Self where T: IntoIterator, @@ -110,16 +90,6 @@ where } /// Select the inclusive 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. pub fn by_range(inner: I, start: usize, end: usize) -> Self { assert!( start >= 1 && end >= start, @@ -129,31 +99,12 @@ where } /// Select every `step` samples beginning from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `inner` - The source iterator yielding frames and repetition counts. - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. pub fn every(inner: I, step: usize, offset: usize) -> Self { assert!(step >= 1 && offset >= 1, "step and offset must be >= 1"); Self::new(inner, Selection::Every { step, offset }) } /// Count how many selected samples fall within an inclusive sample interval. - /// - /// # Arguments - /// - /// * `lo` - The first 1-based sample index covered by the current frame. - /// * `hi` - The last 1-based sample index covered by the current frame. - /// - /// # Returns - /// - /// Returns the number of selected samples represented by the frame. fn count_selected_in(&mut self, lo: usize, hi: usize) -> u16 { match &mut self.selection { Selection::Indices(iter) => { @@ -235,7 +186,7 @@ where self.sample = hi; if selected > 0 { - match decode_frame_to_assignment(&frame) { + match frame.expand_self_contained() { Ok(assignment) => return Some(Ok((assignment, selected))), Err(e) => return Some(Err(e)), } @@ -248,19 +199,10 @@ where /// /// Frame iteration is useful for subsampling and counting because it avoids /// decoding every sample into a full assignment vector. -/// -/// # Arguments -/// -/// * `file_path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns a boxed iterator over generic frames and their repetition counts. -pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result { +pub fn build_frame_iter(file_path: &PathBuf, format: BenWireFormat) -> io::Result { let file = File::options().read(true).open(file_path)?; let reader = BufReader::new(file); - build_frame_iter_from_reader(reader, mode) + build_frame_iter_from_reader(reader, format) } /// Build a generic frame iterator from an already-opened reader. @@ -270,37 +212,19 @@ pub fn build_frame_iter(file_path: &PathBuf, mode: &str) -> io::Result( reader: R, - mode: &str, + format: BenWireFormat, ) -> io::Result { - match mode { - "ben" => { - let frames = AssignmentFrameReader::new(reader)?; - let mapped = frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt))); - Ok(Box::new(mapped)) + match format { + BenWireFormat::Ben => { + let frames = BenStreamFrameReader::from_ben(reader).map_err(io::Error::from)?; + Ok(Box::new(frames)) } - "xben" => { - let x = XZAssignmentReader::new(reader)?; - let variant = x.variant(); - let frames = x.into_frames(); - let mapped = frames - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - Ok(Box::new(mapped)) + BenWireFormat::XBen => { + let frames = BenStreamFrameReader::from_xben(reader).map_err(io::Error::from)?; + Ok(Box::new(frames)) } - _ => Err(io::Error::from(DecoderInitError::UnknownMode { - mode: mode.to_string(), - })), } } @@ -308,17 +232,8 @@ pub fn build_frame_iter_from_reader( /// /// The file is walked frame-by-frame, so this is linear in file size but avoids /// materializing full assignment vectors. -/// -/// # Arguments -/// -/// * `path` - Path to a `.ben` or `.xben` file. -/// * `mode` - Either `"ben"` or `"xben"`. -/// -/// # Returns -/// -/// Returns the number of samples in the file. -pub fn count_samples_from_file(path: &Path, mode: &str) -> io::Result { - let iter = build_frame_iter(&path.to_path_buf(), mode)?; +pub fn count_samples_from_file(path: &Path, format: BenWireFormat) -> io::Result { + let iter = build_frame_iter(&path.to_path_buf(), format)?; count_samples_from_frame_iter(iter) } @@ -336,3 +251,4 @@ pub fn count_samples_from_frame_iter(iter: FrameIter) -> io::Result { } Ok(total) } + diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 32efd05..1dfc156 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -1,7 +1,7 @@ use crate::codec::encode::encode_jsonl_to_xben; use crate::io::reader::errors::DecoderInitError; use crate::io::reader::subsample::{DecodeFrame, Selection, SubsampleFrameDecoder}; -use crate::io::reader::{XZAssignmentFrameReader, XZAssignmentReader}; +use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat}; use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; use std::io::{self, Cursor, Write}; @@ -27,7 +27,7 @@ fn make_xben_from_assignments(assignments: &[Vec], variant: BenVariant) -> xben } -// ── XZAssignmentReader ────────────────────────────────────────────────────── +// ── BenStreamReader ────────────────────────────────────────────────────── #[test] fn xz_reader_standard_iterator() { @@ -35,7 +35,7 @@ fn xz_reader_standard_iterator() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.variant(), BenVariant::Standard); let results: Vec<_> = reader.collect(); assert_eq!(results.len(), 2); @@ -51,7 +51,7 @@ fn xz_reader_mkv_iterator() { {"assignment":[2,2,1,1],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::MkvChain); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.variant(), BenVariant::MkvChain); let results: Vec<_> = reader.collect(); // MkvChain collapses identical consecutive assignments @@ -64,7 +64,7 @@ fn xz_reader_mkv_iterator() { fn xz_reader_twodelta_iterator() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.variant(), BenVariant::TwoDelta); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); @@ -77,7 +77,7 @@ fn xz_reader_count_samples_standard() { {"assignment":[1,2,1,2],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 3); } @@ -88,7 +88,7 @@ fn xz_reader_count_samples_mkv() { {"assignment":[2,2,1,1],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::MkvChain); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 3); } @@ -97,7 +97,7 @@ fn xz_reader_silent_suppresses_output() { let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)) + let reader = BenStreamReader::from_xben(Cursor::new(xben)) .unwrap() .silent(true); let results: Vec<_> = reader.collect(); @@ -110,7 +110,7 @@ fn xz_reader_for_each_assignment() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut collected = Vec::new(); reader .for_each_assignment(|assignment, count| { @@ -130,7 +130,7 @@ fn xz_reader_for_each_assignment_early_stop() { {"assignment":[3,3,3,3],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut collected = Vec::new(); reader .for_each_assignment(|assignment, _count| { @@ -147,7 +147,7 @@ fn xz_reader_write_all_jsonl() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -165,7 +165,7 @@ fn xz_reader_write_all_jsonl_mkv_expands_counts() { {"assignment":[2,2,1,1],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::MkvChain); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -179,11 +179,15 @@ fn xz_reader_into_frames_standard() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.into_frames().collect(); assert_eq!(frames.len(), 2); for f in &frames { - let (bytes, count) = f.as_ref().unwrap(); + let (frame, count) = f.as_ref().unwrap(); + let bytes = match frame { + DecodeFrame::XBen(b, _) => b, + DecodeFrame::Ben(_) => panic!("xben frame iterator yielded BEN arm"), + }; assert!(!bytes.is_empty()); assert_eq!(*count, 1); } @@ -193,7 +197,7 @@ fn xz_reader_into_frames_standard() { fn xz_reader_into_frames_twodelta() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.into_frames().collect(); assert_eq!(frames.len(), 2); } @@ -203,7 +207,7 @@ fn xz_frame_reader_new() { let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamFrameReader::from_xben(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.collect(); assert_eq!(frames.len(), 1); } @@ -211,11 +215,11 @@ fn xz_frame_reader_new() { #[test] fn xz_reader_new_rejects_invalid_data() { let garbage = vec![0u8; 100]; - let result = XZAssignmentReader::new(Cursor::new(garbage)); + let result = BenStreamReader::from_xben(Cursor::new(garbage)); assert!(result.is_err()); } -// ── XZAssignmentReader subsample ──────────────────────────────────────────── +// ── BenStreamReader subsample ──────────────────────────────────────────── #[test] fn xz_reader_subsample_by_indices() { @@ -224,7 +228,7 @@ fn xz_reader_subsample_by_indices() { {"assignment":[3,3,3,3],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_indices(vec![1, 3]) .map(|r| r.unwrap().0) @@ -241,7 +245,7 @@ fn xz_reader_subsample_by_range() { {"assignment":[3,3,3,3],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_range(2, 3) .map(|r| r.unwrap().0) @@ -259,7 +263,7 @@ fn xz_reader_subsample_every() { {"assignment":[4,4,4,4],"sample":4} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_every(2, 1) // samples 1, 3 .map(|r| r.unwrap().0) @@ -269,7 +273,7 @@ fn xz_reader_subsample_every() { assert_eq!(results[1], vec![3, 3, 3, 3]); } -// ── XZAssignmentReader for_each_assignment with silent ────────────────────── +// ── BenStreamReader for_each_assignment with silent ────────────────────── #[test] fn xz_reader_for_each_assignment_silent() { @@ -277,7 +281,7 @@ fn xz_reader_for_each_assignment_silent() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)) + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)) .unwrap() .silent(true); let mut count = 0usize; @@ -290,13 +294,13 @@ fn xz_reader_for_each_assignment_silent() { assert_eq!(count, 2); } -// ── XZAssignmentReader TwoDelta write_all_jsonl ───────────────────────────── +// ── BenStreamReader TwoDelta write_all_jsonl ───────────────────────────── #[test] fn xz_reader_write_all_jsonl_twodelta() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -304,13 +308,13 @@ fn xz_reader_write_all_jsonl_twodelta() { assert_eq!(lines.len(), 3); } -// ── XZAssignmentReader TwoDelta count_samples ─────────────────────────────── +// ── BenStreamReader TwoDelta count_samples ─────────────────────────────── #[test] fn xz_reader_count_samples_twodelta() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 2); } @@ -322,12 +326,16 @@ fn xz_reader_into_frames_standard_content() { {"assignment":[3,3,4,4],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.into_frames().collect(); assert_eq!(frames.len(), 2); // Verify frame bytes can be decoded back for f in &frames { - let (bytes, count) = f.as_ref().unwrap(); + let (frame, count) = f.as_ref().unwrap(); + let bytes = match frame { + DecodeFrame::XBen(b, _) => b, + DecodeFrame::Ben(_) => panic!("xben frame iterator yielded BEN arm"), + }; assert!(!bytes.is_empty()); assert_eq!(*count, 1); } @@ -339,7 +347,7 @@ fn xz_reader_write_all_jsonl_standard_content_verified() { {"assignment":[8,9,10],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -360,7 +368,7 @@ fn xz_reader_write_all_jsonl_mkv_content_verified() { {"assignment":[4,5,6],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::MkvChain); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -381,7 +389,7 @@ fn xz_reader_single_sample_standard() { let jsonl = r#"{"assignment":[42],"sample":1} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.collect(); assert_eq!(results.len(), 1); assert_eq!(results[0].as_ref().unwrap().0, vec![42]); @@ -391,7 +399,7 @@ fn xz_reader_single_sample_standard() { #[test] fn xz_reader_single_sample_twodelta() { let xben = make_xben_from_assignments(&[vec![1u16, 2, 3]], BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, vec![vec![1, 2, 3]]); } @@ -405,7 +413,7 @@ fn xz_reader_subsample_by_indices_deduplicates_and_sorts() { {"assignment":[3,3,3,3],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Pass unsorted duplicates: [3,1,3,1] → sorted+deduped [1,3] let results: Vec<_> = reader .into_subsample_by_indices(vec![3, 1, 3, 1]) @@ -422,7 +430,7 @@ fn xz_reader_subsample_by_indices_beyond_stream() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Index 5 is beyond the stream (only 2 samples) let results: Vec<_> = reader .into_subsample_by_indices(vec![5]) @@ -438,7 +446,7 @@ fn xz_reader_subsample_by_range_single_element() { {"assignment":[3,3,3,3],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_range(2, 2) // only sample 2 .map(|r| r.unwrap().0) @@ -453,7 +461,7 @@ fn xz_reader_subsample_every_offset_beyond_stream() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Offset 10 is beyond the stream let results: Vec<_> = reader .into_subsample_every(1, 10) @@ -470,7 +478,7 @@ fn xz_reader_subsample_mkv_with_count_gt_1() { {"assignment":[4,5,6],"sample":4} "#; let xben = make_xben(jsonl, BenVariant::MkvChain); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Select sample 2 (middle of the count=3 frame) and sample 4 let results: Vec<_> = reader .into_subsample_by_indices(vec![2, 4]) @@ -485,7 +493,7 @@ fn xz_reader_subsample_mkv_with_count_gt_1() { fn xz_reader_subsample_twodelta() { let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_indices(vec![1, 3]) .map(|r| r.unwrap().0) @@ -500,9 +508,9 @@ fn xz_reader_subsample_twodelta() { #[test] fn decoder_init_error_xz_header_detected() { // Feed XZ-compressed data to a reader that expects uncompressed BEN - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let xz_magic = b"\xFD\x37\x7A\x58\x5A\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - let result = AssignmentReader::new(xz_magic.as_slice()); + let result = BenStreamReader::from_ben(xz_magic.as_slice()); assert!(result.is_err()); let io_err: std::io::Error = result.err().unwrap().into(); assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidData); @@ -511,9 +519,9 @@ fn decoder_init_error_xz_header_detected() { #[test] fn decoder_init_error_unknown_banner() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let bad_banner = b"THIS IS NOT BEN!!"; - let result = AssignmentReader::new(bad_banner.as_slice()); + let result = BenStreamReader::from_ben(bad_banner.as_slice()); assert!(result.is_err()); let io_err: std::io::Error = result.err().unwrap().into(); assert_eq!(io_err.kind(), std::io::ErrorKind::InvalidData); @@ -522,7 +530,7 @@ fn decoder_init_error_unknown_banner() { #[test] fn decoder_init_error_io() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; struct FailReader; impl std::io::Read for FailReader { fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { @@ -532,7 +540,7 @@ fn decoder_init_error_io() { )) } } - let result = AssignmentReader::new(FailReader); + let result = BenStreamReader::from_ben(FailReader); assert!(result.is_err()); let io_err: std::io::Error = result.err().unwrap().into(); assert_eq!(io_err.kind(), std::io::ErrorKind::BrokenPipe); @@ -556,7 +564,7 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader .for_each_assignment(|_assignment, _count| { Err(std::io::Error::new( @@ -575,32 +583,19 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { fn xz_reader_large_assignment_roundtrip() { let big_assign: Vec = (1..=1000).collect(); let xben = make_xben_from_assignments(&[big_assign.clone()], BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results.len(), 1); assert_eq!(results[0], big_assign); } -// ── build_frame_iter_from_reader unknown mode ───────────────────────── - -#[test] -fn build_frame_iter_from_reader_unknown_mode_errors() { - use crate::io::reader::subsample::build_frame_iter_from_reader; - let data = Cursor::new(b"dummy data for unknown mode test".to_vec()); - let result = build_frame_iter_from_reader(data, "bogus"); - assert!(result.is_err()); - let err = result.err().unwrap(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("bogus")); -} - // ── SubsampleFrameDecoder stress tests ──────────────────────────────── #[test] fn subsample_every_start_beyond_hi_returns_zero() { let assignments = vec![vec![1u16, 2, 3], vec![4, 5, 6]]; let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_every(1, 100) .map(|r| r.unwrap().0) @@ -612,7 +607,7 @@ fn subsample_every_start_beyond_hi_returns_zero() { fn subsample_range_non_overlapping_returns_empty() { let assignments = vec![vec![1u16, 2], vec![3, 4], vec![5, 6]]; let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_range(10, 20) .map(|r| r.unwrap().0) @@ -624,7 +619,7 @@ fn subsample_range_non_overlapping_returns_empty() { fn subsample_indices_mixed_before_and_after() { let assignments: Vec> = (1..=5).map(|i| vec![i; 3]).collect(); let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_indices(vec![2, 4, 100]) .map(|r| r.unwrap().0) @@ -638,7 +633,7 @@ fn subsample_indices_mixed_before_and_after() { fn subsample_every_step_larger_than_stream() { let assignments = vec![vec![1u16, 2], vec![3, 4]]; let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_every(100, 1) .map(|r| r.unwrap().0) @@ -651,7 +646,7 @@ fn subsample_every_step_larger_than_stream() { fn subsample_indices_empty_yields_nothing() { let assignments = vec![vec![1u16, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_indices(Vec::::new()) .map(|r| r.unwrap().0) @@ -667,7 +662,7 @@ fn subsample_twodelta_by_range() { vec![2, 2, 2, 2], ]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_range(2, 3) .map(|r| r.unwrap().0) @@ -686,7 +681,7 @@ fn subsample_twodelta_every() { vec![2, 1, 2, 1], ]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_every(2, 1) .map(|r| r.unwrap().0) @@ -703,7 +698,7 @@ fn xz_twodelta_many_identical_assignments_roundtrip() { let assign = vec![1u16, 2, 1, 2]; let assignments: Vec<_> = (0..100).map(|_| assign.clone()).collect(); let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total_samples: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total_samples, 100); @@ -717,7 +712,7 @@ fn xz_twodelta_all_identical_single_value_roundtrip() { let assign = vec![5u16; 10]; let assignments: Vec<_> = (0..10).map(|_| assign.clone()).collect(); let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, 10); @@ -732,7 +727,7 @@ fn xz_twodelta_alternating_assignments_roundtrip() { let b = vec![2u16, 2, 1, 1]; let assignments: Vec<_> = (0..50).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results.len(), 50); for (i, r) in results.iter().enumerate() { @@ -751,7 +746,7 @@ fn xz_twodelta_large_assignment_roundtrip() { let a2: Vec = (0..n).map(|i| if i < n / 2 { 2 } else { 1 }).collect(); let assignments = vec![a1.clone(), a2.clone(), a1.clone()]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -777,7 +772,7 @@ fn xz_twodelta_chunk_boundary_roundtrip() { } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results.len(), 21); assert_eq!(results[0], anchor); @@ -810,7 +805,7 @@ fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { writer.write_assignment(delta.clone()).unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, 4); @@ -891,7 +886,7 @@ fn translate_ben_twodelta_to_xben_with_repetitions() { let mut xben = Vec::new(); encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, 5); @@ -920,7 +915,7 @@ fn translate_ben_twodelta_to_xben_many_deltas() { let mut xben = Vec::new(); encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -939,7 +934,7 @@ fn count_samples_from_frame_iter_basic() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); - let iter = build_frame_iter_from_reader(Cursor::new(ben), "ben").unwrap(); + let iter = build_frame_iter_from_reader(Cursor::new(ben), BenWireFormat::Ben).unwrap(); assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 3); } @@ -951,7 +946,7 @@ fn count_samples_from_frame_iter_xben() { {"assignment":[2,2,1,1],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let iter = build_frame_iter_from_reader(Cursor::new(xben), "xben").unwrap(); + let iter = build_frame_iter_from_reader(Cursor::new(xben), BenWireFormat::XBen).unwrap(); assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 2); } @@ -967,16 +962,16 @@ fn count_samples_from_frame_iter_mkv() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let iter = build_frame_iter_from_reader(Cursor::new(ben), "ben").unwrap(); + let iter = build_frame_iter_from_reader(Cursor::new(ben), BenWireFormat::Ben).unwrap(); assert_eq!(count_samples_from_frame_iter(iter).unwrap(), 3); } -// ── AssignmentReader tests ───────────────────────────────────────────────── +// ── BenStreamReader tests ───────────────────────────────────────────────── #[test] fn assignment_reader_standard_roundtrip() { use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[3,3,4,4],"sample":2} @@ -984,7 +979,7 @@ fn assignment_reader_standard_roundtrip() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); - let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(reader.variant(), BenVariant::Standard); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, vec![vec![1, 1, 2, 2], vec![3, 3, 4, 4]]); @@ -993,7 +988,7 @@ fn assignment_reader_standard_roundtrip() { #[test] fn assignment_reader_mkv_roundtrip() { use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let jsonl = r#"{"assignment":[1,2,3],"sample":1} {"assignment":[1,2,3],"sample":2} @@ -1002,7 +997,7 @@ fn assignment_reader_mkv_roundtrip() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(reader.variant(), BenVariant::MkvChain); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); // MkvChain collapses: first frame count=2, second count=1 @@ -1015,7 +1010,7 @@ fn assignment_reader_mkv_roundtrip() { #[test] fn assignment_reader_twodelta_roundtrip() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; use crate::io::writer::AssignmentWriter; let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; @@ -1028,7 +1023,7 @@ fn assignment_reader_twodelta_roundtrip() { } } - let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(reader.variant(), BenVariant::TwoDelta); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); @@ -1037,7 +1032,7 @@ fn assignment_reader_twodelta_roundtrip() { #[test] fn assignment_reader_count_samples() { use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let jsonl = r#"{"assignment":[1,2],"sample":1} {"assignment":[3,4],"sample":2} @@ -1046,14 +1041,14 @@ fn assignment_reader_count_samples() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); - let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(reader.count_samples().unwrap(), 3); } #[test] fn assignment_reader_write_all_jsonl() { use crate::codec::encode::encode_jsonl_to_ben; - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let jsonl = r#"{"assignment":[10,20],"sample":1} {"assignment":[30,40],"sample":2} @@ -1061,7 +1056,7 @@ fn assignment_reader_write_all_jsonl() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); - let mut reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let output_str = String::from_utf8(output).unwrap(); @@ -1073,7 +1068,7 @@ fn assignment_reader_write_all_jsonl() { assert_eq!(v2["assignment"], serde_json::json!([30, 40])); } -// ── Zero-count frame errors in XZAssignmentReader ────────────────────────── +// ── Zero-count frame errors in BenStreamReader ────────────────────────── #[test] fn xz_reader_standard_zero_count_frame_errors() { @@ -1113,7 +1108,7 @@ fn xz_reader_standard_zero_count_frame_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben_mkv)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben_mkv)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!(err.to_string().contains("zero")); } @@ -1133,7 +1128,7 @@ fn xz_reader_twodelta_unknown_frame_tag_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!(err.to_string().contains("0xff") || err.to_string().contains("unknown")); } @@ -1153,7 +1148,7 @@ fn xz_reader_truncated_stream_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!(err.to_string().contains("truncated") || err.to_string().contains("Truncated")); } @@ -1171,7 +1166,7 @@ fn subsample_every_first_past_hi() { "{\"assignment\":[7,8],\"sample\":4}\n", ); let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let sub = reader.into_subsample_every(10, 5); let results: Vec<_> = sub.map(|r| r.unwrap()).collect(); assert!(results.is_empty()); @@ -1223,7 +1218,7 @@ fn xz_reader_twodelta_full_frame_zero_count_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!(err.to_string().contains("zero")); } @@ -1262,7 +1257,7 @@ fn xz_reader_twodelta_chunk_zero_count_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.collect(); assert_eq!(results.len(), 2); // anchor + chunk frame assert!(results[0].is_ok()); @@ -1286,7 +1281,7 @@ fn subsample_indices_skip_past_lo() { "{\"assignment\":[4,5,6],\"sample\":8}\n", ); let xben = make_xben(jsonl, BenVariant::MkvChain); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader .into_subsample_by_indices(vec![7, 8]) .map(|r| r.unwrap()) @@ -1302,7 +1297,7 @@ fn subsample_indices_skip_past_lo() { fn subsample_indices_with_zero_skips_past_lo() { let assignments = vec![vec![1u16, 2], vec![3, 4], vec![5, 6]]; let xben = make_xben_from_assignments(&assignments, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Index 0 is below the 1-based lo boundary, exercises the `next < lo` skip. let results: Vec<_> = reader .into_subsample_by_indices(vec![0, 2]) @@ -1312,7 +1307,7 @@ fn subsample_indices_with_zero_skips_past_lo() { assert_eq!(results[0], vec![3, 4]); } -// ── XZAssignmentFrameReader for MkvChain zero-count ───────────────── +// ── BenStreamFrameReader for MkvChain zero-count ───────────────── #[test] fn xz_frame_reader_mkv_zero_count_errors() { @@ -1331,12 +1326,12 @@ fn xz_frame_reader_mkv_zero_count_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamFrameReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!(err.to_string().contains("zero")); } -// ── XZAssignmentReader TwoDelta truncated stream ───────────────────── +// ── BenStreamReader TwoDelta truncated stream ───────────────────── #[test] fn xz_reader_twodelta_truncated_stream_errors() { @@ -1351,7 +1346,7 @@ fn xz_reader_twodelta_truncated_stream_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!( err.to_string().contains("truncated") || err.to_string().contains("Truncated"), @@ -1386,7 +1381,7 @@ fn xz_reader_twodelta_tag1_rejected_as_unknown() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut iter = reader.into_iter(); let _first = iter.next().unwrap().unwrap(); // consume the valid full frame let err = iter.next().unwrap().unwrap_err(); @@ -1421,7 +1416,7 @@ fn xz_reader_twodelta_chunk_delta_without_anchor_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!( err.to_string().contains("full-assignment") || err.to_string().contains("anchor"), @@ -1454,7 +1449,7 @@ fn xz_reader_for_each_assignment_stream_error() { encoder.finish().unwrap(); } - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut count = 0usize; let result = reader.for_each_assignment(|_assignment, _cnt| { count += 1; @@ -1465,7 +1460,7 @@ fn xz_reader_for_each_assignment_stream_error() { assert!(result.is_err()); } -// ── XZAssignmentFrameReader truncated TwoDelta ────────────────────── +// ── BenStreamFrameReader truncated TwoDelta ────────────────────── #[test] fn xz_frame_reader_twodelta_truncated_errors() { @@ -1480,7 +1475,7 @@ fn xz_frame_reader_twodelta_truncated_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamFrameReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader.into_iter().next().unwrap().unwrap_err(); assert!( err.to_string().contains("truncated") || err.to_string().contains("Truncated"), @@ -1508,7 +1503,7 @@ fn xz_reader_standard_corrupt_frame_errors() { encoder.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.collect(); // An empty frame (no RLE pairs before terminator) yields an empty assignment assert_eq!(results.len(), 1); @@ -1535,7 +1530,7 @@ fn subsample_decoder_zero_count_frame_errors() { assert!(err.to_string().contains("zero"), "got: {}", err); } -// ── XZAssignmentFrameReader: TwoDelta into_frames ─────────────────── +// ── BenStreamFrameReader: TwoDelta into_frames ─────────────────── #[test] fn xz_frame_reader_twodelta_into_frames() { @@ -1546,7 +1541,7 @@ fn xz_frame_reader_twodelta_into_frames() { {"assignment":[2,1,2,2],"sample":2} "#; let xben = make_xben(jsonl, BenVariant::TwoDelta); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let frames: Vec<_> = reader.into_frames().map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); // Each frame is (ben32_bytes, count); counts should be 1 @@ -1554,7 +1549,7 @@ fn xz_frame_reader_twodelta_into_frames() { assert_eq!(frames[1].1, 1); } -// ── XZAssignmentReader: count_samples helper ──────────────────────── +// ── BenStreamReader: count_samples helper ──────────────────────── #[test] fn xz_reader_count_samples() { @@ -1563,11 +1558,11 @@ fn xz_reader_count_samples() { {"assignment":[7,8,9],"sample":3} "#; let xben = make_xben(jsonl, BenVariant::Standard); - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(reader.count_samples().unwrap(), 3); } -// ── XZAssignmentReader: write_all_jsonl ───────────────────────────── +// ── BenStreamReader: write_all_jsonl ───────────────────────────── #[test] fn xz_reader_write_all_jsonl_standard_roundtrip() { @@ -1575,7 +1570,7 @@ fn xz_reader_write_all_jsonl_standard_roundtrip() { {"assignment":[4,5,6],"sample":2} "#; let xben = make_xben(jsonl_in, BenVariant::Standard); - let mut reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let mut output = Vec::new(); reader.write_all_jsonl(&mut output).unwrap(); let text = String::from_utf8(output).unwrap(); @@ -1584,11 +1579,11 @@ fn xz_reader_write_all_jsonl_standard_roundtrip() { assert!(text.contains("\"assignment\":[4,5,6]")); } -// ── AssignmentReader: TwoDelta error propagation in RawBenFrameIter ────────── +// ── BenStreamReader: TwoDelta error propagation in RawBenFrameIter ────────── #[test] fn raw_frame_iter_propagates_twodelta_decode_error() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; use crate::io::writer::AssignmentWriter; // Build a minimal TwoDelta BEN file with two samples. @@ -1610,14 +1605,14 @@ fn raw_frame_iter_propagates_twodelta_decode_error() { // Set max_len_bits to 0, which triggers InvalidData during decoding. ben[anchor_end + 4] = 0; - let reader = AssignmentReader::new(Cursor::new(ben)).unwrap(); + let reader = BenStreamReader::from_ben(Cursor::new(ben)).unwrap(); let mut iter = reader.into_frames(); iter.next().unwrap().unwrap(); // anchor frame OK let err = iter.next().unwrap().unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } -// ── AssignmentReader: zero-count frame errors ──────────────────────────────── +// ── BenStreamReader: zero-count frame errors ──────────────────────────────── /// Build a minimal MkvChain BEN stream whose first frame has count == 0. fn make_mkvchain_zero_count_frame() -> Vec { @@ -1633,18 +1628,18 @@ fn make_mkvchain_zero_count_frame() -> Vec { #[test] fn assignment_reader_count_samples_rejects_zero_count_frame() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let data = make_mkvchain_zero_count_frame(); - let reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let reader = BenStreamReader::from_ben(Cursor::new(data)).unwrap(); let err = reader.count_samples().unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } #[test] fn assignment_reader_for_each_rejects_zero_count_frame() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let data = make_mkvchain_zero_count_frame(); - let mut reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let mut reader = BenStreamReader::from_ben(Cursor::new(data)).unwrap(); let err = reader .for_each_assignment(|_, _| Ok(true)) .unwrap_err(); @@ -1653,9 +1648,9 @@ fn assignment_reader_for_each_rejects_zero_count_frame() { #[test] fn raw_frame_iter_rejects_zero_count_mkv_frame() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; let data = make_mkvchain_zero_count_frame(); - let reader = AssignmentReader::new(Cursor::new(data)).unwrap(); + let reader = BenStreamReader::from_ben(Cursor::new(data)).unwrap(); let err = reader .into_frames() .next() diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs index 8143919..adc43f1 100644 --- a/ben/src/io/reader/twodelta.rs +++ b/ben/src/io/reader/twodelta.rs @@ -1,2 +1,2 @@ -pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; -pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; +pub(crate) const XBEN_TWODELTA_FULL_TAG: u8 = 0; +pub(crate) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; diff --git a/ben/src/io/reader/xz_assignment_reader.rs b/ben/src/io/reader/xz_assignment_reader.rs deleted file mode 100644 index e85c5cf..0000000 --- a/ben/src/io/reader/xz_assignment_reader.rs +++ /dev/null @@ -1,627 +0,0 @@ -use super::errors::DecoderInitError; -use super::subsample::{Ben32Frame, DecodeFrame, MkvRecord, SubsampleFrameDecoder}; -use super::twodelta::{XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG}; -use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben32_line, DecodeError}; -use crate::codec::encode::encode_ben32_assignments; -use crate::format::banners::{variant_from_banner, BANNER_LEN}; -use crate::progress::Spinner; -use crate::util::rle::rle_to_vec; -use crate::BenVariant; -use serde_json::json; -use std::io::{self, BufReader, Cursor, Read, Write}; -use xz2::read::XzDecoder; - -/// Iterator over decoded assignments in an XBEN stream. -pub struct XZAssignmentReader { - xz: BufReader>, - /// Variant encoded in the XBEN banner (private; use `.variant()` accessor). - inner_variant: BenVariant, - overflow: Vec, - buf: Box<[u8]>, - previous_assignment: Option>, - chunk_queue: std::collections::VecDeque<((u16, u16), Vec, u16)>, - silent: bool, -} - -impl XZAssignmentReader { - /// Create an XBEN decoder from an already-opened decompressed stream. - /// - /// # Arguments - /// - /// * `xz` - A buffered XZ decompression reader positioned past the banner. - /// * `variant` - The BEN variant indicated by the banner. - /// - /// # Returns - /// - /// Returns a new decoder ready to yield frames from the stream. - pub(crate) fn from_decompressed_stream( - xz: BufReader>, - variant: BenVariant, - ) -> Self { - Self { - xz, - inner_variant: variant, - overflow: Vec::with_capacity(1 << 20), - buf: vec![0u8; 1 << 20].into_boxed_slice(), - previous_assignment: None, - chunk_queue: std::collections::VecDeque::new(), - silent: false, - } - } - - /// Create a decoder for an XBEN stream. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns a new decoder positioned at the first ben32 frame in the - /// decompressed payload. - pub fn new(reader: R) -> Result { - let xz = XzDecoder::new(reader); - let mut xz = BufReader::with_capacity(1 << 20, xz); - - let mut first = [0u8; BANNER_LEN]; - if let Err(e) = xz.read_exact(&mut first) { - return Err(DecoderInitError::Io(e)); - } - let variant = match variant_from_banner(&first) { - Some(v) => v, - None => return Err(DecoderInitError::InvalidFileFormat(first.to_vec())), - }; - - Ok(Self::from_decompressed_stream(xz, variant)) - } - - /// Return the BEN variant detected from the stream banner. - pub fn variant(&self) -> BenVariant { - self.inner_variant - } - - /// Suppress progress output from this decoder's iterator. - /// - /// # Arguments - /// - /// * `silent` - When `true`, the decoder will not emit progress messages. - /// - /// # Returns - /// - /// Returns `self` for method chaining. - pub fn silent(mut self, silent: bool) -> Self { - self.silent = silent; - self - } - - /// Try to extract one complete ben32 frame from the buffered overflow. - /// - /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 - /// frame and, for MkvChain streams, reads the trailing repetition count. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain one or more - /// complete ben32 frames. - /// - /// # Returns - /// - /// Returns the frame bytes, the number of consumed bytes, and the decoded - /// repetition count when a complete frame is available. - fn pop_frame_from_overflow<'a>(&self, overflow: &'a [u8]) -> Option<(&'a [u8], usize, u16)> { - // TwoDelta callers use pop_twodelta_frame_from_overflow; this method - // is only reached for Standard and MkvChain variants. - if self.inner_variant == BenVariant::Standard { - if overflow.len() < 4 { - return None; - } - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let end = i + 1; - let frame = &overflow[..end]; - return Some((frame, end, 1)); - } - } - None - } else { - if overflow.len() < 6 { - return None; - } - for i in (3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - let count_hi = overflow[i + 1]; - let count_lo = overflow[i + 2]; - let count = u16::from_be_bytes([count_hi, count_lo]); - let end = i + 3; - let frame = &overflow[..end]; - return Some((frame, end, count)); - } - } - None - } - } - - /// Try to extract one complete TwoDelta frame from the buffered overflow. - /// - /// Inspects the leading tag byte to determine whether the frame is a full - /// RLE frame or a delta frame, then reads the corresponding payload. - /// - /// # Arguments - /// - /// * `overflow` - Buffered decompressed bytes that may contain a complete - /// TwoDelta frame. - /// - /// # Returns - /// - /// Returns the parsed frame, the number of consumed bytes, and the - /// repetition count when a complete frame is available. - fn pop_twodelta_frame_from_overflow( - &self, - overflow: &[u8], - ) -> Option, usize, u16)>> { - let tag = *overflow.first()?; - match tag { - XBEN_TWODELTA_FULL_TAG => { - if overflow.len() < 7 { - return None; - } - let run_count = - u32::from_be_bytes([overflow[1], overflow[2], overflow[3], overflow[4]]) - as usize; - let payload_len = run_count * 4; - let total_len = 1 + 4 + payload_len + 2; - if overflow.len() < total_len { - return None; - } - - let mut runs = Vec::with_capacity(run_count); - let mut cursor = 5usize; - for _ in 0..run_count { - let value = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - let len = u16::from_be_bytes([overflow[cursor + 2], overflow[cursor + 3]]); - runs.push((value, len)); - cursor += 4; - } - let count = u16::from_be_bytes([overflow[cursor], overflow[cursor + 1]]); - Some(Ok((runs, total_len, count))) - } - XBEN_TWODELTA_CHUNK_TAG => None, // Handled by try_parse_twodelta_chunk. - _ => Some(Err(io::Error::from(DecodeError::XBenUnknownFrameTag { - tag, - }))), - } - } - - /// Try to parse a columnar TwoDelta chunk from the overflow buffer. - /// - /// If the overflow starts with the chunk tag and contains enough bytes for - /// the full chunk, all frames are decoded and pushed onto `chunk_queue`. - /// Returns `true` on success, `false` when the overflow is incomplete. - fn try_parse_twodelta_chunk(&mut self) -> bool { - if self.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { - return false; - } - if self.overflow.len() < 5 { - return false; - } - - let n_frames = u32::from_be_bytes([ - self.overflow[1], - self.overflow[2], - self.overflow[3], - self.overflow[4], - ]) as usize; - - // Calculate total chunk size: tag(1) + n_frames(4) - // + pairs(n*4) + counts(n*2) + run_counts(n*4) + run_data(variable) - let header_len: usize = 5; - let pairs_len = n_frames * 4; - let counts_len = n_frames * 2; - let run_counts_len = n_frames * 4; - let fixed_len = header_len + pairs_len + counts_len + run_counts_len; - - if self.overflow.len() < fixed_len { - return false; - } - - // Read run-length counts to determine total run data size. - let run_counts_start = header_len + pairs_len + counts_len; - let mut total_runs = 0usize; - let mut run_counts = Vec::with_capacity(n_frames); - for i in 0..n_frames { - let offset = run_counts_start + i * 4; - let rc = u32::from_be_bytes([ - self.overflow[offset], - self.overflow[offset + 1], - self.overflow[offset + 2], - self.overflow[offset + 3], - ]) as usize; - run_counts.push(rc); - total_runs += rc; - } - - let run_data_len = total_runs * 2; - let total_len = fixed_len + run_data_len; - if self.overflow.len() < total_len { - return false; - } - - // Parse pairs channel. - let pairs_start = header_len; - // Parse counts channel. - let counts_start = pairs_start + pairs_len; - // Run data starts after run counts. - let run_data_start = run_counts_start + run_counts_len; - - let mut run_cursor = run_data_start; - for i in 0..n_frames { - let po = pairs_start + i * 4; - let pair = ( - u16::from_be_bytes([self.overflow[po], self.overflow[po + 1]]), - u16::from_be_bytes([self.overflow[po + 2], self.overflow[po + 3]]), - ); - let co = counts_start + i * 2; - let count = u16::from_be_bytes([self.overflow[co], self.overflow[co + 1]]); - - let rc = run_counts[i]; - let mut run_lengths = Vec::with_capacity(rc); - for _ in 0..rc { - run_lengths.push(u16::from_be_bytes([ - self.overflow[run_cursor], - self.overflow[run_cursor + 1], - ])); - run_cursor += 2; - } - - self.chunk_queue.push_back((pair, run_lengths, count)); - } - - self.overflow.drain(..total_len); - true - } - - /// Consume this decoder and iterate over raw ben32 frames instead of - /// materialized assignments. - /// - /// # Returns - /// - /// Returns an iterator that yields raw ben32 frames from the remaining - /// input. - pub fn into_frames(self) -> XZAssignmentFrameReader { - XZAssignmentFrameReader { inner: self } - } - - /// Count the number of samples remaining in the XBEN stream. - /// - /// # Returns - /// - /// Returns the number of remaining samples in the stream. - pub fn count_samples(self) -> io::Result { - let mut total = 0usize; - for frame_res in self.into_frames() { - let (_bytes, cnt) = frame_res?; - total += cnt as usize; - } - Ok(total) - } - - /// Decode assignments and pass each one to a callback by reference. - /// - /// The callback receives a borrowed assignment slice and its repetition - /// count. Return `true` to continue decoding or `false` to stop early. - /// - /// # Arguments - /// - /// * `f` - A callback invoked once per unique frame with `(&[u16], u16)`. - /// - /// # Returns - /// - /// Returns `Ok(())` after the stream is exhausted or the callback signals stop. - pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> - where - F: FnMut(&[u16], u16) -> io::Result, - { - let mut sample_count = 0usize; - let spinner = (!self.silent).then(|| Spinner::new("Decoding sample")); - loop { - match self.next() { - Some(Ok((assignment, count))) => { - sample_count += count as usize; - if let Some(spinner) = &spinner { - spinner.set_count(sample_count as u64); - } - let keep_going = f(&assignment, count)?; - if !keep_going { - return Ok(()); - } - } - Some(Err(e)) => return Err(e), - None => return Ok(()), - } - } - } - - /// Decode the remaining XBEN stream and write it as JSONL. - /// - /// # Arguments - /// - /// * `writer` - The destination that will receive one JSON object per - /// decoded sample. - /// - /// # Returns - /// - /// Returns `Ok(())` after the remaining stream has been fully decoded. - pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - let mut sample_number = 0usize; - self.for_each_assignment(|assignment, count| { - for _ in 0..count { - sample_number += 1; - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - } - Ok(true) - }) - } -} - -fn zero_count_frame_error() -> io::Error { - io::Error::new( - io::ErrorKind::InvalidData, - "XBEN frame count must be greater than zero", - ) -} - -/// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. -/// -/// # Arguments -/// -/// * `frame_bytes` - The ben32 frame bytes. -/// * `variant` - The BEN variant used to interpret the frame tail. -/// -/// # Returns -/// -/// Returns the expanded assignment vector. -pub(super) fn decode_xben_frame_to_assignment( - frame_bytes: &[u8], - variant: BenVariant, -) -> io::Result> { - let (assignment, _) = decode_ben32_line(Cursor::new(frame_bytes), variant)?; - Ok(assignment) -} - -impl Iterator for XZAssignmentReader { - type Item = io::Result; - - /// Decode and return the next assignment from the XBEN stream. - fn next(&mut self) -> Option { - loop { - match self.inner_variant { - BenVariant::Standard | BenVariant::MkvChain => { - if let Some((frame_bytes, consumed, count)) = - self.pop_frame_from_overflow(&self.overflow) - { - if count == 0 { - self.overflow.drain(..consumed); - return Some(Err(zero_count_frame_error())); - } - // pop_frame_from_overflow guarantees a complete - // zero-sentinel-terminated frame, so this never fails. - let assignment = decode_xben_frame_to_assignment( - frame_bytes, - self.inner_variant, - ) - .expect("complete frame from pop_frame_from_overflow"); - self.previous_assignment = Some(assignment.clone()); - self.overflow.drain(..consumed); - return Some(Ok((assignment, count))); - } - } - BenVariant::TwoDelta => { - // Drain frames from a previously parsed chunk first. - // Chunks only contain Delta frames. - if let Some((pair, run_lengths, count)) = self.chunk_queue.pop_front() { - if count == 0 { - return Some(Err(zero_count_frame_error())); - } - let assignment = match self.previous_assignment.take() { - Some(prev) => { - apply_twodelta_runs_to_assignment(prev, pair, &run_lengths) - } - None => Err(io::Error::from(DecodeError::TwoDeltaNoAnchorFrame)), - }; - return Some(match assignment { - Ok(a) => { - self.previous_assignment = Some(a.clone()); - Ok((a, count)) - } - Err(e) => Err(e), - }); - } - - // Try to parse a columnar chunk. - if self.try_parse_twodelta_chunk() { - continue; // Loop to drain chunk_queue. - } - - // Try a single frame from overflow (only Full/tag-0 frames - // or errors — tag-1 is no longer supported). - if let Some(parsed) = self.pop_twodelta_frame_from_overflow(&self.overflow) { - let res = match parsed { - Ok((runs, consumed, count)) => { - if count == 0 { - self.overflow.drain(..consumed); - return Some(Err(zero_count_frame_error())); - } - let assignment = rle_to_vec(runs); - self.previous_assignment = Some(assignment.clone()); - self.overflow.drain(..consumed); - Ok((assignment, count)) - } - Err(err) => { - self.overflow.clear(); - Err(err) - } - }; - return Some(res); - } - } - } - - let read = match self.xz.read(&mut self.buf) { - Ok(0) => { - if self.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::from(DecodeError::XBenTruncated))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.overflow.extend_from_slice(&self.buf[..read]); - } - } -} - -/// Iterator over raw ben32 frames inside an XBEN stream. -pub struct XZAssignmentFrameReader { - pub(super) inner: XZAssignmentReader, -} - -impl XZAssignmentFrameReader { - /// Create a raw XBEN frame iterator from a reader. - /// - /// # Arguments - /// - /// * `reader` - The compressed XBEN input stream. - /// - /// # Returns - /// - /// Returns an iterator over raw ben32 frames. - pub fn new(reader: R) -> Result { - Ok(Self { - inner: XZAssignmentReader::new(reader)?, - }) - } -} - -impl Iterator for XZAssignmentFrameReader { - type Item = io::Result; - - /// Return the next raw ben32 frame from the input stream. - fn next(&mut self) -> Option { - if self.inner.inner_variant == BenVariant::TwoDelta { - return self.inner.next().map(|result| { - result.and_then(|(assignment, count)| { - Ok((encode_ben32_assignments(&assignment)?, count)) - }) - }); - } - - loop { - if let Some((frame, consumed, count)) = - self.inner.pop_frame_from_overflow(&self.inner.overflow) - { - if count == 0 { - self.inner.overflow.drain(..consumed); - return Some(Err(zero_count_frame_error())); - } - let out = frame.to_vec(); - self.inner.overflow.drain(..consumed); - return Some(Ok((out, count))); - } - - let read = match self.inner.xz.read(&mut self.inner.buf) { - Ok(0) => { - if self.inner.overflow.is_empty() { - return None; - } else { - return Some(Err(io::Error::from(DecodeError::XBenTruncated))); - } - } - Ok(n) => n, - Err(e) => return Some(Err(e)), - }; - self.inner - .overflow - .extend_from_slice(&self.inner.buf[..read]); - } - } -} - -impl XZAssignmentReader { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. - /// - /// # Arguments - /// - /// * `indices` - A collection of 1-based sample indices. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder> + Send> - where - T: IntoIterator, - { - let variant = self.inner_variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_indices(Box::new(frames), indices) - } - - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. - /// - /// # Arguments - /// - /// * `start` - The first 1-based sample index to include. - /// * `end` - The last 1-based sample index to include. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.inner_variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::by_range(Box::new(frames), start, end) - } - - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. - /// - /// # Arguments - /// - /// * `step` - The stride between selected samples. - /// * `offset` - The 1-based index of the first selected sample. - /// - /// # Returns - /// - /// Returns a decoder that yields only the selected samples. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder> + Send> { - let variant = self.inner_variant; - let frames = self - .into_frames() - .map(move |res| res.map(|(bytes, cnt)| (DecodeFrame::XBen(bytes, variant), cnt))); - SubsampleFrameDecoder::every(Box::new(frames), step, offset) - } -} diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index bac4962..d4052a0 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -1,4 +1,4 @@ -use crate::io::reader::XZAssignmentReader; +use crate::io::reader::BenStreamReader; use crate::io::writer::XZAssignmentWriter; use crate::BenVariant; use std::io::Cursor; @@ -13,7 +13,7 @@ fn roundtrip_xben(assignments: &[Vec], variant: BenVariant) -> Vec writer.write_assignment(a.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); reader.map(|r| r.unwrap().0).collect() } @@ -26,7 +26,7 @@ fn roundtrip_xben_counts(assignments: &[Vec], variant: BenVariant) -> Vec<( writer.write_assignment(a.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); reader.map(|r| r.unwrap()).collect() } @@ -129,7 +129,7 @@ fn writer_twodelta_chunk_size_1() { writer.write_assignment(a.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -150,7 +150,7 @@ fn writer_twodelta_chunk_size_larger_than_stream() { writer.write_assignment(a.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -185,7 +185,7 @@ fn writer_twodelta_u16_max_value_in_assignment() { #[test] fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { - use crate::io::reader::AssignmentReader; + use crate::io::reader::BenStreamReader; use crate::io::writer::AssignmentWriter; // Assignment with 3 distinct values exercises the `continue` skip path @@ -201,7 +201,7 @@ fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { } } - let reader = AssignmentReader::new(ben.as_slice()).unwrap(); + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); assert_eq!(total, n); } @@ -223,7 +223,7 @@ fn writer_twodelta_write_json_value() { .write_json_value(json!({"assignment": [2, 1, 2, 1]})) .unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, vec![vec![1u16, 2, 1, 2], vec![2, 1, 2, 1]]); } @@ -240,7 +240,7 @@ fn writer_finish_is_idempotent() { writer.finish().unwrap(); writer.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, vec![vec![1u16, 2, 3, 4]]); } @@ -268,7 +268,7 @@ fn writer_write_ben_file_standard_roundtrip() { writer.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, vec![vec![1u16, 2, 3], vec![4, 5, 6]]); } @@ -295,7 +295,7 @@ fn writer_write_ben_file_mkv_roundtrip() { writer.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, 3); @@ -326,7 +326,7 @@ fn writer_write_ben_file_twodelta_roundtrip() { writer.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -386,7 +386,7 @@ fn writer_twodelta_anchor_count_overflow_u16max() { writer.write_assignment(assign.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); assert_eq!(total, n); } @@ -408,7 +408,7 @@ fn writer_twodelta_delta_count_overflow_u16max() { writer.write_assignment(delta.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); let total: usize = results.iter().map(|(_, c)| *c as usize).sum(); assert_eq!(total, n_delta + 1); @@ -447,7 +447,7 @@ fn writer_translate_ben_twodelta_chunk_flush() { writer.finish().unwrap(); } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); } @@ -482,7 +482,7 @@ fn writer_mkv_count_overflow_u16max() { writer.write_assignment(assign.clone()).unwrap(); } } - let reader = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); assert_eq!(total, n); } diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index ea5fa08..051723e 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -1,7 +1,7 @@ //! Sample extraction helpers for BEN and XBEN streams. use crate::codec::decode::decode_ben32_line; -use crate::io::reader::{AssignmentReader, XZAssignmentReader}; +use crate::io::reader::BenStreamReader; use serde_json::Error as SerdeError; use std::fs::File; use std::io::Cursor; @@ -67,7 +67,7 @@ pub fn extract_assignment_ben( } let mut current_sample = 1; - let inner_decoder = AssignmentReader::new(&mut reader).map_err(io::Error::from)?; + let inner_decoder = BenStreamReader::from_ben(&mut reader).map_err(io::Error::from)?; for record in inner_decoder { let (assignment, count) = record.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + count as usize > sample_number { @@ -99,22 +99,28 @@ pub fn extract_assignment_xben( return Err(SampleError::InvalidSampleNumber); } - let inner_decoder = XZAssignmentReader::new(&mut reader) + let inner_decoder = BenStreamReader::from_xben(&mut reader) .map_err(|e| SampleError::new_io_error(io::Error::from(e)))?; let variant = inner_decoder.variant(); let frame_iterator = inner_decoder.into_frames(); let mut current_sample = 1; for frame in frame_iterator { - let frame = frame.map_err(SampleError::new_io_error)?; - if current_sample == sample_number || current_sample + frame.1 as usize > sample_number { - // XZAssignmentFrameReader guarantees complete zero-sentinel - // frames, so decode_ben32_line always succeeds here. - let (assignment, _) = decode_ben32_line(Cursor::new(&frame.0), variant) - .expect("complete frame from XZAssignmentFrameReader"); + let (decode_frame, count) = frame.map_err(SampleError::new_io_error)?; + if current_sample == sample_number || current_sample + count as usize > sample_number { + // The frame iterator guarantees complete zero-sentinel ben32 + // frames in the XBEN arm, so decode_ben32_line always succeeds. + let bytes = match &decode_frame { + crate::io::reader::DecodeFrame::XBen(b, _) => b, + crate::io::reader::DecodeFrame::Ben(_) => { + unreachable!("XBEN reader yields XBen frames") + } + }; + let (assignment, _) = decode_ben32_line(Cursor::new(bytes), variant) + .expect("complete frame from XBEN frame reader"); return Ok(assignment); } - current_sample += frame.1 as usize; + current_sample += count as usize; } Err(SampleError::SampleNotFound { diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 108f095..fef1ec5 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -7,7 +7,7 @@ use crate::codec::decode::decode_ben_line; use crate::codec::BenEncodeFrame; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::reader::AssignmentReader; +use crate::io::reader::BenStreamReader; use crate::io::writer::AssignmentWriter; use crate::progress::Spinner; use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; @@ -132,7 +132,7 @@ fn relabel_ben_file_via_decoder( where F: FnMut(&[u16]) -> io::Result>, { - let mut decoder = AssignmentReader::new(reader)?.silent(true); + let mut decoder = BenStreamReader::from_ben(reader)?.silent(true); let mut encoder = AssignmentWriter::new(writer, variant)?; let mut sample_number = 0usize; let spinner = Spinner::new("Relabeling line"); diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index fe5dae7..9c0d853 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -1,4 +1,4 @@ -//! Rigorous tests for `AssignmentReader` with the MkvChain and TwoDelta BEN variants. +//! Rigorous tests for `BenStreamReader` with the MkvChain and TwoDelta BEN variants. //! //! Standard-variant tests already exist in `test_coverage.rs`. This file adds //! equivalent depth for the two more complex variants. The helpers intentionally @@ -7,7 +7,7 @@ use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::codec::encode::encode_jsonl_to_ben; use binary_ensemble::format::banners::{MKVCHAIN_BEN_BANNER, TWODELTA_BEN_BANNER}; -use binary_ensemble::io::reader::{AssignmentFrameReader, AssignmentReader}; +use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::BenVariant; @@ -29,7 +29,7 @@ fn encode_ben(assignments: &[Vec], variant: BenVariant) -> Vec { /// Expand all repetitions by calling `for_each_assignment`. fn expand_assignments(ben: &[u8]) -> Vec> { - let mut decoder = AssignmentReader::new(ben).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben).unwrap().silent(true); let mut out = Vec::new(); decoder .for_each_assignment(|a, count| { @@ -60,14 +60,14 @@ mod mkvchain { #[test] fn variant_accessor_returns_mkvchain() { let ben = encode_ben(&[vec![1u16, 2]], BenVariant::MkvChain); - let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(decoder.variant(), BenVariant::MkvChain); } #[test] fn empty_payload_yields_nothing() { let ben = MKVCHAIN_BEN_BANNER.to_vec(); - let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert!(frames.is_empty()); } @@ -79,7 +79,7 @@ mod mkvchain { let assignment = vec![3u16, 3, 1, 2, 2, 1]; let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); assert_eq!(decoded, assignment); @@ -91,7 +91,7 @@ mod mkvchain { let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![2u16, 1, 3]]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); for expected in &assignments { let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1, "distinct assignment should have count=1"); @@ -107,7 +107,7 @@ mod mkvchain { let assignments = vec![assignment.clone(); 5]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 5, "expected compressed count=5, got {count}"); assert_eq!(decoded, assignment); @@ -130,7 +130,7 @@ mod mkvchain { ]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (d1, c1) = decoder.next().unwrap().unwrap(); assert_eq!(c1, 3); @@ -157,7 +157,7 @@ mod mkvchain { .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - let records: Vec<_> = AssignmentReader::new(ben.as_slice()) + let records: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .collect::>>() @@ -173,7 +173,7 @@ mod mkvchain { let assignments: Vec> = (0u16..8).map(|i| vec![i, i + 1]).collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - let decoded: Vec> = AssignmentReader::new(ben.as_slice()) + let decoded: Vec> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .map(|r| r.unwrap().0) @@ -188,7 +188,7 @@ mod mkvchain { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_ben(&assignments, BenVariant::MkvChain); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -207,7 +207,7 @@ mod mkvchain { .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -219,7 +219,7 @@ mod mkvchain { fn count_samples_empty_stream() { let ben = MKVCHAIN_BEN_BANNER.to_vec(); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -236,7 +236,7 @@ mod mkvchain { let ben = encode_ben(&vec![assignment.clone(); 3], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -255,7 +255,7 @@ mod mkvchain { let ben = encode_ben(&vec![assignment; 3], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -282,7 +282,7 @@ mod mkvchain { let ben = encode_ben(&[a.clone(), a.clone(), b.clone()], BenVariant::MkvChain); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -305,7 +305,7 @@ mod mkvchain { let ben = encode_ben(&assignments, BenVariant::MkvChain); let mut via_reader = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut via_reader) .unwrap(); @@ -324,7 +324,7 @@ mod mkvchain { let ben = encode_ben(&vec![assignment.clone(); 4], BenVariant::MkvChain); let mut seen_count = 0u16; - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|a, count| { @@ -352,7 +352,7 @@ mod mkvchain { let ben = encode_ben(&assignments, BenVariant::MkvChain); let mut frames: Vec<(Vec, u16)> = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|assignment, count| { @@ -373,7 +373,7 @@ mod mkvchain { let ben = encode_ben(&assignments, BenVariant::MkvChain); let mut seen = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|assignment, _count| { @@ -384,7 +384,7 @@ mod mkvchain { assert_eq!(seen, vec![a]); } - // ─── into_frames (AssignmentFrameReader) ────────────────────────────────── + // ─── into_frames (BenStreamFrameReader) ────────────────────────────────── #[test] fn frame_reader_yields_count_in_tuple() { @@ -392,7 +392,7 @@ mod mkvchain { let assignment = vec![5u16, 6, 7]; let ben = encode_ben(&vec![assignment; 3], BenVariant::MkvChain); - let frames: Vec<_> = AssignmentFrameReader::new(Cursor::new(ben)) + let frames: Vec<_> = BenStreamFrameReader::from_ben(Cursor::new(ben)) .unwrap() .collect::>>() .unwrap(); @@ -407,7 +407,7 @@ mod mkvchain { // A×2, B×1 → 2 frames with counts [2, 1]. let ben = encode_ben(&[a.clone(), a.clone(), b.clone()], BenVariant::MkvChain); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -423,7 +423,7 @@ mod mkvchain { let assignment = vec![3u16, 3, 1, 2]; let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); - let (frame, _count) = AssignmentReader::new(ben.as_slice()) + let (frame, _count) = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -431,7 +431,7 @@ mod mkvchain { .unwrap() .unwrap(); - let decoded = frame.expand(None).unwrap(); + let decoded = frame.expand_self_contained().unwrap(); assert_eq!(decoded, assignment); } @@ -452,7 +452,7 @@ mod mkvchain { .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_indices(vec![3usize, 6]) @@ -467,7 +467,7 @@ mod mkvchain { let a = vec![1u16; 4]; let ben = encode_ben(&vec![a.clone(); 5], BenVariant::MkvChain); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_indices(vec![2usize, 4]) @@ -493,7 +493,7 @@ mod mkvchain { .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_range(2, 5) @@ -510,7 +510,7 @@ mod mkvchain { let a = vec![99u16; 2]; let ben = encode_ben(&vec![a.clone(); 6], BenVariant::MkvChain); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_every(2, 1) @@ -536,7 +536,7 @@ mod mkvchain { .collect(); let ben = encode_ben(&assignments, BenVariant::MkvChain); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_every(2, 2) @@ -553,7 +553,7 @@ mod mkvchain { let assignment = vec![1u16, 1]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .next() .unwrap() @@ -566,7 +566,7 @@ mod mkvchain { let assignment = vec![1u16, 2, 3, 4, 5]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 5]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .next() .unwrap() @@ -579,7 +579,7 @@ mod mkvchain { let assignment = vec![1u16, 2]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .count_samples() .unwrap_err(); @@ -591,7 +591,7 @@ mod mkvchain { let assignment = vec![1u16, 2]; let ben = encode_ben(&[assignment], BenVariant::MkvChain); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .write_all_jsonl(io::sink()) .unwrap_err(); @@ -631,7 +631,7 @@ mod twodelta { fn variant_accessor_returns_twodelta() { let ben = encode_twodelta(&[vec![1u16, 2]]); assert_eq!( - AssignmentReader::new(ben.as_slice()).unwrap().variant(), + BenStreamReader::from_ben(ben.as_slice()).unwrap().variant(), BenVariant::TwoDelta ); } @@ -675,7 +675,7 @@ mod twodelta { let next = vec![2u16, 1, 2, 1, 2]; let ben = encode_twodelta(&[anchor, next.clone()]); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let _ = decoder.next().unwrap().unwrap(); // skip anchor let (decoded_next, _) = decoder.next().unwrap().unwrap(); assert_eq!(decoded_next, next); @@ -711,7 +711,7 @@ mod twodelta { let anchor = vec![1u16, 1, 2, 2]; let ben = encode_twodelta(&vec![anchor.clone(); 3]); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 3, "anchor count should be 3"); assert_eq!(decoded, anchor); @@ -728,7 +728,7 @@ mod twodelta { .collect(); let ben = encode_twodelta(&assignments); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (d_anchor, c_anchor) = decoder.next().unwrap().unwrap(); assert_eq!(c_anchor, 1, "anchor count"); @@ -778,7 +778,7 @@ mod twodelta { fn count_samples_single_anchor() { let ben = encode_twodelta(&[vec![1u16, 2, 3]]); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -793,7 +793,7 @@ mod twodelta { let assignments = vec![a.clone(), b.clone(), a.clone()]; let ben = encode_twodelta(&assignments); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -812,7 +812,7 @@ mod twodelta { .collect(); let ben = encode_twodelta(&assignments); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -828,7 +828,7 @@ mod twodelta { let ben = encode_twodelta(&[assignment.clone()]); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -850,7 +850,7 @@ mod twodelta { let ben = encode_twodelta(&assignments); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -876,7 +876,7 @@ mod twodelta { let ben = encode_twodelta(&assignments); let mut out = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut out) .unwrap(); @@ -898,7 +898,7 @@ mod twodelta { let ben = encode_twodelta(&[a.clone(), b.clone(), a.clone()]); let mut via_reader = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .write_all_jsonl(&mut via_reader) .unwrap(); @@ -917,7 +917,7 @@ mod twodelta { let ben = encode_twodelta(&vec![anchor.clone(); 4]); let mut seen: Vec<(Vec, u16)> = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|a, count| { @@ -941,7 +941,7 @@ mod twodelta { let ben = encode_twodelta(&assignments); let mut frames: Vec<(Vec, u16)> = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|assignment, count| { @@ -962,7 +962,7 @@ mod twodelta { let ben = encode_twodelta(&[a.clone(), b.clone(), c.clone()]); let mut seen = Vec::new(); - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|assignment, _count| { @@ -976,7 +976,7 @@ mod twodelta { assert_eq!(seen[1], b); } - // ─── into_frames (AssignmentFrameReader) ────────────────────────────────── + // ─── into_frames (BenStreamFrameReader) ────────────────────────────────── #[test] fn into_frames_count_is_preserved_through_re_encoding() { @@ -989,7 +989,7 @@ mod twodelta { .collect(); let ben = encode_twodelta(&assignments); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -1010,7 +1010,7 @@ mod twodelta { let input = vec![a.clone(), b.clone(), c.clone()]; let ben = encode_twodelta(&input); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -1019,7 +1019,7 @@ mod twodelta { assert_eq!(frames.len(), 3); for (i, (frame, _count)) in frames.iter().enumerate() { - let decoded = frame.expand(None).unwrap(); + let decoded = frame.expand_self_contained().unwrap(); assert_eq!(decoded, input[i], "frame {i} decoded incorrectly"); } } @@ -1028,7 +1028,7 @@ mod twodelta { fn into_frames_from_anchor_only_has_single_frame_with_count_one() { let assignment = vec![1u16, 2, 3]; let ben = encode_twodelta(&[assignment]); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -1048,7 +1048,7 @@ mod twodelta { .collect(); let ben = encode_twodelta(&input); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_frames() @@ -1068,7 +1068,7 @@ mod twodelta { let input = vec![a.clone(), b.clone(), c.clone(), a.clone(), b.clone()]; let ben = encode_twodelta(&input); - let selected: Vec> = AssignmentReader::new(ben.as_slice()) + let selected: Vec> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_indices(vec![1usize, 3, 5]) @@ -1087,7 +1087,7 @@ mod twodelta { let ben = encode_twodelta(&input); // Range [2, 4] → 3 assignments: b, c, a. - let selected: Vec> = AssignmentReader::new(ben.as_slice()) + let selected: Vec> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_range(2, 4) @@ -1113,7 +1113,7 @@ mod twodelta { ]; let ben = encode_twodelta(&input); - let selected: Vec> = AssignmentReader::new(ben.as_slice()) + let selected: Vec> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_every(3, 1) @@ -1136,7 +1136,7 @@ mod twodelta { .collect(); let ben = encode_twodelta(&assignments); - let selected: Vec<(Vec, u16)> = AssignmentReader::new(ben.as_slice()) + let selected: Vec<(Vec, u16)> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .into_subsample_by_indices(vec![1usize, 3, 4]) @@ -1155,7 +1155,7 @@ mod twodelta { let assignment = vec![1u16, 2, 3]; let ben = encode_twodelta(&[assignment]); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .next() .unwrap() @@ -1170,7 +1170,7 @@ mod twodelta { let ben = encode_twodelta(&[a.clone(), b.clone()]); let truncated = &ben[..ben.len() - 1]; - let mut decoder = AssignmentReader::new(truncated).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(truncated).unwrap().silent(true); let _ = decoder.next().unwrap().unwrap(); // anchor succeeds let err = decoder.next().unwrap().unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); @@ -1182,7 +1182,7 @@ mod twodelta { let b = vec![2u16, 2, 1, 1]; let ben = encode_twodelta(&[a, b]); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .count_samples() .unwrap_err(); @@ -1195,7 +1195,7 @@ mod twodelta { let b = vec![2u16, 2, 1, 1]; let ben = encode_twodelta(&[a, b]); let truncated = &ben[..ben.len() - 1]; - let err = AssignmentReader::new(truncated) + let err = BenStreamReader::from_ben(truncated) .unwrap() .write_all_jsonl(io::sink()) .unwrap_err(); diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 4472e57..f5886ec 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -15,8 +15,7 @@ use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::reader::{ - AssignmentFrameReader, AssignmentReader, DecoderInitError, XZAssignmentFrameReader, - XZAssignmentReader, + BenStreamFrameReader, BenStreamReader, DecodeFrame, DecoderInitError, }; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::json::graph::{ @@ -329,12 +328,12 @@ fn decoder_init_error_converts_to_io_error_from_invalid_format() { } // ────────────────────────────────────────────────────────────────────────────── -// io::reader – AssignmentReader +// io::reader – BenStreamReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_rejects_empty_input() { - match AssignmentReader::new(io::empty()) { + match BenStreamReader::from_ben(io::empty()) { Err(DecoderInitError::Io(_)) => {} Ok(_) => panic!("expected Io error"), Err(e) => panic!("unexpected error variant: {e}"), @@ -343,7 +342,7 @@ fn ben_decoder_rejects_empty_input() { #[test] fn ben_decoder_rejects_wrong_banner() { - match AssignmentReader::new(b"BAD BAD BAD BAD!!".as_slice()) { + match BenStreamReader::from_ben(b"BAD BAD BAD BAD!!".as_slice()) { Err(DecoderInitError::InvalidFileFormat(_)) => {} Ok(_) => panic!("expected InvalidFileFormat error"), Err(e) => panic!("unexpected error variant: {e}"), @@ -354,7 +353,7 @@ fn ben_decoder_rejects_wrong_banner() { fn ben_decoder_rejects_xz_data_with_helpful_message() { // Manufacture a valid XZ header prefix. let xz_magic = b"\xFD\x37\x7A\x58\x5A\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - match AssignmentReader::new(xz_magic.as_slice()) { + match BenStreamReader::from_ben(xz_magic.as_slice()) { Err(DecoderInitError::InvalidFileFormat(ref header)) => { let e = DecoderInitError::InvalidFileFormat(header.clone()); let msg = e.to_string(); @@ -370,7 +369,7 @@ fn ben_decoder_standard_single_assignment_round_trip() { let assignment = vec![1u16, 1, 2, 3, 3, 3]; let ben = encode_standard_ben(&[assignment.clone()]); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); assert_eq!(decoded, assignment); @@ -382,7 +381,7 @@ fn ben_decoder_standard_multiple_assignments_round_trip() { let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![1u16, 1, 1]]; let ben = encode_standard_ben(&assignments); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); for expected in &assignments { let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); @@ -407,7 +406,7 @@ fn ben_decoder_mkv_preserves_repetition_counts() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let (a1, c1) = decoder.next().unwrap().unwrap(); assert_eq!(a1, vec![1u16, 2, 3]); @@ -424,7 +423,7 @@ fn ben_decoder_mkv_preserves_repetition_counts() { fn ben_decoder_count_samples_standard() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 3); } @@ -443,7 +442,7 @@ fn ben_decoder_count_samples_mkv_with_repetitions() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 4); } @@ -452,7 +451,7 @@ fn ben_decoder_write_all_jsonl_produces_correct_output() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; let ben = encode_standard_ben(&assignments); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap(); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let mut out = Vec::new(); decoder.write_all_jsonl(&mut out).unwrap(); @@ -470,7 +469,7 @@ fn ben_decoder_for_each_assignment_early_stop() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_standard_ben(&assignments); - let mut decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let mut seen = Vec::new(); decoder .for_each_assignment(|a, _count| { @@ -485,7 +484,7 @@ fn ben_decoder_for_each_assignment_early_stop() { } // ────────────────────────────────────────────────────────────────────────────── -// io::reader – XZAssignmentReader +// io::reader – BenStreamReader // ────────────────────────────────────────────────────────────────────────────── fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { @@ -508,7 +507,7 @@ fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { fn xben_decoder_reads_variant_from_banner_standard() { let assignments = vec![vec![1u16, 2, 3]]; let xben = make_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_xben(xben.as_slice()).unwrap(); assert_eq!(decoder.variant(), BenVariant::Standard); } @@ -516,7 +515,7 @@ fn xben_decoder_reads_variant_from_banner_standard() { fn xben_decoder_reads_variant_from_banner_mkvchain() { let assignments = vec![vec![1u16, 2, 3]]; let xben = make_xben(&assignments, BenVariant::MkvChain); - let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_xben(xben.as_slice()).unwrap(); assert_eq!(decoder.variant(), BenVariant::MkvChain); } @@ -526,7 +525,7 @@ fn xben_decoder_reads_variant_from_banner_twodelta() { let base = vec![1u16, 1, 2, 2]; let second = vec![1u16, 2, 2, 1]; // swap positions 1 & 3 let xben = make_xben(&[base, second], BenVariant::TwoDelta); - let decoder = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let decoder = BenStreamReader::from_xben(xben.as_slice()).unwrap(); assert_eq!(decoder.variant(), BenVariant::TwoDelta); } @@ -1353,14 +1352,14 @@ fn relabel_ben_file_with_map_as_variant_permutes_correctly() { } // ────────────────────────────────────────────────────────────────────────────── -// AssignmentReader – iterator interface +// BenStreamReader – iterator interface // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_iterator_collects_all_frames() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6], vec![7u16, 8, 9]]; let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert_eq!(frames.len(), 3); for (i, (a, count)) in frames.iter().enumerate() { @@ -1372,7 +1371,7 @@ fn ben_decoder_iterator_collects_all_frames() { #[test] fn ben_decoder_iterator_on_empty_payload_yields_nothing() { let ben = STANDARD_BEN_BANNER.to_vec(); // banner only, no frames - let decoder = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert!(frames.is_empty()); } @@ -1484,7 +1483,7 @@ fn ben_decoder_accepts_cursor_reader() { let assignment = vec![1u16, 2, 3]; let ben = encode_standard_ben(&[assignment.clone()]); let cursor = Cursor::new(ben); - let mut decoder = AssignmentReader::new(cursor).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(cursor).unwrap().silent(true); let (decoded, _) = decoder.next().unwrap().unwrap(); assert_eq!(decoded, assignment); } @@ -1851,14 +1850,14 @@ fn sort_by_ordering_large_graph_multilevel_verifies_permutation() { } // ────────────────────────────────────────────────────────────────────────────── -// XZAssignmentReader / XZAssignmentFrameReader +// BenStreamReader / BenStreamFrameReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn xben_decoder_iterator_standard_collects_all() { let assignments = vec![vec![1u16, 1, 2, 2], vec![3u16, 3, 3, 3]]; let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(decoder.variant(), BenVariant::Standard); let results: Vec> = decoder.map(|r| r.unwrap().0).collect(); assert_eq!(results, assignments); @@ -1872,7 +1871,7 @@ fn xben_decoder_count_samples_standard() { vec![5u16, 6, 5, 6], ]; let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 3); } @@ -1880,7 +1879,7 @@ fn xben_decoder_count_samples_standard() { fn xben_decoder_count_samples_mkvchain() { let assignments: Vec> = (0..5u16).map(|i| vec![i, i + 1]).collect(); let xben = encode_xben(&assignments, BenVariant::MkvChain); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); assert_eq!(decoder.count_samples().unwrap(), 5); } @@ -1888,25 +1887,29 @@ fn xben_decoder_count_samples_mkvchain() { fn xben_frame_decoder_new_and_iterate() { let assignments = vec![vec![1u16, 1, 2], vec![2u16, 2, 1]]; let xben = encode_xben(&assignments, BenVariant::Standard); - let frame_iter = XZAssignmentFrameReader::new(Cursor::new(xben)).unwrap(); - let frames: Vec<(Vec, u16)> = frame_iter.map(|r| r.unwrap()).collect(); + let frame_iter = BenStreamFrameReader::from_xben(Cursor::new(xben)).unwrap(); + let frames: Vec<(DecodeFrame, u16)> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); - for (frame_bytes, count) in &frames { + for (frame, count) in &frames { assert_eq!(*count, 1u16); // Every standard ben32 frame ends with the 4-zero sentinel - assert!(frame_bytes.ends_with(&[0u8, 0, 0, 0])); + let bytes = match frame { + DecodeFrame::XBen(b, _) => b, + DecodeFrame::Ben(_) => panic!("xben frame iterator yielded BEN arm"), + }; + assert!(bytes.ends_with(&[0u8, 0, 0, 0])); } } // ────────────────────────────────────────────────────────────────────────────── -// AssignmentFrameReader +// BenStreamFrameReader // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_frame_decoder_standard_iterates() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6]]; let ben = encode_standard_ben(&assignments); - let frame_iter = AssignmentFrameReader::new(Cursor::new(ben)).unwrap(); + let frame_iter = BenStreamFrameReader::from_ben(Cursor::new(ben)).unwrap(); let frames: Vec<_> = frame_iter.map(|r| r.unwrap()).collect(); assert_eq!(frames.len(), 2); assert_eq!(frames[0].1, 1); @@ -1922,8 +1925,8 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_slice(), &mut ben, BenVariant::TwoDelta).unwrap(); - // AssignmentFrameReader should re-encode TwoDelta frames back to standard BEN frames - let decoder = AssignmentReader::new(Cursor::new(ben)) + // BenStreamFrameReader should re-encode TwoDelta frames back to standard BEN frames + let decoder = BenStreamReader::from_ben(Cursor::new(ben)) .unwrap() .silent(true); let frame_iter = decoder.into_frames(); @@ -1932,14 +1935,14 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { } // ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — AssignmentReader subsample methods +// SubsampleFrameDecoder — BenStreamReader subsample methods // ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_subsample_by_indices() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 4]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)) + let decoder = BenStreamReader::from_ben(Cursor::new(ben)) .unwrap() .silent(true); // 1-based indices: 2, 5, 8 @@ -1957,7 +1960,7 @@ fn ben_decoder_subsample_by_indices() { fn ben_decoder_subsample_by_range() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 3]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)) + let decoder = BenStreamReader::from_ben(Cursor::new(ben)) .unwrap() .silent(true); // Inclusive 1-based range [3, 6] @@ -1974,7 +1977,7 @@ fn ben_decoder_subsample_by_range() { fn ben_decoder_subsample_every_nth() { let assignments: Vec> = (0u16..10).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)) + let decoder = BenStreamReader::from_ben(Cursor::new(ben)) .unwrap() .silent(true); // Every 3rd sample starting at 1-based offset 1: samples 1, 4, 7, 10 @@ -1993,7 +1996,7 @@ fn ben_decoder_subsample_every_nth() { fn ben_decoder_subsample_by_indices_dedup() { let assignments: Vec> = (0u16..5).map(|i| vec![i; 2]).collect(); let ben = encode_standard_ben(&assignments); - let decoder = AssignmentReader::new(Cursor::new(ben)) + let decoder = BenStreamReader::from_ben(Cursor::new(ben)) .unwrap() .silent(true); // Duplicate index 2 → after dedup only samples 2 and 3 are selected @@ -2007,14 +2010,14 @@ fn ben_decoder_subsample_by_indices_dedup() { } // ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — XZAssignmentReader subsample methods +// SubsampleFrameDecoder — BenStreamReader subsample methods // ────────────────────────────────────────────────────────────────────────────── #[test] fn xben_decoder_subsample_by_indices() { let assignments: Vec> = (1u16..=5).map(|i| vec![i; 4]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let selected: Vec> = decoder .into_subsample_by_indices(vec![1usize, 3, 5]) .map(|r| r.unwrap().0) @@ -2029,7 +2032,7 @@ fn xben_decoder_subsample_by_indices() { fn xben_decoder_subsample_by_range() { let assignments: Vec> = (0u16..6).map(|i| vec![i; 3]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let selected: Vec> = decoder .into_subsample_by_range(2, 4) .map(|r| r.unwrap().0) @@ -2043,7 +2046,7 @@ fn xben_decoder_subsample_by_range() { fn xben_decoder_subsample_every() { let assignments: Vec> = (0u16..6).map(|i| vec![i; 2]).collect(); let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = XZAssignmentReader::new(Cursor::new(xben)).unwrap(); + let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); // Every 2nd sample starting from offset 1: samples 1, 3, 5 let selected: Vec> = decoder .into_subsample_every(2, 1) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 5f1ed56..0be2e6d 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -8,8 +8,8 @@ use binary_ensemble::codec::encode::{ }; use binary_ensemble::codec::BenEncodeFrame; use binary_ensemble::io::reader::{ - build_frame_iter, count_samples_from_file, AssignmentReader, DecodeFrame, DecoderInitError, - SubsampleFrameDecoder, XZAssignmentReader, + build_frame_iter, count_samples_from_file, BenStreamReader, BenWireFormat, DecodeFrame, + DecoderInitError, SubsampleFrameDecoder, }; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::ops::extract::extract_assignment_ben; @@ -172,7 +172,7 @@ fn strat_threads_levels() -> impl Strategy { // ---------- Tests ---------- proptest! { - // JSONL -> BEN(Standard) -> JSONL round-trip via BenEncoder/AssignmentReader entry points. + // JSONL -> BEN(Standard) -> JSONL round-trip via BenEncoder/BenStreamReader entry points. #[test] fn fuzz_roundtrip_ben_standard(seq in strat_assignment_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -321,7 +321,7 @@ proptest! { prop_assert_eq!(direct, via); } - // Iterator surface: XZAssignmentReader -> records matches direct JSONL + // Iterator surface: BenStreamReader -> records matches direct JSONL #[test] fn fuzz_xbendecoder_iterator_matches_jsonl(seq in strat_assignment_seq(), params in strat_threads_levels()) { let (threads, level) = params; @@ -338,7 +338,7 @@ proptest! { None, ).unwrap(); - let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let mut dec = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let iter_jsonl = jsonl_from_records(&recs, 0); @@ -350,7 +350,7 @@ proptest! { prop_assert_eq!(iter_jsonl, direct); } - // Iterator surface: XZAssignmentReader over TwoDelta XBEN matches direct JSONL. + // Iterator surface: BenStreamReader over TwoDelta XBEN matches direct JSONL. #[test] fn fuzz_xbendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq(), params in strat_threads_levels()) { let (threads, level) = params; @@ -367,7 +367,7 @@ proptest! { None, ).unwrap(); - let mut dec = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let mut dec = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let iter_jsonl = jsonl_from_records(&recs, 0); @@ -377,7 +377,7 @@ proptest! { prop_assert_eq!(iter_jsonl, direct); } - // Iterator surface: AssignmentReader over BEN produced by BenEncoder. + // Iterator surface: BenStreamReader over BEN produced by BenEncoder. #[test] fn fuzz_bendecoder_iterator_matches_jsonl(seq in strat_assignment_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -386,15 +386,15 @@ proptest! { let mut ben = Vec::new(); encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::Standard).unwrap(); - // Iterate AssignmentReader - let mut dec = AssignmentReader::new(ben.as_slice()).unwrap(); + // Iterate BenStreamReader + let mut dec = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let out = jsonl_from_records(&recs, 0); prop_assert_eq!(out, jsonl); } - // Iterator surface: AssignmentReader over TwoDelta BEN matches JSONL. + // Iterator surface: BenStreamReader over TwoDelta BEN matches JSONL. #[test] fn fuzz_bendecoder_iterator_matches_jsonl_twodelta(seq in strat_twodelta_seq()) { let jsonl = jsonl_from_assignments(&seq); @@ -402,7 +402,7 @@ proptest! { let mut ben = Vec::new(); encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::TwoDelta).unwrap(); - let mut dec = AssignmentReader::new(ben.as_slice()).unwrap(); + let mut dec = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let recs = collect_records(&mut dec).unwrap(); let out = jsonl_from_records(&recs, 0); prop_assert_eq!(out, jsonl); @@ -431,7 +431,7 @@ proptest! { let mut want: Vec = (1..=n).step_by(3).collect(); // 1,4,7,… if want.is_empty() { want.push(1); } - let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let xb = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_by_indices(want.clone()); let recs = collect_records(&mut sub).unwrap(); @@ -476,7 +476,7 @@ proptest! { } } - let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let xb = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_every(step, offset); let recs = collect_records(&mut sub).unwrap(); @@ -511,7 +511,7 @@ proptest! { let truth: Vec> = (s..=e).map(|i| seq[i-1].clone()).collect(); - let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let xb = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_by_range(s, e); let recs = collect_records(&mut sub).unwrap(); @@ -535,7 +535,7 @@ proptest! { want.push(1); } - let mut sub = AssignmentReader::new(ben.as_slice()) + let mut sub = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .into_subsample_by_indices(want.clone()); let recs = collect_records(&mut sub).unwrap(); @@ -579,7 +579,7 @@ fn invalid_ben_header_yields_error() { bogus.extend_from_slice(b"NOT A BEN HEADER!"); bogus.resize(17, 0); - let err = AssignmentReader::new(Cursor::new(bogus)) + let err = BenStreamReader::from_ben(Cursor::new(bogus)) .err() .expect("expeced InvalidFileFormat error"); match err { @@ -597,7 +597,7 @@ fn xben_decoder_rejects_bad_banner() { let mut xz = Vec::new(); xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); - let err = XZAssignmentReader::new(xz.as_slice()) + let err = BenStreamReader::from_xben(xz.as_slice()) .err() .expect("expeced InvalidFileFormat error"); assert_eq!( @@ -624,7 +624,7 @@ fn subsample_every_respects_offset() { .unwrap(); // Keep every 1 starting at offset=2 -> only second sample. - let xb = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let xb = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let mut sub = xb.into_subsample_every(1, 2); let recs = collect_records(&mut sub).unwrap(); @@ -707,7 +707,7 @@ fn ben_new_invalid_header_detects_xz() { .unwrap(); // Try to treat it as BEN - let err = AssignmentReader::new(xz.as_slice()) + let err = BenStreamReader::from_ben(xz.as_slice()) .err() .expect("expected error"); match err { @@ -733,7 +733,7 @@ fn xben_new_invalid_banner() { None, ) .unwrap(); - let err = XZAssignmentReader::new(wrong.as_slice()) + let err = BenStreamReader::from_xben(wrong.as_slice()) .err() .expect("expected invalid data"); assert_eq!( @@ -763,7 +763,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { // Trim the last byte to force partial frame after decompress let trimmed = &xz[..xz.len() - 1]; // Iterating should surface UnexpectedEof (partial frame) - let mut it = XZAssignmentReader::new(trimmed).unwrap(); + let mut it = BenStreamReader::from_xben(trimmed).unwrap(); // Drain until error while let Some(res) = it.next() { if let Err(e) = res { @@ -847,7 +847,7 @@ fn subsample_by_indices_sorts_and_dedups() { None, ) .unwrap(); - let xb = XZAssignmentReader::new(xz.as_slice()).unwrap(); + let xb = BenStreamReader::from_xben(xz.as_slice()).unwrap(); // Deliberately unsorted and duplicated indices let mut sub = xb.into_subsample_by_indices(vec![5, 2, 2, 1, 5, 3]); @@ -945,7 +945,7 @@ fn ben_encoder_write_assignment_path_roundtrips() { #[test] fn ben_decoder_new_reports_short_header_as_io_error() { - let err = AssignmentReader::new([1u8, 2, 3].as_slice()).err().unwrap(); + let err = BenStreamReader::from_ben([1u8, 2, 3].as_slice()).err().unwrap(); match err { DecoderInitError::Io(e) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), other => panic!("unexpected error: {other:?}"), @@ -957,7 +957,7 @@ fn ben_decoder_write_all_jsonl_propagates_frame_errors() { let mut malformed = b"STANDARD BEN FILE".to_vec(); malformed.extend_from_slice(&[3]); // start of a frame, but truncated - let mut decoder = AssignmentReader::new(malformed.as_slice()).unwrap(); + let mut decoder = BenStreamReader::from_ben(malformed.as_slice()).unwrap(); let err = decoder.write_all_jsonl(Vec::new()).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); } @@ -967,7 +967,7 @@ fn ben_decoder_count_samples_propagates_frame_errors() { let mut malformed = b"STANDARD BEN FILE".to_vec(); malformed.extend_from_slice(&[3]); - let err = AssignmentReader::new(malformed.as_slice()) + let err = BenStreamReader::from_ben(malformed.as_slice()) .unwrap() .count_samples() .unwrap_err(); @@ -992,11 +992,11 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { .unwrap(); let mut frames = - binary_ensemble::io::reader::XZAssignmentFrameReader::new(xz.as_slice()).unwrap(); + binary_ensemble::io::reader::BenStreamFrameReader::from_xben(xz.as_slice()).unwrap(); assert!(frames.next().unwrap().is_ok()); let trimmed = &xz[..xz.len() - 1]; - let mut frames = binary_ensemble::io::reader::XZAssignmentFrameReader::new(trimmed).unwrap(); + let mut frames = binary_ensemble::io::reader::BenStreamFrameReader::from_xben(trimmed).unwrap(); loop { match frames.next() { Some(Err(e)) => { @@ -1075,7 +1075,7 @@ impl std::io::Read for FailAfterN { fn ben_decoder_frame_read_error_paths() { let banner = b"STANDARD BEN FILE".to_vec(); - let err = AssignmentReader::new(FailAfterN { + let err = BenStreamReader::from_ben(FailAfterN { data: [banner.clone(), vec![3]].concat(), pos: 0, fail_at: 18, @@ -1086,7 +1086,7 @@ fn ben_decoder_frame_read_error_paths() { .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); - let err = AssignmentReader::new(FailAfterN { + let err = BenStreamReader::from_ben(FailAfterN { data: [banner.clone(), vec![3, 3, 0]].concat(), pos: 0, fail_at: 20, @@ -1097,7 +1097,7 @@ fn ben_decoder_frame_read_error_paths() { .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); - let err = AssignmentReader::new(FailAfterN { + let err = BenStreamReader::from_ben(FailAfterN { data: [banner.clone(), vec![3, 3, 0, 0, 0, 1]].concat(), pos: 0, fail_at: 23, @@ -1119,7 +1119,7 @@ fn ben_decoder_mkv_count_read_error_path() { ) .unwrap(); let truncated = ben[..ben.len() - 1].to_vec(); - let err = AssignmentReader::new(truncated.as_slice()) + let err = BenStreamReader::from_ben(truncated.as_slice()) .unwrap() .next() .unwrap() @@ -1203,7 +1203,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1222,7 +1222,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - XZAssignmentReader::new(xben.as_slice()) + BenStreamReader::from_xben(xben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1245,7 +1245,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { ) .unwrap(); assert_eq!( - XZAssignmentReader::new(twodelta_xben.as_slice()) + BenStreamReader::from_xben(twodelta_xben.as_slice()) .unwrap() .count_samples() .unwrap(), @@ -1284,17 +1284,14 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { let xben_path = unique_temp_path("sample.xben"); fs::write(&xben_path, &xben).unwrap(); - let ben_iter = build_frame_iter(&ben_path, "ben").unwrap(); + let ben_iter = build_frame_iter(&ben_path, BenWireFormat::Ben).unwrap(); assert_eq!(collect_frames(ben_iter).unwrap().len(), 2); - let xben_iter = build_frame_iter(&xben_path, "xben").unwrap(); + let xben_iter = build_frame_iter(&xben_path, BenWireFormat::XBen).unwrap(); assert_eq!(collect_frames(xben_iter).unwrap().len(), 2); - assert_eq!(count_samples_from_file(&ben_path, "ben").unwrap(), 3); - assert_eq!(count_samples_from_file(&xben_path, "xben").unwrap(), 3); - - let err = build_frame_iter(&ben_path, "wat").err().unwrap(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + assert_eq!(count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), 3); + assert_eq!(count_samples_from_file(&xben_path, BenWireFormat::XBen).unwrap(), 3); fs::remove_file(ben_path).unwrap(); fs::remove_file(xben_path).unwrap(); @@ -1316,7 +1313,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { ) .unwrap(); - let mut by_indices = AssignmentReader::new(ben.as_slice()) + let mut by_indices = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .into_subsample_by_indices(vec![4, 1, 1, 3]); let picked = collect_records(&mut by_indices).unwrap(); @@ -1325,7 +1322,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { vec![1, 3, 4] ); - let mut by_range = AssignmentReader::new(ben.as_slice()) + let mut by_range = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .into_subsample_by_range(2, 3); let picked = collect_records(&mut by_range).unwrap(); @@ -1334,7 +1331,7 @@ fn ben_decoder_subsample_helpers_work_on_public_api() { vec![2, 3] ); - let mut every = AssignmentReader::new(ben.as_slice()) + let mut every = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .into_subsample_every(2, 2); let picked = collect_records(&mut every).unwrap(); @@ -1363,7 +1360,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { encoder.finish().unwrap(); } - let records = collect_records(AssignmentReader::new(ben.as_slice()).unwrap()).unwrap(); + let records = collect_records(BenStreamReader::from_ben(ben.as_slice()).unwrap()).unwrap(); assert_eq!( records, vec![ @@ -1377,13 +1374,8 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { decode_ben_to_jsonl(ben.as_slice(), &mut jsonl).unwrap(); assert_eq!(jsonl, jsonl_from_assignments(&assignments)); - let frames = AssignmentReader::new(ben.as_slice()).unwrap().into_frames(); - assert_eq!( - collect_frames(frames.map(|res| res.map(|(f, cnt)| (DecodeFrame::Ben(f), cnt)))) - .unwrap() - .len(), - 3 - ); + let frames = BenStreamReader::from_ben(ben.as_slice()).unwrap().into_frames(); + assert_eq!(collect_frames(frames).unwrap().len(), 3); } #[test] @@ -1456,14 +1448,14 @@ fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { .unwrap(); assert_eq!( - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .count_samples() .unwrap(), 4 ); - let frames: Vec<_> = AssignmentReader::new(ben.as_slice()) + let frames: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .into_frames() .collect::>>() @@ -1478,6 +1470,6 @@ fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { let ben_path = unique_temp_path("twodelta_sample.ben"); fs::write(&ben_path, &ben).unwrap(); - assert_eq!(count_samples_from_file(&ben_path, "ben").unwrap(), 4); + assert_eq!(count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), 4); fs::remove_file(ben_path).unwrap(); } diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 2fc564b..c3fa9ab 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -17,7 +17,7 @@ use binary_ensemble::io::bundle::writer::{ AddAssetOptions, BendlAppender, BendlTruncate, BendlWriter, }; use binary_ensemble::io::bundle::BendlReader; -use binary_ensemble::io::reader::{AssignmentReader, XZAssignmentReader}; +use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::ops::relabel::relabel_ben_file_with_map; use std::cell::RefCell; @@ -26,7 +26,7 @@ use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; use std::rc::Rc; fn expand_ben(bytes: &[u8]) -> Vec> { - AssignmentReader::new(bytes) + BenStreamReader::from_ben(bytes) .unwrap() .silent(true) .flat_map(|record| { @@ -188,7 +188,7 @@ fn tiny_bendl_bundle() -> Vec { fn assert_ben_bytes_do_not_panic(bytes: Vec) { let outcome = std::panic::catch_unwind(|| { - if let Ok(reader) = AssignmentReader::new(bytes.as_slice()) { + if let Ok(reader) = BenStreamReader::from_ben(bytes.as_slice()) { for record in reader.silent(true).take(16) { let _ = record; } @@ -199,7 +199,7 @@ fn assert_ben_bytes_do_not_panic(bytes: Vec) { fn assert_xben_bytes_do_not_panic(bytes: Vec) { let outcome = std::panic::catch_unwind(|| { - if let Ok(reader) = XZAssignmentReader::new(bytes.as_slice()) { + if let Ok(reader) = BenStreamReader::from_xben(bytes.as_slice()) { for record in reader.silent(true).take(16) { let _ = record; } @@ -234,7 +234,7 @@ fn mkvchain_writer_splits_repetition_count_longer_than_u16_max() { writer.finish().unwrap(); } - let mut reader = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); let first = reader.next().unwrap().unwrap(); let second = reader.next().unwrap().unwrap(); assert!(reader.next().is_none()); @@ -256,7 +256,7 @@ fn twodelta_writer_splits_repetition_count_longer_than_u16_max() { let mut total = 0usize; let mut unique_frames = 0usize; - AssignmentReader::new(ben.as_slice()) + BenStreamReader::from_ben(ben.as_slice()) .unwrap() .silent(true) .for_each_assignment(|assignment, count| { @@ -290,7 +290,7 @@ fn xben_mkvchain_splits_repetition_count_longer_than_u16_max() { ) .unwrap(); - let mut reader = XZAssignmentReader::new(xben.as_slice()) + let mut reader = BenStreamReader::from_xben(xben.as_slice()) .unwrap() .silent(true); assert_eq!(reader.next().unwrap().unwrap(), (vec![4, 4, 5], u16::MAX)); @@ -302,7 +302,7 @@ fn xben_mkvchain_splits_repetition_count_longer_than_u16_max() { fn malformed_ben_bit_widths_return_invalid_data() { let mut ben = STANDARD_BEN_BANNER.to_vec(); ben.extend_from_slice(&[0, 1, 0, 0, 0, 0]); - let err = AssignmentReader::new(ben.as_slice()) + let err = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .next() .unwrap() @@ -323,7 +323,7 @@ fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { ben.extend_from_slice(anchor.as_slice()); ben.extend_from_slice(&[0, 1, 0, 2, 0, 0, 0, 0, 0, 1]); - let mut reader = AssignmentReader::new(ben.as_slice()).unwrap().silent(true); + let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); assert_eq!(reader.next().unwrap().unwrap(), (vec![1, 2], 1)); let err = reader.next().unwrap().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -463,7 +463,7 @@ fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { ) .unwrap(); - let mut reader = XZAssignmentReader::new(xben.as_slice()).unwrap(); + let mut reader = BenStreamReader::from_xben(xben.as_slice()).unwrap(); let err = reader.next().unwrap().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } @@ -473,7 +473,7 @@ fn zero_count_frames_are_rejected() { let frame = BenEncodeFrame::from_assignment(vec![1u16], BenVariant::MkvChain, Some(0)); let mut ben = MKVCHAIN_BEN_BANNER.to_vec(); ben.extend_from_slice(frame.as_slice()); - let err = AssignmentReader::new(ben.as_slice()) + let err = BenStreamReader::from_ben(ben.as_slice()) .unwrap() .next() .unwrap() @@ -493,7 +493,7 @@ fn zero_count_frames_are_rejected() { None, ) .unwrap(); - let err = XZAssignmentReader::new(xben.as_slice()) + let err = BenStreamReader::from_xben(xben.as_slice()) .unwrap() .next() .unwrap() From 74d0dd5848226da7224af26aad76b2e4091c6523 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 7 May 2026 15:12:23 -0600 Subject: [PATCH 089/221] small writer extraction --- ben/src/io/writer/assignment_writer.rs | 43 ++------------------- ben/src/io/writer/twodelta.rs | 47 +++++++++++++++++++++++ ben/src/io/writer/xz_assignment_writer.rs | 41 ++------------------ 3 files changed, 54 insertions(+), 77 deletions(-) diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs index b6b09a3..af330bb 100644 --- a/ben/src/io/writer/assignment_writer.rs +++ b/ben/src/io/writer/assignment_writer.rs @@ -1,3 +1,4 @@ +use super::twodelta::twodelta_repeat_runs; use super::utils::parse_json_assignment; use crate::codec::encode::encode_twodelta_frame_with_hint; use crate::codec::BenEncodeFrame; @@ -172,46 +173,8 @@ pub(super) fn twodelta_repeat_frame( assignment: &[u16], count: u16, ) -> io::Result { - let first = assignment.first().copied().unwrap_or(0); - let second = assignment - .iter() - .copied() - .find(|&value| value != first) - .unwrap_or_else(|| if first == u16::MAX { 0 } else { first + 1 }); - - let mut run_lengths = Vec::new(); - let mut current = first; - let mut run_len = 0u16; - - for &value in assignment { - if value != first && value != second { - continue; - } - if value == current { - if run_len == u16::MAX { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "TwoDelta repeat frame contains a run longer than u16::MAX", - )); - } - run_len += 1; - } else { - if run_len > 0 { - run_lengths.push(run_len); - } - current = value; - run_len = 1; - } - } - if run_len > 0 { - run_lengths.push(run_len); - } - - Ok(BenEncodeFrame::from_run_lengths( - (first, second), - run_lengths, - Some(count), - )) + let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; + Ok(BenEncodeFrame::from_run_lengths(pair, run_lengths, Some(count))) } impl Drop for AssignmentWriter { diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 6901077..c97164a 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -1,5 +1,52 @@ +use std::io; + pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// Default number of delta frames per columnar chunk in XBEN TwoDelta. pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; + +/// Walk a TwoDelta repeat-eligible assignment and emit the `(pair, run_lengths)` +/// describing it. +/// +/// Used by both the BEN and XBEN writers to construct the body of a TwoDelta +/// "repeat" frame: each writer wraps the result in its own frame type. Returns +/// an `InvalidInput` error if any run exceeds `u16::MAX` in length. +pub(super) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16), Vec)> { + let first = assignment.first().copied().unwrap_or(0); + let second = assignment + .iter() + .copied() + .find(|&value| value != first) + .unwrap_or_else(|| if first == u16::MAX { 0 } else { first + 1 }); + + let mut run_lengths = Vec::new(); + let mut current = first; + let mut run_len = 0u16; + + for &value in assignment { + if value != first && value != second { + continue; + } + if value == current { + if run_len == u16::MAX { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "TwoDelta repeat frame contains a run longer than u16::MAX", + )); + } + run_len += 1; + } else { + if run_len > 0 { + run_lengths.push(run_len); + } + current = value; + run_len = 1; + } + } + if run_len > 0 { + run_lengths.push(run_len); + } + + Ok(((first, second), run_lengths)) +} diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs index 3bd90be..4103862 100644 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ b/ben/src/io/writer/xz_assignment_writer.rs @@ -1,6 +1,7 @@ use super::frames::BufferedDeltaFrame; use super::twodelta::{ - DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, + twodelta_repeat_runs, DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, + XBEN_TWODELTA_FULL_TAG, }; use super::utils::{encode_xben_twodelta_full_frame, parse_json_assignment}; use crate::codec::decode::decode_ben_line; @@ -393,43 +394,9 @@ pub(super) fn twodelta_repeat_buffered_frame( assignment: &[u16], count: u16, ) -> io::Result { - let first = assignment.first().copied().unwrap_or(0); - let second = assignment - .iter() - .copied() - .find(|&value| value != first) - .unwrap_or_else(|| if first == u16::MAX { 0 } else { first + 1 }); - - let mut run_lengths = Vec::new(); - let mut current = first; - let mut run_len = 0u16; - - for &value in assignment { - if value != first && value != second { - continue; - } - if value == current { - if run_len == u16::MAX { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "TwoDelta repeat frame contains a run longer than u16::MAX", - )); - } - run_len += 1; - } else { - if run_len > 0 { - run_lengths.push(run_len); - } - current = value; - run_len = 1; - } - } - if run_len > 0 { - run_lengths.push(run_len); - } - + let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; Ok(BufferedDeltaFrame { - pair: (first, second), + pair, run_lengths, count, }) From c712179e5fc40e86a81d378e132af747d283dc66 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 00:00:30 -0600 Subject: [PATCH 090/221] consolodate relabel module --- ben/src/cli/reben/ben_mode.rs | 56 +- ben/src/cli/reben/tests.rs | 44 ++ ben/src/io/writer/frame_writer.rs | 87 +++ ben/src/io/writer/mod.rs | 1 + ben/src/ops/relabel/errors.rs | 9 +- ben/src/ops/relabel/mod.rs | 974 ++++++++--------------------- ben/src/ops/relabel/permutation.rs | 248 ++++++++ ben/src/ops/relabel/tests.rs | 565 ++++++++++++----- ben/tests/test_coverage.rs | 163 +++-- ben/tests/test_stress_edges.rs | 9 +- 10 files changed, 1194 insertions(+), 962 deletions(-) create mode 100644 ben/src/io/writer/frame_writer.rs create mode 100644 ben/src/ops/relabel/permutation.rs diff --git a/ben/src/cli/reben/ben_mode.rs b/ben/src/cli/reben/ben_mode.rs index f44bc25..aa5eb50 100644 --- a/ben/src/cli/reben/ben_mode.rs +++ b/ben/src/cli/reben/ben_mode.rs @@ -4,12 +4,7 @@ use super::helpers::{ to_ben_variant, to_graph_ordering, }; use crate::json::graph::{sort_json_file_by_key, sort_json_file_by_ordering}; -use crate::ops::relabel::{ - convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, - relabel_ben_file_as_variant_limit, relabel_ben_file_limit, relabel_ben_file_with_map, - relabel_ben_file_with_map_as_variant, relabel_ben_file_with_map_as_variant_limit, - relabel_ben_file_with_map_limit, -}; +use crate::ops::relabel::{relabel_ben_file, RelabelOptions}; use serde_json::json; use std::fs::File; use std::io::{BufReader, BufWriter, Write}; @@ -53,25 +48,19 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; let writer = BufWriter::new(output_file); - if args.convert_only { - let variant = output_variant.expect("checked above"); - if let Some(limit) = args.n_items { - convert_ben_file_limit(reader, writer, variant, limit) - } else { - convert_ben_file(reader, writer, variant) - } - } else if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_as_variant_limit(reader, writer, variant, limit) + let options = if args.convert_only { + RelabelOptions::convert_to(output_variant.expect("checked above")) + } else { + let base = RelabelOptions::first_seen(); + if let Some(variant) = output_variant { + base.with_target_variant(variant) } else { - relabel_ben_file_as_variant(reader, writer, variant) + base } - } else if let Some(limit) = args.n_items { - relabel_ben_file_limit(reader, writer, limit) - } else { - relabel_ben_file(reader, writer) } - .map_err(|e| format!("BEN relabeling failed: {e}"))?; + .with_max_samples_opt(args.n_items); + relabel_ben_file(reader, writer, options) + .map_err(|e| format!("BEN relabeling failed: {e}"))?; return Ok(()); } @@ -159,23 +148,14 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { map_file_name, ); - if let Some(variant) = output_variant { - if let Some(limit) = args.n_items { - relabel_ben_file_with_map_as_variant_limit( - reader, - writer, - new_to_old_node_map, - variant, - limit, - ) - } else { - relabel_ben_file_with_map_as_variant(reader, writer, new_to_old_node_map, variant) - } - } else if let Some(limit) = args.n_items { - relabel_ben_file_with_map_limit(reader, writer, new_to_old_node_map, limit) + let base = RelabelOptions::node_permutation(new_to_old_node_map); + let options = if let Some(variant) = output_variant { + base.with_target_variant(variant) } else { - relabel_ben_file_with_map(reader, writer, new_to_old_node_map) + base } - .map_err(|e| format!("BEN relabeling with map {map_file_name:?} failed: {e}"))?; + .with_max_samples_opt(args.n_items); + relabel_ben_file(reader, writer, options) + .map_err(|e| format!("BEN relabeling with map {map_file_name:?} failed: {e}"))?; Ok(()) } diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 67896b3..407b165 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -493,6 +493,50 @@ fn read_node_permutation_map_file_rejects_non_integer_index() { let _ = fs::remove_file(&map_path); } +/// Pin today's behavior when a JSON map has two old indices targeting the +/// same new index: `HashMap::insert` overwrites the prior `(new, old)` entry, +/// shrinking the inverted map. The remaining slots no longer cover +/// `0..=max_key` contiguously, so the relabel driver returns +/// `NonContiguousMap` from `dense_permutation`. This is reachable from valid +/// JSON because `serde_json` retains the last value when the input has +/// duplicate JSON keys, and even with unique keys two distinct old indices +/// can target the same new index. +#[test] +fn read_node_permutation_map_file_duplicate_new_index_creates_gap() { + use crate::ops::relabel::{relabel_ben_file, RelabelOptions}; + + let map_path = unique_path("dup_new_index_map.json"); + // old→new: {0→1, 1→1, 2→2}. Inverted: {1: 1 (overwrites 0), 2: 2}. + // Slot 0 is missing in the inverted map, so dense_permutation rejects. + fs::write( + &map_path, + b"{\"node_permutation_old_to_new\":{\"0\":1,\"1\":1,\"2\":2}}", + ) + .unwrap(); + let (map, _label) = read_node_permutation_map_file(map_path.to_str().unwrap()).unwrap(); + assert_eq!(map.len(), 2, "duplicate new index must overwrite, shrinking the map"); + + // Build a tiny BEN file to drive the relabel through dense_permutation. + let mut ben = Vec::new(); + crate::codec::encode::encode_jsonl_to_ben( + b"{\"assignment\":[1,2,3],\"sample\":1}\n".as_slice(), + &mut ben, + crate::BenVariant::Standard, + ) + .unwrap(); + + let err = relabel_ben_file( + ben.as_slice(), + Vec::new(), + RelabelOptions::node_permutation(map), + ) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("contiguous")); + + let _ = fs::remove_file(&map_path); +} + #[test] fn read_node_permutation_map_file_rejects_non_integer_value() { let map_path = unique_path("bad_value_map.json"); diff --git a/ben/src/io/writer/frame_writer.rs b/ben/src/io/writer/frame_writer.rs new file mode 100644 index 0000000..b542a42 --- /dev/null +++ b/ben/src/io/writer/frame_writer.rs @@ -0,0 +1,87 @@ +use super::assignment_writer::twodelta_repeat_frame; +use crate::codec::encode::encode_twodelta_frame_with_hint; +use crate::codec::BenEncodeFrame; +use crate::format::banners::banner_for_variant; +use crate::BenVariant; +use std::collections::HashMap; +use std::io::{self, Write}; + +/// A writer that emits one BEN frame per call, preserving input frame +/// boundaries instead of merging adjacent identical assignments. +/// +/// This sidesteps the merge buffer in [`super::AssignmentWriter`]: callers +/// supply a `(assignment, count)` pair and receive one counted frame on the +/// wire. For [`BenVariant::Standard`] targets, which cannot encode +/// repetition counts, a count of `N` is expanded into `N` one-sample frames. +/// +/// For [`BenVariant::TwoDelta`], the writer maintains its own +/// `previous_sample` and `previous_masks` so subsequent frames encode delta +/// transitions identically to `AssignmentWriter`. +pub(crate) struct FrameWriter { + writer: W, + variant: BenVariant, + previous_sample: Vec, + previous_masks: HashMap>, +} + +impl FrameWriter { + pub(crate) fn new(mut writer: W, variant: BenVariant) -> io::Result { + writer.write_all(banner_for_variant(variant))?; + Ok(Self { + writer, + variant, + previous_sample: Vec::new(), + previous_masks: HashMap::new(), + }) + } + + pub(crate) fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { + if count == 0 { + return Ok(()); + } + match self.variant { + BenVariant::Standard => { + let frame = + BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); + for _ in 0..count { + self.writer.write_all(frame.as_slice())?; + } + } + BenVariant::MkvChain => { + let frame = BenEncodeFrame::from_assignment( + &assignment, + BenVariant::MkvChain, + Some(count), + ); + self.writer.write_all(frame.as_slice())?; + } + BenVariant::TwoDelta => { + if self.previous_sample.is_empty() { + for (idx, &val) in assignment.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } + let frame = BenEncodeFrame::from_assignment( + &assignment, + BenVariant::MkvChain, + Some(count), + ); + self.writer.write_all(frame.as_slice())?; + } else if self.previous_sample == assignment { + let frame = twodelta_repeat_frame(&assignment, count)?; + self.writer.write_all(frame.as_slice())?; + } else { + let frame = encode_twodelta_frame_with_hint( + &self.previous_sample, + &assignment, + None, + Some(&mut self.previous_masks), + Some(count), + )?; + self.writer.write_all(frame.as_slice())?; + } + self.previous_sample = assignment; + } + } + Ok(()) + } +} diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 822ab8d..47f3211 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -1,4 +1,5 @@ pub mod assignment_writer; +pub(crate) mod frame_writer; pub(crate) mod frames; #[cfg(test)] pub(crate) mod tests; diff --git a/ben/src/ops/relabel/errors.rs b/ben/src/ops/relabel/errors.rs index bbc1467..081e5a3 100644 --- a/ben/src/ops/relabel/errors.rs +++ b/ben/src/ops/relabel/errors.rs @@ -5,19 +5,22 @@ use thiserror::Error; #[derive(Debug, Error)] pub enum RelabelError { #[error( - "relabel map must cover a contiguous range of new indices \ + "node permutation map must cover a contiguous range of new indices \ (max index: {max_key}, but {missing} entries are missing)" )] NonContiguousMap { max_key: usize, missing: usize }, - #[error("relabel map length {map_len} does not match assignment length {assignment_len}")] + #[error( + "node permutation map length {map_len} does not match assignment length {assignment_len}" + )] LengthMismatch { map_len: usize, assignment_len: usize, }, #[error( - "relabel map references old index {old_idx}, but assignment length is {assignment_len}" + "node permutation map references old index {old_idx}, \ + but assignment length is {assignment_len}" )] OldIndexOutOfRange { old_idx: usize, diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index fef1ec5..5202472 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -1,547 +1,311 @@ //! Relabeling operations for BEN files. +//! +//! All seven logical relabel/convert operations route through the single +//! [`relabel_ben_file`] driver, parameterised by [`RelabelOptions`]. mod errors; -use errors::RelabelError; +mod permutation; + +#[cfg(test)] +mod tests; use crate::codec::decode::decode_ben_line; use crate::codec::BenEncodeFrame; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::BenStreamReader; +use crate::io::writer::frame_writer::FrameWriter; use crate::io::writer::AssignmentWriter; use crate::progress::Spinner; -use crate::util::rle::{assign_slice_to_rle, rle_to_vec_in_place}; use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; +use permutation::{ + dense_permutation, first_seen_relabel_assignment, first_seen_relabel_rle, permute_assignment, +}; use std::collections::HashMap; use std::io::{self, Cursor, Read, Write}; -/// Convert a sparse permutation map into a dense index vector. -/// -/// # Arguments -/// -/// * `new_to_old_node_map` - The sparse map from new index to old index. -/// -/// # Returns -/// -/// Returns a dense permutation vector where `perm[new_idx] == old_idx`. -fn dense_permutation(new_to_old_node_map: &HashMap) -> io::Result> { - let Some(max_key) = new_to_old_node_map.keys().copied().max() else { - return Ok(Vec::new()); - }; - - let mut permutation = vec![usize::MAX; max_key + 1]; - for (&new_idx, &old_idx) in new_to_old_node_map { - permutation[new_idx] = old_idx; - } - - let missing = permutation.iter().filter(|&&x| x == usize::MAX).count(); - if missing > 0 { - return Err(io::Error::from(RelabelError::NonContiguousMap { - max_key, - missing, - })); - } +/// What value-level transform to apply to each decoded assignment. +#[non_exhaustive] +pub enum RelabelTransform { + /// Pass each assignment through unchanged. + Identity, + /// Rewrite labels in first-appearance order, starting at 1. + FirstSeen, + /// Reorder elements according to a `new_idx -> old_idx` map. + NodePermutation(HashMap), +} - Ok(permutation) +/// Whether the driver may merge adjacent equal output assignments. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[non_exhaustive] +pub enum RunPolicy { + /// Each input frame produces a separate output frame; counts are preserved + /// where the target variant can encode them, and expanded to one-sample + /// frames otherwise. + PreserveFrameBoundaries, + /// Adjacent identical output assignments are merged into a single counted + /// frame where the target variant can encode counts. + CollapseAdjacentEqualAssignments, } -/// Remap an assignment vector's district labels in first-seen order. -/// -/// # Arguments -/// -/// * `assignment` - The original assignment slice whose labels should be remapped. +/// Options for [`relabel_ben_file`]. /// -/// # Returns -/// -/// Returns a new vector with labels replaced by sequential integers starting at 1, -/// assigned in the order they first appear. -fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { - let mut label_map = HashMap::new(); - let mut next_label = 0u16; - let mut out = Vec::with_capacity(assignment.len()); - - for &value in assignment { - let mapped = match label_map.get(&value) { - Some(mapped) => *mapped, - None => { - next_label += 1; - label_map.insert(value, next_label); - next_label - } - }; - out.push(mapped); - } - - out +/// Constructed via [`RelabelOptions::first_seen`], +/// [`RelabelOptions::node_permutation`], or [`RelabelOptions::convert_to`], +/// then refined with the `with_*` builder methods. +#[non_exhaustive] +pub struct RelabelOptions { + transform: RelabelTransform, + target_variant: Option, + max_samples: Option, + run_policy: RunPolicy, } -/// Reorder an assignment vector according to a dense permutation. -/// -/// # Arguments -/// -/// * `assignment` - The original assignment slice to permute. -/// * `permutation` - A dense permutation vector where `permutation[new_idx] == old_idx`. -/// -/// # Returns -/// -/// Returns a new vector with elements rearranged so that `out[new_idx] == assignment[old_idx]`, -/// or an error if the lengths do not match. -fn permute_assignment(assignment: &[u16], permutation: &[usize]) -> io::Result> { - if assignment.len() != permutation.len() { - return Err(io::Error::from(RelabelError::LengthMismatch { - map_len: permutation.len(), - assignment_len: assignment.len(), - })); +impl RelabelOptions { + /// First-seen district relabeling, preserving the input variant and frame + /// boundaries. + pub fn first_seen() -> Self { + Self { + transform: RelabelTransform::FirstSeen, + target_variant: None, + max_samples: None, + run_policy: RunPolicy::PreserveFrameBoundaries, + } } - let mut out = vec![0u16; permutation.len()]; - for (new_idx, &old_idx) in permutation.iter().enumerate() { - if old_idx >= assignment.len() { - return Err(io::Error::from(RelabelError::OldIndexOutOfRange { - old_idx, - assignment_len: assignment.len(), - })); + /// Node permutation through `new_idx -> old_idx`, preserving the input + /// variant and frame boundaries. + pub fn node_permutation(map: HashMap) -> Self { + Self { + transform: RelabelTransform::NodePermutation(map), + target_variant: None, + max_samples: None, + run_policy: RunPolicy::PreserveFrameBoundaries, } - out[new_idx] = assignment[old_idx]; } - Ok(out) -} - -/// Decode a BEN stream, apply a per-assignment transform, and re-encode into the target variant. -/// -/// # Arguments -/// -/// * `reader` - The full BEN input stream, including its banner. -/// * `writer` - The destination for the re-encoded BEN output. -/// * `variant` - The target BEN variant to encode into. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// * `transform` - A closure that takes ownership of each decoded assignment -/// vector and returns the transformed version. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been processed. -fn relabel_ben_file_via_decoder( - reader: R, - writer: W, - variant: BenVariant, - max_samples: Option, - mut transform: F, -) -> io::Result<()> -where - F: FnMut(&[u16]) -> io::Result>, -{ - let mut decoder = BenStreamReader::from_ben(reader)?.silent(true); - let mut encoder = AssignmentWriter::new(writer, variant)?; - let mut sample_number = 0usize; - let spinner = Spinner::new("Relabeling line"); - decoder.for_each_assignment(|assignment, count| { - if max_samples.is_some_and(|limit| sample_number >= limit) { - return Ok(false); + /// Convert to `target` without relabeling, collapsing adjacent equal + /// assignments to preserve today's conversion compression behavior. + pub fn convert_to(target: BenVariant) -> Self { + Self { + transform: RelabelTransform::Identity, + target_variant: Some(target), + max_samples: None, + run_policy: RunPolicy::CollapseAdjacentEqualAssignments, } + } - let relabeled = transform(assignment)?; - let out_count = max_samples - .map(|limit| (limit - sample_number).min(count as usize)) - .unwrap_or(count as usize); + pub fn with_max_samples(mut self, n: usize) -> Self { + self.max_samples = Some(n); + self + } - for _ in 1..out_count { - encoder.write_assignment(relabeled.clone())?; - } - encoder.write_assignment(relabeled)?; + /// Set the sample limit. `Some(n)` sets the limit; `None` clears it. + pub fn with_max_samples_opt(mut self, n: Option) -> Self { + self.max_samples = n; + self + } - sample_number += out_count; - spinner.set_count(sample_number as u64); - Ok(true) - })?; + pub fn with_target_variant(mut self, target: BenVariant) -> Self { + self.target_variant = Some(target); + self + } - encoder.finish()?; - Ok(()) -} + pub fn with_run_policy(mut self, policy: RunPolicy) -> Self { + self.run_policy = policy; + self + } -/// Determine the BEN variant from a 17-byte file banner. -/// -/// # Arguments -/// -/// * `header` - The 17-byte banner read from the start of a BEN file. -/// -/// # Returns -/// -/// Returns the detected `BenVariant`, or an error if the banner is not recognized. -fn detect_ben_variant(header: &[u8; 17]) -> io::Result { - match header { - b"STANDARD BEN FILE" => Ok(BenVariant::Standard), - b"MKVCHAIN BEN FILE" => Ok(BenVariant::MkvChain), - b"TWODELTA BEN FILE" => Ok(BenVariant::TwoDelta), - _ => Err(io::Error::from(FormatError::UnknownBanner { - actual: header.to_vec(), - })), + pub fn transform(&self) -> &RelabelTransform { + &self.transform } -} -/// Shared implementation for converting a BEN file into a different variant without relabeling. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the converted BEN output. -/// * `target_variant` - The BEN variant to encode into. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been converted. -fn convert_ben_file_impl( - mut reader: R, - writer: W, - target_variant: BenVariant, - max_samples: Option, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - let _input_variant = detect_ben_variant(&check_buffer)?; - - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder(chained, writer, target_variant, max_samples, |a| { - Ok(a.to_vec()) - }) + pub fn target_variant(&self) -> Option { + self.target_variant + } + + pub fn max_samples(&self) -> Option { + self.max_samples + } + + pub fn run_policy(&self) -> RunPolicy { + self.run_policy + } } -/// Rewrite a BEN file into the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the converted BEN output. -/// * `target_variant` - The BEN variant to encode into. +/// Process a BEN file according to the supplied options. /// -/// # Returns -/// -/// Returns `Ok(())` after the full BEN file has been converted. -pub fn convert_ben_file( +/// All seven logical relabel/convert operations route through this driver. +/// Internally chooses between an RLE-fast-path byte walker (first-seen +/// relabeling, no variant change, frame-preserving, Standard/MkvChain input) +/// and the high-level decoder driver (everything else). +pub fn relabel_ben_file( reader: R, writer: W, - target_variant: BenVariant, + options: RelabelOptions, ) -> io::Result<()> { - convert_ben_file_impl(reader, writer, target_variant, None) + let mut reader = reader; + let mut banner = [0u8; BANNER_LEN]; + reader.read_exact(&mut banner)?; + let input_variant = variant_from_banner(&banner).ok_or_else(|| { + io::Error::from(FormatError::UnknownBanner { + actual: banner.to_vec(), + }) + })?; + + if can_use_first_seen_fast_path( + &options.transform, + options.target_variant, + input_variant, + options.run_policy, + ) { + let mut writer = writer; + writer.write_all(&banner)?; + return relabel_first_seen_via_byte_walk( + reader, + writer, + input_variant, + options.max_samples, + ); + } + + let target_variant = options.target_variant.unwrap_or(input_variant); + let chained = Cursor::new(banner).chain(reader); + let permutation = match &options.transform { + RelabelTransform::NodePermutation(map) => Some(dense_permutation(map)?), + _ => None, + }; + relabel_via_decoder( + chained, + writer, + target_variant, + options.max_samples, + options.run_policy, + |a| match &options.transform { + RelabelTransform::Identity => Ok(a.to_vec()), + RelabelTransform::FirstSeen => Ok(first_seen_relabel_assignment(a)), + RelabelTransform::NodePermutation(_) => { + permute_assignment(a, permutation.as_ref().expect("set above")) + } + }, + ) } -/// Rewrite at most `max_samples` expanded samples into the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the converted BEN output. -/// * `target_variant` - The BEN variant to encode into. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been converted. -pub fn convert_ben_file_limit( +/// Convert a BEN file to the requested variant without relabeling. +pub fn convert_ben_file( reader: R, writer: W, - target_variant: BenVariant, - max_samples: usize, -) -> io::Result<()> { - convert_ben_file_impl(reader, writer, target_variant, Some(max_samples)) -} - -/// Canonicalize the labels used inside each BEN frame. -/// -/// Labels are reassigned in first-seen order within each assignment vector, -/// which can improve downstream compression ratios. -/// -/// # Arguments -/// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// -/// # Returns -/// -/// Returns `Ok(())` after all frames have been relabeled and written. -pub fn relabel_ben_lines( - mut reader: R, - mut writer: W, - variant: BenVariant, + target: BenVariant, ) -> io::Result<()> { - relabel_ben_lines_impl(&mut reader, &mut writer, variant, None) + relabel_ben_file(reader, writer, RelabelOptions::convert_to(target)) } -/// Canonicalize up to a bounded number of samples from a BEN frame stream. -/// -/// Labels are reassigned in first-seen order within each assignment vector, -/// which can improve downstream compression ratios. -/// -/// # Arguments -/// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and -/// written. -pub fn relabel_ben_lines_limit( - mut reader: R, - mut writer: W, - variant: BenVariant, - max_samples: usize, -) -> io::Result<()> { - relabel_ben_lines_impl(&mut reader, &mut writer, variant, Some(max_samples)) +/// True when the driver may take the byte-walking RLE fast path. +/// +/// The predicate is one boolean computed once. See `risks` in the plan for +/// why it is its own pure function and gets a dedicated unit-test matrix. +fn can_use_first_seen_fast_path( + transform: &RelabelTransform, + target_variant: Option, + input: BenVariant, + run_policy: RunPolicy, +) -> bool { + matches!(transform, RelabelTransform::FirstSeen) + && target_variant.is_none() + && run_policy == RunPolicy::PreserveFrameBoundaries + && matches!(input, BenVariant::Standard | BenVariant::MkvChain) } -/// Shared implementation for canonical BEN relabeling. -/// -/// # Arguments +/// Decode a BEN stream, apply a per-assignment transform, and re-encode into +/// the target variant. /// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. -fn relabel_ben_lines_impl( - mut reader: R, - mut writer: W, - variant: BenVariant, +/// With [`RunPolicy::PreserveFrameBoundaries`], the implementation never +/// merges across input frame boundaries: MkvChain/TwoDelta targets receive +/// counted output frames, Standard targets receive `count` one-sample frames +/// because Standard cannot encode repetition counts. With +/// [`RunPolicy::CollapseAdjacentEqualAssignments`], the existing +/// [`AssignmentWriter`] merging path is used. +fn relabel_via_decoder( + reader: R, + writer: W, + target_variant: BenVariant, max_samples: Option, -) -> io::Result<()> { - let mut sample_number = 0; - let mut label_map = HashMap::new(); + run_policy: RunPolicy, + mut transform: F, +) -> io::Result<()> +where + F: FnMut(&[u16]) -> io::Result>, +{ + let mut decoder = BenStreamReader::from_ben(reader)?.silent(true); + let mut sample_number = 0usize; let spinner = Spinner::new("Relabeling line"); - loop { - if max_samples.is_some_and(|limit| sample_number >= limit) { - break; - } - let mut tmp_buffer = [0u8]; - let max_val_bits = match reader.read_exact(&mut tmp_buffer) { - Ok(_) => tmp_buffer[0], - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - break; - } - return Err(e); - } - }; - - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - - let mut label = 0; - label_map.clear(); - label_map.reserve(ben_line.len()); - for (val, _len) in &mut ben_line { - let new_val = match label_map.get(val) { - Some(v) => *v, - None => { - label += 1; - label_map.insert(*val, label); - label + match run_policy { + RunPolicy::CollapseAdjacentEqualAssignments => { + let mut encoder = AssignmentWriter::new(writer, target_variant)?; + decoder.for_each_assignment(|assignment, count| { + if max_samples.is_some_and(|limit| sample_number >= limit) { + return Ok(false); } - }; - *val = new_val; - } - let count_occurrences = if variant == BenVariant::MkvChain { - let count = reader.read_u16::()?; - let out_count = max_samples - .map(|limit| ((limit - sample_number).min(count as usize)) as u16) - .unwrap_or(count); - out_count - } else { - 1 - }; + let relabeled = transform(assignment)?; + let out_count = max_samples + .map(|limit| (limit - sample_number).min(count as usize)) + .unwrap_or(count as usize); - let relabeled = - BenEncodeFrame::from_rle(ben_line, variant, Some(count_occurrences)); - writer.write_all(relabeled.as_slice())?; - - sample_number += count_occurrences as usize; - - spinner.set_count(sample_number as u64); - } - - Ok(()) -} - -/// Relabel an entire BEN file, preserving its leading BEN banner. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// -/// # Returns -/// -/// Returns `Ok(())` after the full BEN file has been relabeled. -pub fn relabel_ben_file(mut reader: R, mut writer: W) -> io::Result<()> { - relabel_ben_file_impl(&mut reader, &mut writer, None) -} + for _ in 1..out_count { + encoder.write_assignment(relabeled.clone())?; + } + if out_count > 0 { + encoder.write_assignment(relabeled)?; + } -/// Relabel at most `max_samples` expanded samples from a BEN file, preserving -/// its leading BEN banner. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled. -pub fn relabel_ben_file_limit( - mut reader: R, - mut writer: W, - max_samples: usize, -) -> io::Result<()> { - relabel_ben_file_impl(&mut reader, &mut writer, Some(max_samples)) -} + sample_number += out_count; + spinner.set_count(sample_number as u64); + Ok(true) + })?; + encoder.finish()?; + } + RunPolicy::PreserveFrameBoundaries => { + let mut writer = FrameWriter::new(writer, target_variant)?; + decoder.for_each_assignment(|assignment, count| { + if max_samples.is_some_and(|limit| sample_number >= limit) { + return Ok(false); + } -/// Shared implementation for BEN-file canonical relabeling. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. -fn relabel_ben_file_impl( - mut reader: R, - mut writer: W, - max_samples: Option, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; + let relabeled = transform(assignment)?; + let out_count = max_samples + .map(|limit| (limit - sample_number).min(count as usize)) + .unwrap_or(count as usize); - let variant = variant_from_banner(&check_buffer).ok_or_else(|| { - io::Error::from(FormatError::UnknownBanner { - actual: check_buffer.to_vec(), - }) - })?; + if out_count > 0 { + writer.write_frame(relabeled, out_count as u16)?; + } - match variant { - BenVariant::Standard | BenVariant::MkvChain => { - writer.write_all(&check_buffer)?; - relabel_ben_lines_impl(&mut reader, &mut writer, variant, max_samples)? - } - BenVariant::TwoDelta => { - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - &mut writer, - variant, - max_samples, - |assignment| Ok(first_seen_relabel_assignment(assignment)), - )? + sample_number += out_count; + spinner.set_count(sample_number as u64); + Ok(true) + })?; } } Ok(()) } -/// Relabel BEN frames using an externally supplied node map. +/// Byte-walking RLE fast path for first-seen relabeling on Standard/MkvChain. /// -/// `new_to_old_node_map` maps the new node index to the position that should be -/// read from the original assignment vector. -/// -/// # Arguments -/// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// -/// # Returns -/// -/// Returns `Ok(())` after all frames have been relabeled and written. -pub fn relabel_ben_lines_with_map( +/// Walks 6-byte frame headers, decodes the RLE in place, applies first-seen +/// relabeling on the `(val, len)` pairs, and re-encodes. Skips assignment +/// vector materialization entirely. The output banner has been emitted by the +/// caller before this is invoked. +fn relabel_first_seen_via_byte_walk( mut reader: R, mut writer: W, - new_to_old_node_map: HashMap, - variant: BenVariant, -) -> io::Result<()> { - relabel_ben_lines_with_map_impl(&mut reader, &mut writer, new_to_old_node_map, variant, None) -} - -/// Relabel BEN frames using an externally supplied node map, up to a bounded -/// number of expanded samples. -/// -/// # Arguments -/// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and -/// written. -pub fn relabel_ben_lines_with_map_limit( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, - variant: BenVariant, - max_samples: usize, -) -> io::Result<()> { - relabel_ben_lines_with_map_impl( - &mut reader, - &mut writer, - new_to_old_node_map, - variant, - Some(max_samples), - ) -} - -/// Shared implementation for mapped BEN relabeling. -/// -/// # Arguments -/// -/// * `reader` - The BEN input stream without its 17-byte file banner. -/// * `writer` - The destination for the relabeled BEN frames. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each frame. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. -fn relabel_ben_lines_with_map_impl( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, - variant: BenVariant, + input_variant: BenVariant, max_samples: Option, ) -> io::Result<()> { - let mut sample_number = 0; - let permutation = dense_permutation(&new_to_old_node_map)?; - let mut assignment_vec = Vec::new(); - let mut new_assignment_vec = vec![0u16; permutation.len()]; - let mut new_rle = Vec::new(); + let mut sample_number = 0usize; let spinner = Spinner::new("Relabeling line"); loop { if max_samples.is_some_and(|limit| sample_number >= limit) { @@ -561,41 +325,21 @@ fn relabel_ben_lines_with_map_impl( let max_len_bits = reader.read_u8()?; let n_bytes = reader.read_u32::()?; - let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - rle_to_vec_in_place(&ben_line, &mut assignment_vec); - - if assignment_vec.len() != permutation.len() { - return Err(io::Error::from(RelabelError::LengthMismatch { - map_len: permutation.len(), - assignment_len: assignment_vec.len(), - })); - } - - for (new_idx, &old_idx) in permutation.iter().enumerate() { - if old_idx >= assignment_vec.len() { - return Err(io::Error::from(RelabelError::OldIndexOutOfRange { - old_idx, - assignment_len: assignment_vec.len(), - })); - } - new_assignment_vec[new_idx] = assignment_vec[old_idx]; - } - - assign_slice_to_rle(&new_assignment_vec, &mut new_rle); + let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + first_seen_relabel_rle(&mut ben_line); - let count_occurrences = if variant == BenVariant::MkvChain { + let count_occurrences = if input_variant == BenVariant::MkvChain { let count = reader.read_u16::()?; - let out_count = max_samples + max_samples .map(|limit| ((limit - sample_number).min(count as usize)) as u16) - .unwrap_or(count); - out_count + .unwrap_or(count) } else { 1 }; - let relabeled = - BenEncodeFrame::from_rle(new_rle.clone(), variant, Some(count_occurrences)); - writer.write_all(relabeled.as_slice())?; + let frame = + BenEncodeFrame::from_rle(ben_line, input_variant, Some(count_occurrences)); + writer.write_all(frame.as_slice())?; sample_number += count_occurrences as usize; spinner.set_count(sample_number as u64); @@ -603,233 +347,3 @@ fn relabel_ben_lines_with_map_impl( Ok(()) } - -/// Relabel an entire BEN file using an externally supplied node map. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// -/// # Returns -/// -/// Returns `Ok(())` after the full BEN file has been relabeled. -pub fn relabel_ben_file_with_map( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, -) -> io::Result<()> { - relabel_ben_file_with_map_impl(&mut reader, &mut writer, new_to_old_node_map, None) -} - -/// Relabel at most `max_samples` expanded samples from a BEN file using an -/// externally supplied node map. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled. -pub fn relabel_ben_file_with_map_limit( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, - max_samples: usize, -) -> io::Result<()> { - relabel_ben_file_with_map_impl( - &mut reader, - &mut writer, - new_to_old_node_map, - Some(max_samples), - ) -} - -/// Shared implementation for BEN-file mapped relabeling. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN file. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `max_samples` - Optional upper bound on the number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after all (or up to `max_samples`) samples have been relabeled. -fn relabel_ben_file_with_map_impl( - mut reader: R, - mut writer: W, - new_to_old_node_map: HashMap, - max_samples: Option, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - - let variant = variant_from_banner(&check_buffer).ok_or_else(|| { - io::Error::from(FormatError::UnknownBanner { - actual: check_buffer.to_vec(), - }) - })?; - - match variant { - BenVariant::Standard | BenVariant::MkvChain => { - writer.write_all(&check_buffer)?; - relabel_ben_lines_with_map_impl( - &mut reader, - &mut writer, - new_to_old_node_map, - variant, - max_samples, - )? - } - BenVariant::TwoDelta => { - let permutation = dense_permutation(&new_to_old_node_map)?; - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - &mut writer, - variant, - max_samples, - |assignment| permute_assignment(assignment, &permutation), - )? - } - } - - Ok(()) -} - -/// Canonicalize BEN assignments and write them using the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN output. -/// * `target_variant` - The BEN variant to encode into. -/// -/// # Returns -/// -/// Returns `Ok(())` after the full BEN file has been relabeled and converted. -pub fn relabel_ben_file_as_variant( - mut reader: R, - writer: W, - target_variant: BenVariant, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - let _input_variant = detect_ben_variant(&check_buffer)?; - - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - Ok(first_seen_relabel_assignment(&assignment)) - }) -} - -/// Canonicalize up to `max_samples` expanded samples and write the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN output. -/// * `target_variant` - The BEN variant to encode into. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and converted. -pub fn relabel_ben_file_as_variant_limit( - mut reader: R, - writer: W, - target_variant: BenVariant, - max_samples: usize, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - let _input_variant = detect_ben_variant(&check_buffer)?; - - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - writer, - target_variant, - Some(max_samples), - |assignment| Ok(first_seen_relabel_assignment(assignment)), - ) -} - -/// Relabel a BEN file with a supplied node map and write the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN output. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `target_variant` - The BEN variant to encode into. -/// -/// # Returns -/// -/// Returns `Ok(())` after the full BEN file has been relabeled and converted. -pub fn relabel_ben_file_with_map_as_variant( - mut reader: R, - writer: W, - new_to_old_node_map: HashMap, - target_variant: BenVariant, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - let _input_variant = detect_ben_variant(&check_buffer)?; - - let permutation = dense_permutation(&new_to_old_node_map)?; - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder(chained, writer, target_variant, None, |assignment| { - permute_assignment(assignment, &permutation) - }) -} - -/// Relabel up to `max_samples` expanded samples with a supplied node map and write the requested BEN variant. -/// -/// # Arguments -/// -/// * `reader` - The input BEN stream, including its banner. -/// * `writer` - The destination for the relabeled BEN output. -/// * `new_to_old_node_map` - The permutation describing how node positions -/// should be reordered. -/// * `target_variant` - The BEN variant to encode into. -/// * `max_samples` - The maximum number of expanded samples to write. -/// -/// # Returns -/// -/// Returns `Ok(())` after up to `max_samples` samples have been relabeled and converted. -pub fn relabel_ben_file_with_map_as_variant_limit( - mut reader: R, - writer: W, - new_to_old_node_map: HashMap, - target_variant: BenVariant, - max_samples: usize, -) -> io::Result<()> { - let mut check_buffer = [0u8; BANNER_LEN]; - reader.read_exact(&mut check_buffer)?; - let _input_variant = detect_ben_variant(&check_buffer)?; - - let permutation = dense_permutation(&new_to_old_node_map)?; - let chained = Cursor::new(check_buffer).chain(reader); - relabel_ben_file_via_decoder( - chained, - writer, - target_variant, - Some(max_samples), - |assignment| permute_assignment(assignment, &permutation), - ) -} - -#[cfg(test)] -mod tests; diff --git a/ben/src/ops/relabel/permutation.rs b/ben/src/ops/relabel/permutation.rs new file mode 100644 index 0000000..bef676e --- /dev/null +++ b/ben/src/ops/relabel/permutation.rs @@ -0,0 +1,248 @@ +use super::errors::RelabelError; +use std::collections::HashMap; +use std::io; + +/// Convert a sparse permutation map into a dense index vector. +pub(super) fn dense_permutation( + new_to_old_node_map: &HashMap, +) -> io::Result> { + let Some(max_key) = new_to_old_node_map.keys().copied().max() else { + return Ok(Vec::new()); + }; + + let mut permutation = vec![usize::MAX; max_key + 1]; + for (&new_idx, &old_idx) in new_to_old_node_map { + permutation[new_idx] = old_idx; + } + + let missing = permutation.iter().filter(|&&x| x == usize::MAX).count(); + if missing > 0 { + return Err(io::Error::from(RelabelError::NonContiguousMap { + max_key, + missing, + })); + } + + Ok(permutation) +} + +/// Remap an assignment vector's district labels in first-seen order, starting at 1. +pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { + let mut label_map = HashMap::new(); + let mut next_label = 0u16; + let mut out = Vec::with_capacity(assignment.len()); + + for &value in assignment { + let mapped = match label_map.get(&value) { + Some(mapped) => *mapped, + None => { + next_label += 1; + label_map.insert(value, next_label); + next_label + } + }; + out.push(mapped); + } + + out +} + +/// Rewrite the value of each `(val, len)` RLE pair in first-seen order, in place. +pub(super) fn first_seen_relabel_rle(runs: &mut [(u16, u16)]) { + let mut label_map = HashMap::new(); + let mut label = 0u16; + label_map.reserve(runs.len()); + for (val, _len) in runs { + let new_val = match label_map.get(val) { + Some(v) => *v, + None => { + label += 1; + label_map.insert(*val, label); + label + } + }; + *val = new_val; + } +} + +/// Reorder an assignment vector according to a dense permutation. +pub(super) fn permute_assignment( + assignment: &[u16], + permutation: &[usize], +) -> io::Result> { + if assignment.len() != permutation.len() { + return Err(io::Error::from(RelabelError::LengthMismatch { + map_len: permutation.len(), + assignment_len: assignment.len(), + })); + } + + let mut out = vec![0u16; permutation.len()]; + for (new_idx, &old_idx) in permutation.iter().enumerate() { + if old_idx >= assignment.len() { + return Err(io::Error::from(RelabelError::OldIndexOutOfRange { + old_idx, + assignment_len: assignment.len(), + })); + } + out[new_idx] = assignment[old_idx]; + } + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::util::rle::{assign_to_rle, rle_to_vec}; + + // ── dense_permutation ─────────────────────────────────────────────── + + #[test] + fn dense_permutation_empty_map_returns_empty_vec() { + let map = HashMap::new(); + assert!(dense_permutation(&map).unwrap().is_empty()); + } + + #[test] + fn dense_permutation_contiguous_map_yields_dense_vec() { + let map: HashMap = [(0, 2), (1, 0), (2, 1)].into_iter().collect(); + assert_eq!(dense_permutation(&map).unwrap(), vec![2, 0, 1]); + } + + #[test] + fn dense_permutation_non_contiguous_below_max_errors() { + let map: HashMap = [(0, 0), (2, 1)].into_iter().collect(); + let err = dense_permutation(&map).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("contiguous")); + } + + #[test] + fn dense_permutation_non_zero_start_errors() { + // {1 -> 10}: slot 0 missing; pin today's behavior. + let map: HashMap = [(1, 10)].into_iter().collect(); + let err = dense_permutation(&map).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn dense_permutation_duplicate_old_indices_allowed_pinning_current_behavior() { + // {0 -> 5, 1 -> 5}: produces a non-bijective dense Vec without error. + let map: HashMap = [(0, 5), (1, 5)].into_iter().collect(); + assert_eq!(dense_permutation(&map).unwrap(), vec![5, 5]); + } + + // ── first_seen_relabel_assignment ────────────────────────────────── + + #[test] + fn first_seen_relabel_assignment_empty() { + assert!(first_seen_relabel_assignment(&[]).is_empty()); + } + + #[test] + fn first_seen_relabel_assignment_all_same() { + assert_eq!(first_seen_relabel_assignment(&[7, 7, 7]), vec![1, 1, 1]); + } + + #[test] + fn first_seen_relabel_assignment_monotonic() { + assert_eq!( + first_seen_relabel_assignment(&[2, 3, 4, 5]), + vec![1, 2, 3, 4] + ); + } + + #[test] + fn first_seen_relabel_assignment_reversed() { + assert_eq!( + first_seen_relabel_assignment(&[5, 4, 3, 2]), + vec![1, 2, 3, 4] + ); + } + + #[test] + fn first_seen_relabel_assignment_with_gaps() { + assert_eq!( + first_seen_relabel_assignment(&[1, 5, 9, 5, 1, 9]), + vec![1, 2, 3, 2, 1, 3] + ); + } + + // ── first_seen_relabel_rle ───────────────────────────────────────── + + #[test] + fn first_seen_relabel_rle_basic() { + let mut runs = vec![(2u16, 3u16), (3, 1), (2, 2), (5, 1)]; + first_seen_relabel_rle(&mut runs); + assert_eq!(runs, vec![(1, 3), (2, 1), (1, 2), (3, 1)]); + } + + /// Cross-check: assignment-level and RLE-level first-seen relabeling must + /// agree for any input. This pins the equivalence as a property, not a + /// coincidence (decision #6 / risk mitigation). + #[test] + fn first_seen_relabel_assignment_equals_rle_path() { + let inputs: Vec> = vec![ + vec![], + vec![7], + vec![1, 1, 1, 1], + vec![2, 3, 4, 5, 5, 3, 4, 2], + vec![5, 4, 3, 2, 1], + vec![1, 5, 9, 5, 1, 9, 9, 1], + vec![3, 3, 1, 1, 2, 2, 3, 3, 4], + ]; + for input in inputs { + let from_assignment = first_seen_relabel_assignment(&input); + + let mut runs = assign_to_rle(input.clone()); + first_seen_relabel_rle(&mut runs); + let from_rle = rle_to_vec(runs); + + assert_eq!( + from_assignment, from_rle, + "divergence on input: {:?}", + input + ); + } + } + + // ── permute_assignment ───────────────────────────────────────────── + + #[test] + fn permute_assignment_identity() { + let assignment = vec![10u16, 20, 30]; + let perm = vec![0, 1, 2]; + assert_eq!( + permute_assignment(&assignment, &perm).unwrap(), + vec![10, 20, 30] + ); + } + + #[test] + fn permute_assignment_reversal() { + let assignment = vec![10u16, 20, 30]; + let perm = vec![2, 1, 0]; + assert_eq!( + permute_assignment(&assignment, &perm).unwrap(), + vec![30, 20, 10] + ); + } + + #[test] + fn permute_assignment_length_mismatch() { + let assignment = vec![1u16, 2, 3]; + let perm = vec![0, 1]; + let err = permute_assignment(&assignment, &perm).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("length")); + } + + #[test] + fn permute_assignment_old_index_out_of_range() { + let assignment = vec![1u16, 2, 3]; + let perm = vec![0, 1, 99]; + let err = permute_assignment(&assignment, &perm).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("old index")); + } +} diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 30d8ce8..1c730b4 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -2,6 +2,7 @@ use super::*; use crate::codec::decode::decode_ben_to_jsonl; use crate::codec::encode::encode_jsonl_to_ben; use crate::codec::BenEncodeFrame; +use crate::format::banners::BANNER_LEN; use crate::util::rle::assign_to_rle; use crate::BenVariant; use rand::seq::SliceRandom; @@ -41,6 +42,15 @@ where map } +/// Wrap a banner-stripped frame payload back into a full BEN file by prepending +/// the banner. Tests that previously fed banner-less buffers feed a full BEN +/// file under the new API and call [`relabel_ben_file`] directly. +fn with_banner(variant: BenVariant, payload: &[u8]) -> Vec { + let mut out = crate::format::banners::banner_for_variant(variant).to_vec(); + out.extend_from_slice(payload); + out +} + #[test] fn test_relabel_ben_line_simple() { let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; @@ -50,10 +60,17 @@ fn test_relabel_ben_line_simple() { let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); - relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap(); + relabel_ben_file( + with_banner_in.as_slice(), + &mut buf, + RelabelOptions::first_seen(), + ) + .unwrap(); - assert_eq!(buf, expected); + assert_eq!(&buf[..BANNER_LEN], crate::format::banners::STANDARD_BEN_BANNER); + assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } #[test] @@ -78,7 +95,7 @@ fn test_relabel_simple_file() { let mut output2 = Vec::new(); let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file(output.as_slice(), writer2).unwrap(); + relabel_ben_file(output.as_slice(), writer2, RelabelOptions::first_seen()).unwrap(); let mut output3 = Vec::new(); let writer3 = io::BufWriter::new(&mut output3); @@ -125,7 +142,7 @@ fn test_relabel_simple_file_mkv() { let mut output2 = Vec::new(); let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file(output.as_slice(), writer2).unwrap(); + relabel_ben_file(output.as_slice(), writer2, RelabelOptions::first_seen()).unwrap(); let mut output3 = Vec::new(); let writer3 = io::BufWriter::new(&mut output3); @@ -168,7 +185,12 @@ fn test_relabel_simple_file_mkv_with_limit() { .unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file_limit(encoded.as_slice(), io::BufWriter::new(&mut relabeled), 2).unwrap(); + relabel_ben_file( + encoded.as_slice(), + io::BufWriter::new(&mut relabeled), + RelabelOptions::first_seen().with_max_samples(2), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); @@ -199,7 +221,12 @@ fn test_relabel_simple_file_twodelta() { .unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file(encoded.as_slice(), io::BufWriter::new(&mut relabeled)).unwrap(); + relabel_ben_file( + encoded.as_slice(), + io::BufWriter::new(&mut relabeled), + RelabelOptions::first_seen(), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); @@ -236,16 +263,17 @@ fn test_relabel_ben_line_with_map() { new_to_old_map.insert(7, 4); new_to_old_map.insert(8, 5); + let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), + relabel_ben_file( + with_banner_in.as_slice(), &mut buf, - new_to_old_map, - BenVariant::Standard, + RelabelOptions::node_permutation(new_to_old_map), ) .unwrap(); - assert_eq!(buf, expected); + assert_eq!(&buf[..BANNER_LEN], crate::format::banners::STANDARD_BEN_BANNER); + assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } #[test] @@ -260,16 +288,16 @@ fn test_relabel_ben_line_with_shuffle() { let out_rle = assign_to_rle(out_assign); let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), + relabel_ben_file( + with_banner_in.as_slice(), &mut buf, - new_to_old_map, - BenVariant::Standard, + RelabelOptions::node_permutation(new_to_old_map), ) .unwrap(); - assert_eq!(buf, expected); + assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } #[test] @@ -291,16 +319,16 @@ fn test_relabel_ben_line_with_large_shuffle() { let out_rle = assign_to_rle(out_assign); let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); - relabel_ben_lines_with_map( - input.as_slice(), + relabel_ben_file( + with_banner_in.as_slice(), &mut buf, - new_to_old_map, - BenVariant::Standard, + RelabelOptions::node_permutation(new_to_old_map), ) .unwrap(); - assert_eq!(buf, expected); + assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } #[test] @@ -340,7 +368,12 @@ fn test_relabel_simple_file_with_map() { let mut output2 = Vec::new(); let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); + relabel_ben_file( + output.as_slice(), + writer2, + RelabelOptions::node_permutation(new_to_old_map), + ) + .unwrap(); let mut output3 = Vec::new(); let writer3 = io::BufWriter::new(&mut output3); @@ -402,7 +435,12 @@ fn test_relabel_simple_file_with_map_mkv() { let mut output2 = Vec::new(); let writer2 = io::BufWriter::new(&mut output2); - relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap(); + relabel_ben_file( + output.as_slice(), + writer2, + RelabelOptions::node_permutation(new_to_old_map), + ) + .unwrap(); let mut output3 = Vec::new(); let writer3 = io::BufWriter::new(&mut output3); @@ -450,10 +488,10 @@ fn test_relabel_simple_file_with_map_twodelta() { .unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file_with_map( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut relabeled), - new_to_old_map, + RelabelOptions::node_permutation(new_to_old_map), ) .unwrap(); @@ -490,11 +528,10 @@ fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { .unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file_with_map_limit( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut relabeled), - new_to_old_map, - 2, + RelabelOptions::node_permutation(new_to_old_map).with_max_samples(2), ) .unwrap(); @@ -511,22 +548,33 @@ fn test_relabel_simple_file_with_map_mkv_limit_truncates_counts() { #[test] fn test_relabel_file_rejects_invalid_header() { - let err = relabel_ben_file(b"not a valid banner".as_slice(), Vec::new()).unwrap_err(); + let err = relabel_ben_file( + b"not a valid banner".as_slice(), + Vec::new(), + RelabelOptions::first_seen(), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!(err.to_string(), "unrecognized BEN banner (got [110, 111, 116, 32, 97, 32, 118, 97, 108, 105, 100, 32, 98, 97, 110, 110, 101]; expected one of \"STANDARD BEN FILE\", \"MKVCHAIN BEN FILE\", or \"TWODELTA BEN FILE\")"); } #[test] fn test_relabel_file_with_map_rejects_invalid_header() { - let err = - relabel_ben_file_with_map(b"not a valid banner".as_slice(), Vec::new(), HashMap::new()) - .unwrap_err(); + let err = relabel_ben_file( + b"not a valid banner".as_slice(), + Vec::new(), + RelabelOptions::node_permutation(HashMap::new()), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!(err.to_string(), "unrecognized BEN banner (got [110, 111, 116, 32, 97, 32, 118, 97, 108, 105, 100, 32, 98, 97, 110, 110, 101]; expected one of \"STANDARD BEN FILE\", \"MKVCHAIN BEN FILE\", or \"TWODELTA BEN FILE\")"); } #[test] fn test_relabel_lines_propagate_non_eof_reader_error() { + // Reader returns a valid Standard banner via Cursor, then the BoomReader + // produces a non-EOF I/O error on the body. The byte-walk fast path + // returns this I/O error unchanged. struct BoomReader { returned_first: bool, } @@ -542,14 +590,12 @@ fn test_relabel_lines_propagate_non_eof_reader_error() { } } - let err = relabel_ben_lines( + let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain( BoomReader { returned_first: false, }, - Vec::new(), - BenVariant::Standard, - ) - .unwrap_err(); + ); + let err = relabel_ben_file(chained, Vec::new(), RelabelOptions::first_seen()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); } @@ -570,13 +616,15 @@ fn test_relabel_lines_with_map_propagate_non_eof_reader_error() { } } - let err = relabel_ben_lines_with_map( + let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain( BoomReader { returned_first: false, }, + ); + let err = relabel_ben_file( + chained, Vec::new(), - HashMap::new(), - BenVariant::Standard, + RelabelOptions::node_permutation(HashMap::new()), ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); @@ -657,11 +705,10 @@ fn test_convert_ben_file_limit_truncates() { .unwrap(); let mut converted = Vec::new(); - convert_ben_file_limit( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut converted), - BenVariant::Standard, - 2, + RelabelOptions::convert_to(BenVariant::Standard).with_max_samples(2), ) .unwrap(); @@ -693,18 +740,14 @@ fn test_relabel_ben_lines_limit_standard() { ) .unwrap(); - let mut relabeled = Vec::new(); - relabel_ben_lines_limit( - &encoded[17..], - io::BufWriter::new(&mut relabeled), - BenVariant::Standard, - 2, + let mut full_relabeled = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + io::BufWriter::new(&mut full_relabeled), + RelabelOptions::first_seen().with_max_samples(2), ) .unwrap(); - let mut full_relabeled = b"STANDARD BEN FILE".to_vec(); - full_relabeled.extend_from_slice(&relabeled); - let mut decoded = Vec::new(); decode_ben_to_jsonl(full_relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); let output_str = String::from_utf8(decoded).unwrap(); @@ -735,19 +778,14 @@ fn test_relabel_ben_lines_with_map_limit_standard() { let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); - let mut relabeled = Vec::new(); - relabel_ben_lines_with_map_limit( - &encoded[17..], - io::BufWriter::new(&mut relabeled), - map, - BenVariant::Standard, - 1, + let mut full_relabeled = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + io::BufWriter::new(&mut full_relabeled), + RelabelOptions::node_permutation(map).with_max_samples(1), ) .unwrap(); - let mut full_relabeled = b"STANDARD BEN FILE".to_vec(); - full_relabeled.extend_from_slice(&relabeled); - let mut decoded = Vec::new(); decode_ben_to_jsonl(full_relabeled.as_slice(), io::BufWriter::new(&mut decoded)).unwrap(); let output_str = String::from_utf8(decoded).unwrap(); @@ -772,10 +810,10 @@ fn test_relabel_ben_file_as_variant_standard_to_twodelta() { .unwrap(); let mut converted = Vec::new(); - relabel_ben_file_as_variant( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut converted), - BenVariant::TwoDelta, + RelabelOptions::first_seen().with_target_variant(BenVariant::TwoDelta), ) .unwrap(); @@ -806,11 +844,12 @@ fn test_relabel_ben_file_as_variant_limit() { .unwrap(); let mut converted = Vec::new(); - relabel_ben_file_as_variant_limit( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut converted), - BenVariant::MkvChain, - 2, + RelabelOptions::first_seen() + .with_target_variant(BenVariant::MkvChain) + .with_max_samples(2), ) .unwrap(); @@ -844,11 +883,10 @@ fn test_relabel_ben_file_with_map_as_variant() { let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); let mut converted = Vec::new(); - relabel_ben_file_with_map_as_variant( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut converted), - map, - BenVariant::MkvChain, + RelabelOptions::node_permutation(map).with_target_variant(BenVariant::MkvChain), ) .unwrap(); @@ -881,12 +919,12 @@ fn test_relabel_ben_file_with_map_as_variant_limit() { let map: HashMap = [(0, 2), (1, 0), (2, 1)].iter().cloned().collect(); let mut converted = Vec::new(); - relabel_ben_file_with_map_as_variant_limit( + relabel_ben_file( encoded.as_slice(), io::BufWriter::new(&mut converted), - map, - BenVariant::Standard, - 2, + RelabelOptions::node_permutation(map) + .with_target_variant(BenVariant::Standard) + .with_max_samples(2), ) .unwrap(); @@ -917,79 +955,35 @@ fn test_convert_ben_file_rejects_invalid_banner() { #[test] fn test_relabel_ben_file_as_variant_rejects_invalid_banner() { - let err = relabel_ben_file_as_variant( + let err = relabel_ben_file( b"not a valid banner".as_slice(), Vec::new(), - BenVariant::Standard, + RelabelOptions::first_seen().with_target_variant(BenVariant::Standard), ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } -// ── dense_permutation error paths ──────────────────────────────────── - -#[test] -fn test_dense_permutation_empty_map() { - let map = HashMap::new(); - let perm = dense_permutation(&map).unwrap(); - assert!(perm.is_empty()); -} - -#[test] -fn test_dense_permutation_non_contiguous() { - let map: HashMap = [(0, 0), (2, 1)].iter().cloned().collect(); - let err = dense_permutation(&map).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("contiguous")); -} - -// ── permute_assignment error paths ─────────────────────────────────── - -#[test] -fn test_permute_assignment_length_mismatch() { - let assignment = vec![1u16, 2, 3]; - let perm = vec![0, 1]; - let err = permute_assignment(&assignment, &perm).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("length")); -} - -#[test] -fn test_permute_assignment_index_out_of_range() { - let assignment = vec![1u16, 2, 3]; - let perm = vec![0, 1, 99]; - let err = permute_assignment(&assignment, &perm).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidInput); - assert!(err.to_string().contains("old index")); -} - -// ── first_seen_relabel_assignment ────────────────────────────────────────── - -#[test] -fn test_first_seen_relabel_assignment() { - assert_eq!(first_seen_relabel_assignment(&[5, 3, 5, 7]), vec![1, 2, 1, 3]); - assert_eq!(first_seen_relabel_assignment(&[]), Vec::::new()); - assert_eq!(first_seen_relabel_assignment(&[42]), vec![1]); -} - // ── relabel_ben_lines_with_map: LengthMismatch ───────────────────── #[test] fn test_relabel_ben_length_mismatch() { - // Build a BEN stream with assignment length 3 ([1,2,3]), - // then supply a permutation of length 5 — triggers LengthMismatch. + // BEN stream with assignment length 3 ([1,2,3]); permutation of length 5 + // — triggers LengthMismatch. let jsonl = r#"{"assignment":[1,2,3],"sample":1} "#; let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); - let body = &ben[17..]; // strip banner - // Permutation of length 5 (identity, doesn't matter — length check comes first) let map: HashMap = (0..5).map(|i| (i, i)).collect(); let mut output = Vec::new(); - let err = - relabel_ben_lines_with_map(body, &mut output, map, BenVariant::Standard).unwrap_err(); + let err = relabel_ben_file( + ben.as_slice(), + &mut output, + RelabelOptions::node_permutation(map), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); assert!( err.to_string().contains("length") || err.to_string().contains("mismatch"), @@ -1000,26 +994,32 @@ fn test_relabel_ben_length_mismatch() { #[test] fn test_relabel_ben_lines_non_eof_read_error_propagates() { - // relabel_ben_lines_impl returns a non-EOF I/O error when the reader fails. + // The byte-walk fast path returns a non-EOF I/O error when the reader fails. + let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()) + .chain(ErrorAfterOneByte); let mut output = Vec::new(); - let err = relabel_ben_lines(ErrorAfterOneByte, &mut output, BenVariant::Standard).unwrap_err(); + let err = + relabel_ben_file(chained, &mut output, RelabelOptions::first_seen()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } #[test] fn test_relabel_ben_file_with_map_non_eof_read_error_propagates() { - // relabel_ben_file_impl returns a non-EOF I/O error when the reader fails. let map: HashMap = (0..4).map(|i| (i, i)).collect(); + let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()) + .chain(ErrorAfterOneByte); let mut output = Vec::new(); - let err = - relabel_ben_lines_with_map(ErrorAfterOneByte, &mut output, map, BenVariant::Standard) - .unwrap_err(); + let err = relabel_ben_file( + chained, + &mut output, + RelabelOptions::node_permutation(map), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } #[test] fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { - // relabel_ben_file_via_decoder propagates decode errors for TwoDelta streams. // Build a valid 2-sample TwoDelta BEN file, then corrupt the delta frame. let mut ben: Vec = Vec::new(); { @@ -1028,16 +1028,14 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } - // Locate the delta frame start: banner(17) + max_val_bits(1) + max_len_bits(1) + - // n_bytes(4 BE) + payload(n_bytes) + count(2) = anchor_end. let banner_len = 17usize; let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; let anchor_end = banner_len + 6 + n_bytes + 2; - // Set delta frame's max_len_bits (5th byte) to 0 to trigger InvalidData. ben[anchor_end + 4] = 0; let mut output = Vec::new(); - let err = relabel_ben_file(ben.as_slice(), &mut output).unwrap_err(); + let err = relabel_ben_file(ben.as_slice(), &mut output, RelabelOptions::first_seen()) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -1057,7 +1055,292 @@ fn test_relabel_ben_file_with_map_twodelta_malformed_frame_error_propagates() { let map: HashMap = (0..4).map(|i| (i, i)).collect(); let mut output = Vec::new(); - let err = relabel_ben_file_with_map(ben.as_slice(), &mut output, map) - .unwrap_err(); + let err = relabel_ben_file( + ben.as_slice(), + &mut output, + RelabelOptions::node_permutation(map), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } + +// ── Verification: predicate matrix + frame-preservation + cross-policy ── + +#[test] +fn fast_path_predicate_matrix() { + use BenVariant::*; + use RunPolicy::*; + let transforms = [ + ("Identity", RelabelTransform::Identity), + ("FirstSeen", RelabelTransform::FirstSeen), + ( + "NodePermutation", + RelabelTransform::NodePermutation(HashMap::new()), + ), + ]; + let inputs = [Standard, MkvChain, TwoDelta]; + let target_states = [None, Some(Standard)]; + let policies = [PreserveFrameBoundaries, CollapseAdjacentEqualAssignments]; + + let mut true_cases = 0; + for (tname, t) in &transforms { + for &input in &inputs { + for &target in &target_states { + for &policy in &policies { + let result = can_use_first_seen_fast_path(t, target, input, policy); + let expected = matches!(tname, &"FirstSeen") + && target.is_none() + && policy == PreserveFrameBoundaries + && (input == Standard || input == MkvChain); + assert_eq!( + result, expected, + "({}, target={:?}, input={:?}, policy={:?})", + tname, target, input, policy + ); + if result { + true_cases += 1; + } + } + } + } + } + assert_eq!(true_cases, 2, "expected exactly two true matrix entries"); +} + +/// Forced-slow vs. fast-path equivalence for first-seen relabeling on +/// Standard input. Forcing the slow path uses `with_target_variant(input)` +/// per decision #5 (`is_none()` semantics in the predicate). +#[test] +fn fast_path_matches_slow_path_standard() { + let file = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[5,5,3],\"sample\":2}\n", + "{\"assignment\":[1,2,3],\"sample\":3}\n", + ); + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut fast_out = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + &mut fast_out, + RelabelOptions::first_seen(), + ) + .unwrap(); + + let mut slow_out = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + &mut slow_out, + RelabelOptions::first_seen().with_target_variant(BenVariant::Standard), + ) + .unwrap(); + + let mut fast_jsonl = Vec::new(); + decode_ben_to_jsonl(fast_out.as_slice(), &mut fast_jsonl).unwrap(); + let mut slow_jsonl = Vec::new(); + decode_ben_to_jsonl(slow_out.as_slice(), &mut slow_jsonl).unwrap(); + assert_eq!(fast_jsonl, slow_jsonl); +} + +#[test] +fn fast_path_matches_slow_path_mkvchain() { + let file = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[3,1,2],\"sample\":2}\n", + "{\"assignment\":[5,4,2],\"sample\":3}\n", + ); + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::MkvChain, + ) + .unwrap(); + + let mut fast_out = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + &mut fast_out, + RelabelOptions::first_seen(), + ) + .unwrap(); + + // Force the slow path by setting target_variant to the input variant. + let mut slow_out = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + &mut slow_out, + RelabelOptions::first_seen().with_target_variant(BenVariant::MkvChain), + ) + .unwrap(); + + // Decoded equivalence is the load-bearing assertion. Byte-identity is also + // expected here (per plan verification step 4) — tighten if it holds. + let mut fast_jsonl = Vec::new(); + decode_ben_to_jsonl(fast_out.as_slice(), &mut fast_jsonl).unwrap(); + let mut slow_jsonl = Vec::new(); + decode_ben_to_jsonl(slow_out.as_slice(), &mut slow_jsonl).unwrap(); + assert_eq!(fast_jsonl, slow_jsonl); +} + +#[test] +fn collapse_policy_disables_fast_path() { + // With CollapseAdjacentEqualAssignments + first-seen on Standard input, + // the predicate must be false (fast path disabled). We verify behaviorally + // by running both: the merging path should produce the same decoded + // content but takes a different code path internally. + let file = concat!( + "{\"assignment\":[3,1,2],\"sample\":1}\n", + "{\"assignment\":[3,1,2],\"sample\":2}\n", + ); + let mut encoded = Vec::new(); + encode_jsonl_to_ben( + file.as_bytes(), + io::BufWriter::new(&mut encoded), + BenVariant::Standard, + ) + .unwrap(); + + let mut out = Vec::new(); + relabel_ben_file( + encoded.as_slice(), + &mut out, + RelabelOptions::first_seen() + .with_run_policy(RunPolicy::CollapseAdjacentEqualAssignments), + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); + let s = String::from_utf8(decoded).unwrap(); + assert!(s.contains("\"assignment\":[1,2,3]")); +} + +/// Decision #9: with `PreserveFrameBoundaries`, two adjacent input frames +/// with the same assignment but distinct counts must remain distinct counted +/// frames at MkvChain target — not merged into one frame with summed count. +/// With `CollapseAdjacentEqualAssignments`, they are merged. +#[test] +fn run_policy_pins_frame_preservation_and_collapse() { + // Build an MkvChain BEN file with two adjacent equal-assignment frames of + // counts 5 and 7 (12 total samples). + let mut input = Vec::new(); + { + let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; + input.extend_from_slice(banner); + let frame_a = + BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = + BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + input.extend_from_slice(frame_a.as_slice()); + input.extend_from_slice(frame_b.as_slice()); + } + + // Identity transform via convert_to(MkvChain), preserving frame boundaries. + let mut preserved = Vec::new(); + relabel_ben_file( + input.as_slice(), + &mut preserved, + RelabelOptions::convert_to(BenVariant::MkvChain) + .with_run_policy(RunPolicy::PreserveFrameBoundaries), + ) + .unwrap(); + + // Strip banner and count MkvChain frames by walking headers. + fn count_mkvchain_frames(ben: &[u8]) -> usize { + let mut i = BANNER_LEN; + let mut frames = 0; + while i < ben.len() { + // header: max_val_bits(1), max_len_bits(1), n_bytes(4), payload(n_bytes), count(2) + let n_bytes = + u32::from_be_bytes(ben[i + 2..i + 6].try_into().unwrap()) as usize; + i += 6 + n_bytes + 2; + frames += 1; + } + frames + } + + assert_eq!( + count_mkvchain_frames(&preserved), + 2, + "PreserveFrameBoundaries must keep both counted frames" + ); + + let mut collapsed = Vec::new(); + relabel_ben_file( + input.as_slice(), + &mut collapsed, + RelabelOptions::convert_to(BenVariant::MkvChain) + .with_run_policy(RunPolicy::CollapseAdjacentEqualAssignments), + ) + .unwrap(); + + assert_eq!( + count_mkvchain_frames(&collapsed), + 1, + "CollapseAdjacentEqualAssignments must merge into one count=12 frame" + ); + + // Decoded sample count is invariant across policies for MkvChain target. + let mut a = Vec::new(); + decode_ben_to_jsonl(preserved.as_slice(), &mut a).unwrap(); + let mut b = Vec::new(); + decode_ben_to_jsonl(collapsed.as_slice(), &mut b).unwrap(); + assert_eq!( + a.iter().filter(|&&c| c == b'\n').count(), + 12, + "preserved decodes 12 samples" + ); + assert_eq!( + b.iter().filter(|&&c| c == b'\n').count(), + 12, + "collapsed decodes 12 samples" + ); +} + +/// Cross-policy invariant for Standard targets: byte-identical output +/// regardless of run policy, because Standard cannot encode counts. +#[test] +fn standard_target_cross_policy_byte_identity() { + // Build the same (5, 7) MkvChain fixture. + let mut input = Vec::new(); + { + let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; + input.extend_from_slice(banner); + let frame_a = + BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = + BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + input.extend_from_slice(frame_a.as_slice()); + input.extend_from_slice(frame_b.as_slice()); + } + + let mut preserve_out = Vec::new(); + relabel_ben_file( + input.as_slice(), + &mut preserve_out, + RelabelOptions::convert_to(BenVariant::Standard) + .with_run_policy(RunPolicy::PreserveFrameBoundaries), + ) + .unwrap(); + + let mut collapse_out = Vec::new(); + relabel_ben_file( + input.as_slice(), + &mut collapse_out, + RelabelOptions::convert_to(BenVariant::Standard) + .with_run_policy(RunPolicy::CollapseAdjacentEqualAssignments), + ) + .unwrap(); + + assert_eq!( + preserve_out, collapse_out, + "Standard target must be byte-identical across run policies" + ); +} diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index f5886ec..0dde453 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -21,11 +21,7 @@ use binary_ensemble::io::writer::AssignmentWriter; use binary_ensemble::json::graph::{ sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, }; -use binary_ensemble::ops::relabel::{ - convert_ben_file, convert_ben_file_limit, relabel_ben_file, relabel_ben_file_as_variant, - relabel_ben_file_as_variant_limit, relabel_ben_file_with_map_as_variant, - relabel_ben_file_with_map_as_variant_limit, relabel_ben_lines_limit, -}; +use binary_ensemble::ops::relabel::{convert_ben_file, relabel_ben_file, RelabelOptions}; use binary_ensemble::util::rle::{assign_to_rle, rle_to_vec}; use binary_ensemble::BenVariant; @@ -813,7 +809,12 @@ fn convert_ben_file_limit_truncates_to_max_samples() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::Standard, 4).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::convert_to(BenVariant::Standard).with_max_samples(4), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); @@ -826,7 +827,12 @@ fn convert_ben_file_limit_zero_produces_banner_only() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::Standard, 0).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::convert_to(BenVariant::Standard).with_max_samples(0), + ) + .unwrap(); // Banner must be present; no frames. assert!(out.starts_with(STANDARD_BEN_BANNER)); @@ -849,14 +855,13 @@ fn relabel_ben_lines_limit_truncates_standard() { ]; let ben = encode_standard_ben(&assignments); - // Relabel only the payload (strip the 17-byte banner first). - let payload = &ben[BANNER_LEN..]; - let mut relabeled_payload = Vec::new(); - relabel_ben_lines_limit(payload, &mut relabeled_payload, BenVariant::Standard, 2).unwrap(); - - // Reconstruct a full BEN file so we can decode it. - let mut full = STANDARD_BEN_BANNER.to_vec(); - full.extend_from_slice(&relabeled_payload); + let mut full = Vec::new(); + relabel_ben_file( + ben.as_slice(), + &mut full, + RelabelOptions::first_seen().with_max_samples(2), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(full.as_slice(), &mut decoded).unwrap(); @@ -877,7 +882,12 @@ fn relabel_ben_file_as_variant_standard_to_standard() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_as_variant(ben.as_slice(), &mut out, BenVariant::Standard).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::first_seen().with_target_variant(BenVariant::Standard), + ) + .unwrap(); assert!(out.starts_with(STANDARD_BEN_BANNER)); @@ -908,7 +918,12 @@ fn relabel_ben_file_as_variant_standard_to_mkvchain() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_as_variant(ben.as_slice(), &mut out, BenVariant::MkvChain).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::first_seen().with_target_variant(BenVariant::MkvChain), + ) + .unwrap(); assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); @@ -919,10 +934,10 @@ fn relabel_ben_file_as_variant_standard_to_mkvchain() { #[test] fn relabel_ben_file_as_variant_rejects_invalid_header() { - let err = relabel_ben_file_as_variant( + let err = relabel_ben_file( b"TOTALLY WRONG!!!!!!".as_slice(), Vec::new(), - BenVariant::Standard, + RelabelOptions::first_seen().with_target_variant(BenVariant::Standard), ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -934,7 +949,14 @@ fn relabel_ben_file_as_variant_limit_truncates_output() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_as_variant_limit(ben.as_slice(), &mut out, BenVariant::Standard, 3).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::first_seen() + .with_target_variant(BenVariant::Standard) + .with_max_samples(3), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); @@ -947,7 +969,14 @@ fn relabel_ben_file_as_variant_limit_zero_gives_empty() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_as_variant_limit(ben.as_slice(), &mut out, BenVariant::Standard, 0).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::first_seen() + .with_target_variant(BenVariant::Standard) + .with_max_samples(0), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); @@ -969,11 +998,11 @@ fn relabel_ben_file_with_map_as_variant_standard_to_standard() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_with_map_as_variant( + relabel_ben_file( ben.as_slice(), &mut out, - reverse_map_3(), - BenVariant::Standard, + RelabelOptions::node_permutation(reverse_map_3()) + .with_target_variant(BenVariant::Standard), ) .unwrap(); @@ -996,11 +1025,11 @@ fn relabel_ben_file_with_map_as_variant_standard_to_mkvchain() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_with_map_as_variant( + relabel_ben_file( ben.as_slice(), &mut out, - reverse_map_3(), - BenVariant::MkvChain, + RelabelOptions::node_permutation(reverse_map_3()) + .with_target_variant(BenVariant::MkvChain), ) .unwrap(); @@ -1013,11 +1042,11 @@ fn relabel_ben_file_with_map_as_variant_standard_to_mkvchain() { #[test] fn relabel_ben_file_with_map_as_variant_rejects_invalid_header() { - let err = relabel_ben_file_with_map_as_variant( + let err = relabel_ben_file( b"NOT A VALID BEN!!".as_slice(), Vec::new(), - reverse_map_3(), - BenVariant::Standard, + RelabelOptions::node_permutation(reverse_map_3()) + .with_target_variant(BenVariant::Standard), ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -1035,12 +1064,12 @@ fn relabel_ben_file_with_map_as_variant_limit_truncates() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_with_map_as_variant_limit( + relabel_ben_file( ben.as_slice(), &mut out, - reverse_map_3(), - BenVariant::Standard, - 3, + RelabelOptions::node_permutation(reverse_map_3()) + .with_target_variant(BenVariant::Standard) + .with_max_samples(3), ) .unwrap(); @@ -1055,12 +1084,12 @@ fn relabel_ben_file_with_map_as_variant_limit_zero_gives_empty() { let ben = encode_standard_ben(&assignments); let mut out = Vec::new(); - relabel_ben_file_with_map_as_variant_limit( + relabel_ben_file( ben.as_slice(), &mut out, - reverse_map_3(), - BenVariant::Standard, - 0, + RelabelOptions::node_permutation(reverse_map_3()) + .with_target_variant(BenVariant::Standard) + .with_max_samples(0), ) .unwrap(); @@ -1082,8 +1111,12 @@ fn relabel_file_with_map_detects_gap_in_permutation() { // Map {0→0, 2→2} – index 1 is missing. let bad_map: HashMap = [(0, 0), (2, 2)].iter().cloned().collect(); - use binary_ensemble::ops::relabel::relabel_ben_file_with_map; - let err = relabel_ben_file_with_map(ben.as_slice(), Vec::new(), bad_map).unwrap_err(); + let err = relabel_ben_file( + ben.as_slice(), + Vec::new(), + RelabelOptions::node_permutation(bad_map), + ) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); } @@ -1112,7 +1145,12 @@ fn convert_ben_file_limit_with_mkvchain_repetitions() { encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); let mut out = Vec::new(); - convert_ben_file_limit(ben.as_slice(), &mut out, BenVariant::MkvChain, 3).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::convert_to(BenVariant::MkvChain).with_max_samples(3), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); @@ -1138,7 +1176,12 @@ fn relabel_ben_file_twodelta_canonicalizes_labels() { encode_jsonl_to_ben(file.as_bytes(), &mut ben, BenVariant::TwoDelta).unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file(ben.as_slice(), &mut relabeled).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut relabeled, + RelabelOptions::first_seen(), + ) + .unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(relabeled.as_slice(), &mut decoded).unwrap(); @@ -1316,7 +1359,12 @@ fn relabel_ben_file_as_variant_mkvchain_to_standard() { encode_jsonl_to_ben(jsonl.as_bytes(), &mut mkv_ben, BenVariant::MkvChain).unwrap(); let mut out = Vec::new(); - relabel_ben_file_as_variant(mkv_ben.as_slice(), &mut out, BenVariant::Standard).unwrap(); + relabel_ben_file( + mkv_ben.as_slice(), + &mut out, + RelabelOptions::first_seen().with_target_variant(BenVariant::Standard), + ) + .unwrap(); assert!(out.starts_with(STANDARD_BEN_BANNER)); @@ -1341,8 +1389,12 @@ fn relabel_ben_file_with_map_as_variant_permutes_correctly() { let map: HashMap = [(0, 3), (1, 2), (2, 1), (3, 0)].iter().cloned().collect(); let mut out = Vec::new(); - relabel_ben_file_with_map_as_variant(ben.as_slice(), &mut out, map, BenVariant::Standard) - .unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut out, + RelabelOptions::node_permutation(map).with_target_variant(BenVariant::Standard), + ) + .unwrap(); let decoded_str = decode_ben_to_string(&out); assert!( @@ -1387,11 +1439,21 @@ fn relabel_ben_file_standard_is_idempotent() { // First relabeling. let mut relabeled1 = Vec::new(); - relabel_ben_file(ben.as_slice(), &mut relabeled1).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut relabeled1, + RelabelOptions::first_seen(), + ) + .unwrap(); // Second relabeling on already-canonical output. let mut relabeled2 = Vec::new(); - relabel_ben_file(relabeled1.as_slice(), &mut relabeled2).unwrap(); + relabel_ben_file( + relabeled1.as_slice(), + &mut relabeled2, + RelabelOptions::first_seen(), + ) + .unwrap(); // The decoded output of both should be identical. let mut decoded1 = Vec::new(); @@ -1424,7 +1486,12 @@ fn single_unique_label_relabeled_to_one() { let ben = encode_standard_ben(&[assignment]); let mut relabeled = Vec::new(); - relabel_ben_file(ben.as_slice(), &mut relabeled).unwrap(); + relabel_ben_file( + ben.as_slice(), + &mut relabeled, + RelabelOptions::first_seen(), + ) + .unwrap(); let decoded_str = decode_ben_to_string(&relabeled); // All 99s should become 1s. diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index c3fa9ab..3d15934 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -19,7 +19,7 @@ use binary_ensemble::io::bundle::writer::{ use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::AssignmentWriter; -use binary_ensemble::ops::relabel::relabel_ben_file_with_map; +use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; use std::cell::RefCell; use std::collections::HashMap; use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; @@ -383,7 +383,12 @@ fn relabel_map_out_of_range_old_indices_error_cleanly() { } let out_of_range_old = HashMap::from([(0usize, 0usize), (1, 2)]); - let err = relabel_ben_file_with_map(ben.as_slice(), Vec::new(), out_of_range_old).unwrap_err(); + let err = relabel_ben_file( + ben.as_slice(), + Vec::new(), + RelabelOptions::node_permutation(out_of_range_old), + ) + .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); } From 46655ebce67e81afb43dec2903b59fc1d9a65e17 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 00:09:02 -0600 Subject: [PATCH 091/221] small dedupe in graph module --- ben/src/json/graph/mod.rs | 50 ++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index 9f52c3e..e5bc88a 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -50,15 +50,9 @@ pub fn sort_json_file_by_key( tracing::trace!("Sorting JSON file by key: {}", key); let (result, order) = if nx_graph.directed { - let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; - let order = petxgraph::sort_by_key(&mut petx, key); - let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; - (result, order) + reorder_directed(nx_graph, |p| petxgraph::sort_by_key(p, key))? } else { - let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; - let order = petxgraph::sort_by_key(&mut petx, key); - let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; - (result, order) + reorder_undirected(nx_graph, |p| petxgraph::sort_by_key(p, key))? }; write_nx_graph(writer, &result)?; @@ -90,15 +84,9 @@ pub fn sort_json_file_by_ordering( tracing::trace!("Sorting JSON file by ordering method: {:?}", method); let (result, order) = if nx_graph.directed { - let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; - let order = run_ordering_method(&mut petx, method); - let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; - (result, order) + reorder_directed(nx_graph, |p| run_ordering_method(p, method))? } else { - let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; - let order = run_ordering_method(&mut petx, method); - let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; - (result, order) + reorder_undirected(nx_graph, |p| run_ordering_method(p, method))? }; write_nx_graph(writer, &result)?; @@ -174,5 +162,35 @@ fn nx_err(e: NxPetgraphError) -> Error { Error::new(ErrorKind::InvalidData, e) } +/// Convert an [`NxGraphAdjFormat`] into a directed [`PetxGraph`], apply an +/// in-place reordering operation, and convert back to JSON adjacency form. +fn reorder_directed( + nx_graph: NxGraphAdjFormat, + op: F, +) -> Result<(NxGraphAdjFormat, Vec)> +where + F: FnOnce(&mut PetxGraph) -> Vec, +{ + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = op(&mut petx); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + Ok((result, order)) +} + +/// Convert an [`NxGraphAdjFormat`] into an undirected [`PetxGraph`], apply an +/// in-place reordering operation, and convert back to JSON adjacency form. +fn reorder_undirected( + nx_graph: NxGraphAdjFormat, + op: F, +) -> Result<(NxGraphAdjFormat, Vec)> +where + F: FnOnce(&mut PetxGraph) -> Vec, +{ + let mut petx: PetxGraph = nx_graph.try_into().map_err(nx_err)?; + let order = op(&mut petx); + let result: NxGraphAdjFormat = (&petx).try_into().map_err(nx_err)?; + Ok((result, order)) +} + #[cfg(test)] mod tests; From ecc39df15ef0fc38506e3fb0153810fa038da308 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 18:07:37 -0600 Subject: [PATCH 092/221] Unify stream writers --- ben-py/src/encode/encoder.rs | 8 +- ben-py/src/encode/types.rs | 4 +- ben/src/cli/pcben/mod.rs | 9 +- ben/src/codec/decode/tests/twodelta.rs | 4 +- ben/src/codec/decode/xz.rs | 5 +- ben/src/codec/encode/jsonl.rs | 44 +-- ben/src/codec/encode/mod.rs | 2 +- ben/src/codec/encode/xz.rs | 17 +- ben/src/codec/translate/errors.rs | 2 +- ben/src/codec/translate/mod.rs | 2 +- ben/src/io/bundle/mod.rs | 3 +- ben/src/io/bundle/writer.rs | 68 ++-- ben/src/io/reader/tests.rs | 44 +-- ben/src/io/writer/assignment_writer.rs | 185 ---------- ben/src/io/writer/frame_writer.rs | 87 ----- ben/src/io/writer/frames.rs | 2 +- ben/src/io/writer/mod.rs | 9 +- ben/src/io/writer/options.rs | 62 ++++ ben/src/io/writer/stream_writer/ben.rs | 136 +++++++ ben/src/io/writer/stream_writer/mod.rs | 300 ++++++++++++++++ ben/src/io/writer/stream_writer/xben.rs | 420 ++++++++++++++++++++++ ben/src/io/writer/tests.rs | 412 +++++++++++++++++---- ben/src/io/writer/twodelta.rs | 6 +- ben/src/io/writer/utils.rs | 4 +- ben/src/io/writer/xz_assignment_writer.rs | 410 --------------------- ben/src/ops/relabel/mod.rs | 10 +- ben/src/ops/relabel/tests.rs | 4 +- ben/tests/test_assignment_reader.rs | 6 +- ben/tests/test_coverage.rs | 26 +- ben/tests/test_impls_pipeline.rs | 44 +-- ben/tests/test_stress_edges.rs | 16 +- 31 files changed, 1409 insertions(+), 942 deletions(-) delete mode 100644 ben/src/io/writer/assignment_writer.rs delete mode 100644 ben/src/io/writer/frame_writer.rs create mode 100644 ben/src/io/writer/options.rs create mode 100644 ben/src/io/writer/stream_writer/ben.rs create mode 100644 ben/src/io/writer/stream_writer/mod.rs create mode 100644 ben/src/io/writer/stream_writer/xben.rs delete mode 100644 ben/src/io/writer/xz_assignment_writer.rs diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 8d05ef7..0ee0d9b 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -5,7 +5,7 @@ use binary_ensemble::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, STANDARDIZED_NAME_GRAPH, FINALIZED_YES, HEADER_SIZE, }; -use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::io::writer::BenStreamWriter; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; use std::cell::RefCell; @@ -16,7 +16,7 @@ use std::rc::Rc; #[pyclass(name = "BenEncoder", unsendable)] pub struct PyBenEncoder { file: Option, - encoder: Option>, + encoder: Option>, mode: OutputMode, } @@ -126,10 +126,10 @@ impl PyBenEncoder { } }; - // Construct the AssignmentWriter on a clone of the shared slot. + // Construct the BenStreamWriter on a clone of the shared slot. // This writes the BEN banner as its first action, which in the // bundle case becomes the first byte of the stream region. - let encoder = AssignmentWriter::new(SharedFileWriter(Rc::clone(&file)), ben_var) + let encoder = BenStreamWriter::for_ben(SharedFileWriter(Rc::clone(&file)), ben_var) .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {e}")))?; Ok(PyBenEncoder { diff --git a/ben-py/src/encode/types.rs b/ben-py/src/encode/types.rs index 02d8ee3..19600d1 100644 --- a/ben-py/src/encode/types.rs +++ b/ben-py/src/encode/types.rs @@ -5,14 +5,14 @@ use std::io::{self, BufWriter, Write}; use std::rc::Rc; /// Handle to the underlying output file shared between the live -/// `AssignmentWriter` and the `PyBenEncoder` that owns it. Needed so the +/// `BenStreamWriter` and the `PyBenEncoder` that owns it. Needed so the /// encoder can reach the buffered file after the inner assignment writer /// has finished, in order to patch the bundle header and write the /// trailing directory. pub(super) type SharedFileSlot = Rc>>; /// Wrapper around a shared buffered file that implements `Write`. The -/// `AssignmentWriter` holds one of these and delegates every write into +/// `BenStreamWriter` holds one of these and delegates every write into /// the shared slot. pub(super) struct SharedFileWriter(pub SharedFileSlot); diff --git a/ben/src/cli/pcben/mod.rs b/ben/src/cli/pcben/mod.rs index e7e9f9a..517fbd9 100644 --- a/ben/src/cli/pcben/mod.rs +++ b/ben/src/cli/pcben/mod.rs @@ -1,6 +1,6 @@ use crate::cli::common::{check_overwrite, set_quiet, set_verbose, CliError, CliResult}; use crate::io::reader::BenStreamReader; -use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; +use crate::io::writer::BenStreamWriter; use crate::BenVariant; use clap::{Parser, ValueEnum}; use pipe::pipe; @@ -227,7 +227,7 @@ fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { /// Read zero-based assignment vectors and encode them as BEN. fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { - let mut ben_writer = AssignmentWriter::new(writer, BenVariant::MkvChain)?; + let mut ben_writer = BenStreamWriter::for_ben(writer, BenVariant::MkvChain)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) @@ -237,13 +237,15 @@ fn assignment_encode_ben(reader: R, writer: W) -> i .collect(); ben_writer.write_assignment(assignment)?; } + ben_writer.finish()?; Ok(()) } /// Read zero-based assignment vectors and encode them as XBEN. fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { let encoder = XzEncoder::new(writer, 9); - let mut xben_writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain)?; + let mut xben_writer = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None)?; for line in reader.lines() { let assignment: Vec = serde_json::from_str::>(&line.unwrap()) @@ -253,6 +255,7 @@ fn assignment_encode_xben(reader: R, writer: W) -> .collect(); xben_writer.write_json_value(json!({ "assignment": assignment }))?; } + xben_writer.finish()?; Ok(()) } diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index f3f99ed..8c0c35c 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -4,7 +4,7 @@ use crate::codec::decode::{ }; use crate::codec::encode::{encode_ben_to_xben, encode_twodelta_frame}; use crate::codec::frames::BenEncodeFrame; -use crate::io::writer::AssignmentWriter; +use crate::io::writer::BenStreamWriter; use crate::util::rle::rle_to_vec; use crate::BenVariant; use serde_json::{json, Value}; @@ -14,7 +14,7 @@ use std::io::BufReader; fn make_twodelta_ben(assignments: &[Vec]) -> Vec { let mut out = Vec::new(); { - let mut w = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut out, BenVariant::TwoDelta).unwrap(); for a in assignments { w.write_assignment(a.clone()).unwrap(); } diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 0e356f7..fbc9cc7 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -2,7 +2,7 @@ use crate::codec::translate::ben32_to_ben_lines; use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::BenStreamReader; -use crate::io::writer::AssignmentWriter; +use crate::io::writer::BenStreamWriter; use crate::progress::Spinner; use crate::BenVariant; use std::io::{self, BufRead, BufReader, Read, Write}; @@ -44,7 +44,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: BufReader::new(decoder), BenVariant::TwoDelta, ); - let mut ben = AssignmentWriter::new(writer, BenVariant::TwoDelta)?; + let mut ben = BenStreamWriter::for_ben(writer, BenVariant::TwoDelta)?; for record in &mut xben { let (assignment, count) = record?; ben.write_assignment(assignment.clone())?; @@ -52,6 +52,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: ben.write_assignment(assignment.clone())?; } } + ben.finish()?; return Ok(()); } None => { diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 923608a..5fba9aa 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -1,11 +1,9 @@ -use crate::codec::encode::xz::XZ_DEFAULT_MT_BLOCK_SIZE; -use crate::codec::encode::EncodeError; -use crate::io::writer::{AssignmentWriter, XZAssignmentWriter}; +use crate::codec::encode::xz::{build_mt_stream, resolve_threads}; +use crate::io::writer::BenStreamWriter; use crate::progress::Spinner; use crate::BenVariant; use serde_json::Value; use std::io::{self, BufRead, Result, Write}; -use xz2::stream::MtStreamBuilder; use xz2::write::XzEncoder; /// Encode JSONL assignment records directly into an XBEN stream. @@ -26,8 +24,8 @@ use xz2::write::XzEncoder; /// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for /// Standard and MkvChain variants. /// * `block_size` - Optional per-block size in bytes for the MT encoder. -/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or -/// `0` (liblzma auto) for single-thread runs. +/// `None` defaults to [`crate::codec::encode::xz::XZ_DEFAULT_MT_BLOCK_SIZE`] +/// when threads > 1, or `0` (liblzma auto) for single-thread runs. /// /// # Returns /// @@ -41,34 +39,12 @@ pub fn encode_jsonl_to_xben( chunk_size: Option, block_size: Option, ) -> Result<()> { - let n_cpus: u32 = n_threads - .unwrap_or(1) - .min( - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(1) as u32, - ) - .max(1); - + let n_cpus = resolve_threads(n_threads); let level = compression_level.unwrap_or(9).clamp(0, 9); - - let resolved_block_size = match block_size { - Some(n) => n, - None if n_cpus > 1 => XZ_DEFAULT_MT_BLOCK_SIZE, - None => 0, - }; - - let mt = MtStreamBuilder::new() - .threads(n_cpus) - .preset(level) - .block_size(resolved_block_size) - .encoder() - .map_err(|e| io::Error::from(EncodeError::XzInit(e)))?; + let mt = build_mt_stream(n_cpus, level, block_size)?; let encoder = XzEncoder::new_stream(writer, mt); - let mut ben_encoder = XZAssignmentWriter::new(encoder, variant)?; - if let Some(cs) = chunk_size { - ben_encoder = ben_encoder.with_chunk_size(cs); - } + + let mut ben_encoder = BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size)?; let mut line_num = 1u64; let spinner = Spinner::new("Encoding line"); @@ -87,6 +63,7 @@ pub fn encode_jsonl_to_xben( ben_encoder.write_json_value(data)?; } + ben_encoder.finish()?; Ok(()) } @@ -112,7 +89,7 @@ pub fn encode_jsonl_to_ben( ) -> Result<()> { let mut line_num = 1u64; let spinner = Spinner::new("Encoding line"); - let mut ben_encoder = AssignmentWriter::new(writer, variant)?; + let mut ben_encoder = BenStreamWriter::for_ben(writer, variant)?; for line_result in reader.lines() { spinner.set_count(line_num); line_num += 1; @@ -126,5 +103,6 @@ pub fn encode_jsonl_to_ben( ben_encoder.write_json_value(data)?; } + ben_encoder.finish()?; Ok(()) } diff --git a/ben/src/codec/encode/mod.rs b/ben/src/codec/encode/mod.rs index 71dde76..50ea4de 100644 --- a/ben/src/codec/encode/mod.rs +++ b/ben/src/codec/encode/mod.rs @@ -5,7 +5,7 @@ pub mod errors; mod jsonl; pub mod path; mod twodelta; -mod xz; +pub(crate) mod xz; pub(crate) use ben::encode_ben32_assignments; pub use errors::EncodeError; diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index fee0a80..39c5ecc 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -1,7 +1,7 @@ use crate::codec::encode::errors::EncodeError; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; -use crate::io::writer::XZAssignmentWriter; +use crate::io::writer::BenStreamWriter; use std::io::{self, BufRead, Cursor, Read, Result, Write}; use xz2::stream::{MtStreamBuilder, Stream}; use xz2::write::XzEncoder; @@ -17,7 +17,7 @@ use xz2::write::XzEncoder; pub const XZ_DEFAULT_MT_BLOCK_SIZE: u64 = 16 * 1024 * 1024; /// Resolve `n_threads` against the host's available parallelism. -fn resolve_threads(n_threads: Option) -> u32 { +pub(crate) fn resolve_threads(n_threads: Option) -> u32 { n_threads .unwrap_or(1) .min(host_parallelism()) @@ -57,7 +57,7 @@ pub fn cpus_from_signed(n: i32) -> u32 { /// liblzma. When it is `None`, we default to [`XZ_DEFAULT_MT_BLOCK_SIZE`] /// for `n_threads > 1` and to `0` (liblzma's "auto") for the single-thread /// case so single-thread encoding does not pay any block-overhead cost. -fn build_mt_stream( +pub(crate) fn build_mt_stream( n_threads: u32, level: u32, block_size: Option, @@ -167,13 +167,10 @@ pub fn encode_ben_to_xben( actual: check_buffer.to_vec(), }) })?; - let mut ben_encoder = XZAssignmentWriter::new(encoder, variant)?; - if let Some(cs) = chunk_size { - ben_encoder = ben_encoder.with_chunk_size(cs); - } - - ben_encoder.write_ben_file(Cursor::new(check_buffer).chain(reader))?; - + let mut ben_encoder = + BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size)?; + ben_encoder.ingest_ben_stream(Cursor::new(check_buffer).chain(reader))?; + ben_encoder.finish()?; Ok(()) } diff --git a/ben/src/codec/translate/errors.rs b/ben/src/codec/translate/errors.rs index 0958d8e..11f8b1e 100644 --- a/ben/src/codec/translate/errors.rs +++ b/ben/src/codec/translate/errors.rs @@ -15,7 +15,7 @@ pub enum TranslateError { #[error( "TwoDelta BEN streams cannot be translated to ben32; \ - use XZAssignmentWriter/BenStreamReader for TwoDelta compressed I/O" + use BenStreamWriter/BenStreamReader for TwoDelta compressed I/O" )] TwoDeltaUnsupported, diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 3550e35..d6b740d 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -2,7 +2,7 @@ //! //! The ben32 intermediate format is used only by the Standard and MkvChain //! variants. TwoDelta streams use a separate columnar layout and bypass -//! ben32 entirely — see [`XZAssignmentWriter`](crate::io::writer::XZAssignmentWriter) +//! ben32 entirely — see [`BenStreamWriter`](crate::io::writer::BenStreamWriter) //! and [`BenStreamReader`](crate::io::reader::BenStreamReader) for the //! TwoDelta compressed-I/O path. diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index b536eb4..57a1cd9 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -23,6 +23,5 @@ mod tests; pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; pub use writer::{ - AddAssetOptions, BendlStreamHandle, BendlWriteError, BendlWriter, BundleAssignmentSink, - BundleAssignmentStreamCtx, + AddAssetOptions, BendlStreamHandle, BendlWriteError, BendlWriter, BundleAssignmentStreamCtx, }; diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index eb7cc06..2151ac8 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -15,10 +15,8 @@ //! 2. **stream phase** — the caller invokes [`BendlWriter::begin_stream`] //! to enter the stream region. The returned handle wraps the raw //! underlying writer so the caller can plumb it into -//! [`crate::io::writer::AssignmentWriter`] or -//! [`crate::io::writer::XZAssignmentWriter`]. When the stream is -//! complete the caller records the sample count via -//! [`BendlWriter::end_stream`]. +//! [`crate::io::writer::BenStreamWriter`]. When the stream is complete +//! the caller records the sample count via [`BendlWriter::end_stream`]. //! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing //! directory and patches the header. //! @@ -304,8 +302,8 @@ impl BendlWriter { handle.finish(sample_count) } - /// Open a BEN assignment stream backed by an - /// [`crate::io::writer::AssignmentWriter`] and invoke `f` with a + /// Open a BEN assignment stream backed by a + /// [`crate::io::writer::BenStreamWriter`] and invoke `f` with a /// context that can encode assignments into it. /// /// The context tracks how many `write_assignment` / `write_json_value` @@ -324,7 +322,9 @@ impl BendlWriter { let mut handle = self.begin_stream()?; let mut sample_count: i64 = 0; { - let mut ben = crate::io::writer::AssignmentWriter::new(&mut handle, variant)?; + let writer_ref: &mut dyn Write = &mut handle; + let mut ben = + crate::io::writer::BenStreamWriter::for_ben(writer_ref, variant)?; { let mut ctx = BundleAssignmentStreamCtx { writer: &mut ben, @@ -333,18 +333,22 @@ impl BendlWriter { f(&mut ctx)?; } ben.finish()?; - // `ben` is dropped here, releasing its borrow on `handle`. } handle.finish(sample_count) } - /// Open an XBEN assignment stream backed by an - /// [`crate::io::writer::XZAssignmentWriter`] and invoke `f` with a + /// Open an XBEN assignment stream backed by a + /// [`crate::io::writer::BenStreamWriter`] and invoke `f` with a /// context that can encode assignments into it. /// /// The closure sees the same counting [`BundleAssignmentStreamCtx`] /// type used by [`BendlWriter::write_ben_stream`], so callers can be /// written to be generic over the assignment container. + /// + /// The XBEN encoder uses bundle compression preset + /// [`crate::io::bundle::format::DEFAULT_XZ_PRESET`], not the codec's + /// MT-stream defaults — bundle assignment streams are intentionally + /// single-threaded with a milder preset. pub fn write_xben_stream( &mut self, variant: crate::BenVariant, @@ -356,8 +360,11 @@ impl BendlWriter { let mut handle = self.begin_stream()?; let mut sample_count: i64 = 0; { - let encoder = xz2::write::XzEncoder::new(&mut handle, DEFAULT_XZ_PRESET); - let mut xben = crate::io::writer::XZAssignmentWriter::new(encoder, variant)?; + let writer_ref: &mut dyn Write = &mut handle; + let encoder = xz2::write::XzEncoder::new(writer_ref, DEFAULT_XZ_PRESET); + let mut xben = crate::io::writer::BenStreamWriter::for_xben_with_encoder( + encoder, variant, None, + )?; { let mut ctx = BundleAssignmentStreamCtx { writer: &mut xben, @@ -366,9 +373,6 @@ impl BendlWriter { f(&mut ctx)?; } xben.finish()?; - // `xben` is dropped here, which drops its inner `XzEncoder`, - // which in turn finalizes the xz stream and flushes the last - // bytes out to `handle`. } handle.finish(sample_count) } @@ -424,8 +428,8 @@ impl BendlWriter { /// Mutable handle to the stream region held by a [`BendlWriter`]. /// /// The handle implements `Write` so it can be wrapped in -/// `AssignmentWriter::new(handle, variant)` or -/// `XZAssignmentWriter::new(handle, variant)` directly. +/// `BenStreamWriter::for_ben(handle, variant)` or +/// `BenStreamWriter::for_xben_with_encoder(encoder, variant, ...)` directly. pub struct BendlStreamHandle<'a, W: Write + Seek> { parent: &'a mut BendlWriter, start_offset: u64, @@ -456,36 +460,22 @@ impl<'a, W: Write + Seek> Write for BendlStreamHandle<'a, W> { } } -/// Minimal trait that hides the concrete assignment-writer type behind a -/// pair of methods that both [`crate::io::writer::AssignmentWriter`] and -/// [`crate::io::writer::XZAssignmentWriter`] implement. -/// -/// The bundle layer uses this to let a single -/// [`BundleAssignmentStreamCtx`] wrap either container. -pub trait BundleAssignmentSink { - /// Encode one assignment vector. +/// Bundle-private adapter that hides the concrete `BenStreamWriter` +/// behind two methods, so [`BundleAssignmentStreamCtx`] can stay non-generic +/// without forcing the public API to expose the writer's `W` parameter or +/// to grow a second lifetime. +trait BundleAssignmentSink { fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()>; - /// Encode one JSON assignment record. fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()>; } -impl BundleAssignmentSink for crate::io::writer::AssignmentWriter { - fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { - crate::io::writer::AssignmentWriter::write_assignment(self, assign_vec) - } - - fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { - crate::io::writer::AssignmentWriter::write_json_value(self, data) - } -} - -impl BundleAssignmentSink for crate::io::writer::XZAssignmentWriter { +impl BundleAssignmentSink for crate::io::writer::BenStreamWriter { fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { - crate::io::writer::XZAssignmentWriter::write_assignment(self, assign_vec) + crate::io::writer::BenStreamWriter::write_assignment(self, assign_vec) } fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { - crate::io::writer::XZAssignmentWriter::write_json_value(self, data) + crate::io::writer::BenStreamWriter::write_json_value(self, data) } } diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 1dfc156..d53867d 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -2,7 +2,7 @@ use crate::codec::encode::encode_jsonl_to_xben; use crate::io::reader::errors::DecoderInitError; use crate::io::reader::subsample::{DecodeFrame, Selection, SubsampleFrameDecoder}; use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat}; -use crate::io::writer::XZAssignmentWriter; +use crate::io::writer::BenStreamWriter; use crate::BenVariant; use std::io::{self, Cursor, Write}; use xz2::write::XzEncoder; @@ -14,12 +14,12 @@ fn make_xben(jsonl: &str, variant: BenVariant) -> Vec { xben } -/// Build a minimal XBEN stream using XZAssignmentWriter directly. +/// Build a minimal XBEN stream using BenStreamWriter directly. fn make_xben_from_assignments(assignments: &[Vec], variant: BenVariant) -> Vec { let mut xben = Vec::new(); { let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + let mut writer = BenStreamWriter::for_xben_with_encoder(encoder, variant, None).unwrap(); for a in assignments { writer.write_assignment(a.clone()).unwrap(); } @@ -753,7 +753,7 @@ fn xz_twodelta_large_assignment_roundtrip() { #[test] fn xz_twodelta_chunk_boundary_roundtrip() { - use crate::io::writer::XZAssignmentWriter; + use crate::io::writer::BenStreamWriter; use xz2::write::XzEncoder; let anchor = vec![1u16, 2, 1, 2]; @@ -762,14 +762,15 @@ fn xz_twodelta_chunk_boundary_roundtrip() { let mut xben = Vec::new(); { let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(3); + let mut writer = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::TwoDelta, Some(3)) + .unwrap(); writer.write_assignment(anchor.clone()).unwrap(); for _ in 0..10 { writer.write_assignment(delta.clone()).unwrap(); writer.write_assignment(anchor.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); @@ -787,7 +788,7 @@ fn xz_twodelta_chunk_boundary_roundtrip() { #[test] fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { - use crate::io::writer::XZAssignmentWriter; + use crate::io::writer::BenStreamWriter; use xz2::write::XzEncoder; let anchor = vec![1u16, 1, 2, 2]; @@ -796,13 +797,14 @@ fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { let mut xben = Vec::new(); { let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(100); + let mut writer = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::TwoDelta, Some(100)) + .unwrap(); writer.write_assignment(anchor.clone()).unwrap(); writer.write_assignment(delta.clone()).unwrap(); writer.write_assignment(delta.clone()).unwrap(); writer.write_assignment(delta.clone()).unwrap(); + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); @@ -821,7 +823,7 @@ fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { fn translate_ben_twodelta_to_xben_roundtrip() { use crate::codec::encode::encode_ben_to_xben; use crate::codec::decode::decode_xben_to_jsonl; - use crate::io::writer::AssignmentWriter; + use crate::io::writer::BenStreamWriter; use std::io::BufReader; let a0 = vec![1u16, 2, 1, 2]; @@ -831,7 +833,7 @@ fn translate_ben_twodelta_to_xben_roundtrip() { let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { w.write_assignment(a.clone()).unwrap(); } @@ -862,7 +864,7 @@ fn translate_ben_twodelta_to_xben_roundtrip() { #[test] fn translate_ben_twodelta_to_xben_with_repetitions() { use crate::codec::encode::encode_ben_to_xben; - use crate::io::writer::AssignmentWriter; + use crate::io::writer::BenStreamWriter; use std::io::BufReader; let anchor = vec![1u16, 2, 1, 2]; @@ -877,7 +879,7 @@ fn translate_ben_twodelta_to_xben_with_repetitions() { let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { w.write_assignment(a.clone()).unwrap(); } @@ -895,7 +897,7 @@ fn translate_ben_twodelta_to_xben_with_repetitions() { #[test] fn translate_ben_twodelta_to_xben_many_deltas() { use crate::codec::encode::encode_ben_to_xben; - use crate::io::writer::AssignmentWriter; + use crate::io::writer::BenStreamWriter; use std::io::BufReader; let a = vec![1u16, 1, 2, 2]; @@ -906,7 +908,7 @@ fn translate_ben_twodelta_to_xben_many_deltas() { let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { w.write_assignment(a.clone()).unwrap(); } @@ -1011,13 +1013,13 @@ fn assignment_reader_mkv_roundtrip() { #[test] fn assignment_reader_twodelta_roundtrip() { use crate::io::reader::BenStreamReader; - use crate::io::writer::AssignmentWriter; + use crate::io::writer::BenStreamWriter; let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { writer.write_assignment(a.clone()).unwrap(); } @@ -1584,12 +1586,12 @@ fn xz_reader_write_all_jsonl_standard_roundtrip() { #[test] fn raw_frame_iter_propagates_twodelta_decode_error() { use crate::io::reader::BenStreamReader; - use crate::io::writer::AssignmentWriter; + use crate::io::writer::BenStreamWriter; // Build a minimal TwoDelta BEN file with two samples. let mut ben: Vec = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } diff --git a/ben/src/io/writer/assignment_writer.rs b/ben/src/io/writer/assignment_writer.rs deleted file mode 100644 index af330bb..0000000 --- a/ben/src/io/writer/assignment_writer.rs +++ /dev/null @@ -1,185 +0,0 @@ -use super::twodelta::twodelta_repeat_runs; -use super::utils::parse_json_assignment; -use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::BenEncodeFrame; -use crate::format::banners::banner_for_variant; -use crate::BenVariant; -use serde_json::Value; -use std::collections::HashMap; -use std::io::{self, Result, Write}; - -/// A struct to make the writing of BEN files easier and more ergonomic. -pub struct AssignmentWriter { - writer: W, - previous_sample: Vec, - previous_masks: HashMap>, - pending_sample: Option>, - sample_count: u16, - variant: BenVariant, - complete: bool, -} - -impl AssignmentWriter { - /// Create a new BEN writer and immediately emit the BEN banner. - /// - /// # Arguments - /// - /// * `writer` - The destination that will receive the BEN stream. - /// * `variant` - The BEN variant to encode. - /// - /// # Returns - /// - /// Returns a new encoder ready to accept assignments. - pub fn new(mut writer: W, variant: BenVariant) -> io::Result { - writer.write_all(banner_for_variant(variant))?; - - Ok(AssignmentWriter { - writer, - previous_sample: Vec::new(), - previous_masks: HashMap::new(), - pending_sample: None, - sample_count: 0, - complete: false, - variant, - }) - } - - /// Encode and write the pending assignment with the accumulated repetition count. - /// - /// For TwoDelta, the first frame is written as an MkvBen frame. Subsequent - /// frames are written as TwoDelta frames encoding the transition from - /// `previous_sample`. This is a no-op when no sample is pending. - /// - /// Note: That on the first call to `flush_pending_frame` when `self.pending_sample` is `None`, - /// the method will simply return `Ok(())` without writing anything. Flushing only happens - /// when there is a pending sample to write. - fn flush_pending_frame(&mut self) -> Result<()> { - let pending_sample = match self.pending_sample.take() { - Some(p) => p, - None => return Ok(()), - }; - - match self.variant { - BenVariant::Standard => { - let frame = - BenEncodeFrame::from_assignment(&pending_sample, BenVariant::Standard, None); - for _ in 0..self.sample_count { - self.writer.write_all(frame.as_slice())?; - } - } - BenVariant::MkvChain => { - let frame = BenEncodeFrame::from_assignment( - &pending_sample, - BenVariant::MkvChain, - Some(self.sample_count), - ); - self.writer.write_all(frame.as_slice())?; - } - BenVariant::TwoDelta => { - if self.previous_sample.is_empty() { - // First frame: encode in MkvChain wire format and build - // the initial position masks. - for (idx, &val) in pending_sample.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - let frame = BenEncodeFrame::from_assignment( - &pending_sample, - BenVariant::MkvChain, - Some(self.sample_count), - ); - self.writer.write_all(frame.as_slice())?; - } else if self.previous_sample == pending_sample { - let frame = twodelta_repeat_frame(&pending_sample, self.sample_count)?; - self.writer.write_all(frame.as_slice())?; - } else { - let frame = encode_twodelta_frame_with_hint( - &self.previous_sample, - &pending_sample, - None, - Some(&mut self.previous_masks), - Some(self.sample_count), - )?; - self.writer.write_all(frame.as_slice())?; - } - } - } - - self.previous_sample = pending_sample; - Ok(()) - } - - /// Encode and write a full assignment vector. - /// - /// Consecutive identical assignments are counted and written as a single - /// frame with the accumulated count for MkvChain and TwoDelta variants. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - if self.pending_sample.as_deref() == Some(assign_vec.as_slice()) { - if self.sample_count == u16::MAX { - self.flush_pending_frame()?; - self.pending_sample = Some(assign_vec); - self.sample_count = 1; - return Ok(()); - } - self.sample_count += 1; - return Ok(()); - } - self.flush_pending_frame()?; - self.pending_sample = Some(assign_vec); - self.sample_count = 1; - Ok(()) - } - - /// Encode and write a JSON assignment record. - /// - /// The input must contain an `assignment` array of integers. Other fields - /// are ignored. - /// - /// # Arguments - /// - /// * `data` - A JSON object containing an `assignment` array. - /// - /// # Returns - /// - /// Returns `Ok(())` after the record has been validated and encoded. - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - let new_assign = parse_json_assignment(data)?; - self.write_assignment(new_assign) - } - - /// Flush any buffered state to the underlying writer. - /// - /// # Returns - /// - /// Returns `Ok(())` once any buffered state has been flushed. - pub fn finish(&mut self) -> Result<()> { - if self.complete { - return Ok(()); - } - self.flush_pending_frame()?; - self.complete = true; - Ok(()) - } -} - -pub(super) fn twodelta_repeat_frame( - assignment: &[u16], - count: u16, -) -> io::Result { - let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; - Ok(BenEncodeFrame::from_run_lengths(pair, run_lengths, Some(count))) -} - -impl Drop for AssignmentWriter { - /// Flush any buffered BEN state during drop. - fn drop(&mut self) { - let _ = self.finish(); - } -} diff --git a/ben/src/io/writer/frame_writer.rs b/ben/src/io/writer/frame_writer.rs deleted file mode 100644 index b542a42..0000000 --- a/ben/src/io/writer/frame_writer.rs +++ /dev/null @@ -1,87 +0,0 @@ -use super::assignment_writer::twodelta_repeat_frame; -use crate::codec::encode::encode_twodelta_frame_with_hint; -use crate::codec::BenEncodeFrame; -use crate::format::banners::banner_for_variant; -use crate::BenVariant; -use std::collections::HashMap; -use std::io::{self, Write}; - -/// A writer that emits one BEN frame per call, preserving input frame -/// boundaries instead of merging adjacent identical assignments. -/// -/// This sidesteps the merge buffer in [`super::AssignmentWriter`]: callers -/// supply a `(assignment, count)` pair and receive one counted frame on the -/// wire. For [`BenVariant::Standard`] targets, which cannot encode -/// repetition counts, a count of `N` is expanded into `N` one-sample frames. -/// -/// For [`BenVariant::TwoDelta`], the writer maintains its own -/// `previous_sample` and `previous_masks` so subsequent frames encode delta -/// transitions identically to `AssignmentWriter`. -pub(crate) struct FrameWriter { - writer: W, - variant: BenVariant, - previous_sample: Vec, - previous_masks: HashMap>, -} - -impl FrameWriter { - pub(crate) fn new(mut writer: W, variant: BenVariant) -> io::Result { - writer.write_all(banner_for_variant(variant))?; - Ok(Self { - writer, - variant, - previous_sample: Vec::new(), - previous_masks: HashMap::new(), - }) - } - - pub(crate) fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { - if count == 0 { - return Ok(()); - } - match self.variant { - BenVariant::Standard => { - let frame = - BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); - for _ in 0..count { - self.writer.write_all(frame.as_slice())?; - } - } - BenVariant::MkvChain => { - let frame = BenEncodeFrame::from_assignment( - &assignment, - BenVariant::MkvChain, - Some(count), - ); - self.writer.write_all(frame.as_slice())?; - } - BenVariant::TwoDelta => { - if self.previous_sample.is_empty() { - for (idx, &val) in assignment.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - let frame = BenEncodeFrame::from_assignment( - &assignment, - BenVariant::MkvChain, - Some(count), - ); - self.writer.write_all(frame.as_slice())?; - } else if self.previous_sample == assignment { - let frame = twodelta_repeat_frame(&assignment, count)?; - self.writer.write_all(frame.as_slice())?; - } else { - let frame = encode_twodelta_frame_with_hint( - &self.previous_sample, - &assignment, - None, - Some(&mut self.previous_masks), - Some(count), - )?; - self.writer.write_all(frame.as_slice())?; - } - self.previous_sample = assignment; - } - } - Ok(()) - } -} diff --git a/ben/src/io/writer/frames.rs b/ben/src/io/writer/frames.rs index 60d4e9e..b3b3130 100644 --- a/ben/src/io/writer/frames.rs +++ b/ben/src/io/writer/frames.rs @@ -1,5 +1,5 @@ /// A buffered delta frame awaiting chunk serialization. -pub(super) struct BufferedDeltaFrame { +pub(crate) struct BufferedDeltaFrame { pub pair: (u16, u16), pub run_lengths: Vec, pub count: u16, diff --git a/ben/src/io/writer/mod.rs b/ben/src/io/writer/mod.rs index 47f3211..c85f444 100644 --- a/ben/src/io/writer/mod.rs +++ b/ben/src/io/writer/mod.rs @@ -1,12 +1,11 @@ -pub mod assignment_writer; -pub(crate) mod frame_writer; pub(crate) mod frames; +pub(crate) mod options; +pub(crate) mod stream_writer; #[cfg(test)] pub(crate) mod tests; pub(crate) mod twodelta; pub(crate) mod utils; -pub mod xz_assignment_writer; -pub use assignment_writer::AssignmentWriter; +pub use options::XzEncodeOptions; +pub use stream_writer::{BenStreamWriter, BenWireFormat}; pub use twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; -pub use xz_assignment_writer::XZAssignmentWriter; diff --git a/ben/src/io/writer/options.rs b/ben/src/io/writer/options.rs new file mode 100644 index 0000000..fb04e31 --- /dev/null +++ b/ben/src/io/writer/options.rs @@ -0,0 +1,62 @@ +//! Encode-side configuration knobs for the unified BEN stream writer. +//! +//! Mirrors the discipline of `RelabelOptions`: a `#[non_exhaustive]` struct +//! with private fields and value-taking builder setters, so adding a knob +//! later is non-breaking. `None` semantically means "use the codec/lzma +//! default" and is distinct from any specific user-provided value; +//! callers who want defaults simply do not call the setter. + +use super::twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; + +/// Encode-side knobs for `BenStreamWriter::for_xben`. +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct XzEncodeOptions { + pub(crate) n_threads: Option, + pub(crate) compression_level: Option, + pub(crate) block_size: Option, + pub(crate) twodelta_chunk_size: usize, +} + +impl XzEncodeOptions { + /// Build the default options. Matches today's `None`/`DEFAULT_TWODELTA_CHUNK_SIZE`. + pub fn new() -> Self { + Self::default() + } + + /// Set the XZ encoder thread count. `0` normalizes to `1`. + pub fn with_n_threads(mut self, n: u32) -> Self { + self.n_threads = Some(n.max(1)); + self + } + + /// Set the XZ compression level. Clamped to `0..=9`. + pub fn with_compression_level(mut self, level: u32) -> Self { + self.compression_level = Some(level.min(9)); + self + } + + /// Set the XZ per-block size in bytes. + pub fn with_block_size(mut self, size: u64) -> Self { + self.block_size = Some(size); + self + } + + /// Set the TwoDelta columnar chunk size. `0` normalizes to `1`. + /// Ignored for Standard and MkvChain XBEN streams. + pub fn with_twodelta_chunk_size(mut self, size: usize) -> Self { + self.twodelta_chunk_size = size.max(1); + self + } +} + +impl Default for XzEncodeOptions { + fn default() -> Self { + Self { + n_threads: None, + compression_level: None, + block_size: None, + twodelta_chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, + } + } +} diff --git a/ben/src/io/writer/stream_writer/ben.rs b/ben/src/io/writer/stream_writer/ben.rs new file mode 100644 index 0000000..b15b2c0 --- /dev/null +++ b/ben/src/io/writer/stream_writer/ben.rs @@ -0,0 +1,136 @@ +//! Plain-BEN encode logic for the unified stream writer. + +use std::collections::HashMap; +use std::io::{self, Write}; + +use crate::codec::encode::encode_twodelta_frame_with_hint; +use crate::codec::BenEncodeFrame; +use crate::BenVariant; + +use super::super::twodelta::twodelta_repeat_runs; + +/// State for the BEN arm. Variant lives here as the single source of truth. +pub(super) struct BenState { + pub(super) writer: W, + pub(super) variant: BenVariant, + pub(super) previous_assignment: Vec, + pub(super) previous_masks: HashMap>, + pub(super) pending_assignment: Option>, + pub(super) pending_count: u16, +} + +impl BenState { + pub(super) fn new(writer: W, variant: BenVariant) -> Self { + Self { + writer, + variant, + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + pending_assignment: None, + pending_count: 0, + } + } + + /// Encode and write the buffered assignment with the accumulated repetition count. + /// No-op when nothing is pending. + pub(super) fn flush_pending_frame(&mut self) -> io::Result<()> { + let pending = match self.pending_assignment.take() { + Some(p) => p, + None => return Ok(()), + }; + let count = self.pending_count; + self.pending_count = 0; + self.encode_and_write_frame(&pending, count)?; + self.previous_assignment = pending; + Ok(()) + } + + /// Encode one `(assignment, count)` directly, used for both flush and `write_frame`. + /// Updates `previous_masks` for TwoDelta. + fn encode_and_write_frame(&mut self, assignment: &[u16], count: u16) -> io::Result<()> { + match self.variant { + BenVariant::Standard => { + let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None); + for _ in 0..count { + self.writer.write_all(frame.as_slice())?; + } + } + BenVariant::MkvChain => { + let frame = + BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count)); + self.writer.write_all(frame.as_slice())?; + } + BenVariant::TwoDelta => { + if self.previous_assignment.is_empty() { + // First frame: encode as MkvChain wire format and seed + // the position masks for subsequent delta frames. + for (idx, &val) in assignment.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } + let frame = BenEncodeFrame::from_assignment( + assignment, + BenVariant::MkvChain, + Some(count), + ); + self.writer.write_all(frame.as_slice())?; + } else if self.previous_assignment.as_slice() == assignment { + let frame = twodelta_repeat_frame(assignment, count)?; + self.writer.write_all(frame.as_slice())?; + } else { + let frame = encode_twodelta_frame_with_hint( + &self.previous_assignment, + assignment, + None, + Some(&mut self.previous_masks), + Some(count), + )?; + self.writer.write_all(frame.as_slice())?; + } + } + } + Ok(()) + } + + pub(super) fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { + if self.pending_count == u16::MAX { + self.flush_pending_frame()?; + self.pending_assignment = Some(assign_vec); + self.pending_count = 1; + return Ok(()); + } + self.pending_count += 1; + return Ok(()); + } + self.flush_pending_frame()?; + self.pending_assignment = Some(assign_vec); + self.pending_count = 1; + Ok(()) + } + + /// Encode one frame with the supplied count, flushing any pending merge state first. + /// Caller has already verified `count != 0` and that the writer is in a valid state. + pub(super) fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { + self.flush_pending_frame()?; + self.encode_and_write_frame(&assignment, count)?; + // For TwoDelta, the next delta is encoded against the just-emitted + // frame. `encode_and_write_frame` already updated `previous_masks` + // when the previous_assignment was empty; in all variants we need + // to update `previous_assignment` here so a subsequent + // `write_assignment` sees the right baseline. + self.previous_assignment = assignment; + Ok(()) + } +} + +pub(crate) fn twodelta_repeat_frame( + assignment: &[u16], + count: u16, +) -> io::Result { + let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; + Ok(BenEncodeFrame::from_run_lengths( + pair, + run_lengths, + Some(count), + )) +} diff --git a/ben/src/io/writer/stream_writer/mod.rs b/ben/src/io/writer/stream_writer/mod.rs new file mode 100644 index 0000000..8be87f0 --- /dev/null +++ b/ben/src/io/writer/stream_writer/mod.rs @@ -0,0 +1,300 @@ +//! Unified writer for the BEN-stack stream layer (layer 3 — see +//! `docs/glossary.md`). +//! +//! Hides the wire-format choice (BEN bit-packed vs ben32 / XBEN columnar) +//! and the transport choice (plain vs xz-compressed) behind one type. + +mod ben; +mod xben; + +#[cfg(test)] +pub(super) mod test_helpers { + pub(crate) use super::ben::twodelta_repeat_frame; + pub(crate) use super::xben::twodelta_repeat_buffered_frame; +} + +use std::io::{self, BufRead, Write}; + +use serde_json::Value; +use xz2::stream::Stream; +use xz2::write::XzEncoder; + +use crate::codec::encode::xz::{build_mt_stream, resolve_threads}; +use crate::format::banners::banner_for_variant; +use crate::BenVariant; + +use super::options::XzEncodeOptions; +use super::utils::parse_json_assignment; + +pub use crate::io::reader::BenWireFormat; + +use ben::BenState; +use xben::XBenInner; + +/// Writer for an encoded BEN-stack stream of samples (layer 3 — see +/// `docs/glossary.md`). +/// +/// Construct with [`BenStreamWriter::for_ben`] for plain BEN or +/// [`BenStreamWriter::for_xben`] for XBEN. `write_assignment` is available +/// on both arms; `write_frame` is plain-BEN-only and preserves one frame +/// boundary per call. Calling `write_frame` on an XBEN writer returns +/// `InvalidInput`. +pub struct BenStreamWriter { + inner: BenStreamInner, + state: WriterState, + /// Tracks whether any sample-writing or direct-ingest operation has + /// touched the writer. `ingest_ben_stream` requires this to be `false`. + body_started: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum WriterState { + Open, + BodyClosed, + Complete, + Failed, +} + +enum BenStreamInner { + Ben(BenState), + XBen(Box>), +} + +impl BenStreamWriter { + /// Open a plain-BEN writer. Emits the BEN banner immediately. + /// + /// On error, the underlying `writer` is dropped — no partial + /// `BenStreamWriter` is returned. The caller treats the output as + /// failed and discards. + pub fn for_ben(mut writer: W, variant: BenVariant) -> io::Result { + writer.write_all(banner_for_variant(variant))?; + Ok(Self { + inner: BenStreamInner::Ben(BenState::new(writer, variant)), + state: WriterState::Open, + body_started: false, + }) + } + + /// Open an XBEN writer. Builds the xz encoder from `options` and emits + /// the BEN banner inside the compressed stream. + pub fn for_xben( + writer: W, + variant: BenVariant, + options: XzEncodeOptions, + ) -> io::Result { + let n_cpus = resolve_threads(options.n_threads); + let level = options.compression_level.unwrap_or(9).min(9); + let mt: Stream = build_mt_stream(n_cpus, level, options.block_size)?; + let encoder = XzEncoder::new_stream(writer, mt); + Self::for_xben_with_encoder(encoder, variant, Some(options.twodelta_chunk_size)) + } + + /// Open an XBEN writer around an already-built xz encoder. Used by codec + /// plumbing that constructs encoders explicitly. The TwoDelta chunk + /// size is passed independently because compression options have + /// already been consumed building the encoder; `None` means default. + pub(crate) fn for_xben_with_encoder( + mut encoder: XzEncoder, + variant: BenVariant, + twodelta_chunk_size: Option, + ) -> io::Result { + encoder.write_all(banner_for_variant(variant))?; + let chunk_size = twodelta_chunk_size + .unwrap_or(super::twodelta::DEFAULT_TWODELTA_CHUNK_SIZE) + .max(1); + Ok(Self { + inner: BenStreamInner::XBen(Box::new(XBenInner::new(encoder, variant, chunk_size))), + state: WriterState::Open, + body_started: false, + }) + } + + /// The BEN variant of this stream. + pub fn variant(&self) -> BenVariant { + match &self.inner { + BenStreamInner::Ben(b) => b.variant, + BenStreamInner::XBen(x) => x.variant(), + } + } + + /// The wire format (BEN vs XBEN) of this stream. + pub fn wire_format(&self) -> BenWireFormat { + match &self.inner { + BenStreamInner::Ben(_) => BenWireFormat::Ben, + BenStreamInner::XBen(_) => BenWireFormat::XBen, + } + } + + /// Encode one assignment vector. Count-capable formats buffer + /// adjacent-equal assignments into counted frames; XBEN-Standard writes + /// each assignment immediately, and Standard BEN expands buffered + /// counts into one-sample frames on flush. + pub fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + match self.state { + WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { + return Err(invalid_input("writer is not in a state that accepts samples")); + } + WriterState::Open => {} + } + + self.body_started = true; + let result = match &mut self.inner { + BenStreamInner::Ben(b) => b.write_assignment(assign_vec), + BenStreamInner::XBen(x) => x.write_assignment(assign_vec), + }; + if result.is_err() { + self.state = WriterState::Failed; + } + result + } + + /// Plain-BEN only: encode one assignment vector with a caller-supplied + /// count. MkvChain/TwoDelta emit one counted frame; Standard expands + /// `count` into one-sample frames. + /// + /// Guard order: writer-state, then mode, then zero-count no-op, then + /// the stateful flush/encode path. + pub fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { + match self.state { + WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { + return Err(invalid_input("writer is not in a state that accepts frames")); + } + WriterState::Open => {} + } + let ben = match &mut self.inner { + BenStreamInner::Ben(b) => b, + BenStreamInner::XBen(_) => { + return Err(invalid_input("write_frame is plain-BEN-only")); + } + }; + if count == 0 { + return Ok(()); + } + + self.body_started = true; + let result = ben.write_frame(assignment, count); + if result.is_err() { + self.state = WriterState::Failed; + } + result + } + + /// Encode one JSON assignment record. + pub fn write_json_value(&mut self, data: Value) -> io::Result<()> { + match self.state { + WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { + return Err(invalid_input("writer is not in a state that accepts samples")); + } + WriterState::Open => {} + } + // JSON parse is preflight: failure does not poison. + let new_assign = parse_json_assignment(data)?; + // From here on, we are in the stateful encode path. + self.body_started = true; + let result = match &mut self.inner { + BenStreamInner::Ben(b) => b.write_assignment(new_assign), + BenStreamInner::XBen(x) => x.write_assignment(new_assign), + }; + if result.is_err() { + self.state = WriterState::Failed; + } + result + } + + /// Crate-private XBEN-only direct ingest. Fresh-writer-only and terminal + /// for sample writes: on success the writer transitions to `BodyClosed` + /// and only `finish()` remains valid. + pub(crate) fn ingest_ben_stream(&mut self, reader: impl BufRead) -> io::Result<()> { + match self.state { + WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { + return Err(invalid_input( + "writer is not in a state that accepts ingest", + )); + } + WriterState::Open => {} + } + let xben = match &mut self.inner { + BenStreamInner::Ben(_) => { + return Err(invalid_input("ingest_ben_stream requires XBEN mode")); + } + BenStreamInner::XBen(x) => x, + }; + if self.body_started { + return Err(invalid_input( + "ingest_ben_stream requires a fresh writer with no prior sample writes", + )); + } + + self.body_started = true; + let result = xben.ingest_ben_stream(reader); + match result { + Ok(()) => { + self.state = WriterState::BodyClosed; + Ok(()) + } + Err(e) => { + self.state = WriterState::Failed; + Err(e) + } + } + } + + /// Flush buffered BEN/XBEN state and finalize the underlying compressed + /// stream when present. Valid from `Open` and `BodyClosed`. Repeated + /// `finish()` after success returns `Ok(())`. Once finalization enters + /// the stateful path, any encode/writer/encoder error transitions the + /// writer to `Failed`; subsequent calls return `InvalidInput`. + pub fn finish(&mut self) -> io::Result<()> { + match self.state { + WriterState::Complete => return Ok(()), + WriterState::Failed => { + return Err(invalid_input("writer was poisoned by a prior error")); + } + WriterState::Open | WriterState::BodyClosed => {} + } + + let result: io::Result<()> = match &mut self.inner { + BenStreamInner::Ben(b) => { + if self.state == WriterState::Open { + b.flush_pending_frame() + } else { + Ok(()) + } + } + BenStreamInner::XBen(x) => { + let flush_res = if self.state == WriterState::Open { + x.flush() + } else { + Ok(()) + }; + match flush_res { + Ok(()) => x.encoder.try_finish(), + Err(e) => Err(e), + } + } + }; + + match result { + Ok(()) => { + self.state = WriterState::Complete; + Ok(()) + } + Err(e) => { + self.state = WriterState::Failed; + Err(e) + } + } + } +} + +impl Drop for BenStreamWriter { + fn drop(&mut self) { + if matches!(self.state, WriterState::Open | WriterState::BodyClosed) { + let _ = self.finish(); + } + } +} + +fn invalid_input(msg: &'static str) -> io::Error { + io::Error::new(io::ErrorKind::InvalidInput, msg) +} diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs new file mode 100644 index 0000000..c9593e0 --- /dev/null +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -0,0 +1,420 @@ +//! XBEN encode logic for the unified stream writer. + +use std::collections::HashMap; +use std::io::{self, BufRead, Read, Write}; + +use byteorder::{BigEndian, ReadBytesExt}; +use xz2::write::XzEncoder; + +use crate::codec::decode::decode_ben_line; +use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; +use crate::codec::translate::ben_to_ben32_lines; +use crate::codec::BenEncodeFrame; +use crate::format::banners::{has_known_banner_prefix, BANNER_LEN}; +use crate::progress::Spinner; +use crate::BenVariant; + +use super::super::frames::BufferedDeltaFrame; +use super::super::twodelta::{ + twodelta_repeat_runs, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, +}; +use super::super::utils::encode_xben_twodelta_full_frame; + +/// XBEN-arm state. Owns the xz encoder and a per-variant inner state. +pub(super) struct XBenInner { + pub(super) encoder: XzEncoder, + pub(super) state: XBenState, +} + +/// Per-variant inner state. Variant lives here as the single source of truth. +pub(super) enum XBenState { + Standard, + MkvChain { + pending_assignment: Option>, + pending_count: u16, + }, + TwoDelta { + previous_assignment: Vec, + previous_masks: HashMap>, + pending_initial_full_assignment: Option>, + pending_initial_full_count: u16, + twodelta_chunk_size: usize, + chunk_buffer: Vec, + }, +} + +impl XBenState { + pub(super) fn new(variant: BenVariant, twodelta_chunk_size: usize) -> Self { + match variant { + BenVariant::Standard => XBenState::Standard, + BenVariant::MkvChain => XBenState::MkvChain { + pending_assignment: None, + pending_count: 0, + }, + BenVariant::TwoDelta => XBenState::TwoDelta { + previous_assignment: Vec::new(), + previous_masks: HashMap::new(), + pending_initial_full_assignment: None, + pending_initial_full_count: 0, + twodelta_chunk_size, + chunk_buffer: Vec::new(), + }, + } + } + + pub(super) fn variant(&self) -> BenVariant { + match self { + XBenState::Standard => BenVariant::Standard, + XBenState::MkvChain { .. } => BenVariant::MkvChain, + XBenState::TwoDelta { .. } => BenVariant::TwoDelta, + } + } +} + +impl XBenInner { + pub(super) fn new(encoder: XzEncoder, variant: BenVariant, twodelta_chunk_size: usize) -> Self { + Self { + encoder, + state: XBenState::new(variant, twodelta_chunk_size), + } + } + + pub(super) fn variant(&self) -> BenVariant { + self.state.variant() + } + + pub(super) fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { + match &mut self.state { + XBenState::Standard => { + let encoded = encode_ben32_assignments(&assign_vec)?; + self.encoder.write_all(&encoded)?; + } + XBenState::MkvChain { + pending_assignment, + pending_count, + } => { + if pending_assignment.as_deref() == Some(assign_vec.as_slice()) { + if *pending_count == u16::MAX { + flush_mkv_pending(&mut self.encoder, pending_assignment, pending_count)?; + *pending_assignment = Some(assign_vec); + *pending_count = 1; + return Ok(()); + } + *pending_count += 1; + return Ok(()); + } + flush_mkv_pending(&mut self.encoder, pending_assignment, pending_count)?; + *pending_assignment = Some(assign_vec); + *pending_count = 1; + } + XBenState::TwoDelta { + previous_assignment, + previous_masks, + pending_initial_full_assignment, + pending_initial_full_count, + twodelta_chunk_size, + chunk_buffer, + } => { + // First assignment ever: buffer as the initial full frame. + if pending_initial_full_assignment.is_none() && previous_assignment.is_empty() { + *pending_initial_full_assignment = Some(assign_vec); + *pending_initial_full_count = 1; + return Ok(()); + } + // Repeat of the pending initial full frame. + if pending_initial_full_assignment.as_deref() == Some(assign_vec.as_slice()) { + if *pending_initial_full_count == u16::MAX { + flush_twodelta_initial( + &mut self.encoder, + pending_initial_full_assignment, + pending_initial_full_count, + previous_assignment, + previous_masks, + )?; + let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; + chunk_buffer.push(repeat); + *previous_assignment = assign_vec; + return Ok(()); + } + *pending_initial_full_count += 1; + return Ok(()); + } + // Repeat of the last delta frame in the current chunk. + if !chunk_buffer.is_empty() + && previous_assignment.as_slice() == assign_vec.as_slice() + { + if chunk_buffer.last().unwrap().count == u16::MAX { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; + chunk_buffer.push(repeat); + } else { + chunk_buffer.last_mut().unwrap().count += 1; + } + return Ok(()); + } + // New distinct assignment: flush the initial full frame if pending. + if pending_initial_full_assignment.is_some() { + flush_twodelta_initial( + &mut self.encoder, + pending_initial_full_assignment, + pending_initial_full_count, + previous_assignment, + previous_masks, + )?; + } + // Encode the delta frame and add it to the chunk buffer. + let frame = encode_twodelta_frame_with_hint( + &*previous_assignment, + &assign_vec, + None, + Some(previous_masks), + None, + )?; + let (pair, run_lengths) = match frame { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!( + "encode_twodelta_frame_with_hint always returns the TwoDelta arm" + ), + }; + chunk_buffer.push(BufferedDeltaFrame { + pair, + run_lengths, + count: 1, + }); + *previous_assignment = assign_vec; + if chunk_buffer.len() >= *twodelta_chunk_size { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + } + } + } + Ok(()) + } + + /// Flush all buffered XBEN state in preparation for `try_finish`. + pub(super) fn flush(&mut self) -> io::Result<()> { + match &mut self.state { + XBenState::Standard => Ok(()), + XBenState::MkvChain { + pending_assignment, + pending_count, + } => flush_mkv_pending(&mut self.encoder, pending_assignment, pending_count), + XBenState::TwoDelta { + previous_assignment, + previous_masks, + pending_initial_full_assignment, + pending_initial_full_count, + chunk_buffer, + .. + } => { + flush_twodelta_initial( + &mut self.encoder, + pending_initial_full_assignment, + pending_initial_full_count, + previous_assignment, + previous_masks, + )?; + flush_chunk_inner(&mut self.encoder, chunk_buffer) + } + } + } + + /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without + /// materializing full assignment vectors. + fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> io::Result<()> { + let chunk_size = match &self.state { + XBenState::TwoDelta { + twodelta_chunk_size, + .. + } => *twodelta_chunk_size, + _ => unreachable!("translate_ben_twodelta_to_xben requires TwoDelta state"), + }; + let chunk_buffer = match &mut self.state { + XBenState::TwoDelta { chunk_buffer, .. } => chunk_buffer, + _ => unreachable!(), + }; + + // First frame: standard BEN RLE → XBEN full frame. + let max_val_bits = reader.read_u8()?; + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + let first_count = reader.read_u16::()?; + + let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); + encoded.push(XBEN_TWODELTA_FULL_TAG); + encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for &(value, len) in &runs { + encoded.extend_from_slice(&value.to_be_bytes()); + encoded.extend_from_slice(&len.to_be_bytes()); + } + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&first_count.to_be_bytes())?; + + let mut sample_count = first_count as usize; + let spinner = Spinner::new("Encoding line"); + spinner.set_count(sample_count as u64); + + // Delta frames: unpack bitpacked run lengths and buffer into chunks. + loop { + let pair_a = match reader.read_u16::() { + Ok(v) => v, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + }; + let pair_b = reader.read_u16::()?; + let delta_max_len_bits = reader.read_u8()?; + let delta_n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; delta_n_bytes as usize]; + reader.read_exact(&mut payload)?; + let count = reader.read_u16::()?; + + let (pair, run_lengths) = match BenEncodeFrame::from_parts( + (pair_a, pair_b), + delta_max_len_bits, + payload, + count, + ) { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), + }; + + chunk_buffer.push(BufferedDeltaFrame { + pair, + run_lengths, + count, + }); + + if chunk_buffer.len() >= chunk_size { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + } + + sample_count += count as usize; + spinner.set_count(sample_count as u64); + } + + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + Ok(()) + } + + /// Crate-private direct ingest entry point. + /// + /// Standard/MkvChain accept bannered or bannerless input; TwoDelta + /// requires a banner. + pub(super) fn ingest_ben_stream(&mut self, mut reader: impl BufRead) -> io::Result<()> { + let peek = reader.fill_buf()?; + let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); + + let variant = self.variant(); + + if has_banner { + if variant == BenVariant::TwoDelta { + reader.consume(BANNER_LEN); + return self.translate_ben_twodelta_to_xben(reader); + } + reader.consume(BANNER_LEN); + } + + if variant == BenVariant::TwoDelta { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN translation requires a BEN stream with its banner", + )); + } + + ben_to_ben32_lines(&mut reader, &mut self.encoder, variant) + } +} + +fn flush_mkv_pending( + encoder: &mut XzEncoder, + pending_assignment: &mut Option>, + pending_count: &mut u16, +) -> io::Result<()> { + let pending = match pending_assignment.take() { + Some(p) => p, + None => return Ok(()), + }; + let count = *pending_count; + *pending_count = 0; + let encoded = encode_ben32_assignments(&pending)?; + encoder.write_all(&encoded)?; + encoder.write_all(&count.to_be_bytes())?; + Ok(()) +} + +fn flush_twodelta_initial( + encoder: &mut XzEncoder, + pending_initial_full_assignment: &mut Option>, + pending_initial_full_count: &mut u16, + previous_assignment: &mut Vec, + previous_masks: &mut HashMap>, +) -> io::Result<()> { + let pending = match pending_initial_full_assignment.take() { + Some(p) => p, + None => return Ok(()), + }; + let count = *pending_initial_full_count; + *pending_initial_full_count = 0; + + for (idx, &val) in pending.iter().enumerate() { + previous_masks.entry(val).or_default().push(idx); + } + let encoded = encode_xben_twodelta_full_frame(&pending); + encoder.write_all(&encoded)?; + encoder.write_all(&count.to_be_bytes())?; + *previous_assignment = pending; + Ok(()) +} + +fn flush_chunk_inner( + encoder: &mut XzEncoder, + chunk_buffer: &mut Vec, +) -> io::Result<()> { + if chunk_buffer.is_empty() { + return Ok(()); + } + + let n = chunk_buffer.len() as u32; + encoder.write_all(&[XBEN_TWODELTA_CHUNK_TAG])?; + encoder.write_all(&n.to_be_bytes())?; + + for frame in chunk_buffer.iter() { + encoder.write_all(&frame.pair.0.to_be_bytes())?; + encoder.write_all(&frame.pair.1.to_be_bytes())?; + } + for frame in chunk_buffer.iter() { + encoder.write_all(&frame.count.to_be_bytes())?; + } + for frame in chunk_buffer.iter() { + encoder.write_all(&(frame.run_lengths.len() as u32).to_be_bytes())?; + } + for frame in chunk_buffer.iter() { + for &rl in &frame.run_lengths { + encoder.write_all(&rl.to_be_bytes())?; + } + } + + chunk_buffer.clear(); + Ok(()) +} + +pub(crate) fn twodelta_repeat_buffered_frame( + assignment: &[u16], + count: u16, +) -> io::Result { + let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; + Ok(BufferedDeltaFrame { + pair, + run_lengths, + count, + }) +} diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index d4052a0..301a892 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -1,17 +1,28 @@ use crate::io::reader::BenStreamReader; -use crate::io::writer::XZAssignmentWriter; +use crate::io::writer::BenStreamWriter; use crate::BenVariant; use std::io::Cursor; use xz2::write::XzEncoder; +/// Build a `BenStreamWriter` over an explicit single-thread XZ encoder so +/// the resulting xben byte stream is deterministic and small. +fn build_xben_writer<'a>( + out: &'a mut Vec, + variant: BenVariant, + chunk_size: Option, +) -> BenStreamWriter<&'a mut Vec> { + let encoder = XzEncoder::new(out, 1); + BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size).unwrap() +} + fn roundtrip_xben(assignments: &[Vec], variant: BenVariant) -> Vec> { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + let mut writer = build_xben_writer(&mut xben, variant, None); for a in assignments { writer.write_assignment(a.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); reader.map(|r| r.unwrap().0).collect() @@ -20,11 +31,11 @@ fn roundtrip_xben(assignments: &[Vec], variant: BenVariant) -> Vec fn roundtrip_xben_counts(assignments: &[Vec], variant: BenVariant) -> Vec<(Vec, u16)> { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, variant).unwrap(); + let mut writer = build_xben_writer(&mut xben, variant, None); for a in assignments { writer.write_assignment(a.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); reader.map(|r| r.unwrap()).collect() @@ -121,13 +132,11 @@ fn writer_twodelta_chunk_size_1() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(1); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(1)); for a in &assignments { writer.write_assignment(a.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); @@ -142,13 +151,11 @@ fn writer_twodelta_chunk_size_larger_than_stream() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(1_000_000); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(1_000_000)); for a in &assignments { writer.write_assignment(a.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); @@ -181,13 +188,10 @@ fn writer_twodelta_u16_max_value_in_assignment() { } } -// ── BEN AssignmentWriter TwoDelta repeat frame ────────────────────── +// ── BEN BenStreamWriter TwoDelta repeat frame ──────────────────────── #[test] fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { - use crate::io::reader::BenStreamReader; - use crate::io::writer::AssignmentWriter; - // Assignment with 3 distinct values exercises the `continue` skip path // inside `twodelta_repeat_frame` for values outside the picked pair. let assign = vec![1u16, 2, 3, 1, 2]; @@ -195,10 +199,11 @@ fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for _ in 0..n { w.write_assignment(assign.clone()).unwrap(); } + w.finish().unwrap(); } let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); @@ -214,14 +219,14 @@ fn writer_twodelta_write_json_value() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); writer .write_json_value(json!({"assignment": [1, 2, 1, 2]})) .unwrap(); writer .write_json_value(json!({"assignment": [2, 1, 2, 1]})) .unwrap(); + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); @@ -234,8 +239,7 @@ fn writer_twodelta_write_json_value() { fn writer_finish_is_idempotent() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); writer.write_assignment(vec![1u16, 2, 3, 4]).unwrap(); writer.finish().unwrap(); writer.finish().unwrap(); @@ -245,10 +249,10 @@ fn writer_finish_is_idempotent() { assert_eq!(results, vec![vec![1u16, 2, 3, 4]]); } -// ── write_ben_file translation ──────────────────────────────────────── +// ── ingest_ben_stream translation ───────────────────────────────────── #[test] -fn writer_write_ben_file_standard_roundtrip() { +fn writer_ingest_ben_stream_standard_roundtrip() { use crate::codec::encode::encode_jsonl_to_ben; use std::io::BufReader; @@ -260,10 +264,9 @@ fn writer_write_ben_file_standard_roundtrip() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::Standard).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::Standard, None); writer - .write_ben_file(BufReader::new(ben.as_slice())) + .ingest_ben_stream(BufReader::new(ben.as_slice())) .unwrap(); writer.finish().unwrap(); } @@ -274,7 +277,7 @@ fn writer_write_ben_file_standard_roundtrip() { } #[test] -fn writer_write_ben_file_mkv_roundtrip() { +fn writer_ingest_ben_stream_mkv_roundtrip() { use crate::codec::encode::encode_jsonl_to_ben; use std::io::BufReader; @@ -287,10 +290,9 @@ fn writer_write_ben_file_mkv_roundtrip() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::MkvChain, None); writer - .write_ben_file(BufReader::new(ben.as_slice())) + .ingest_ben_stream(BufReader::new(ben.as_slice())) .unwrap(); writer.finish().unwrap(); } @@ -302,26 +304,25 @@ fn writer_write_ben_file_mkv_roundtrip() { } #[test] -fn writer_write_ben_file_twodelta_roundtrip() { - use crate::io::writer::AssignmentWriter; +fn writer_ingest_ben_stream_twodelta_roundtrip() { use std::io::BufReader; let assignments = vec![vec![1u16, 2, 1, 2], vec![1, 1, 2, 2], vec![2, 1, 2, 1]]; let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { w.write_assignment(a.clone()).unwrap(); } + w.finish().unwrap(); } let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); writer - .write_ben_file(BufReader::new(ben.as_slice())) + .ingest_ben_stream(BufReader::new(ben.as_slice())) .unwrap(); writer.finish().unwrap(); } @@ -332,15 +333,14 @@ fn writer_write_ben_file_twodelta_roundtrip() { } #[test] -fn writer_write_ben_file_twodelta_rejects_bannerless() { +fn writer_ingest_ben_stream_twodelta_rejects_bannerless() { use std::io::BufReader; let mut xben = Vec::new(); - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); let no_banner = vec![0u8; 50]; let err = writer - .write_ben_file(BufReader::new(no_banner.as_slice())) + .ingest_ben_stream(BufReader::new(no_banner.as_slice())) .unwrap_err(); assert!( err.to_string().contains("banner") @@ -380,11 +380,11 @@ fn writer_twodelta_anchor_count_overflow_u16max() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); for _ in 0..n { writer.write_assignment(assign.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); @@ -399,14 +399,12 @@ fn writer_twodelta_delta_count_overflow_u16max() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(n_delta + 1); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(n_delta + 1)); writer.write_assignment(anchor.clone()).unwrap(); for _ in 0..n_delta { writer.write_assignment(delta.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); @@ -414,11 +412,10 @@ fn writer_twodelta_delta_count_overflow_u16max() { assert_eq!(total, n_delta + 1); } -// ── TwoDelta translate via write_ben_file with chunk flush ─────────── +// ── TwoDelta translate via ingest_ben_stream with chunk flush ──────── #[test] fn writer_translate_ben_twodelta_chunk_flush() { - use crate::io::writer::AssignmentWriter; use std::io::BufReader; let a = vec![1u16, 1, 2, 2]; @@ -429,20 +426,18 @@ fn writer_translate_ben_twodelta_chunk_flush() { let mut ben = Vec::new(); { - let mut w = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in &assignments { w.write_assignment(a.clone()).unwrap(); } + w.finish().unwrap(); } let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta) - .unwrap() - .with_chunk_size(5); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(5)); writer - .write_ben_file(BufReader::new(ben.as_slice())) + .ingest_ben_stream(BufReader::new(ben.as_slice())) .unwrap(); writer.finish().unwrap(); } @@ -456,12 +451,12 @@ fn writer_translate_ben_twodelta_chunk_flush() { #[test] fn xz_writer_twodelta_too_many_ids_propagates_on_write() { - // Writing a third assignment that changes 3 distinct IDs errors at line 228. + // Writing a third assignment that changes 3 distinct IDs errors at the + // TwoDelta encode boundary. let anchor = vec![1u16, 1, 2, 2]; let invalid = vec![2u16, 3, 1, 3]; // 3 distinct changing ids let mut xben = Vec::new(); - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); writer.write_assignment(anchor).unwrap(); let err = writer.write_assignment(invalid).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -476,11 +471,11 @@ fn writer_mkv_count_overflow_u16max() { let mut xben = Vec::new(); { - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::MkvChain).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::MkvChain, None); for _ in 0..n { writer.write_assignment(assign.clone()).unwrap(); } + writer.finish().unwrap(); } let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); @@ -491,7 +486,7 @@ fn writer_mkv_count_overflow_u16max() { #[test] fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { - use super::assignment_writer::twodelta_repeat_frame; + use super::stream_writer::test_helpers::{twodelta_repeat_buffered_frame, twodelta_repeat_frame}; use std::io; // All-identical-value assignment with 65536 elements: the pair-position @@ -500,14 +495,8 @@ fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { let err = twodelta_repeat_frame(&assign, 1).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); assert!(err.to_string().contains("u16::MAX")); -} - -#[test] -fn twodelta_repeat_buffered_frame_run_exceeds_u16_max_errors() { - use super::xz_assignment_writer::twodelta_repeat_buffered_frame; - use std::io; - let assign = vec![1u16; 65536]; + // The XBEN buffered variant must error the same way. let result = twodelta_repeat_buffered_frame(&assign, 1); let err = result.err().expect("expected error"); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); @@ -518,12 +507,11 @@ fn twodelta_repeat_buffered_frame_run_exceeds_u16_max_errors() { fn translate_twodelta_non_eof_read_error_propagates() { use std::io::{self, Read}; - // write_ben_file in TwoDelta mode calls translate_ben_twodelta_to_xben. + // ingest_ben_stream in TwoDelta mode calls translate_ben_twodelta_to_xben. // After reading the anchor frame it loops reading delta frames; a // non-EOF error on pair_a (first u16 read in the loop) must propagate. let mut xben = Vec::new(); - let encoder = XzEncoder::new(&mut xben, 1); - let mut writer = XZAssignmentWriter::new(encoder, BenVariant::TwoDelta).unwrap(); + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); // Banner (17 bytes) + minimal anchor frame: // max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1 @@ -538,6 +526,286 @@ fn translate_twodelta_non_eof_read_error_propagates() { } let reader = std::io::BufReader::new(input.as_slice().chain(ErrorAfterEof)); - let err = writer.write_ben_file(reader).unwrap_err(); + let err = writer.ingest_ben_stream(reader).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } + +// ── BEN write_frame mixing with write_assignment ──────────────────── + +#[test] +fn ben_write_frame_then_write_assignment_mixed_standard() { + let a = vec![1u16, 2, 3]; + let b = vec![4u16, 5, 6]; + + let mut ben = Vec::new(); + { + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::Standard).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.write_frame(b.clone(), 3).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); + let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); + assert_eq!(total, 2 + 3 + 1); +} + +#[test] +fn ben_write_frame_then_write_assignment_mixed_mkv() { + let a = vec![1u16, 2, 3]; + let b = vec![4u16, 5, 6]; + + let mut ben = Vec::new(); + { + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.write_frame(b.clone(), 3).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); + let records: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = records.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 6); + // Expect three counted frames: (a, 2), (b, 3), (a, 1). + assert_eq!(records.len(), 3); + assert_eq!(records[0], (a.clone(), 2)); + assert_eq!(records[1], (b.clone(), 3)); + assert_eq!(records[2], (a.clone(), 1)); +} + +#[test] +fn ben_write_frame_zero_count_is_noop_and_does_not_flush() { + // write_assignment(a); write_frame(b, 0); write_assignment(a) should + // act like two adjacent write_assignment(a) calls — no inserted + // frame boundary. + let a = vec![1u16, 2, 3]; + let b = vec![4u16, 5, 6]; + + let mut ben = Vec::new(); + { + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.write_frame(b.clone(), 0).unwrap(); + w.write_assignment(a.clone()).unwrap(); + w.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); + let records: Vec<_> = reader.map(|r| r.unwrap()).collect(); + assert_eq!(records.len(), 1); + assert_eq!(records[0], (a.clone(), 2)); +} + +#[test] +fn ben_twodelta_first_call_write_frame_emits_anchor_with_count() { + let a = vec![1u16, 2, 1, 2]; + let mut ben = Vec::new(); + { + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + w.write_frame(a.clone(), 3).unwrap(); + w.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); + let records: Vec<_> = reader.map(|r| r.unwrap()).collect(); + let total: usize = records.iter().map(|(_, c)| *c as usize).sum(); + assert_eq!(total, 3); + for (v, _) in &records { + assert_eq!(v, &a); + } +} + +#[test] +fn ben_twodelta_write_frame_updates_previous_assignment_for_next_delta() { + let a = vec![1u16, 2, 1, 2]; + let b = vec![2u16, 1, 2, 1]; + let mut ben = Vec::new(); + { + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + w.write_frame(a.clone(), 3).unwrap(); + w.write_assignment(b.clone()).unwrap(); + w.finish().unwrap(); + } + // Round-trip must reproduce the inputs, which proves the delta against + // the emitted anchor was encoded correctly. + let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); + let mut samples: Vec> = Vec::new(); + reader + .for_each_assignment(|assignment, count| { + for _ in 0..count { + samples.push(assignment.to_vec()); + } + Ok(true) + }) + .unwrap(); + assert_eq!(samples.len(), 4); + for v in &samples[..3] { + assert_eq!(v, &a); + } + assert_eq!(&samples[3], &b); +} + +#[test] +fn write_frame_on_xben_returns_invalid_input() { + let mut xben = Vec::new(); + let mut writer = build_xben_writer(&mut xben, BenVariant::MkvChain, None); + let err = writer.write_frame(vec![1u16, 2, 3], 1).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +// ── finish + body-state guards ─────────────────────────────────────── + +#[test] +fn xben_finish_emits_complete_stream_before_drop() { + let mut xben = Vec::new(); + let mut writer = build_xben_writer(&mut xben, BenVariant::MkvChain, None); + writer.write_assignment(vec![1u16, 2, 3]).unwrap(); + writer.finish().unwrap(); + // Repeated finish after success returns Ok. + writer.finish().unwrap(); + drop(writer); + + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![vec![1u16, 2, 3]]); +} + +#[test] +fn write_methods_after_finish_return_invalid_input() { + let mut ben = Vec::new(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); + w.write_assignment(vec![1u16, 2, 3]).unwrap(); + w.finish().unwrap(); + let err = w.write_assignment(vec![4u16, 5, 6]).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = w.write_frame(vec![4u16, 5, 6], 1).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = w + .write_json_value(serde_json::json!({"assignment": [4, 5, 6]})) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +#[test] +fn write_frame_after_finish_with_zero_count_still_returns_invalid_input() { + // Pin guard ordering: finished/wrong-mode checks happen before the + // zero-count no-op. + let mut ben = Vec::new(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); + w.finish().unwrap(); + let err = w.write_frame(vec![1u16, 2, 3], 0).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +#[test] +fn write_frame_zero_count_on_xben_returns_invalid_input() { + // Guard ordering: wrong-mode check happens before zero-count no-op. + let mut xben = Vec::new(); + let mut w = build_xben_writer(&mut xben, BenVariant::MkvChain, None); + let err = w.write_frame(vec![1u16, 2, 3], 0).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +#[test] +fn ingest_ben_stream_lifecycle_terminal_for_sample_writes() { + use crate::codec::encode::encode_jsonl_to_ben; + use std::io::BufReader; + + let jsonl = r#"{"assignment":[1,2,3],"sample":1} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::Standard).unwrap(); + + let mut xben = Vec::new(); + let mut writer = build_xben_writer(&mut xben, BenVariant::Standard, None); + writer + .ingest_ben_stream(BufReader::new(ben.as_slice())) + .unwrap(); + + // Subsequent sample writes must be rejected; finish() still succeeds. + let err = writer.write_assignment(vec![1u16, 2, 3]).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = writer + .write_json_value(serde_json::json!({"assignment": [1, 2, 3]})) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = writer + .ingest_ben_stream(BufReader::new(b"".as_slice())) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + + writer.finish().unwrap(); +} + +#[test] +fn ingest_ben_stream_rejects_non_fresh_writer() { + use std::io::BufReader; + + let mut xben = Vec::new(); + let mut writer = build_xben_writer(&mut xben, BenVariant::Standard, None); + writer.write_assignment(vec![1u16, 2, 3]).unwrap(); + let err = writer + .ingest_ben_stream(BufReader::new(b"".as_slice())) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +#[test] +fn ingest_ben_stream_rejects_ben_mode_writer() { + use std::io::BufReader; + + let mut ben = Vec::new(); + let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::Standard).unwrap(); + let err = w + .ingest_ben_stream(BufReader::new(b"".as_slice())) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} + +// ── Fail-hard gate: poisoning on encode pipeline error ─────────────── + +#[test] +fn ben_writer_failed_state_after_underlying_writer_error() { + // The banner write happens during construction; constructor failure + // bypasses WriterState entirely. To exercise the post-construction + // poisoning path we wrap a buffer that accepts only the 17 banner + // bytes and errors on subsequent writes. + struct FailAfterN { + buf: Vec, + n: usize, + } + impl std::io::Write for FailAfterN { + fn write(&mut self, b: &[u8]) -> std::io::Result { + if self.buf.len() + b.len() > self.n { + return Err(std::io::Error::other("boom")); + } + self.buf.extend_from_slice(b); + Ok(b.len()) + } + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + let mut w = BenStreamWriter::for_ben( + FailAfterN { + buf: Vec::new(), + n: 17, + }, + BenVariant::MkvChain, + ) + .unwrap(); + // First call buffers the assignment as pending; no IO yet. + w.write_assignment(vec![1u16, 2, 3]).unwrap(); + // Second call with a different assignment triggers a flush, which + // must fail and poison the writer. + let err = w.write_assignment(vec![4u16, 5, 6]).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::Other); + let err = w.write_assignment(vec![1u16, 2, 3]).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = w.write_frame(vec![1u16, 2, 3], 1).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + let err = w.finish().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); +} diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index c97164a..3e02226 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -1,7 +1,7 @@ use std::io; -pub(super) const XBEN_TWODELTA_FULL_TAG: u8 = 0; -pub(super) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; +pub(crate) const XBEN_TWODELTA_FULL_TAG: u8 = 0; +pub(crate) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// Default number of delta frames per columnar chunk in XBEN TwoDelta. pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; @@ -12,7 +12,7 @@ pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; /// Used by both the BEN and XBEN writers to construct the body of a TwoDelta /// "repeat" frame: each writer wraps the result in its own frame type. Returns /// an `InvalidInput` error if any run exceeds `u16::MAX` in length. -pub(super) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16), Vec)> { +pub(crate) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16), Vec)> { let first = assignment.first().copied().unwrap_or(0); let second = assignment .iter() diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index dcb7855..c4d8c01 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -14,7 +14,7 @@ use std::io::{self, Result}; /// /// Returns a `Vec` of assignment values, or an error if the field is /// missing, not an array, or contains values that do not fit in a `u16`. -pub(super) fn parse_json_assignment(data: Value) -> Result> { +pub(crate) fn parse_json_assignment(data: Value) -> Result> { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidData, @@ -57,7 +57,7 @@ pub(super) fn parse_json_assignment(data: Value) -> Result> { /// # Returns /// /// Returns the encoded frame as a byte vector. -pub(super) fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { +pub(crate) fn encode_xben_twodelta_full_frame(assignments: &[u16]) -> Vec { let runs = assign_to_rle(assignments); let mut bytes = Vec::with_capacity(1 + 4 + runs.len() * 4); bytes.push(XBEN_TWODELTA_FULL_TAG); diff --git a/ben/src/io/writer/xz_assignment_writer.rs b/ben/src/io/writer/xz_assignment_writer.rs deleted file mode 100644 index 4103862..0000000 --- a/ben/src/io/writer/xz_assignment_writer.rs +++ /dev/null @@ -1,410 +0,0 @@ -use super::frames::BufferedDeltaFrame; -use super::twodelta::{ - twodelta_repeat_runs, DEFAULT_TWODELTA_CHUNK_SIZE, XBEN_TWODELTA_CHUNK_TAG, - XBEN_TWODELTA_FULL_TAG, -}; -use super::utils::{encode_xben_twodelta_full_frame, parse_json_assignment}; -use crate::codec::decode::decode_ben_line; -use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; -use crate::codec::translate::ben_to_ben32_lines; -use crate::codec::BenEncodeFrame; -use crate::format::banners::{banner_for_variant, has_known_banner_prefix, BANNER_LEN}; -use crate::progress::Spinner; -use crate::BenVariant; -use byteorder::{BigEndian, ReadBytesExt}; -use serde_json::Value; -use std::collections::HashMap; -use std::io::{self, BufRead, Read, Result, Write}; -use xz2::write::XzEncoder; - -/// A struct to make the writing of XBEN files easier and more ergonomic. -pub struct XZAssignmentWriter { - encoder: XzEncoder, - previous_assignment: Vec, - previous_masks: HashMap>, - pending_assignment: Option>, - count: u16, - variant: BenVariant, - chunk_size: usize, - chunk_buffer: Vec, - complete: bool, -} - -impl XZAssignmentWriter { - /// Encode and write the pending assignment with the accumulated count. - /// - /// For TwoDelta, builds the initial masks and writes the full frame followed - /// by the count. For MkvChain, encodes the assignment and appends the count. - /// This is a no-op when no assignment is pending. - fn flush_pending_frame(&mut self) -> Result<()> { - let pending = match self.pending_assignment.take() { - Some(p) => p, - None => return Ok(()), - }; - - // Standard writes each assignment immediately; MkvChain and TwoDelta buffer. - if self.variant == BenVariant::MkvChain { - let encoded = encode_ben32_assignments(&pending)?; - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&self.count.to_be_bytes())?; - } else { - // TwoDelta - for (idx, &val) in pending.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - let encoded = encode_xben_twodelta_full_frame(&pending); - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&self.count.to_be_bytes())?; - } - - self.previous_assignment = pending; - Ok(()) - } - - /// Write all buffered delta frames as a single columnar chunk. - /// - /// The chunk layout groups same-type fields together so XZ's dictionary - /// compression can exploit the resulting byte-level regularity: - /// - /// ```text - /// [chunk_tag=2] [n_frames: u32] - /// [pairs channel: (pair_a u16, pair_b u16) × n_frames] - /// [counts channel: count u16 × n_frames] - /// [run-length counts: n_runs u32 × n_frames] - /// [run-length data: u16 × total_runs] - /// ``` - fn flush_chunk(&mut self) -> Result<()> { - if self.chunk_buffer.is_empty() { - return Ok(()); - } - - let n = self.chunk_buffer.len() as u32; - self.encoder.write_all(&[XBEN_TWODELTA_CHUNK_TAG])?; - self.encoder.write_all(&n.to_be_bytes())?; - - // Pairs channel. - for frame in &self.chunk_buffer { - self.encoder.write_all(&frame.pair.0.to_be_bytes())?; - self.encoder.write_all(&frame.pair.1.to_be_bytes())?; - } - - // Counts channel. - for frame in &self.chunk_buffer { - self.encoder.write_all(&frame.count.to_be_bytes())?; - } - - // Run-length counts channel. - for frame in &self.chunk_buffer { - self.encoder - .write_all(&(frame.run_lengths.len() as u32).to_be_bytes())?; - } - - // Run-length data channel. - for frame in &self.chunk_buffer { - for &rl in &frame.run_lengths { - self.encoder.write_all(&rl.to_be_bytes())?; - } - } - - self.chunk_buffer.clear(); - Ok(()) - } - - /// Create a new XBEN writer around an already-configured XZ encoder. - /// - /// # Arguments - /// - /// * `encoder` - The configured XZ encoder that will receive the ben32 - /// payload. - /// * `variant` - The BEN variant to encode inside the compressed stream. - /// - /// # Returns - /// - /// Returns a new XBEN encoder ready to accept assignments or BEN frames. - pub fn new(mut encoder: XzEncoder, variant: BenVariant) -> io::Result { - encoder.write_all(banner_for_variant(variant))?; - Ok(XZAssignmentWriter { - encoder, - previous_assignment: Vec::new(), - previous_masks: HashMap::new(), - pending_assignment: None, - count: 0, - variant, - chunk_size: DEFAULT_TWODELTA_CHUNK_SIZE, - chunk_buffer: Vec::new(), - complete: false, - }) - } - - /// Set the number of delta frames per columnar chunk. - /// - /// Only affects TwoDelta variant encoding. Larger chunks give XZ more - /// same-type data to compress together; smaller chunks reduce peak memory. - /// - /// # Arguments - /// - /// * `size` - Number of delta frames per chunk. - /// - /// # Returns - /// - /// Returns `self` for method chaining. - pub fn with_chunk_size(mut self, size: usize) -> Self { - self.chunk_size = size.max(1); - self - } - - /// Encode and write a full assignment vector into the compressed XBEN stream. - /// - /// # Arguments - /// - /// * `assign_vec` - The full assignment vector to encode. - /// - /// # Returns - /// - /// Returns `Ok(())` after the assignment has been queued or written. - pub fn write_assignment(&mut self, assign_vec: Vec) -> Result<()> { - match self.variant { - BenVariant::Standard => { - let encoded = encode_ben32_assignments(&assign_vec)?; - self.encoder.write_all(&encoded)?; - self.previous_assignment = assign_vec; - } - BenVariant::MkvChain => { - if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { - if self.count == u16::MAX { - self.flush_pending_frame()?; - self.pending_assignment = Some(assign_vec); - self.count = 1; - return Ok(()); - } - self.count += 1; - return Ok(()); - } - self.flush_pending_frame()?; - self.pending_assignment = Some(assign_vec); - self.count = 1; - } - BenVariant::TwoDelta => { - // First assignment ever: buffer as the initial full frame. - if self.pending_assignment.is_none() && self.previous_assignment.is_empty() { - self.pending_assignment = Some(assign_vec); - self.count = 1; - return Ok(()); - } - // Repeat of the pending initial full frame. - if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { - if self.count == u16::MAX { - self.flush_pending_frame()?; - let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; - self.chunk_buffer.push(repeat); - self.previous_assignment = assign_vec; - return Ok(()); - } - self.count += 1; - return Ok(()); - } - // Repeat of the last delta frame in the current chunk. - if !self.chunk_buffer.is_empty() - && self.previous_assignment.as_slice() == assign_vec.as_slice() - { - if self.chunk_buffer.last().unwrap().count == u16::MAX { - self.flush_chunk()?; - let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; - self.chunk_buffer.push(repeat); - } else { - self.chunk_buffer.last_mut().unwrap().count += 1; - } - return Ok(()); - } - // New distinct assignment: flush the initial full frame if pending. - if self.pending_assignment.is_some() { - self.flush_pending_frame()?; - } - // Encode the delta frame and add it to the chunk buffer. - let frame = encode_twodelta_frame_with_hint( - &self.previous_assignment, - &assign_vec, - None, - Some(&mut self.previous_masks), - None, - )?; - let (pair, run_lengths) = match frame { - BenEncodeFrame::TwoDelta { - pair, - run_length_vector, - .. - } => (pair, run_length_vector), - _ => unreachable!( - "encode_twodelta_frame_with_hint always returns the TwoDelta arm" - ), - }; - self.chunk_buffer.push(BufferedDeltaFrame { - pair, - run_lengths, - count: 1, - }); - self.previous_assignment = assign_vec; - if self.chunk_buffer.len() >= self.chunk_size { - self.flush_chunk()?; - } - } - } - Ok(()) - } - - /// Encode and write a JSON assignment record into the compressed XBEN stream. - /// - /// # Arguments - /// - /// * `data` - A JSON object containing an `assignment` array. - /// - /// # Returns - /// - /// Returns `Ok(())` after the record has been validated and encoded. - pub fn write_json_value(&mut self, data: Value) -> Result<()> { - self.write_assignment(parse_json_assignment(data)?) - } - - /// Flush any buffered state to the underlying XZ encoder. - /// - /// # Returns - /// - /// Returns `Ok(())` once all buffered state has been flushed. - pub fn finish(&mut self) -> Result<()> { - if self.complete { - return Ok(()); - } - self.flush_pending_frame()?; - self.flush_chunk()?; - self.complete = true; - Ok(()) - } - - /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without - /// materializing full assignment vectors. - /// - /// The first frame (standard BEN RLE) is decoded to RLE runs and written as - /// an XBEN full frame. Subsequent delta frames have their bitpacked run - /// lengths unpacked and written as XBEN delta frames with raw u16 runs. - /// This avoids O(N) assignment reconstruction per frame entirely. - /// - /// # Arguments - /// - /// * `reader` - The BEN TwoDelta stream positioned after the banner. - /// - /// # Returns - /// - /// Returns `Ok(())` after the stream has been fully translated. - fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> Result<()> { - // First frame: standard BEN RLE → XBEN full frame. - let max_val_bits = reader.read_u8()?; - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - let first_count = reader.read_u16::()?; - - let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); - encoded.push(XBEN_TWODELTA_FULL_TAG); - encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); - for &(value, len) in &runs { - encoded.extend_from_slice(&value.to_be_bytes()); - encoded.extend_from_slice(&len.to_be_bytes()); - } - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&first_count.to_be_bytes())?; - - let mut sample_count = first_count as usize; - let spinner = Spinner::new("Encoding line"); - spinner.set_count(sample_count as u64); - - // Delta frames: unpack bitpacked run lengths and buffer into chunks. - loop { - let pair_a = match reader.read_u16::() { - Ok(v) => v, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, - Err(e) => return Err(e), - }; - let pair_b = reader.read_u16::()?; - let delta_max_len_bits = reader.read_u8()?; - let delta_n_bytes = reader.read_u32::()?; - - let mut payload = vec![0u8; delta_n_bytes as usize]; - reader.read_exact(&mut payload)?; - let count = reader.read_u16::()?; - - // Unpack bitpacked run lengths via the frame layer's TwoDelta - // constructor, then peel out the fields we need for buffering. - let (pair, run_lengths) = match BenEncodeFrame::from_parts( - (pair_a, pair_b), - delta_max_len_bits, - payload, - count, - ) { - BenEncodeFrame::TwoDelta { - pair, - run_length_vector, - .. - } => (pair, run_length_vector), - _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), - }; - - self.chunk_buffer.push(BufferedDeltaFrame { - pair, - run_lengths, - count, - }); - - if self.chunk_buffer.len() >= self.chunk_size { - self.flush_chunk()?; - } - - sample_count += count as usize; - spinner.set_count(sample_count as u64); - } - - self.flush_chunk()?; - - Ok(()) - } - - pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> { - let peek = reader.fill_buf()?; - let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); - - if has_banner { - if self.variant == BenVariant::TwoDelta { - reader.consume(BANNER_LEN); - return self.translate_ben_twodelta_to_xben(reader); - } - reader.consume(BANNER_LEN); - } - - if self.variant == BenVariant::TwoDelta { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN translation requires a BEN stream with its banner", - )); - } - - ben_to_ben32_lines(&mut reader, &mut self.encoder, self.variant) - } -} - -pub(super) fn twodelta_repeat_buffered_frame( - assignment: &[u16], - count: u16, -) -> io::Result { - let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; - Ok(BufferedDeltaFrame { - pair, - run_lengths, - count, - }) -} - -impl Drop for XZAssignmentWriter { - /// Flush any buffered XBEN state during drop. - fn drop(&mut self) { - let _ = self.finish(); - } -} diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 5202472..e819ec6 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -14,8 +14,7 @@ use crate::codec::BenEncodeFrame; use crate::format::banners::{variant_from_banner, BANNER_LEN}; use crate::format::FormatError; use crate::io::reader::BenStreamReader; -use crate::io::writer::frame_writer::FrameWriter; -use crate::io::writer::AssignmentWriter; +use crate::io::writer::BenStreamWriter; use crate::progress::Spinner; use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; @@ -225,7 +224,7 @@ fn can_use_first_seen_fast_path( /// counted output frames, Standard targets receive `count` one-sample frames /// because Standard cannot encode repetition counts. With /// [`RunPolicy::CollapseAdjacentEqualAssignments`], the existing -/// [`AssignmentWriter`] merging path is used. +/// [`BenStreamWriter`] merging path is used. fn relabel_via_decoder( reader: R, writer: W, @@ -243,7 +242,7 @@ where match run_policy { RunPolicy::CollapseAdjacentEqualAssignments => { - let mut encoder = AssignmentWriter::new(writer, target_variant)?; + let mut encoder = BenStreamWriter::for_ben(writer, target_variant)?; decoder.for_each_assignment(|assignment, count| { if max_samples.is_some_and(|limit| sample_number >= limit) { return Ok(false); @@ -268,7 +267,7 @@ where encoder.finish()?; } RunPolicy::PreserveFrameBoundaries => { - let mut writer = FrameWriter::new(writer, target_variant)?; + let mut writer = BenStreamWriter::for_ben(writer, target_variant)?; decoder.for_each_assignment(|assignment, count| { if max_samples.is_some_and(|limit| sample_number >= limit) { return Ok(false); @@ -287,6 +286,7 @@ where spinner.set_count(sample_number as u64); Ok(true) })?; + writer.finish()?; } } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 1c730b4..782f27a 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -1023,7 +1023,7 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { // Build a valid 2-sample TwoDelta BEN file, then corrupt the delta frame. let mut ben: Vec = Vec::new(); { - let mut writer = crate::io::writer::AssignmentWriter::new(&mut ben, BenVariant::TwoDelta) + let mut writer = crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta) .unwrap(); writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); @@ -1043,7 +1043,7 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { fn test_relabel_ben_file_with_map_twodelta_malformed_frame_error_propagates() { let mut ben: Vec = Vec::new(); { - let mut writer = crate::io::writer::AssignmentWriter::new(&mut ben, BenVariant::TwoDelta) + let mut writer = crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta) .unwrap(); writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index 9c0d853..bcf2eb7 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -8,7 +8,7 @@ use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::codec::encode::encode_jsonl_to_ben; use binary_ensemble::format::banners::{MKVCHAIN_BEN_BANNER, TWODELTA_BEN_BANNER}; use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; -use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::BenVariant; use std::io::{self, Cursor}; @@ -606,11 +606,11 @@ mod mkvchain { mod twodelta { use super::*; - /// Encode via `AssignmentWriter` so we control the exact frame layout. + /// Encode via `BenStreamWriter` so we control the exact frame layout. fn encode_twodelta(assignments: &[Vec]) -> Vec { let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for a in assignments { writer.write_assignment(a.clone()).unwrap(); } diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 0dde453..dd3b635 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -17,7 +17,7 @@ use binary_ensemble::format::banners::{ use binary_ensemble::io::reader::{ BenStreamFrameReader, BenStreamReader, DecodeFrame, DecoderInitError, }; -use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::json::graph::{ sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, }; @@ -532,7 +532,7 @@ fn xben_decoder_reads_variant_from_banner_twodelta() { #[test] fn ben_encoder_writes_correct_banner_standard() { let mut out = Vec::new(); - let encoder = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let encoder = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); drop(encoder); assert!(out.starts_with(STANDARD_BEN_BANNER)); } @@ -540,7 +540,7 @@ fn ben_encoder_writes_correct_banner_standard() { #[test] fn ben_encoder_writes_correct_banner_mkvchain() { let mut out = Vec::new(); - let encoder = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); + let encoder = BenStreamWriter::for_ben(&mut out, BenVariant::MkvChain).unwrap(); drop(encoder); assert!(out.starts_with(MKVCHAIN_BEN_BANNER)); } @@ -548,7 +548,7 @@ fn ben_encoder_writes_correct_banner_mkvchain() { #[test] fn ben_encoder_writes_correct_banner_twodelta() { let mut out = Vec::new(); - let encoder = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); + let encoder = BenStreamWriter::for_ben(&mut out, BenVariant::TwoDelta).unwrap(); drop(encoder); assert!(out.starts_with(TWODELTA_BEN_BANNER)); } @@ -558,7 +558,7 @@ fn ben_encoder_standard_single_assignment_round_trip() { let assignment = vec![1u16, 2, 3, 3, 2, 1]; let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.finish().unwrap(); } @@ -573,7 +573,7 @@ fn ben_encoder_standard_single_assignment_round_trip() { fn ben_encoder_finish_is_idempotent() { let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::MkvChain).unwrap(); enc.write_assignment(vec![1u16, 2]).unwrap(); enc.finish().unwrap(); let len_after_first_finish = enc.finish().unwrap(); // second call @@ -590,7 +590,7 @@ fn ben_encoder_write_json_value_valid_input() { let data = json!({"assignment": [1, 2, 3], "sample": 1}); let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); enc.write_json_value(data).unwrap(); enc.finish().unwrap(); } @@ -602,7 +602,7 @@ fn ben_encoder_write_json_value_valid_input() { fn ben_encoder_write_json_value_missing_assignment_field_errors() { let data = json!({"sample": 1}); // no "assignment" let mut out = Vec::new(); - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!( result.is_err(), @@ -615,7 +615,7 @@ fn ben_encoder_write_json_value_value_too_large_errors() { // 65536 doesn't fit in u16. let data = json!({"assignment": [65536], "sample": 1}); let mut out = Vec::new(); - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!(result.is_err(), "expected error for value out of u16 range"); } @@ -624,7 +624,7 @@ fn ben_encoder_write_json_value_value_too_large_errors() { fn ben_encoder_write_json_value_negative_value_errors() { let data = json!({"assignment": [-1], "sample": 1}); let mut out = Vec::new(); - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); let result = enc.write_json_value(data); assert!( result.is_err(), @@ -638,7 +638,7 @@ fn ben_encoder_standard_identical_assignments_still_written() { let assignment = vec![2u16, 2, 2]; let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::Standard).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); @@ -656,7 +656,7 @@ fn ben_encoder_mkv_identical_assignments_deduplicated() { let assignment = vec![2u16, 2, 2]; let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::MkvChain).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::MkvChain).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); enc.write_assignment(assignment.clone()).unwrap(); @@ -677,7 +677,7 @@ fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { let next = vec![2u16, 2, 1, 1, 2, 1]; // all 1s→2s and 2s→1s let mut out = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut out, BenVariant::TwoDelta).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::TwoDelta).unwrap(); enc.write_assignment(base.clone()).unwrap(); enc.write_assignment(next.clone()).unwrap(); enc.finish().unwrap(); diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 0be2e6d..e7430b6 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -11,7 +11,7 @@ use binary_ensemble::io::reader::{ build_frame_iter, count_samples_from_file, BenStreamReader, BenWireFormat, DecodeFrame, DecoderInitError, SubsampleFrameDecoder, }; -use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::ops::extract::extract_assignment_ben; use binary_ensemble::BenVariant; @@ -647,7 +647,7 @@ fn benencoder_finish_flushes_once() { let mut ben_vec = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut ben_vec, BenVariant::MkvChain).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut ben_vec, BenVariant::MkvChain).unwrap(); for line in lines.lines() { let v: serde_json::Value = serde_json::from_str(line).unwrap(); enc.write_json_value(v).unwrap(); @@ -929,7 +929,7 @@ fn xz_mt_params_are_capped_and_safe() { fn ben_encoder_write_assignment_path_roundtrips() { let mut ben = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut ben, BenVariant::Standard).unwrap(); enc.write_assignment(vec![9u16, 9, 2, 2, 2]).unwrap(); enc.finish().unwrap(); } @@ -1010,30 +1010,24 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { } #[test] -fn xben_encoder_write_ben_file_without_banner_path_roundtrips() { - let mut payload_only = Vec::new(); +fn encode_ben_to_xben_round_trips_through_decode() { + let mut ben_input = Vec::new(); { - let mut enc = AssignmentWriter::new(&mut payload_only, BenVariant::Standard).unwrap(); + let mut enc = BenStreamWriter::for_ben(&mut ben_input, BenVariant::Standard).unwrap(); enc.write_assignment(vec![5u16, 5, 7]).unwrap(); enc.finish().unwrap(); } - let payload_only = payload_only[17..].to_vec(); let mut xz = Vec::new(); - { - let mt = xz2::stream::MtStreamBuilder::new() - .threads(1) - .preset(0) - .block_size(0) - .encoder() - .unwrap(); - let encoder = xz2::write::XzEncoder::new_stream(&mut xz, mt); - let mut xben = - binary_ensemble::io::writer::XZAssignmentWriter::new(encoder, BenVariant::Standard) - .unwrap(); - xben.write_ben_file(BufReader::new(payload_only.as_slice())) - .unwrap(); - } + binary_ensemble::codec::encode::encode_ben_to_xben( + BufReader::new(ben_input.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let mut ben = Vec::new(); decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut ben).unwrap(); @@ -1353,7 +1347,7 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { let mut ben = Vec::new(); { - let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for assignment in &assignments { encoder.write_assignment(assignment.clone()).unwrap(); } @@ -1385,7 +1379,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { let mut ben = Vec::new(); { - let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(first.clone()).unwrap(); encoder.write_assignment(second).unwrap(); @@ -1408,7 +1402,7 @@ fn twodelta_first_frame_carries_repeat_trailer() { #[test] fn twodelta_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); encoder.write_assignment(vec![1u16, 3, 2, 4]).unwrap(); let err = encoder.finish().err().unwrap(); @@ -1418,7 +1412,7 @@ fn twodelta_rejects_non_pair_transition() { #[test] fn twodelta_write_json_value_rejects_non_pair_transition() { let mut ben = Vec::new(); - let mut encoder = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); encoder .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) .unwrap(); diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 3d15934..c8fd96b 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -18,7 +18,7 @@ use binary_ensemble::io::bundle::writer::{ }; use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::BenStreamReader; -use binary_ensemble::io::writer::AssignmentWriter; +use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; use std::cell::RefCell; use std::collections::HashMap; @@ -213,7 +213,7 @@ fn standard_rle_splits_assignment_run_longer_than_u16_max() { let assignment = vec![7u16; u16::MAX as usize + 1]; let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::Standard).unwrap(); writer.write_assignment(assignment.clone()).unwrap(); writer.finish().unwrap(); } @@ -227,7 +227,7 @@ fn mkvchain_writer_splits_repetition_count_longer_than_u16_max() { let sample = vec![1u16, 2, 2, 1]; let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::MkvChain).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); for _ in 0..(u16::MAX as usize + 1) { writer.write_assignment(sample.clone()).unwrap(); } @@ -247,7 +247,7 @@ fn twodelta_writer_splits_repetition_count_longer_than_u16_max() { let sample = vec![1u16, 1, 2, 2]; let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::TwoDelta).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); for _ in 0..(u16::MAX as usize + 1) { writer.write_assignment(sample.clone()).unwrap(); } @@ -377,7 +377,7 @@ fn xz_compress_propagates_input_reader_errors() { fn relabel_map_out_of_range_old_indices_error_cleanly() { let mut ben = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut ben, BenVariant::Standard).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::Standard).unwrap(); writer.write_assignment(vec![10, 20]).unwrap(); writer.finish().unwrap(); } @@ -510,7 +510,7 @@ fn zero_count_frames_are_rejected() { fn seeded_malformed_ben_bytes_do_not_panic() { let mut valid_standard = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut valid_standard, BenVariant::Standard).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut valid_standard, BenVariant::Standard).unwrap(); writer.write_assignment(vec![1, 1, 2, 3]).unwrap(); writer.write_assignment(vec![3, 3, 2, 1]).unwrap(); writer.finish().unwrap(); @@ -518,7 +518,7 @@ fn seeded_malformed_ben_bytes_do_not_panic() { let mut valid_mkv = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut valid_mkv, BenVariant::MkvChain).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut valid_mkv, BenVariant::MkvChain).unwrap(); writer.write_assignment(vec![4, 4, 5]).unwrap(); writer.write_assignment(vec![4, 4, 5]).unwrap(); writer.write_assignment(vec![5, 4, 4]).unwrap(); @@ -527,7 +527,7 @@ fn seeded_malformed_ben_bytes_do_not_panic() { let mut valid_twodelta = Vec::new(); { - let mut writer = AssignmentWriter::new(&mut valid_twodelta, BenVariant::TwoDelta).unwrap(); + let mut writer = BenStreamWriter::for_ben(&mut valid_twodelta, BenVariant::TwoDelta).unwrap(); writer.write_assignment(vec![1, 1, 2, 2]).unwrap(); writer.write_assignment(vec![1, 2, 1, 2]).unwrap(); writer.write_assignment(vec![2, 2, 1, 1]).unwrap(); From 6c2a9044b4a4d61d5fd4fc2d56b6392252822b2e Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 18:14:53 -0600 Subject: [PATCH 093/221] add known asset kind enum for bendl --- ben/src/cli/bendl/append.rs | 26 ++++++------------- ben/src/cli/bendl/create.rs | 26 ++++++------------- ben/src/cli/bendl/helpers.rs | 40 +++++++++++++++++++++++------ ben/src/io/bundle/format.rs | 32 +++++++++++++++++++++++ ben/src/io/bundle/writer.rs | 49 ++++++++++++++++++++++++++++++++++-- 5 files changed, 128 insertions(+), 45 deletions(-) diff --git a/ben/src/cli/bendl/append.rs b/ben/src/cli/bendl/append.rs index 6ac77b6..0a09ff3 100644 --- a/ben/src/cli/bendl/append.rs +++ b/ben/src/cli/bendl/append.rs @@ -1,8 +1,6 @@ use super::args::{AppendArgs, NamedAsset}; -use super::helpers::append_file_asset; -use crate::io::bundle::format::{ - ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, -}; +use super::helpers::{append_custom_file_asset, append_known_file_asset}; +use crate::io::bundle::format::KnownAssetKind; use crate::io::bundle::writer::BendlAppender; use crate::io::bundle::AddAssetOptions; use std::fs::OpenOptions; @@ -18,10 +16,9 @@ pub(super) fn run_append(args: AppendArgs) -> Result<(), String> { let mut added = 0usize; if let Some(ref path) = args.metadata { - append_file_asset( + append_known_file_asset( &mut appender, - ASSET_TYPE_METADATA, - "metadata.json", + KnownAssetKind::Metadata, path, AddAssetOptions::defaults().json(), )?; @@ -33,27 +30,20 @@ pub(super) fn run_append(args: AppendArgs) -> Result<(), String> { } else { AddAssetOptions::defaults().json() }; - append_file_asset(&mut appender, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + append_known_file_asset(&mut appender, KnownAssetKind::Graph, path, opts)?; added += 1; } if let Some(ref path) = args.node_permutation_map { - append_file_asset( + append_known_file_asset( &mut appender, - ASSET_TYPE_NODE_PERMUTATION_MAP, - "node_permutation_map.json", + KnownAssetKind::NodePermutationMap, path, AddAssetOptions::defaults().json(), )?; added += 1; } for NamedAsset { name, path } in &args.assets { - append_file_asset( - &mut appender, - ASSET_TYPE_CUSTOM, - name, - path, - AddAssetOptions::defaults(), - )?; + append_custom_file_asset(&mut appender, name, path, AddAssetOptions::defaults())?; added += 1; } diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs index 094306b..021525f 100644 --- a/ben/src/cli/bendl/create.rs +++ b/ben/src/cli/bendl/create.rs @@ -1,9 +1,7 @@ use super::args::{CreateArgs, NamedAsset}; -use super::helpers::{add_file_asset, format_from_path}; +use super::helpers::{add_custom_file_asset, add_known_file_asset, format_from_path}; use crate::cli::common::check_overwrite; -use crate::io::bundle::format::{ - ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, -}; +use crate::io::bundle::format::KnownAssetKind; use crate::io::bundle::{AddAssetOptions, BendlWriter}; use crate::io::reader::subsample::count_samples_from_file; use crate::io::reader::BenWireFormat; @@ -31,10 +29,9 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { // Add singleton assets first, in canonical order. if let Some(ref path) = args.metadata { - add_file_asset( + add_known_file_asset( &mut writer, - ASSET_TYPE_METADATA, - "metadata.json", + KnownAssetKind::Metadata, path, AddAssetOptions::defaults().json(), )?; @@ -45,25 +42,18 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { } else { AddAssetOptions::defaults().json() }; - add_file_asset(&mut writer, ASSET_TYPE_GRAPH, "graph.json", path, opts)?; + add_known_file_asset(&mut writer, KnownAssetKind::Graph, path, opts)?; } if let Some(ref path) = args.node_permutation_map { - add_file_asset( + add_known_file_asset( &mut writer, - ASSET_TYPE_NODE_PERMUTATION_MAP, - "node_permutation_map.json", + KnownAssetKind::NodePermutationMap, path, AddAssetOptions::defaults().json(), )?; } for NamedAsset { name, path } in &args.assets { - add_file_asset( - &mut writer, - ASSET_TYPE_CUSTOM, - name, - path, - AddAssetOptions::defaults(), - )?; + add_custom_file_asset(&mut writer, name, path, AddAssetOptions::defaults())?; } // Stream phase: copy bytes from the input file directly into the diff --git a/ben/src/cli/bendl/helpers.rs b/ben/src/cli/bendl/helpers.rs index 88762c2..f9bd8a8 100644 --- a/ben/src/cli/bendl/helpers.rs +++ b/ben/src/cli/bendl/helpers.rs @@ -1,4 +1,4 @@ -use crate::io::bundle::format::AssignmentFormat; +use crate::io::bundle::format::{AssignmentFormat, KnownAssetKind}; use crate::io::bundle::writer::BendlAppender; use crate::io::bundle::{AddAssetOptions, BendlWriteError, BendlWriter}; use std::io::{Read, Seek, Write}; @@ -16,30 +16,56 @@ pub(super) fn format_from_path(path: &Path) -> Result } } -pub(super) fn add_file_asset( +pub(super) fn add_known_file_asset( + writer: &mut BendlWriter, + kind: KnownAssetKind, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + let name = kind.standardized_name(); + writer + .add_known_asset(kind, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} + +pub(super) fn add_custom_file_asset( writer: &mut BendlWriter, - asset_type: u16, name: &str, path: &Path, options: AddAssetOptions, ) -> Result<(), String> { let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; writer - .add_asset(asset_type, name, &bytes, options) + .add_custom_asset(name, &bytes, options) + .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) +} + +pub(super) fn append_known_file_asset< + W: Read + Write + Seek + crate::io::bundle::writer::BendlTruncate, +>( + appender: &mut BendlAppender, + kind: KnownAssetKind, + path: &Path, + options: AddAssetOptions, +) -> Result<(), String> { + let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; + let name = kind.standardized_name(); + appender + .add_known_asset(kind, &bytes, options) .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) } -pub(super) fn append_file_asset< +pub(super) fn append_custom_file_asset< W: Read + Write + Seek + crate::io::bundle::writer::BendlTruncate, >( appender: &mut BendlAppender, - asset_type: u16, name: &str, path: &Path, options: AddAssetOptions, ) -> Result<(), String> { let bytes = std::fs::read(path).map_err(|e| format!("failed to read {path:?}: {e}"))?; appender - .add_asset(asset_type, name, &bytes, options) + .add_custom_asset(name, &bytes, options) .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) } diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index c7fefac..23ccaec 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -104,6 +104,38 @@ pub fn standardized_name_for(asset_type: u16) -> Option<&'static str> { } } +/// One of the known singleton asset types reserved by the bundle format. +/// +/// Each variant carries a fixed `asset_type` integer and a fixed +/// standardized name. Custom assets (writer-chosen name, multiple allowed) +/// are not represented here. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum KnownAssetKind { + Metadata, + Graph, + NodePermutationMap, +} + +impl KnownAssetKind { + /// The asset-type integer reserved for this kind in the bundle format. + pub fn asset_type(self) -> u16 { + match self { + Self::Metadata => ASSET_TYPE_METADATA, + Self::Graph => ASSET_TYPE_GRAPH, + Self::NodePermutationMap => ASSET_TYPE_NODE_PERMUTATION_MAP, + } + } + + /// The standardized filename reserved for this kind. + pub fn standardized_name(self) -> &'static str { + match self { + Self::Metadata => STANDARDIZED_NAME_METADATA, + Self::Graph => STANDARDIZED_NAME_GRAPH, + Self::NodePermutationMap => STANDARDIZED_NAME_NODE_PERMUTATION_MAP, + } + } +} + /// Return whether a given asset type should default to xz compression /// when the writer is not given an explicit compression option. pub fn default_compresses_by_type(asset_type: u16) -> bool { diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 2151ac8..50078f9 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -33,8 +33,9 @@ use xz2::write::XzEncoder; use super::format::{ standardized_name_for, default_compresses_by_type, encode_directory, read_directory, - AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_JSON, - ASSET_FLAG_XZ, FINALIZED_YES, DEFAULT_XZ_PRESET, HEADER_SIZE, + AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, + ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, FINALIZED_YES, DEFAULT_XZ_PRESET, + HEADER_SIZE, }; /// Ability to truncate an underlying seekable target to a given length. @@ -260,6 +261,28 @@ impl BendlWriter { ) } + /// Add one of the known singleton assets, using its reserved asset-type + /// integer and standardized name automatically. + pub fn add_known_asset( + &mut self, + kind: KnownAssetKind, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + self.add_asset(kind.asset_type(), kind.standardized_name(), payload, options) + } + + /// Add a custom (writer-named) asset. The asset-type is set to + /// [`ASSET_TYPE_CUSTOM`] automatically. + pub fn add_custom_asset( + &mut self, + name: &str, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + self.add_asset(ASSET_TYPE_CUSTOM, name, payload, options) + } + /// Transition from the asset phase into the stream phase and return /// a mutable reference to the inner writer so the caller can /// directly write the embedded BEN/XBEN payload. @@ -722,6 +745,28 @@ impl BendlAppender { ) } + /// Append one of the known singleton assets, using its reserved + /// asset-type integer and standardized name automatically. + pub fn add_known_asset( + &mut self, + kind: KnownAssetKind, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + self.add_asset(kind.asset_type(), kind.standardized_name(), payload, options) + } + + /// Append a custom (writer-named) asset. The asset-type is set to + /// [`ASSET_TYPE_CUSTOM`] automatically. + pub fn add_custom_asset( + &mut self, + name: &str, + payload: &[u8], + options: AddAssetOptions, + ) -> Result<(), BendlWriteError> { + self.add_asset(ASSET_TYPE_CUSTOM, name, payload, options) + } + /// Commit all pending appends. /// /// This compresses any buffered payloads that need it (entirely in From 92f2b3e0223afc11ac5f8f50f8f2b03ae88066cb Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 18:22:40 -0600 Subject: [PATCH 094/221] add in an XBEN variant --- ben/src/codec/decode/xz.rs | 10 ++--- ben/src/codec/translate/errors.rs | 9 ----- ben/src/codec/translate/mod.rs | 50 ++++++++++--------------- ben/src/codec/translate/tests.rs | 43 ++++++++++++--------- ben/src/io/writer/stream_writer/xben.rs | 17 +++++---- ben/src/lib.rs | 47 +++++++++++++++++++++++ 6 files changed, 107 insertions(+), 69 deletions(-) diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index fbc9cc7..56207af 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -4,7 +4,7 @@ use crate::format::FormatError; use crate::io::reader::BenStreamReader; use crate::io::writer::BenStreamWriter; use crate::progress::Spinner; -use crate::BenVariant; +use crate::{BenVariant, XBenVariant}; use std::io::{self, BufRead, BufReader, Read, Write}; use xz2::read::XzDecoder; @@ -30,14 +30,14 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: return Err(e); } - let variant = match variant_from_banner(&first_buffer) { + let variant: XBenVariant = match variant_from_banner(&first_buffer) { Some(BenVariant::Standard) => { writer.write_all(banner_for_variant(BenVariant::Standard))?; - BenVariant::Standard + XBenVariant::Standard } Some(BenVariant::MkvChain) => { writer.write_all(banner_for_variant(BenVariant::MkvChain))?; - BenVariant::MkvChain + XBenVariant::MkvChain } Some(BenVariant::TwoDelta) => { let mut xben = BenStreamReader::from_xben_decompressed( @@ -78,7 +78,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let mut last_valid_assignment = 0; // TwoDelta was dispatched before this loop and returned early. - if variant == BenVariant::Standard { + if variant == XBenVariant::Standard { for i in (3..overflow.len()).step_by(4) { if overflow[i - 3..=i] == [0, 0, 0, 0] { last_valid_assignment = i + 1; diff --git a/ben/src/codec/translate/errors.rs b/ben/src/codec/translate/errors.rs index 11f8b1e..ddd2c0d 100644 --- a/ben/src/codec/translate/errors.rs +++ b/ben/src/codec/translate/errors.rs @@ -13,12 +13,6 @@ pub enum TranslateError { )] Ben32MissingTerminator { actual: [u8; 4], offset: usize }, - #[error( - "TwoDelta BEN streams cannot be translated to ben32; \ - use BenStreamWriter/BenStreamReader for TwoDelta compressed I/O" - )] - TwoDeltaUnsupported, - #[error("IO error: {0}")] Io(#[from] io::Error), } @@ -27,9 +21,6 @@ impl From for io::Error { fn from(e: TranslateError) -> Self { match e { TranslateError::Io(e) => e, - TranslateError::TwoDeltaUnsupported => { - io::Error::new(io::ErrorKind::Unsupported, e.to_string()) - } other => io::Error::new(io::ErrorKind::InvalidData, other), } } diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index d6b740d..b67efe3 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -15,15 +15,15 @@ use std::io::{self, Read, Write}; use crate::codec::decode::decode_ben_line; use crate::codec::BenEncodeFrame; use crate::progress::Spinner; -use crate::BenVariant; +use crate::{BenVariant, XBenVariant}; /// Convert a single ben32 frame into a BEN frame payload. /// /// # Arguments /// /// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator. -/// * `variant` - The BEN variant. Determines whether the resulting BEN frame -/// embeds a trailing repetition count. +/// * `variant` - The BEN32-supporting variant. Determines whether the resulting +/// BEN frame embeds a trailing repetition count. /// * `count` - The repetition count for `MkvChain`. Ignored for `Standard`. /// /// # Returns @@ -31,7 +31,7 @@ use crate::BenVariant; /// Returns the encoded BEN frame payload and header. fn ben32_to_ben_line( ben32_vec: Vec, - variant: BenVariant, + variant: XBenVariant, count: u16, ) -> io::Result> { let mut buffer = [0u8; 4]; @@ -64,25 +64,22 @@ fn ben32_to_ben_line( })); } - Ok(BenEncodeFrame::from_rle(ben32_rle, variant, Some(count)).into_bytes()) + Ok(BenEncodeFrame::from_rle(ben32_rle, BenVariant::from(variant), Some(count)).into_bytes()) } /// Translate a stream of ben32 frames into BEN frames. /// /// This is primarily used while decoding XBEN, where the compressed payload is -/// stored in ben32 form. -/// -/// Only the [`Standard`](BenVariant::Standard) and -/// [`MkvChain`](BenVariant::MkvChain) variants are supported. -/// TwoDelta streams use a different compressed layout and do not pass through -/// ben32; see the module-level documentation for details. +/// stored in ben32 form. Parameterised by [`XBenVariant`] so TwoDelta is +/// excluded at compile time; TwoDelta streams use a different compressed +/// layout and do not pass through ben32 (see the module-level documentation). /// /// # Arguments /// /// * `reader` - The ben32 input stream. /// * `writer` - The destination for the translated BEN frames. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each ben32 frame. +/// * `variant` - The BEN32-supporting variant, used to determine whether +/// repetition counts follow each ben32 frame. /// /// # Returns /// @@ -90,7 +87,7 @@ fn ben32_to_ben_line( pub fn ben32_to_ben_lines( mut reader: R, mut writer: W, - variant: BenVariant, + variant: XBenVariant, ) -> io::Result<()> { 'outer: loop { let mut ben32_vec: Vec = Vec::new(); @@ -103,7 +100,7 @@ pub fn ben32_to_ben_lines( Ok(()) => { ben32_vec.extend(ben32_read_buff); if ben32_read_buff == [0u8; 4] { - if variant == BenVariant::MkvChain { + if variant == XBenVariant::MkvChain { n_reps = reader.read_u16::()?; } break 'inner; @@ -160,20 +157,16 @@ fn ben_to_ben32_line( /// Translate a BEN stream into ben32 frames. /// /// This is the format used inside XBEN after the outer XZ compression layer is -/// removed. -/// -/// Only the [`Standard`](BenVariant::Standard) and -/// [`MkvChain`](BenVariant::MkvChain) variants are supported. -/// Passing [`TwoDelta`](BenVariant::TwoDelta) returns an error. TwoDelta -/// streams use a separate columnar layout and bypass ben32 entirely; see -/// the module-level documentation for details. +/// removed. Parameterised by [`XBenVariant`] so TwoDelta is excluded at compile +/// time; TwoDelta streams use a separate columnar layout and bypass ben32 +/// entirely (see the module-level documentation). /// /// # Arguments /// /// * `reader` - The BEN input stream without its 17-byte file banner. /// * `writer` - The destination for the translated ben32 frames. -/// * `variant` - The BEN variant, used to determine whether repetition counts -/// follow each translated frame. +/// * `variant` - The BEN32-supporting variant, used to determine whether +/// repetition counts follow each translated frame. /// /// # Returns /// @@ -181,7 +174,7 @@ fn ben_to_ben32_line( pub fn ben_to_ben32_lines( mut reader: R, mut writer: W, - variant: BenVariant, + variant: XBenVariant, ) -> io::Result<()> { let mut sample_number = 1usize; let spinner = Spinner::new("Encoding line"); @@ -203,13 +196,13 @@ pub fn ben_to_ben32_lines( spinner.set_count(sample_number as u64); match variant { - BenVariant::Standard => { + XBenVariant::Standard => { sample_number += 1; let ben32_vec = ben_to_ben32_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; writer.write_all(&ben32_vec)?; } - BenVariant::MkvChain => { + XBenVariant::MkvChain => { let ben32_vec = ben_to_ben32_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; @@ -218,9 +211,6 @@ pub fn ben_to_ben32_lines( writer.write_all(&ben32_vec)?; writer.write_all(&n_reps.to_be_bytes())?; } - BenVariant::TwoDelta => { - return Err(io::Error::from(TranslateError::TwoDeltaUnsupported)); - } } } diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index aca135c..a4c5162 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -1,7 +1,7 @@ use super::*; use crate::codec::encode::{encode_ben32_line, encode_jsonl_to_ben}; use crate::util::rle::rle_to_vec; -use crate::BenVariant; +use crate::{BenVariant, XBenVariant}; use rand::SeedableRng; use rand_chacha::ChaCha8Rng; use rand_distr::{Distribution, Uniform}; @@ -31,7 +31,7 @@ fn translate_ben32_to_ben_file(mut reader: R, mut writer: W) } writer.write_all(b"STANDARD BEN FILE")?; - ben32_to_ben_lines(reader, writer, BenVariant::Standard) + ben32_to_ben_lines(reader, writer, XBenVariant::Standard) } fn translate_ben_to_ben32_file(mut reader: R, mut writer: W) -> io::Result<()> { @@ -46,7 +46,7 @@ fn translate_ben_to_ben32_file(mut reader: R, mut writer: W) } writer.write_all(b"STANDARD BEN FILE")?; - ben_to_ben32_lines(reader, writer, BenVariant::Standard) + ben_to_ben32_lines(reader, writer, XBenVariant::Standard) } #[test] @@ -274,13 +274,13 @@ fn test_ben_to_ben32_lines_non_eof_error_on_frame_boundary() { }; let mut output = Vec::new(); - let err = ben_to_ben32_lines(reader, &mut output, BenVariant::Standard).unwrap_err(); + let err = ben_to_ben32_lines(reader, &mut output, XBenVariant::Standard).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } #[test] fn test_ben32_to_ben_line_rejects_invalid_length() { - let err = ben32_to_ben_line(vec![1, 2, 3], BenVariant::Standard, 0).unwrap_err(); + let err = ben32_to_ben_line(vec![1, 2, 3], XBenVariant::Standard, 0).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), @@ -290,7 +290,7 @@ fn test_ben32_to_ben_line_rejects_invalid_length() { #[test] fn test_ben32_to_ben_line_rejects_missing_terminator() { - let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1], BenVariant::Standard, 0).unwrap_err(); + let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1], XBenVariant::Standard, 0).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), @@ -305,7 +305,7 @@ fn test_ben32_to_ben_lines_preserves_mkv_counts() { ]; let mut output = Vec::new(); - ben32_to_ben_lines(&input[..], &mut output, BenVariant::MkvChain).unwrap(); + ben32_to_ben_lines(&input[..], &mut output, XBenVariant::MkvChain).unwrap(); let count = u16::from_be_bytes([output[output.len() - 2], output[output.len() - 1]]); assert_eq!(count, 5); @@ -337,7 +337,7 @@ fn test_ben_to_ben32_lines_propagates_non_eof_read_errors() { reads: 0, }, &mut output, - BenVariant::Standard, + XBenVariant::Standard, ) .unwrap_err(); @@ -355,7 +355,7 @@ fn test_ben32_to_ben_lines_propagates_non_eof_read_errors() { } } - let err = ben32_to_ben_lines(BoomReader, Vec::new(), BenVariant::Standard).unwrap_err(); + let err = ben32_to_ben_lines(BoomReader, Vec::new(), XBenVariant::Standard).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); assert_eq!(err.to_string(), "boom"); } @@ -371,22 +371,29 @@ fn test_ben_to_ben32_lines_mkv_roundtrip() { encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); let mut ben32 = Vec::new(); - ben_to_ben32_lines(&ben[17..], &mut ben32, BenVariant::MkvChain).unwrap(); + ben_to_ben32_lines(&ben[17..], &mut ben32, XBenVariant::MkvChain).unwrap(); let mut round = Vec::new(); - ben32_to_ben_lines(ben32.as_slice(), &mut round, BenVariant::MkvChain).unwrap(); + ben32_to_ben_lines(ben32.as_slice(), &mut round, XBenVariant::MkvChain).unwrap(); assert_eq!(round, ben[17..]); } #[test] -fn test_ben_to_ben32_lines_rejects_twodelta() { - let ben_data = vec![2, 3, 0, 0, 0, 2, 0xAB, 0xCD]; - let mut output = Vec::new(); - let err = - ben_to_ben32_lines(ben_data.as_slice(), &mut output, BenVariant::TwoDelta).unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::Unsupported); - assert!(err.to_string().contains("TwoDelta")); +fn test_xben_variant_try_from_rejects_twodelta() { + use crate::TwoDeltaNotXBenError; + assert_eq!( + XBenVariant::try_from(BenVariant::Standard).unwrap(), + XBenVariant::Standard + ); + assert_eq!( + XBenVariant::try_from(BenVariant::MkvChain).unwrap(), + XBenVariant::MkvChain + ); + assert_eq!( + XBenVariant::try_from(BenVariant::TwoDelta).unwrap_err(), + TwoDeltaNotXBenError + ); } #[test] diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index c9593e0..3e4655c 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -323,14 +323,17 @@ impl XBenInner { reader.consume(BANNER_LEN); } - if variant == BenVariant::TwoDelta { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "TwoDelta XBEN translation requires a BEN stream with its banner", - )); - } + let xben_variant = match crate::XBenVariant::try_from(variant) { + Ok(v) => v, + Err(_) => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta XBEN translation requires a BEN stream with its banner", + )); + } + }; - ben_to_ben32_lines(&mut reader, &mut self.encoder, variant) + ben_to_ben32_lines(&mut reader, &mut self.encoder, xben_variant) } } diff --git a/ben/src/lib.rs b/ben/src/lib.rs index c36bf1c..afa1056 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -55,3 +55,50 @@ pub enum BenVariant { /// Store delta-encoded frames for improved compression of correlated samples. TwoDelta, } + +/// The subset of [`BenVariant`] values that pass through the BEN32 intermediate +/// wire format (see `docs/glossary.md`). +/// +/// `TwoDelta` streams use a separate XBEN columnar layout and are intentionally +/// excluded; functions parameterised by `XBenVariant` cannot be called for +/// TwoDelta at compile time. Convert with `From for BenVariant` +/// or `TryFrom for XBenVariant`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum XBenVariant { + Standard, + MkvChain, +} + +impl From for BenVariant { + fn from(v: XBenVariant) -> Self { + match v { + XBenVariant::Standard => BenVariant::Standard, + XBenVariant::MkvChain => BenVariant::MkvChain, + } + } +} + +/// Returned by `TryFrom for XBenVariant` when the input is +/// `TwoDelta`, which has no BEN32 representation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TwoDeltaNotXBenError; + +impl std::fmt::Display for TwoDeltaNotXBenError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("TwoDelta has no BEN32 representation; use the XBEN columnar layout instead") + } +} + +impl std::error::Error for TwoDeltaNotXBenError {} + +impl TryFrom for XBenVariant { + type Error = TwoDeltaNotXBenError; + + fn try_from(v: BenVariant) -> Result { + match v { + BenVariant::Standard => Ok(XBenVariant::Standard), + BenVariant::MkvChain => Ok(XBenVariant::MkvChain), + BenVariant::TwoDelta => Err(TwoDeltaNotXBenError), + } + } +} From aa185d8fa7cc80f1c79fe78f2eb6415fc42bdb39 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 21:16:24 -0600 Subject: [PATCH 095/221] update pcompress translation to be more consistent --- ben/src/cli/pcben/args.rs | 46 +++++ ben/src/cli/pcben/mod.rs | 267 ++------------------------ ben/src/cli/pcben/modes/ben_to_pc.rs | 40 ++++ ben/src/cli/pcben/modes/mod.rs | 9 + ben/src/cli/pcben/modes/pc_to_ben.rs | 40 ++++ ben/src/cli/pcben/modes/pc_to_xben.rs | 40 ++++ ben/src/cli/pcben/paths.rs | 43 +++++ ben/src/cli/pcben/tests.rs | 9 +- ben/src/cli/pcben/translate.rs | 89 +++++++++ 9 files changed, 329 insertions(+), 254 deletions(-) create mode 100644 ben/src/cli/pcben/args.rs create mode 100644 ben/src/cli/pcben/modes/ben_to_pc.rs create mode 100644 ben/src/cli/pcben/modes/mod.rs create mode 100644 ben/src/cli/pcben/modes/pc_to_ben.rs create mode 100644 ben/src/cli/pcben/modes/pc_to_xben.rs create mode 100644 ben/src/cli/pcben/paths.rs create mode 100644 ben/src/cli/pcben/translate.rs diff --git a/ben/src/cli/pcben/args.rs b/ben/src/cli/pcben/args.rs new file mode 100644 index 0000000..13fb568 --- /dev/null +++ b/ben/src/cli/pcben/args.rs @@ -0,0 +1,46 @@ +//! `pcben` CLI argument definitions. + +use clap::{Parser, ValueEnum}; + +#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] +/// Defines the mode of operation. +pub(super) enum Mode { + /// Convert BEN into PCOMPRESS. + BenToPc, + /// Convert PCOMPRESS into BEN. + PcToBen, + /// Convert PCOMPRESS into XBEN. + PcToXben, +} + +#[derive(Parser, Debug)] +#[command( + name = "Conversion tool for BEN and PCOMPRESS formats", + about = "This is a CLI tool that allows for the conversion between BEN and PCOMPRESS formats.", + version +)] +/// Defines the command line arguments accepted by the program. +pub(super) struct Args { + /// Mode to run the program in + #[arg(short, long, value_enum)] + pub(super) mode: Mode, + /// Input file to read from. + #[arg(short, long)] + pub(super) input_file: Option, + /// Output file to write to. Optional. + /// If not provided, the output file will be determined + /// based on the input file and the mode of operation. + #[arg(short, long)] + pub(super) output_file: Option, + /// If the output file already exists, this flag + /// will cause the program to overwrite it without + /// asking the user for confirmation. + #[arg(short = 'w', long)] + pub(super) overwrite: bool, + /// Enables verbose printing for the CLI. Optional. + #[arg(short, long)] + pub(super) verbose: bool, + /// Suppress in-place progress spinners. Trace logging is unaffected. + #[arg(short = 'q', long)] + pub(super) quiet: bool, +} diff --git a/ben/src/cli/pcben/mod.rs b/ben/src/cli/pcben/mod.rs index 517fbd9..9a0e850 100644 --- a/ben/src/cli/pcben/mod.rs +++ b/ben/src/cli/pcben/mod.rs @@ -1,58 +1,17 @@ -use crate::cli::common::{check_overwrite, set_quiet, set_verbose, CliError, CliResult}; -use crate::io::reader::BenStreamReader; -use crate::io::writer::BenStreamWriter; -use crate::BenVariant; -use clap::{Parser, ValueEnum}; -use pipe::pipe; -use serde_json::json; -use std::{ - fs::File, - io::{self, BufRead, BufReader, BufWriter, Read, Write}, -}; -use xz2::write::XzEncoder; +//! `pcben` CLI: convert between BEN/XBEN and the foreign PCOMPRESS format. -#[derive(Parser, Debug, Clone, ValueEnum, PartialEq)] -/// Defines the mode of operation. -enum Mode { - /// Convert BEN into PCOMPRESS. - BenToPc, - /// Convert PCOMPRESS into BEN. - PcToBen, - /// Convert PCOMPRESS into XBEN. - PcToXben, -} +mod args; +mod modes; +mod paths; +mod translate; -#[derive(Parser, Debug)] -#[command( - name = "Conversion tool for BEN and PCOMPRESS formats", - about = "This is a CLI tool that allows for the conversion between BEN and PCOMPRESS formats.", - version -)] -/// Defines the command line arguments accepted by the program. -struct Args { - /// Mode to run the program in - #[arg(short, long, value_enum)] - mode: Mode, - /// Input file to read from. - #[arg(short, long)] - input_file: Option, - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. - #[arg(short, long)] - output_file: Option, - /// If the output file already exists, this flag - /// will cause the program to overwrite it without - /// asking the user for confirmation. - #[arg(short = 'w', long)] - overwrite: bool, - /// Enables verbose printing for the CLI. Optional. - #[arg(short, long)] - verbose: bool, - /// Suppress in-place progress spinners. Trace logging is unaffected. - #[arg(short = 'q', long)] - quiet: bool, -} +#[cfg(test)] +mod tests; + +use args::{Args, Mode}; + +use crate::cli::common::{set_quiet, set_verbose, CliResult}; +use clap::Parser; /// Parse CLI arguments and execute the selected `pcben` conversion. pub fn run() -> CliResult { @@ -61,204 +20,8 @@ pub fn run() -> CliResult { set_quiet(args.quiet); match args.mode { - Mode::BenToPc => { - tracing::trace!("Converting BEN to PCOMPRESS"); - - let ben_reader: Box = match args.input_file.as_ref() { - Some(file) => Box::new(BufReader::new(File::open(file)?)), - None => Box::new(io::stdin()), - }; - - let mut pcompress_writer: BufWriter> = match resolved_output_path( - Mode::BenToPc, - args.input_file.as_deref(), - args.output_file.as_deref(), - args.overwrite, - )? { - Some(file) => BufWriter::new(Box::new(File::create(file)?)), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - - let _ = std::thread::spawn(move || -> io::Result<()> { - assignment_decode_ben(ben_reader, pipe_writer) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - pcompress::encode::encode(&mut buf_pipe_reader, &mut pcompress_writer, false); - Ok(()) - } - Mode::PcToBen => { - tracing::trace!("Converting PCOMPRESS to BEN"); - - let mut pcompress_reader: BufReader> = match args - .input_file - .as_ref() - { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), - None => BufReader::new(Box::new(io::stdin())), - }; - - let mut ben_writer: BufWriter> = match resolved_output_path( - Mode::PcToBen, - args.input_file.as_deref(), - args.output_file.as_deref(), - args.overwrite, - )? { - Some(file) => BufWriter::new(Box::new(File::create(file)?)), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - let mut buf_pipe_writer = BufWriter::new(pipe_writer); - - let _ = std::thread::spawn(move || { - pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) - } - Mode::PcToXben => { - tracing::trace!("Converting PCOMPRESS to XBEN"); - - let mut pcompress_reader: BufReader> = match args - .input_file - .as_ref() - { - Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), - None => BufReader::new(Box::new(io::stdin())), - }; - - let mut ben_writer: BufWriter> = match resolved_output_path( - Mode::PcToXben, - args.input_file.as_deref(), - args.output_file.as_deref(), - args.overwrite, - )? { - Some(file) => BufWriter::new(Box::new(File::create(file)?)), - None => BufWriter::new(Box::new(io::stdout())), - }; - - let (pipe_reader, pipe_writer) = pipe(); - let mut buf_pipe_writer = BufWriter::new(pipe_writer); - - let _ = std::thread::spawn(move || { - pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) - }); - - let mut buf_pipe_reader = BufReader::new(pipe_reader); - assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) - } + Mode::BenToPc => modes::ben_to_pc::run(args), + Mode::PcToBen => modes::pc_to_ben::run(args), + Mode::PcToXben => modes::pc_to_xben::run(args), } } - -/// Resolve the output file path for a `pcben` mode. -fn resolved_output_path( - mode: Mode, - input_file: Option<&str>, - output_file: Option<&str>, - overwrite: bool, -) -> io::Result> { - let Some(path) = output_file - .map(ToOwned::to_owned) - .or_else(|| input_file.map(|input| derive_output_path(mode, input))) - else { - return Ok(None); - }; - - check_overwrite(&path, overwrite)?; - Ok(Some(path)) -} - -/// Derive the default output file name for a `pcben` conversion mode. -fn derive_output_path(mode: Mode, input_file: &str) -> String { - match mode { - Mode::BenToPc => input_file - .strip_suffix(".ben") - .map(|prefix| format!("{prefix}.pcompress")) - .unwrap_or_else(|| format!("{input_file}.pcompress")), - Mode::PcToBen => input_file - .strip_suffix(".pcompress") - .or_else(|| input_file.strip_suffix(".pc")) - .map(|prefix| format!("{prefix}.ben")) - .unwrap_or_else(|| format!("{input_file}.ben")), - Mode::PcToXben => input_file - .strip_suffix(".pcompress") - .or_else(|| input_file.strip_suffix(".pc")) - .map(|prefix| format!("{prefix}.xben")) - .unwrap_or_else(|| format!("{input_file}.xben")), - } -} - -/// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. -fn assignment_decode_ben(mut reader: R, mut writer: W) -> io::Result<()> { - let ben_reader = BenStreamReader::from_ben(&mut reader)?; - let mut line = String::new(); - - for result in ben_reader { - match result { - Ok((assignment, count)) => { - render_zero_based_assignment_line(&assignment, &mut line); - for _ in 0..count { - writeln!(writer, "{line}")?; - } - } - Err(e) => return Err(e), - } - } - - Ok(()) -} - -/// Render a BEN assignment vector as a zero-based JSON array for PCOMPRESS. -fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { - output.clear(); - output.push('['); - for (idx, value) in assignment.iter().enumerate() { - if idx > 0 { - output.push(','); - } - output.push_str(&value.saturating_sub(1).to_string()); - } - output.push(']'); -} - -/// Read zero-based assignment vectors and encode them as BEN. -fn assignment_encode_ben(reader: R, writer: W) -> io::Result<()> { - let mut ben_writer = BenStreamWriter::for_ben(writer, BenVariant::MkvChain)?; - - for line in reader.lines() { - let assignment: Vec = serde_json::from_str::>(&line.unwrap()) - .unwrap() - .into_iter() - .map(|x| x as u16 + 1) - .collect(); - ben_writer.write_assignment(assignment)?; - } - ben_writer.finish()?; - Ok(()) -} - -/// Read zero-based assignment vectors and encode them as XBEN. -fn assignment_encode_xben(reader: R, writer: W) -> io::Result<()> { - let encoder = XzEncoder::new(writer, 9); - let mut xben_writer = - BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None)?; - - for line in reader.lines() { - let assignment: Vec = serde_json::from_str::>(&line.unwrap()) - .unwrap() - .into_iter() - .map(|x| x as u16 + 1) - .collect(); - xben_writer.write_json_value(json!({ "assignment": assignment }))?; - } - xben_writer.finish()?; - - Ok(()) -} - -#[cfg(test)] -mod tests; diff --git a/ben/src/cli/pcben/modes/ben_to_pc.rs b/ben/src/cli/pcben/modes/ben_to_pc.rs new file mode 100644 index 0000000..b53e70b --- /dev/null +++ b/ben/src/cli/pcben/modes/ben_to_pc.rs @@ -0,0 +1,40 @@ +//! `pcben --mode ben-to-pc` handler. + +use super::super::args::{Args, Mode}; +use super::super::paths::resolved_output_path; +use super::super::translate::assignment_decode_ben; + +use crate::cli::common::CliResult; +use pipe::pipe; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, Read}; + +/// Execute the `ben-to-pc` sub-mode. +pub(in crate::cli::pcben) fn run(args: Args) -> CliResult { + tracing::trace!("Converting BEN to PCOMPRESS"); + + let ben_reader: Box = match args.input_file.as_ref() { + Some(file) => Box::new(BufReader::new(File::open(file)?)), + None => Box::new(io::stdin()), + }; + + let mut pcompress_writer: BufWriter> = match resolved_output_path( + Mode::BenToPc, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file)?)), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + + let _ = std::thread::spawn(move || -> io::Result<()> { + assignment_decode_ben(ben_reader, pipe_writer) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + pcompress::encode::encode(&mut buf_pipe_reader, &mut pcompress_writer, false); + Ok(()) +} diff --git a/ben/src/cli/pcben/modes/mod.rs b/ben/src/cli/pcben/modes/mod.rs new file mode 100644 index 0000000..d5a78dc --- /dev/null +++ b/ben/src/cli/pcben/modes/mod.rs @@ -0,0 +1,9 @@ +//! Per-mode handlers for the `pcben` CLI. +//! +//! The dispatcher in `super::run` matches on the parsed `Mode` enum and +//! forwards to one of these handlers. Splitting one handler per file keeps +//! each mode under ~40 lines and makes them individually testable. + +pub(super) mod ben_to_pc; +pub(super) mod pc_to_ben; +pub(super) mod pc_to_xben; diff --git a/ben/src/cli/pcben/modes/pc_to_ben.rs b/ben/src/cli/pcben/modes/pc_to_ben.rs new file mode 100644 index 0000000..594da2b --- /dev/null +++ b/ben/src/cli/pcben/modes/pc_to_ben.rs @@ -0,0 +1,40 @@ +//! `pcben --mode pc-to-ben` handler. + +use super::super::args::{Args, Mode}; +use super::super::paths::resolved_output_path; +use super::super::translate::assignment_encode_ben; + +use crate::cli::common::{CliError, CliResult}; +use pipe::pipe; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, Read}; + +/// Execute the `pc-to-ben` sub-mode. +pub(in crate::cli::pcben) fn run(args: Args) -> CliResult { + tracing::trace!("Converting PCOMPRESS to BEN"); + + let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), + None => BufReader::new(Box::new(io::stdin())), + }; + + let mut ben_writer: BufWriter> = match resolved_output_path( + Mode::PcToBen, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file)?)), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + let mut buf_pipe_writer = BufWriter::new(pipe_writer); + + let _ = std::thread::spawn(move || { + pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + assignment_encode_ben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) +} diff --git a/ben/src/cli/pcben/modes/pc_to_xben.rs b/ben/src/cli/pcben/modes/pc_to_xben.rs new file mode 100644 index 0000000..e4d3915 --- /dev/null +++ b/ben/src/cli/pcben/modes/pc_to_xben.rs @@ -0,0 +1,40 @@ +//! `pcben --mode pc-to-xben` handler. + +use super::super::args::{Args, Mode}; +use super::super::paths::resolved_output_path; +use super::super::translate::assignment_encode_xben; + +use crate::cli::common::{CliError, CliResult}; +use pipe::pipe; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, Read}; + +/// Execute the `pc-to-xben` sub-mode. +pub(in crate::cli::pcben) fn run(args: Args) -> CliResult { + tracing::trace!("Converting PCOMPRESS to XBEN"); + + let mut pcompress_reader: BufReader> = match args.input_file.as_ref() { + Some(file) => BufReader::new(Box::new(BufReader::new(File::open(file)?))), + None => BufReader::new(Box::new(io::stdin())), + }; + + let mut ben_writer: BufWriter> = match resolved_output_path( + Mode::PcToXben, + args.input_file.as_deref(), + args.output_file.as_deref(), + args.overwrite, + )? { + Some(file) => BufWriter::new(Box::new(File::create(file)?)), + None => BufWriter::new(Box::new(io::stdout())), + }; + + let (pipe_reader, pipe_writer) = pipe(); + let mut buf_pipe_writer = BufWriter::new(pipe_writer); + + let _ = std::thread::spawn(move || { + pcompress::decode::decode(&mut pcompress_reader, &mut buf_pipe_writer, 0, false) + }); + + let mut buf_pipe_reader = BufReader::new(pipe_reader); + assignment_encode_xben(&mut buf_pipe_reader, &mut ben_writer).map_err(CliError::from) +} diff --git a/ben/src/cli/pcben/paths.rs b/ben/src/cli/pcben/paths.rs new file mode 100644 index 0000000..ba0c8b3 --- /dev/null +++ b/ben/src/cli/pcben/paths.rs @@ -0,0 +1,43 @@ +//! Output-path resolution helpers for the `pcben` CLI. + +use super::args::Mode; +use crate::cli::common::check_overwrite; +use std::io; + +/// Resolve the output file path for a `pcben` mode. +pub(super) fn resolved_output_path( + mode: Mode, + input_file: Option<&str>, + output_file: Option<&str>, + overwrite: bool, +) -> io::Result> { + let Some(path) = output_file + .map(ToOwned::to_owned) + .or_else(|| input_file.map(|input| derive_output_path(mode, input))) + else { + return Ok(None); + }; + + check_overwrite(&path, overwrite)?; + Ok(Some(path)) +} + +/// Derive the default output file name for a `pcben` conversion mode. +pub(super) fn derive_output_path(mode: Mode, input_file: &str) -> String { + match mode { + Mode::BenToPc => input_file + .strip_suffix(".ben") + .map(|prefix| format!("{prefix}.pcompress")) + .unwrap_or_else(|| format!("{input_file}.pcompress")), + Mode::PcToBen => input_file + .strip_suffix(".pcompress") + .or_else(|| input_file.strip_suffix(".pc")) + .map(|prefix| format!("{prefix}.ben")) + .unwrap_or_else(|| format!("{input_file}.ben")), + Mode::PcToXben => input_file + .strip_suffix(".pcompress") + .or_else(|| input_file.strip_suffix(".pc")) + .map(|prefix| format!("{prefix}.xben")) + .unwrap_or_else(|| format!("{input_file}.xben")), + } +} diff --git a/ben/src/cli/pcben/tests.rs b/ben/src/cli/pcben/tests.rs index 1b75321..800513e 100644 --- a/ben/src/cli/pcben/tests.rs +++ b/ben/src/cli/pcben/tests.rs @@ -1,8 +1,13 @@ -use super::*; +use super::args::{Args, Mode}; +use super::paths::{derive_output_path, resolved_output_path}; +use super::translate::{ + assignment_decode_ben, assignment_encode_ben, assignment_encode_xben, +}; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; use crate::codec::encode::encode_jsonl_to_ben; +use crate::BenVariant; use clap::{CommandFactory, Parser}; -use std::io::{BufReader, Cursor}; +use std::io::{self, BufReader, Cursor}; #[test] fn clap_metadata_uses_package_version() { diff --git a/ben/src/cli/pcben/translate.rs b/ben/src/cli/pcben/translate.rs new file mode 100644 index 0000000..27db926 --- /dev/null +++ b/ben/src/cli/pcben/translate.rs @@ -0,0 +1,89 @@ +//! BEN ↔ PCOMPRESS assignment translation helpers. +//! +//! PCOMPRESS uses zero-based district ids; BEN uses one-based. These +//! helpers bridge the two conventions so the per-mode handlers can be +//! kept short. + +use crate::io::reader::BenStreamReader; +use crate::io::writer::BenStreamWriter; +use crate::BenVariant; +use serde_json::json; +use std::io::{self, BufRead, Read, Write}; +use xz2::write::XzEncoder; + +/// Decode BEN and emit one zero-based assignment vector per line for PCOMPRESS. +pub(super) fn assignment_decode_ben( + mut reader: R, + mut writer: W, +) -> io::Result<()> { + let ben_reader = BenStreamReader::from_ben(&mut reader)?; + let mut line = String::new(); + + for result in ben_reader { + match result { + Ok((assignment, count)) => { + render_zero_based_assignment_line(&assignment, &mut line); + for _ in 0..count { + writeln!(writer, "{line}")?; + } + } + Err(e) => return Err(e), + } + } + + Ok(()) +} + +/// Render a BEN assignment vector as a zero-based JSON array for PCOMPRESS. +fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { + output.clear(); + output.push('['); + for (idx, value) in assignment.iter().enumerate() { + if idx > 0 { + output.push(','); + } + output.push_str(&value.saturating_sub(1).to_string()); + } + output.push(']'); +} + +/// Read zero-based assignment vectors and encode them as BEN. +pub(super) fn assignment_encode_ben( + reader: R, + writer: W, +) -> io::Result<()> { + let mut ben_writer = BenStreamWriter::for_ben(writer, BenVariant::MkvChain)?; + + for line in reader.lines() { + let assignment: Vec = serde_json::from_str::>(&line.unwrap()) + .unwrap() + .into_iter() + .map(|x| x as u16 + 1) + .collect(); + ben_writer.write_assignment(assignment)?; + } + ben_writer.finish()?; + Ok(()) +} + +/// Read zero-based assignment vectors and encode them as XBEN. +pub(super) fn assignment_encode_xben( + reader: R, + writer: W, +) -> io::Result<()> { + let encoder = XzEncoder::new(writer, 9); + let mut xben_writer = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None)?; + + for line in reader.lines() { + let assignment: Vec = serde_json::from_str::>(&line.unwrap()) + .unwrap() + .into_iter() + .map(|x| x as u16 + 1) + .collect(); + xben_writer.write_json_value(json!({ "assignment": assignment }))?; + } + xben_writer.finish()?; + + Ok(()) +} From 22632408c5fab61bcfa01d7faaaede9782bb16c9 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 21:48:33 -0600 Subject: [PATCH 096/221] Update bundle protocol --- ben-py/src/encode/encoder.rs | 221 ++++++--------- ben-py/src/encode/helpers.rs | 10 - ben-py/src/encode/mod.rs | 1 - ben-py/src/encode/types.rs | 41 --- ben/src/cli/ben/bundle.rs | 30 +- ben/src/cli/ben/tests.rs | 16 +- ben/src/cli/bendl/create.rs | 12 +- ben/src/cli/bendl/tests.rs | 10 +- ben/src/io/bundle/mod.rs | 2 +- ben/src/io/bundle/tests/writer.rs | 377 +++++++++++++++++-------- ben/src/io/bundle/writer.rs | 247 ++++++++++------ ben/src/io/writer/stream_writer/mod.rs | 65 ++++- ben/src/test_utils.rs | 6 +- ben/tests/test_stress_edges.rs | 6 +- 14 files changed, 599 insertions(+), 445 deletions(-) delete mode 100644 ben-py/src/encode/types.rs diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 0ee0d9b..ab91837 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -1,23 +1,52 @@ -use super::helpers::{parse_graph_input, xz_compress}; -use super::types::{OutputMode, SharedFileSlot, SharedFileWriter}; +use super::helpers::parse_graph_input; use crate::common::{open_output, parse_variant}; -use binary_ensemble::io::bundle::format::{ - encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_FLAG_JSON, - ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, STANDARDIZED_NAME_GRAPH, FINALIZED_YES, HEADER_SIZE, +use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; +use binary_ensemble::io::bundle::{ + AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, }; use binary_ensemble::io::writer::BenStreamWriter; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; -use std::cell::RefCell; -use std::io::{Seek, SeekFrom, Write}; +use std::fs::File; +use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -use std::rc::Rc; + +/// Per-call encoder state. The bundle path threads ownership of the +/// underlying file through `BendlWriter` → `BendlStreamSession` → +/// `BenStreamWriter`, so when `close()` runs we walk the chain back +/// from `BenStreamWriter::finish_into_inner` (returning the session) +/// to `BendlStreamSession::finish_into_writer` (returning the bundle +/// writer) to `BendlWriter::finish` (returning the buffered file). +enum EncoderState { + /// Plain `.ben` file path: writes directly to a buffered file with + /// no bundle framing. + BenOnly(BenStreamWriter>), + /// `.bendl` bundle path: the session owns the buffered file and the + /// `BenStreamWriter` writes through it. `sample_count` is tracked + /// alongside so it can be plumbed into `finish_into_writer` at + /// `close()` time. + BundleStreaming { + writer: BenStreamWriter>>, + sample_count: i64, + }, +} #[pyclass(name = "BenEncoder", unsendable)] pub struct PyBenEncoder { - file: Option, - encoder: Option>, - mode: OutputMode, + state: Option, +} + +impl PyBenEncoder { + fn map_bundle_err(err: BendlWriteError) -> PyErr { + match err { + BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + other => PyException::new_err(format!("{other}")), + } + } + + fn map_io_err(err: io::Error) -> PyErr { + PyIOError::new_err(format!("{err}")) + } } #[pymethods] @@ -69,88 +98,62 @@ impl PyBenEncoder { } let buf = open_output(&file_path, overwrite)?; - let file: SharedFileSlot = Rc::new(RefCell::new(buf)); - let mode = if ben_file_only { - OutputMode::BenOnly + let state = if ben_file_only { + EncoderState::BenOnly( + BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?, + ) } else { - let graph_bytes = match graph { - Some(obj) => Some(parse_graph_input(py, &obj)?), - None => None, - }; - - // Write a provisional bundle header and any graph asset before - // the assignment stream begins. - let mut header = BendlHeader::provisional(AssignmentFormat::Ben, HEADER_SIZE as u64); - let mut entries: Vec = Vec::new(); - { - let mut slot = file.borrow_mut(); - slot.seek(SeekFrom::Start(0)) - .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; - header.write_to(&mut *slot).map_err(|e| { - PyIOError::new_err(format!("Failed to write bundle header: {e}")) - })?; - - if let Some(bytes) = graph_bytes { - let compressed = xz_compress(&bytes).map_err(|e| { - PyIOError::new_err(format!("Failed to xz-compress graph asset: {e}")) - })?; - let payload_offset = slot.stream_position().map_err(|e| { - PyIOError::new_err(format!("Failed to query output position: {e}")) - })?; - slot.write_all(&compressed).map_err(|e| { - PyIOError::new_err(format!("Failed to write graph asset payload: {e}")) - })?; - entries.push(BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: STANDARDIZED_NAME_GRAPH.to_string(), - payload_offset, - payload_len: compressed.len() as u64, - checksum: None, - }); - } + // Bundle path. Add the optional graph asset before opening + // the stream session — the bundle writer auto-compresses + // graphs (default_compresses_by_type), so we hand it raw + // JSON bytes and let it apply the XZ flag. + let mut writer = + BendlWriter::new(buf, AssignmentFormat::Ben).map_err(Self::map_io_err)?; + if let Some(graph_obj) = graph { + let raw = parse_graph_input(py, &graph_obj)?; + writer + .add_known_asset( + KnownAssetKind::Graph, + &raw, + AddAssetOptions::defaults().json(), + ) + .map_err(Self::map_bundle_err)?; } - - let stream_start = file - .borrow_mut() - .stream_position() - .map_err(|e| PyIOError::new_err(format!("Failed to query output position: {e}")))?; - header.stream_offset = stream_start; - - OutputMode::Bundle { - header, - entries, - stream_start, + let session = writer + .into_stream_session() + .map_err(Self::map_bundle_err)?; + let writer = BenStreamWriter::for_ben(session, ben_var).map_err(Self::map_io_err)?; + EncoderState::BundleStreaming { + writer, sample_count: 0, } }; - // Construct the BenStreamWriter on a clone of the shared slot. - // This writes the BEN banner as its first action, which in the - // bundle case becomes the first byte of the stream region. - let encoder = BenStreamWriter::for_ben(SharedFileWriter(Rc::clone(&file)), ben_var) - .map_err(|e| PyIOError::new_err(format!("Failed to create encoder: {e}")))?; - - Ok(PyBenEncoder { - file: Some(file), - encoder: Some(encoder), - mode, - }) + Ok(Self { state: Some(state) }) } /// Encode a single assignment and append it to the output stream. #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { - let enc = self - .encoder + let state = self + .state .as_mut() .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; - enc.write_assignment(assignment) - .map_err(|e| PyIOError::new_err(format!("Failed to encode assignment: {e}")))?; - if let OutputMode::Bundle { sample_count, .. } = &mut self.mode { - *sample_count += 1; + match state { + EncoderState::BenOnly(w) => { + w.write_assignment(assignment).map_err(Self::map_io_err)?; + } + EncoderState::BundleStreaming { + writer, + sample_count, + } => { + writer + .write_assignment(assignment) + .map_err(Self::map_io_err)?; + *sample_count += 1; + } } Ok(()) } @@ -158,61 +161,21 @@ impl PyBenEncoder { /// Flush the assignment stream and, for bundle output, patch the /// header and write the trailing directory. Idempotent. fn close(&mut self) -> PyResult<()> { - // Finish the assignment stream and drop the inner encoder so its - // Rc handle to the shared file slot is released. - if let Some(mut enc) = self.encoder.take() { - enc.finish().map_err(|e| { - PyIOError::new_err(format!("Failed to flush encoder when closing: {e}")) - })?; - drop(enc); - } - - let file = match self.file.take() { - Some(f) => f, - None => return Ok(()), + let Some(state) = self.state.take() else { + return Ok(()); }; - - match &mut self.mode { - OutputMode::BenOnly => { - file.borrow_mut() - .flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + match state { + EncoderState::BenOnly(writer) => { + let mut buf = writer.finish_into_inner().map_err(Self::map_io_err)?; + buf.flush().map_err(Self::map_io_err)?; } - OutputMode::Bundle { - header, - entries, - stream_start, + EncoderState::BundleStreaming { + writer, sample_count, } => { - let mut slot = file.borrow_mut(); - let stream_end = slot.stream_position().map_err(|e| { - PyIOError::new_err(format!("Failed to query output position: {e}")) - })?; - let stream_len = stream_end.saturating_sub(*stream_start); - - let directory_offset = stream_end; - let directory_bytes = encode_directory(entries).map_err(|e| { - PyException::new_err(format!("Failed to encode bundle directory: {e}")) - })?; - slot.write_all(&directory_bytes).map_err(|e| { - PyIOError::new_err(format!("Failed to write bundle directory: {e}")) - })?; - let directory_len = directory_bytes.len() as u64; - - header.stream_offset = *stream_start; - header.stream_len = stream_len; - header.directory_offset = directory_offset; - header.directory_len = directory_len; - header.sample_count = *sample_count; - header.finalized = FINALIZED_YES; - - slot.seek(SeekFrom::Start(0)) - .map_err(|e| PyIOError::new_err(format!("Failed to seek output: {e}")))?; - header.write_to(&mut *slot).map_err(|e| { - PyIOError::new_err(format!("Failed to patch bundle header: {e}")) - })?; - slot.flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + let session = writer.finish_into_inner().map_err(Self::map_io_err)?; + let bundle = session.finish_into_writer(sample_count); + bundle.finish().map_err(Self::map_bundle_err)?; } } Ok(()) diff --git a/ben-py/src/encode/helpers.rs b/ben-py/src/encode/helpers.rs index 125a319..17335e3 100644 --- a/ben-py/src/encode/helpers.rs +++ b/ben-py/src/encode/helpers.rs @@ -1,17 +1,7 @@ -use binary_ensemble::io::bundle::format::DEFAULT_XZ_PRESET; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyDict, PyList}; -use std::io::{self, Write}; use std::path::PathBuf; -use xz2::write::XzEncoder; - -/// xz-compress a byte slice with the bundle's default preset. -pub(super) fn xz_compress(bytes: &[u8]) -> io::Result> { - let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); - encoder.write_all(bytes)?; - encoder.finish() -} /// Normalize a user-supplied graph argument into raw UTF-8 JSON bytes. /// diff --git a/ben-py/src/encode/mod.rs b/ben-py/src/encode/mod.rs index 57e6ae6..a02afc1 100644 --- a/ben-py/src/encode/mod.rs +++ b/ben-py/src/encode/mod.rs @@ -3,7 +3,6 @@ mod encoder; mod helpers; mod py_funcs; -mod types; pub use encoder::PyBenEncoder; pub use py_funcs::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; diff --git a/ben-py/src/encode/types.rs b/ben-py/src/encode/types.rs deleted file mode 100644 index 19600d1..0000000 --- a/ben-py/src/encode/types.rs +++ /dev/null @@ -1,41 +0,0 @@ -use binary_ensemble::io::bundle::format::{BendlDirectoryEntry, BendlHeader}; -use std::cell::RefCell; -use std::fs::File; -use std::io::{self, BufWriter, Write}; -use std::rc::Rc; - -/// Handle to the underlying output file shared between the live -/// `BenStreamWriter` and the `PyBenEncoder` that owns it. Needed so the -/// encoder can reach the buffered file after the inner assignment writer -/// has finished, in order to patch the bundle header and write the -/// trailing directory. -pub(super) type SharedFileSlot = Rc>>; - -/// Wrapper around a shared buffered file that implements `Write`. The -/// `BenStreamWriter` holds one of these and delegates every write into -/// the shared slot. -pub(super) struct SharedFileWriter(pub SharedFileSlot); - -impl Write for SharedFileWriter { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.0.borrow_mut().write(buf) - } - - fn flush(&mut self) -> io::Result<()> { - self.0.borrow_mut().flush() - } -} - -/// Output container produced by `PyBenEncoder`. -pub(super) enum OutputMode { - /// Plain `.ben` file: just the assignment stream, no header or directory. - BenOnly, - /// `.bendl` bundle: provisional header up front, optional graph asset, - /// then the assignment stream, then a directory written at close time. - Bundle { - header: BendlHeader, - entries: Vec, - stream_start: u64, - sample_count: i64, - }, -} diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index b63d827..4f3f825 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -64,18 +64,16 @@ pub(super) fn run_encode_bundle_with_graph( let sample_count = count_jsonl_lines(input_path)?; let out_file = File::create(out_path)?; - let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) + let bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let mut session = bendl_writer + .into_stream_session() .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; { - let mut handle = bendl_writer - .begin_stream() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; let reader = BufReader::new(File::open(input_path)?); - encode_jsonl_to_ben(reader, &mut handle, variant)?; - handle - .finish(sample_count) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + encode_jsonl_to_ben(reader, &mut session, variant)?; } + let bendl_writer = session.finish_into_writer(sample_count); bendl_writer .finish() .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; @@ -111,17 +109,17 @@ pub(super) fn run_xencode_bundle_with_graph( }; let out_file = File::create(out_path)?; - let mut bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) + let bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + let mut session = bendl_writer + .into_stream_session() .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; { - let mut handle = bendl_writer - .begin_stream() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; let reader = BufReader::new(File::open(input_path)?); if from_ben { encode_ben_to_xben( reader, - &mut handle, + &mut session, n_threads, compression_level, chunk_size, @@ -130,7 +128,7 @@ pub(super) fn run_xencode_bundle_with_graph( } else { encode_jsonl_to_xben( reader, - &mut handle, + &mut session, variant, n_threads, compression_level, @@ -138,10 +136,8 @@ pub(super) fn run_xencode_bundle_with_graph( block_size, )?; } - handle - .finish(sample_count) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; } + let bendl_writer = session.finish_into_writer(sample_count); bendl_writer .finish() .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index 285f748..e322bd2 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -330,8 +330,10 @@ fn append_graph_asset_adds_graph_to_bundle() { // Build a minimal finalized .bendl in memory, write to temp file. let mut buf: Vec = Vec::new(); { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); } let bendl_path = unique_path("append_graph.bendl"); @@ -450,8 +452,10 @@ fn append_graph_asset_errors_on_missing_graph_file() { let mut buf: Vec = Vec::new(); { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); } let bendl_path = unique_path("err_graph.bendl"); @@ -525,7 +529,9 @@ fn append_graph_asset_errors_when_bundle_already_has_graph() { AddAssetOptions::defaults().json(), ) .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); } let bendl_path = unique_path("dup_graph.bendl"); diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs index 021525f..e2c5447 100644 --- a/ben/src/cli/bendl/create.rs +++ b/ben/src/cli/bendl/create.rs @@ -58,19 +58,17 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { // Stream phase: copy bytes from the input file directly into the // bundle's stream region. This preserves the exact BEN/XBEN bytes. + let mut session = writer + .into_stream_session() + .map_err(|e| format!("failed to open stream region: {e}"))?; { - let mut handle = writer - .begin_stream() - .map_err(|e| format!("failed to open stream region: {e}"))?; let mut input = BufReader::new( File::open(&args.input).map_err(|e| format!("failed to open {:?}: {e}", args.input))?, ); - io::copy(&mut input, &mut handle) + io::copy(&mut input, &mut session) .map_err(|e| format!("failed to copy assignment stream: {e}"))?; - handle - .finish(sample_count) - .map_err(|e| format!("failed to close stream region: {e}"))?; } + let writer = session.finish_into_writer(sample_count); writer .finish() diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index 1925508..f5d0573 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -9,7 +9,7 @@ use crate::io::bundle::format::AssignmentFormat; use crate::io::bundle::{BendlReader, BendlWriter}; use crate::test_utils::{sample_bendl_bytes, unique_path}; use clap::Parser; -use std::io::{BufReader, Cursor}; +use std::io::{BufReader, Cursor, Write}; use std::path::PathBuf; use std::time::{SystemTime, UNIX_EPOCH}; @@ -110,7 +110,9 @@ fn run_inspect_xben_format_and_checksum_flag() { }, ) .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); let path = unique_path("inspect_xben.bendl"); std::fs::write(&path, &buf).unwrap(); @@ -389,7 +391,9 @@ fn run_extract_asset_by_name() { AddAssetOptions::defaults(), ) .unwrap(); - writer.write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); let bendl = unique_path("extract_asset.bendl"); std::fs::write(&bendl, &buf).unwrap(); diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index 57a1cd9..fc8d712 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -23,5 +23,5 @@ mod tests; pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; pub use writer::{ - AddAssetOptions, BendlStreamHandle, BendlWriteError, BendlWriter, BundleAssignmentStreamCtx, + AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, BundleAssignmentStreamCtx, }; diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index e449192..cd8628a 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -1,4 +1,4 @@ -use std::io::{self, Cursor, Read, Write}; +use std::io::{self, Cursor, Read, Seek, Write}; use crate::io::bundle::format::{ AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, @@ -15,6 +15,18 @@ fn make_buffer() -> Cursor> { Cursor::new(Vec::new()) } +/// Test helper: replicate the deleted `BendlWriter::write_stream_bytes` +/// using the owned-session chain. Used purely to keep test bodies short. +fn write_stream_bytes_via_session( + writer: BendlWriter>>, + bytes: &[u8], + sample_count: i64, +) -> BendlWriter>> { + let mut session = writer.into_stream_session().unwrap(); + session.write_all(bytes).unwrap(); + session.finish_into_writer(sample_count) +} + #[test] fn minimal_bundle_round_trip_through_reader() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); @@ -22,7 +34,7 @@ fn minimal_bundle_round_trip_through_reader() { .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", br#"{"note":"hello"}"#) .unwrap(); let stream_bytes = b"STANDARD BEN FILE\x00\x01fake".to_vec(); - writer.write_stream_bytes(&stream_bytes, 7).unwrap(); + let writer = write_stream_bytes_via_session(writer, &stream_bytes, 7); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -56,9 +68,7 @@ fn graph_asset_is_compressed_by_default() { writer .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -89,9 +99,7 @@ fn graph_asset_can_be_forced_raw() { AddAssetOptions::defaults().json().raw(), ) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -153,12 +161,10 @@ fn writer_rejects_duplicate_custom_name() { #[test] fn writer_rejects_asset_added_after_stream_begins() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - { - let mut handle = writer.begin_stream().unwrap(); - handle.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); - handle.finish(1).unwrap(); - } + // After a session has been finished, the writer is in `StreamWritten` + // and `add_*_asset` rejects further additions with `AssetsAfterStream`. + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let err = writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap_err(); @@ -185,9 +191,7 @@ fn finalized_directory_lives_at_eof() { writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); @@ -214,7 +218,7 @@ fn build_base_bundle() -> (Vec, (u64, u64)) { .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"version\":1}") .unwrap(); let stream = b"STANDARD BEN FILE\x00\x01\x02\x03\x04\x05stream bytes"; - writer.write_stream_bytes(stream, 3).unwrap(); + let writer = write_stream_bytes_via_session(writer, stream, 3); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); @@ -327,9 +331,7 @@ fn append_rejects_duplicate_custom_name_without_touching_file() { AddAssetOptions::defaults(), ) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let bundle = writer.finish().unwrap().into_inner(); let bundle_before = bundle.clone(); @@ -505,8 +507,8 @@ fn write_ben_stream_round_trips_through_assignment_reader() { vec![1, 1, 1, 1, 2, 2], ]; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = writer .write_ben_stream(BenVariant::MkvChain, |ctx| { for s in &samples { ctx.write_assignment(s.clone())?; @@ -545,8 +547,8 @@ fn write_xben_stream_round_trips_through_assignment_reader() { vec![1, 1, 2, 3, 4, 4], ]; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + let writer = writer .write_xben_stream(BenVariant::MkvChain, |ctx| { for s in &samples { ctx.write_assignment(s.clone())?; @@ -584,7 +586,7 @@ fn write_ben_stream_alongside_front_loaded_asset() { writer .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) .unwrap(); - writer + let writer = writer .write_ben_stream(BenVariant::Standard, |ctx| { for s in &samples { ctx.write_assignment(s.clone())?; @@ -623,8 +625,8 @@ fn write_ben_stream_alongside_front_loaded_asset() { fn open_assignment_reader_rejects_mismatched_format() { use crate::BenVariant; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = writer .write_ben_stream(BenVariant::Standard, |ctx| { ctx.write_assignment(vec![0, 1])?; Ok(()) @@ -658,60 +660,24 @@ fn fully_empty_bundle_finalizes_and_round_trips() { } #[test] -fn begin_stream_twice_returns_wrong_state_error() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - { - let handle = match writer.begin_stream() { - Ok(h) => h, - Err(_) => panic!("first begin_stream must succeed"), - }; - // Drop the handle without calling finish() — the writer is - // now stuck in the Streaming state. - drop(handle); - } - let err = writer - .begin_stream() - .err() - .expect("second begin_stream must fail"); - assert!(matches!(err, BendlWriteError::WrongState { .. })); -} - -#[test] -fn finish_from_streaming_state_errors() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - match writer.begin_stream() { - Ok(handle) => drop(handle), - Err(_) => panic!("begin_stream must succeed"), - } - // Intentionally leave the writer in the Streaming state. - let err = writer.finish().unwrap_err(); - assert!(matches!( - err, - BendlWriteError::WrongState { - found: "Streaming", - .. - } - )); -} - -#[test] -fn begin_stream_after_stream_written_returns_wrong_state() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); - // Writer is now in StreamWritten state; begin_stream must fail. - let err = writer - .begin_stream() - .err() - .expect("begin_stream after StreamWritten must fail"); - assert!(matches!( - err, - BendlWriteError::WrongState { +fn into_stream_session_after_stream_written_returns_wrong_state() { + // Regression fixture for the `into_stream_session` guard: a writer + // that has already finished one stream phase must reject a second + // attempt to enter the stream phase. Without this guard, a chained + // `into_stream_session → finish_into_writer → into_stream_session` + // would silently overwrite `header.stream_offset` and corrupt the + // bundle. This is the only runtime fixture for that guard. + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); + // Writer is now in StreamWritten state; into_stream_session must fail. + match writer.into_stream_session() { + Err(BendlWriteError::WrongState { + expected: "Assets", found: "StreamWritten", - .. - } - )); + }) => {} + Err(other) => panic!("expected WrongState, got {other:?}"), + Ok(_) => panic!("into_stream_session after StreamWritten must fail"), + } } #[test] @@ -730,9 +696,7 @@ fn stress_many_custom_assets_round_trip() { ) .unwrap(); } - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -800,9 +764,7 @@ fn append_does_not_disturb_front_loaded_asset_bytes() { writer .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let bundle = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(bundle.clone())).unwrap(); @@ -868,9 +830,7 @@ fn writer_asset_with_checksum_round_trips_through_reader() { }, ) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -881,12 +841,10 @@ fn writer_asset_with_checksum_round_trips_through_reader() { #[test] fn finished_writer_rejects_further_operations() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); // `finish` consumes `self`, which is itself the protection — there - // is no way to call add_asset/begin_stream afterwards. + // is no way to call add_asset / into_stream_session afterwards. let buf = writer.finish().unwrap().into_inner(); // The resulting buffer is a valid finalized bundle. let reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -928,9 +886,7 @@ fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00fake", 1) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let buf = writer.finish().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.assets().len(), 1); @@ -975,15 +931,16 @@ fn append_rejects_duplicate_name_across_existing_and_pending() { fn write_ben_stream_closure_error_short_circuits_finalize() { use crate::BenVariant; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let err = writer - .write_ben_stream(BenVariant::Standard, |_ctx| { - Err(io::Error::new(io::ErrorKind::Other, "boom")) - }) - .unwrap_err(); - match err { - BendlWriteError::Io(e) => assert_eq!(e.kind(), io::ErrorKind::Other), - other => panic!("expected Io(Other), got {other:?}"), + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + // BendlWriter doesn't implement Debug, so destructure via match + // rather than `.unwrap_err()`. + let result = writer.write_ben_stream(BenVariant::Standard, |_ctx| { + Err(io::Error::new(io::ErrorKind::Other, "boom")) + }); + match result { + Ok(_) => panic!("expected closure error to short-circuit"), + Err(BendlWriteError::Io(e)) => assert_eq!(e.kind(), io::ErrorKind::Other), + Err(other) => panic!("expected Io(Other), got {other:?}"), } } @@ -1049,9 +1006,7 @@ fn randomized_round_trip_many_custom_assets() { // assignment-complete. let sample_count: i64 = rng.random_range(0..=20); let fake_stream = b"STANDARD BEN FILE\x00\x01\x02payload".to_vec(); - writer - .write_stream_bytes(&fake_stream, sample_count) - .unwrap(); + let writer = write_stream_bytes_via_session(writer, &fake_stream, sample_count); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -1196,8 +1151,8 @@ fn write_ben_stream_json_value_and_sample_count() { use crate::BenVariant; use serde_json::json; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = writer .write_ben_stream(BenVariant::Standard, |ctx| { assert_eq!(ctx.sample_count(), 0); ctx.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1}))?; @@ -1222,8 +1177,8 @@ fn write_xben_stream_json_value() { use crate::BenVariant; use serde_json::json; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + let writer = writer .write_xben_stream(BenVariant::Standard, |ctx| { ctx.write_json_value(json!({"assignment": [10, 20], "sample": 1}))?; ctx.write_json_value(json!({"assignment": [30, 40], "sample": 2}))?; @@ -1240,13 +1195,16 @@ fn write_xben_stream_json_value() { assert_eq!(decoded, vec![vec![10, 20], vec![30, 40]]); } -// ── BendlStreamHandle: flush ───────────────────────────────────── +// ── BendlStreamSession: flush ──────────────────────────────────── #[test] -fn stream_handle_flush_succeeds() { - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let mut handle = writer.begin_stream().unwrap(); - handle.flush().unwrap(); +fn stream_session_flush_succeeds() { + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.flush().unwrap(); + // Discard the session — it would warn on Drop, but the test runner + // does not assert on log output, so this is fine for unit tests. + let _ = session.finish_into_writer(0); } // ── BendlAppender: checksum flag ──────────────────────────────── @@ -1298,8 +1256,8 @@ fn appender_rejects_bundle_with_trailing_directory_bytes() { fn finish_from_finished_state_errors() { use crate::BenVariant; - let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - writer + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer = writer .write_ben_stream(BenVariant::Standard, |ctx| { ctx.write_assignment(vec![1, 2])?; Ok(()) @@ -1311,3 +1269,180 @@ fn finish_from_finished_state_errors() { let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); assert!(reader.is_finalized()); } + +// ── Plan verification tests ────────────────────────────────────── + +/// Verification #4 from the plan: bundle byte-equivalence between the +/// closure-based `write_ben_stream` and the explicit +/// `into_stream_session` → `finish_into_writer` chain. +#[test] +fn bundle_byte_equivalent_via_closure_and_explicit_session_for_ben() { + use crate::io::writer::BenStreamWriter; + use crate::BenVariant; + + let samples: Vec> = vec![ + vec![0, 0, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], + vec![0, 1, 1, 1, 2, 2], + vec![1, 1, 1, 1, 2, 2], + ]; + + // Path A: closure-based. + let writer_a = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let writer_a = writer_a + .write_ben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf_a = writer_a.finish().unwrap().into_inner(); + + // Path B: explicit session. + let writer_b = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut session = writer_b.into_stream_session().unwrap(); + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::MkvChain).unwrap(); + for s in &samples { + ben.write_assignment(s.clone()).unwrap(); + } + ben.finish().unwrap(); + drop(ben); + let writer_b = session.finish_into_writer(samples.len() as i64); + let buf_b = writer_b.finish().unwrap().into_inner(); + + assert_eq!( + buf_a, buf_b, + "closure path and explicit session path must produce identical bundle bytes" + ); +} + +/// Verification #7: dropping a `BendlStreamSession` mid-flight must +/// leave the bundle on disk unfinalized (no directory written, header +/// `finalized != FINALIZED_YES`). +#[test] +fn bundle_streaming_session_drop_leaves_unfinalized() { + let mut buf: Vec = Vec::new(); + { + let writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00partial").unwrap(); + // Drop without finish_into_writer. + drop(session); + } + + // The bundle on disk has a provisional header, no directory. + let header = BendlHeader::read_from(&mut Cursor::new(&buf)).unwrap(); + assert_eq!( + header.finalized, FINALIZED_NO, + "dropped session must leave the bundle unfinalized" + ); +} + +/// Verification #8: bundle XBEN compression gate. Two paths should +/// produce identical bundle bytes — the closure helper +/// `write_xben_stream`, and an explicit session that wraps the bundle +/// preset xz encoder around `for_xben_with_encoder`. +#[test] +fn bundle_xben_byte_equivalent_closure_and_explicit_encoder() { + use crate::io::bundle::format::DEFAULT_XZ_PRESET; + use crate::io::writer::BenStreamWriter; + use crate::BenVariant; + use xz2::write::XzEncoder; + + let samples: Vec> = vec![ + vec![0, 1, 2, 3, 4, 5], + vec![1, 1, 2, 3, 4, 5], + vec![1, 1, 2, 3, 4, 4], + ]; + + // Path A: closure. + let writer_a = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + let writer_a = writer_a + .write_xben_stream(BenVariant::MkvChain, |ctx| { + for s in &samples { + ctx.write_assignment(s.clone())?; + } + Ok(()) + }) + .unwrap(); + let buf_a = writer_a.finish().unwrap().into_inner(); + + // Path B: explicit session + XzEncoder built with the bundle preset. + let writer_b = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); + let mut session = writer_b.into_stream_session().unwrap(); + { + let encoder = XzEncoder::new(&mut session, DEFAULT_XZ_PRESET); + let mut xben = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None).unwrap(); + for s in &samples { + xben.write_assignment(s.clone()).unwrap(); + } + xben.finish().unwrap(); + } + let writer_b = session.finish_into_writer(samples.len() as i64); + let buf_b = writer_b.finish().unwrap().into_inner(); + + assert_eq!( + buf_a, buf_b, + "XBEN closure path and explicit-encoder path must produce identical bundle bytes" + ); +} + +/// Verification #9: `BendlStreamSession::write` must increment its +/// internal byte counter by the returned write count, not by the +/// requested buffer length, so partial writes are accounted correctly +/// and the finalized header's `stream_len` matches the actual byte +/// count of the stream region. +#[test] +fn stream_session_partial_writes_account_returned_bytes() { + use std::io::{self, Cursor as IoCursor, SeekFrom}; + + /// Inner writer that always reports `cap` bytes written per call, + /// regardless of the buffer length, but writes the matching prefix. + struct ShortWriter { + cursor: IoCursor>, + cap: usize, + } + + impl Write for ShortWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let n = buf.len().min(self.cap); + self.cursor.write_all(&buf[..n])?; + Ok(n) + } + + fn flush(&mut self) -> io::Result<()> { + self.cursor.flush() + } + } + + impl Seek for ShortWriter { + fn seek(&mut self, pos: SeekFrom) -> io::Result { + self.cursor.seek(pos) + } + } + + let inner = ShortWriter { + cursor: IoCursor::new(Vec::new()), + cap: 3, + }; + let writer = BendlWriter::new(inner, AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + + // Drive a few partial writes; total written should equal the sum + // of the returned `n` from each call. + let mut total_returned: u64 = 0; + for _ in 0..5 { + let n = session.write(b"hello world").unwrap(); + total_returned += n as u64; + } + assert_eq!(session.bytes_written(), total_returned); + + // Finalize and confirm `stream_len` in the patched header agrees. + let writer = session.finish_into_writer(0); + let final_inner = writer.finish().unwrap(); + let mut bundle_buf = final_inner.cursor.into_inner(); + let header = BendlHeader::read_from(&mut Cursor::new(&mut bundle_buf)).unwrap(); + assert_eq!(header.stream_len, total_returned); +} diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 50078f9..9713433 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -6,17 +6,19 @@ //! [header] [asset payloads] [assignment stream] [directory] //! ``` //! -//! The writer operates in three logical phases: +//! The writer operates in three logical phases, expressed via owned +//! typestate transitions: //! //! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero //! or more times. Each call writes the (optionally xz-compressed) //! payload to the file and records its absolute offset and length in //! an in-memory entry list. -//! 2. **stream phase** — the caller invokes [`BendlWriter::begin_stream`] -//! to enter the stream region. The returned handle wraps the raw -//! underlying writer so the caller can plumb it into -//! [`crate::io::writer::BenStreamWriter`]. When the stream is complete -//! the caller records the sample count via [`BendlWriter::end_stream`]. +//! 2. **stream phase** — the caller invokes +//! [`BendlWriter::into_stream_session`] to consume the writer and +//! obtain a [`BendlStreamSession`] that owns the underlying writer +//! and implements `Write`. When the stream is complete the caller +//! calls [`BendlStreamSession::finish_into_writer`] to recover the +//! [`BendlWriter`] in the `StreamWritten` state. //! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing //! directory and patches the header. //! @@ -121,27 +123,25 @@ enum WriterState { /// No assets have been written yet, but the provisional header is /// already in place and the writer is positioned just after it. Assets, - /// `begin_stream` has been called; the caller is responsible for - /// writing the embedded BEN/XBEN payload before calling `end_stream`. - Streaming, - /// `end_stream` has completed; the writer is ready for `finish`. + /// A stream session has been finished and the writer is ready for + /// [`BendlWriter::finish`]. The streaming phase itself is expressed + /// in the type system via [`BendlStreamSession`] and is therefore + /// not observable in this enum. StreamWritten { stream_len: u64, sample_count: i64 }, - /// `finish` has been called. No further operations are permitted. - Finished, } impl BendlWriter { /// Create a new writer by writing a provisional header at offset 0. /// /// The assignment stream will begin immediately after the asset - /// payload region — [`BendlWriter::begin_stream`] computes the - /// exact offset at the moment it is called, so asset writes that - /// happen between `new` and `begin_stream` push the stream out as - /// expected. + /// payload region — [`BendlWriter::into_stream_session`] computes + /// the exact offset at the moment it is called, so asset writes + /// that happen between `new` and `into_stream_session` push the + /// stream out as expected. pub fn new(mut inner: W, assignment_format: AssignmentFormat) -> io::Result { inner.seek(SeekFrom::Start(0))?; // stream_offset in the provisional header is patched at - // begin_stream time; start it just after the header. + // into_stream_session time; start it just after the header. let header = BendlHeader::provisional(assignment_format, HEADER_SIZE as u64); header.write_to(&mut inner)?; @@ -283,48 +283,50 @@ impl BendlWriter { self.add_asset(ASSET_TYPE_CUSTOM, name, payload, options) } - /// Transition from the asset phase into the stream phase and return - /// a mutable reference to the inner writer so the caller can - /// directly write the embedded BEN/XBEN payload. + /// Consume the writer and transition into the stream phase. /// - /// Once this method has been called, no further assets may be added. - /// The caller is responsible for calling [`BendlWriter::end_stream`] - /// when the payload is complete. - pub fn begin_stream(&mut self) -> Result, BendlWriteError> { - if self.state != WriterState::Assets { - return Err(BendlWriteError::WrongState { - expected: "Assets", - found: if matches!(self.state, WriterState::Streaming) { - "Streaming" - } else { - "StreamWritten" - }, - }); + /// The returned [`BendlStreamSession`] owns the underlying writer + /// and implements `Write`, so it can be plumbed into a + /// [`crate::io::writer::BenStreamWriter`] (or written to directly). + /// When the stream is complete the caller calls + /// [`BendlStreamSession::finish_into_writer`] to recover ownership + /// of a [`BendlWriter`] in the `StreamWritten` state, ready for + /// [`BendlWriter::finish`]. + /// + /// Returns [`BendlWriteError::WrongState`] when called on a writer + /// that has already produced a stream (e.g. via a prior + /// `finish_into_writer`); this guard prevents a second + /// `into_stream_session` from silently overwriting + /// `header.stream_offset` and corrupting the bundle. + pub fn into_stream_session( + mut self, + ) -> Result, BendlWriteError> { + match self.state { + WriterState::Assets => {} + WriterState::StreamWritten { .. } => { + return Err(BendlWriteError::WrongState { + expected: "Assets", + found: "StreamWritten", + }); + } } let stream_offset = self.inner.seek(SeekFrom::Current(0))?; self.header.stream_offset = stream_offset; - self.state = WriterState::Streaming; - Ok(BendlStreamHandle { - parent: self, + Ok(BendlStreamSession { + inner: Some(self.inner), + parent: Some(ParentState { + header: self.header, + entries: self.entries, + names: self.names, + singleton_types: self.singleton_types, + }), start_offset: stream_offset, + bytes_written: 0, }) } - /// Directly write the whole stream region from an in-memory byte - /// slice. This is a convenience for tests and for tools that already - /// have the encoded stream bytes on hand. - pub fn write_stream_bytes( - &mut self, - bytes: &[u8], - sample_count: i64, - ) -> Result<(), BendlWriteError> { - let mut handle = self.begin_stream()?; - handle.write_all(bytes).map_err(BendlWriteError::Io)?; - handle.finish(sample_count) - } - /// Open a BEN assignment stream backed by a /// [`crate::io::writer::BenStreamWriter`] and invoke `f` with a /// context that can encode assignments into it. @@ -333,19 +335,21 @@ impl BendlWriter { /// calls the closure makes and records that count as the bundle's /// authoritative `sample_count` when the stream is finalized. The /// closure is free to short-circuit by returning an error, in which - /// case the stream phase is abandoned and the error is propagated. + /// case the stream phase is abandoned, the error is propagated, and + /// the partially-written bundle is unrecoverable through this API + /// (no continuation path on the closure-error branch). pub fn write_ben_stream( - &mut self, + self, variant: crate::BenVariant, f: F, - ) -> Result<(), BendlWriteError> + ) -> Result where F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, { - let mut handle = self.begin_stream()?; + let mut session = self.into_stream_session()?; let mut sample_count: i64 = 0; { - let writer_ref: &mut dyn Write = &mut handle; + let writer_ref: &mut dyn Write = &mut session; let mut ben = crate::io::writer::BenStreamWriter::for_ben(writer_ref, variant)?; { @@ -357,7 +361,7 @@ impl BendlWriter { } ben.finish()?; } - handle.finish(sample_count) + Ok(session.finish_into_writer(sample_count)) } /// Open an XBEN assignment stream backed by a @@ -373,17 +377,17 @@ impl BendlWriter { /// MT-stream defaults — bundle assignment streams are intentionally /// single-threaded with a milder preset. pub fn write_xben_stream( - &mut self, + self, variant: crate::BenVariant, f: F, - ) -> Result<(), BendlWriteError> + ) -> Result where F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, { - let mut handle = self.begin_stream()?; + let mut session = self.into_stream_session()?; let mut sample_count: i64 = 0; { - let writer_ref: &mut dyn Write = &mut handle; + let writer_ref: &mut dyn Write = &mut session; let encoder = xz2::write::XzEncoder::new(writer_ref, DEFAULT_XZ_PRESET); let mut xben = crate::io::writer::BenStreamWriter::for_xben_with_encoder( encoder, variant, None, @@ -397,27 +401,25 @@ impl BendlWriter { } xben.finish()?; } - handle.finish(sample_count) + Ok(session.finish_into_writer(sample_count)) } /// Write the trailing directory, patch the header, and return the /// underlying writer. pub fn finish(mut self) -> Result { - if matches!(self.state, WriterState::Streaming) { - return Err(BendlWriteError::WrongState { - expected: "StreamWritten", - found: "Streaming", - }); - } - let (stream_len, sample_count) = - if let WriterState::StreamWritten { stream_len, sample_count } = self.state { - (stream_len, sample_count) - } else { - // Assets state: no stream written; treat as empty stream. + let (stream_len, sample_count) = match self.state { + WriterState::StreamWritten { + stream_len, + sample_count, + } => (stream_len, sample_count), + WriterState::Assets => { + // No stream written; treat as empty stream located just + // after the asset region. let stream_offset = self.inner.seek(SeekFrom::Current(0))?; self.header.stream_offset = stream_offset; (0, 0) - }; + } + }; // Position at end of stream (== start of directory). let directory_offset = self.header.stream_offset + stream_len; @@ -442,44 +444,101 @@ impl BendlWriter { // Flush explicitly; some writers (files) are not flushed on drop. self.inner.flush()?; - self.state = WriterState::Finished; Ok(self.inner) } } -/// Mutable handle to the stream region held by a [`BendlWriter`]. +/// Internal state of a [`BendlWriter`] that has been temporarily moved +/// into a [`BendlStreamSession`]. Stored as a single struct so +/// `finish_into_writer` can rebuild the writer with one move. +struct ParentState { + header: BendlHeader, + entries: Vec, + names: HashSet, + singleton_types: HashSet, +} + +/// Owned stream-phase session. Holds the underlying writer and the +/// parent [`BendlWriter`]'s in-memory state across the streaming phase, +/// implements `Write` so it can be plumbed into a +/// [`crate::io::writer::BenStreamWriter`], and exposes +/// [`Self::finish_into_writer`] to hand ownership back as a +/// [`BendlWriter`] in the `StreamWritten` state. /// -/// The handle implements `Write` so it can be wrapped in -/// `BenStreamWriter::for_ben(handle, variant)` or -/// `BenStreamWriter::for_xben_with_encoder(encoder, variant, ...)` directly. -pub struct BendlStreamHandle<'a, W: Write + Seek> { - parent: &'a mut BendlWriter, +/// `inner` and `parent` are wrapped in `Option` so `finish_into_writer` +/// can `take()` them without partial-moving out of a `Drop` type. The +/// [`Drop`] impl emits a `tracing::warn!` if the session is dropped +/// without `finish_into_writer`, since that leaves the bundle on disk +/// unfinalized. +pub struct BendlStreamSession { + inner: Option, + parent: Option, start_offset: u64, + bytes_written: u64, } -impl<'a, W: Write + Seek> BendlStreamHandle<'a, W> { - /// Record the sample count and transition the writer out of the - /// stream phase. Call this after the embedded BEN/XBEN payload has - /// been written. - pub fn finish(self, sample_count: i64) -> Result<(), BendlWriteError> { - let end = self.parent.inner.seek(SeekFrom::Current(0))?; - let stream_len = end.saturating_sub(self.start_offset); - self.parent.state = WriterState::StreamWritten { - stream_len, - sample_count, - }; - Ok(()) +impl BendlStreamSession { + /// Number of bytes written into the stream region so far. Pure + /// counter — no I/O, no `&mut` required. + pub fn bytes_written(&self) -> u64 { + self.bytes_written + } + + /// Offset (in the underlying writer) at which the stream region + /// began, recorded at session-construction time. + pub fn start_offset(&self) -> u64 { + self.start_offset + } + + /// End the stream phase and return ownership of a [`BendlWriter`] + /// in the `StreamWritten` state, ready for [`BendlWriter::finish`]. + /// + /// Infallible: the body is `take()` + arithmetic + struct + /// construction with no I/O. Once this method returns, the + /// session's [`Drop`] impl observes `inner.is_none()` and skips + /// the warn. + pub fn finish_into_writer(mut self, sample_count: i64) -> BendlWriter { + let inner = self.inner.take().expect("session has not been finished"); + let parent = self.parent.take().expect("session has not been finished"); + BendlWriter { + inner, + header: parent.header, + entries: parent.entries, + names: parent.names, + singleton_types: parent.singleton_types, + state: WriterState::StreamWritten { + stream_len: self.bytes_written, + sample_count, + }, + } } } -impl<'a, W: Write + Seek> Write for BendlStreamHandle<'a, W> { +impl Write for BendlStreamSession { fn write(&mut self, buf: &[u8]) -> io::Result { - self.parent.inner.write(buf) + let inner = self.inner.as_mut().expect("session has not been finished"); + let n = inner.write(buf)?; + self.bytes_written += n as u64; + Ok(n) } fn flush(&mut self) -> io::Result<()> { - self.parent.inner.flush() + self.inner + .as_mut() + .expect("session has not been finished") + .flush() + } +} + +impl Drop for BendlStreamSession { + fn drop(&mut self) { + if self.inner.is_some() { + tracing::warn!( + "BendlStreamSession dropped without finish_into_writer; \ + bundle on disk is unfinalized" + ); + } } } diff --git a/ben/src/io/writer/stream_writer/mod.rs b/ben/src/io/writer/stream_writer/mod.rs index 8be87f0..7c7b21b 100644 --- a/ben/src/io/writer/stream_writer/mod.rs +++ b/ben/src/io/writer/stream_writer/mod.rs @@ -40,7 +40,11 @@ use xben::XBenInner; /// boundary per call. Calling `write_frame` on an XBEN writer returns /// `InvalidInput`. pub struct BenStreamWriter { - inner: BenStreamInner, + /// Wrapped in `Option` so [`Self::finish_into_inner`] can `take()` it + /// without partial-moving out of a `Drop` type. All other access + /// sites unwrap with `.expect("inner present")` — only the consuming + /// `finish_into_inner` ever leaves it `None`. + inner: Option>, state: WriterState, /// Tracks whether any sample-writing or direct-ingest operation has /// touched the writer. `ingest_ben_stream` requires this to be `false`. @@ -69,7 +73,7 @@ impl BenStreamWriter { pub fn for_ben(mut writer: W, variant: BenVariant) -> io::Result { writer.write_all(banner_for_variant(variant))?; Ok(Self { - inner: BenStreamInner::Ben(BenState::new(writer, variant)), + inner: Some(BenStreamInner::Ben(BenState::new(writer, variant))), state: WriterState::Open, body_started: false, }) @@ -103,7 +107,9 @@ impl BenStreamWriter { .unwrap_or(super::twodelta::DEFAULT_TWODELTA_CHUNK_SIZE) .max(1); Ok(Self { - inner: BenStreamInner::XBen(Box::new(XBenInner::new(encoder, variant, chunk_size))), + inner: Some(BenStreamInner::XBen(Box::new(XBenInner::new( + encoder, variant, chunk_size, + )))), state: WriterState::Open, body_started: false, }) @@ -111,7 +117,7 @@ impl BenStreamWriter { /// The BEN variant of this stream. pub fn variant(&self) -> BenVariant { - match &self.inner { + match self.inner.as_ref().expect("inner present") { BenStreamInner::Ben(b) => b.variant, BenStreamInner::XBen(x) => x.variant(), } @@ -119,7 +125,7 @@ impl BenStreamWriter { /// The wire format (BEN vs XBEN) of this stream. pub fn wire_format(&self) -> BenWireFormat { - match &self.inner { + match self.inner.as_ref().expect("inner present") { BenStreamInner::Ben(_) => BenWireFormat::Ben, BenStreamInner::XBen(_) => BenWireFormat::XBen, } @@ -138,7 +144,7 @@ impl BenStreamWriter { } self.body_started = true; - let result = match &mut self.inner { + let result = match self.inner.as_mut().expect("inner present") { BenStreamInner::Ben(b) => b.write_assignment(assign_vec), BenStreamInner::XBen(x) => x.write_assignment(assign_vec), }; @@ -161,7 +167,7 @@ impl BenStreamWriter { } WriterState::Open => {} } - let ben = match &mut self.inner { + let ben = match self.inner.as_mut().expect("inner present") { BenStreamInner::Ben(b) => b, BenStreamInner::XBen(_) => { return Err(invalid_input("write_frame is plain-BEN-only")); @@ -191,7 +197,7 @@ impl BenStreamWriter { let new_assign = parse_json_assignment(data)?; // From here on, we are in the stateful encode path. self.body_started = true; - let result = match &mut self.inner { + let result = match self.inner.as_mut().expect("inner present") { BenStreamInner::Ben(b) => b.write_assignment(new_assign), BenStreamInner::XBen(x) => x.write_assignment(new_assign), }; @@ -213,7 +219,7 @@ impl BenStreamWriter { } WriterState::Open => {} } - let xben = match &mut self.inner { + let xben = match self.inner.as_mut().expect("inner present") { BenStreamInner::Ben(_) => { return Err(invalid_input("ingest_ben_stream requires XBEN mode")); } @@ -253,7 +259,7 @@ impl BenStreamWriter { WriterState::Open | WriterState::BodyClosed => {} } - let result: io::Result<()> = match &mut self.inner { + let result: io::Result<()> = match self.inner.as_mut().expect("inner present") { BenStreamInner::Ben(b) => { if self.state == WriterState::Open { b.flush_pending_frame() @@ -285,11 +291,48 @@ impl BenStreamWriter { } } } + + /// Consume the writer, flush any buffered state, finalize the + /// underlying compressed stream when present (XBEN), and return the + /// underlying `W`. + /// + /// Unlike `std::io::BufWriter::into_inner`, this method's name is + /// intentionally `finish_into_inner` because errors from the BEN + /// flush or the consuming `XzEncoder::finish()` can still lose + /// access to the inner writer. Returns `InvalidInput` if the writer + /// is in `Failed`. Accepted from `Open`, `BodyClosed`, and + /// `Complete`; the `Complete` path simply extracts the inner writer + /// after prior finalization. + pub fn finish_into_inner(mut self) -> io::Result { + let state = self.state; + match state { + WriterState::Failed => return Err(invalid_input("writer was poisoned")), + WriterState::Open | WriterState::BodyClosed | WriterState::Complete => {} + } + let inner = self.inner.take().expect("inner present"); + match inner { + BenStreamInner::Ben(mut b) => { + if state == WriterState::Open { + b.flush_pending_frame()?; + } + Ok(b.writer) + } + BenStreamInner::XBen(boxed) => { + let mut x = *boxed; + if state == WriterState::Open { + x.flush()?; + } + x.encoder.finish() + } + } + } } impl Drop for BenStreamWriter { fn drop(&mut self) { - if matches!(self.state, WriterState::Open | WriterState::BodyClosed) { + if self.inner.is_some() + && matches!(self.state, WriterState::Open | WriterState::BodyClosed) + { let _ = self.finish(); } } diff --git a/ben/src/test_utils.rs b/ben/src/test_utils.rs index 0946860..be16f4e 100644 --- a/ben/src/test_utils.rs +++ b/ben/src/test_utils.rs @@ -65,8 +65,10 @@ pub fn sample_ben_bytes(jsonl: &[u8], variant: BenVariant) -> Vec { pub fn sample_bendl_bytes(stream: &[u8], format: AssignmentFormat) -> Vec { let mut buf: Vec = Vec::new(); { - let mut writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); - writer.write_stream_bytes(stream, 1).unwrap(); + let writer = BendlWriter::new(Cursor::new(&mut buf), format).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(stream).unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap(); } buf diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index c8fd96b..559a3d9 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -180,9 +180,9 @@ fn tiny_bendl_bundle() -> Vec { AddAssetOptions::defaults().raw(), ) .unwrap(); - writer - .write_stream_bytes(b"STANDARD BEN FILE\x00\x01\x02", 1) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00\x01\x02").unwrap(); + let writer = session.finish_into_writer(1); writer.finish().unwrap().into_inner() } From 899e27050478fe25d51b97b26f07ca17aa8938af Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 9 May 2026 23:03:10 -0600 Subject: [PATCH 097/221] clean up bendl stream api --- ben/src/io/bundle/mod.rs | 4 +- ben/src/io/bundle/tests/writer.rs | 256 +++++++++--------------------- ben/src/io/bundle/writer.rs | 128 --------------- 3 files changed, 80 insertions(+), 308 deletions(-) diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index fc8d712..1794cd1 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -22,6 +22,4 @@ pub mod writer; mod tests; pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; -pub use writer::{ - AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, BundleAssignmentStreamCtx, -}; +pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index cd8628a..cbb1f6d 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -1,15 +1,18 @@ -use std::io::{self, Cursor, Read, Seek, Write}; +use std::io::{Cursor, Read, Seek, Write}; + +use xz2::write::XzEncoder; use crate::io::bundle::format::{ AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, - BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, + BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, }; use crate::io::bundle::reader::BendlReader; -use crate::io::reader::BenWireFormat; use crate::io::bundle::writer::{ AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter, }; +use crate::io::reader::BenWireFormat; +use crate::io::writer::BenStreamWriter; fn make_buffer() -> Cursor> { Cursor::new(Vec::new()) @@ -497,7 +500,7 @@ fn append_rejects_conflicting_pending_additions() { // -------- Phase 4: assignment-stream integration tests -------- #[test] -fn write_ben_stream_round_trips_through_assignment_reader() { +fn bundle_ben_stream_round_trips_through_assignment_reader() { use crate::BenVariant; let samples: Vec> = vec![ @@ -508,14 +511,15 @@ fn write_ben_stream_round_trips_through_assignment_reader() { ]; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let writer = writer - .write_ben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::MkvChain).unwrap(); + for s in &samples { + ben.write_assignment(s.clone()).unwrap(); + } + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(samples.len() as i64); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -537,7 +541,7 @@ fn write_ben_stream_round_trips_through_assignment_reader() { } #[test] -fn write_xben_stream_round_trips_through_assignment_reader() { +fn bundle_xben_stream_round_trips_through_assignment_reader() { use crate::BenVariant; let samples: Vec> = vec![ @@ -548,14 +552,17 @@ fn write_xben_stream_round_trips_through_assignment_reader() { ]; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - let writer = writer - .write_xben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let encoder = XzEncoder::new(&mut session, DEFAULT_XZ_PRESET); + let mut xben = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None).unwrap(); + for s in &samples { + xben.write_assignment(s.clone()).unwrap(); + } + xben.finish().unwrap(); + } + let writer = session.finish_into_writer(samples.len() as i64); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -576,7 +583,7 @@ fn write_xben_stream_round_trips_through_assignment_reader() { } #[test] -fn write_ben_stream_alongside_front_loaded_asset() { +fn bundle_ben_stream_alongside_front_loaded_asset() { use crate::BenVariant; let graph = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#; @@ -586,14 +593,15 @@ fn write_ben_stream_alongside_front_loaded_asset() { writer .add_json_asset(ASSET_TYPE_GRAPH, "graph.json", graph) .unwrap(); - let writer = writer - .write_ben_stream(BenVariant::Standard, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::Standard).unwrap(); + for s in &samples { + ben.write_assignment(s.clone()).unwrap(); + } + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(samples.len() as i64); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -622,16 +630,17 @@ fn write_ben_stream_alongside_front_loaded_asset() { } #[test] -fn open_assignment_reader_rejects_mismatched_format() { +fn open_assignment_reader_reports_ben_wire_format() { use crate::BenVariant; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let writer = writer - .write_ben_stream(BenVariant::Standard, |ctx| { - ctx.write_assignment(vec![0, 1])?; - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::Standard).unwrap(); + ben.write_assignment(vec![0, 1]).unwrap(); + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(1); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -927,23 +936,6 @@ fn append_rejects_duplicate_name_across_existing_and_pending() { assert!(reader.find_asset_by_name("blob").is_some()); } -#[test] -fn write_ben_stream_closure_error_short_circuits_finalize() { - use crate::BenVariant; - - let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - // BendlWriter doesn't implement Debug, so destructure via match - // rather than `.unwrap_err()`. - let result = writer.write_ben_stream(BenVariant::Standard, |_ctx| { - Err(io::Error::new(io::ErrorKind::Other, "boom")) - }); - match result { - Ok(_) => panic!("expected closure error to short-circuit"), - Err(BendlWriteError::Io(e)) => assert_eq!(e.kind(), io::ErrorKind::Other), - Err(other) => panic!("expected Io(Other), got {other:?}"), - } -} - // ----------------------------------------------------------------------- // Randomized / stress tests // ----------------------------------------------------------------------- @@ -1147,21 +1139,21 @@ fn randomized_append_sequence_preserves_all_prior_entries() { // ── write_json_value and sample_count coverage ────────────────── #[test] -fn write_ben_stream_json_value_and_sample_count() { +fn bundle_ben_stream_json_value_and_caller_sample_count() { use crate::BenVariant; use serde_json::json; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let writer = writer - .write_ben_stream(BenVariant::Standard, |ctx| { - assert_eq!(ctx.sample_count(), 0); - ctx.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1}))?; - assert_eq!(ctx.sample_count(), 1); - ctx.write_json_value(json!({"assignment": [4, 5, 6], "sample": 2}))?; - assert_eq!(ctx.sample_count(), 2); - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::Standard).unwrap(); + ben.write_json_value(json!({"assignment": [1, 2, 3], "sample": 1})) + .unwrap(); + ben.write_json_value(json!({"assignment": [4, 5, 6], "sample": 2})) + .unwrap(); + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(2); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -1173,18 +1165,23 @@ fn write_ben_stream_json_value_and_sample_count() { } #[test] -fn write_xben_stream_json_value() { +fn bundle_xben_stream_json_value() { use crate::BenVariant; use serde_json::json; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - let writer = writer - .write_xben_stream(BenVariant::Standard, |ctx| { - ctx.write_json_value(json!({"assignment": [10, 20], "sample": 1}))?; - ctx.write_json_value(json!({"assignment": [30, 40], "sample": 2}))?; - Ok(()) - }) - .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let encoder = XzEncoder::new(&mut session, DEFAULT_XZ_PRESET); + let mut xben = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::Standard, None).unwrap(); + xben.write_json_value(json!({"assignment": [10, 20], "sample": 1})) + .unwrap(); + xben.write_json_value(json!({"assignment": [30, 40], "sample": 2})) + .unwrap(); + xben.finish().unwrap(); + } + let writer = session.finish_into_writer(2); let buf = writer.finish().unwrap().into_inner(); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -1253,70 +1250,25 @@ fn appender_rejects_bundle_with_trailing_directory_bytes() { // ── finalize from wrong state ─────────────────────────────────── #[test] -fn finish_from_finished_state_errors() { +fn finish_after_assignment_stream_produces_finalized_bundle() { use crate::BenVariant; let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let writer = writer - .write_ben_stream(BenVariant::Standard, |ctx| { - ctx.write_assignment(vec![1, 2])?; - Ok(()) - }) - .unwrap(); - // First finish succeeds + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::Standard).unwrap(); + ben.write_assignment(vec![1, 2]).unwrap(); + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(1); let buf = writer.finish().unwrap(); - // Verify the result is usable let reader = BendlReader::open(Cursor::new(buf.into_inner())).unwrap(); assert!(reader.is_finalized()); + assert_eq!(reader.sample_count(), Some(1)); } // ── Plan verification tests ────────────────────────────────────── -/// Verification #4 from the plan: bundle byte-equivalence between the -/// closure-based `write_ben_stream` and the explicit -/// `into_stream_session` → `finish_into_writer` chain. -#[test] -fn bundle_byte_equivalent_via_closure_and_explicit_session_for_ben() { - use crate::io::writer::BenStreamWriter; - use crate::BenVariant; - - let samples: Vec> = vec![ - vec![0, 0, 1, 1, 2, 2], - vec![0, 1, 1, 1, 2, 2], - vec![0, 1, 1, 1, 2, 2], - vec![1, 1, 1, 1, 2, 2], - ]; - - // Path A: closure-based. - let writer_a = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let writer_a = writer_a - .write_ben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); - let buf_a = writer_a.finish().unwrap().into_inner(); - - // Path B: explicit session. - let writer_b = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let mut session = writer_b.into_stream_session().unwrap(); - let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::MkvChain).unwrap(); - for s in &samples { - ben.write_assignment(s.clone()).unwrap(); - } - ben.finish().unwrap(); - drop(ben); - let writer_b = session.finish_into_writer(samples.len() as i64); - let buf_b = writer_b.finish().unwrap().into_inner(); - - assert_eq!( - buf_a, buf_b, - "closure path and explicit session path must produce identical bundle bytes" - ); -} - /// Verification #7: dropping a `BendlStreamSession` mid-flight must /// leave the bundle on disk unfinalized (no directory written, header /// `finalized != FINALIZED_YES`). @@ -1339,56 +1291,6 @@ fn bundle_streaming_session_drop_leaves_unfinalized() { ); } -/// Verification #8: bundle XBEN compression gate. Two paths should -/// produce identical bundle bytes — the closure helper -/// `write_xben_stream`, and an explicit session that wraps the bundle -/// preset xz encoder around `for_xben_with_encoder`. -#[test] -fn bundle_xben_byte_equivalent_closure_and_explicit_encoder() { - use crate::io::bundle::format::DEFAULT_XZ_PRESET; - use crate::io::writer::BenStreamWriter; - use crate::BenVariant; - use xz2::write::XzEncoder; - - let samples: Vec> = vec![ - vec![0, 1, 2, 3, 4, 5], - vec![1, 1, 2, 3, 4, 5], - vec![1, 1, 2, 3, 4, 4], - ]; - - // Path A: closure. - let writer_a = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - let writer_a = writer_a - .write_xben_stream(BenVariant::MkvChain, |ctx| { - for s in &samples { - ctx.write_assignment(s.clone())?; - } - Ok(()) - }) - .unwrap(); - let buf_a = writer_a.finish().unwrap().into_inner(); - - // Path B: explicit session + XzEncoder built with the bundle preset. - let writer_b = BendlWriter::new(make_buffer(), AssignmentFormat::Xben).unwrap(); - let mut session = writer_b.into_stream_session().unwrap(); - { - let encoder = XzEncoder::new(&mut session, DEFAULT_XZ_PRESET); - let mut xben = - BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None).unwrap(); - for s in &samples { - xben.write_assignment(s.clone()).unwrap(); - } - xben.finish().unwrap(); - } - let writer_b = session.finish_into_writer(samples.len() as i64); - let buf_b = writer_b.finish().unwrap().into_inner(); - - assert_eq!( - buf_a, buf_b, - "XBEN closure path and explicit-encoder path must produce identical bundle bytes" - ); -} - /// Verification #9: `BendlStreamSession::write` must increment its /// internal byte counter by the returned write count, not by the /// requested buffer length, so partial writes are accounted correctly diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 9713433..6a817a2 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -327,83 +327,6 @@ impl BendlWriter { }) } - /// Open a BEN assignment stream backed by a - /// [`crate::io::writer::BenStreamWriter`] and invoke `f` with a - /// context that can encode assignments into it. - /// - /// The context tracks how many `write_assignment` / `write_json_value` - /// calls the closure makes and records that count as the bundle's - /// authoritative `sample_count` when the stream is finalized. The - /// closure is free to short-circuit by returning an error, in which - /// case the stream phase is abandoned, the error is propagated, and - /// the partially-written bundle is unrecoverable through this API - /// (no continuation path on the closure-error branch). - pub fn write_ben_stream( - self, - variant: crate::BenVariant, - f: F, - ) -> Result - where - F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, - { - let mut session = self.into_stream_session()?; - let mut sample_count: i64 = 0; - { - let writer_ref: &mut dyn Write = &mut session; - let mut ben = - crate::io::writer::BenStreamWriter::for_ben(writer_ref, variant)?; - { - let mut ctx = BundleAssignmentStreamCtx { - writer: &mut ben, - sample_count: &mut sample_count, - }; - f(&mut ctx)?; - } - ben.finish()?; - } - Ok(session.finish_into_writer(sample_count)) - } - - /// Open an XBEN assignment stream backed by a - /// [`crate::io::writer::BenStreamWriter`] and invoke `f` with a - /// context that can encode assignments into it. - /// - /// The closure sees the same counting [`BundleAssignmentStreamCtx`] - /// type used by [`BendlWriter::write_ben_stream`], so callers can be - /// written to be generic over the assignment container. - /// - /// The XBEN encoder uses bundle compression preset - /// [`crate::io::bundle::format::DEFAULT_XZ_PRESET`], not the codec's - /// MT-stream defaults — bundle assignment streams are intentionally - /// single-threaded with a milder preset. - pub fn write_xben_stream( - self, - variant: crate::BenVariant, - f: F, - ) -> Result - where - F: FnOnce(&mut BundleAssignmentStreamCtx<'_>) -> io::Result<()>, - { - let mut session = self.into_stream_session()?; - let mut sample_count: i64 = 0; - { - let writer_ref: &mut dyn Write = &mut session; - let encoder = xz2::write::XzEncoder::new(writer_ref, DEFAULT_XZ_PRESET); - let mut xben = crate::io::writer::BenStreamWriter::for_xben_with_encoder( - encoder, variant, None, - )?; - { - let mut ctx = BundleAssignmentStreamCtx { - writer: &mut xben, - sample_count: &mut sample_count, - }; - f(&mut ctx)?; - } - xben.finish()?; - } - Ok(session.finish_into_writer(sample_count)) - } - /// Write the trailing directory, patch the header, and return the /// underlying writer. pub fn finish(mut self) -> Result { @@ -542,57 +465,6 @@ impl Drop for BendlStreamSession { } } -/// Bundle-private adapter that hides the concrete `BenStreamWriter` -/// behind two methods, so [`BundleAssignmentStreamCtx`] can stay non-generic -/// without forcing the public API to expose the writer's `W` parameter or -/// to grow a second lifetime. -trait BundleAssignmentSink { - fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()>; - fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()>; -} - -impl BundleAssignmentSink for crate::io::writer::BenStreamWriter { - fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { - crate::io::writer::BenStreamWriter::write_assignment(self, assign_vec) - } - - fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { - crate::io::writer::BenStreamWriter::write_json_value(self, data) - } -} - -/// Closure-side handle passed to [`BendlWriter::write_ben_stream`] and -/// [`BendlWriter::write_xben_stream`]. -/// -/// Exposes the usual assignment-writing methods while also counting -/// samples so the bundle's header can be patched with an authoritative -/// `sample_count` at stream finalization. -pub struct BundleAssignmentStreamCtx<'a> { - writer: &'a mut dyn BundleAssignmentSink, - sample_count: &'a mut i64, -} - -impl<'a> BundleAssignmentStreamCtx<'a> { - /// Encode one assignment vector and bump the sample counter. - pub fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { - self.writer.write_assignment(assign_vec)?; - *self.sample_count += 1; - Ok(()) - } - - /// Encode one JSON assignment record and bump the sample counter. - pub fn write_json_value(&mut self, data: serde_json::Value) -> io::Result<()> { - self.writer.write_json_value(data)?; - *self.sample_count += 1; - Ok(()) - } - - /// Number of samples the closure has written so far. - pub fn sample_count(&self) -> i64 { - *self.sample_count - } -} - /// Errors produced by the bundle writer. #[derive(Debug, Error)] pub enum BendlWriteError { From 7a7ef6c65ecaafdec1380d5517ee829bcf843b36 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 18 May 2026 16:28:55 -0600 Subject: [PATCH 098/221] add in checksum logic --- Cargo.lock | 25 ++ ben/Cargo.toml | 6 + ben/src/cli/bendl/extract.rs | 3 +- ben/src/cli/bendl/tests.rs | 10 +- ben/src/io/bundle/error.rs | 154 ++++++++ ben/src/io/bundle/format.rs | 59 ++- ben/src/io/bundle/mod.rs | 2 + ben/src/io/bundle/reader.rs | 353 ++++++++++++++++- ben/src/io/bundle/tests/format.rs | 65 +++- ben/src/io/bundle/tests/reader.rs | 607 +++++++++++++++++++++++++++--- ben/src/io/bundle/tests/writer.rs | 33 +- ben/src/io/bundle/writer.rs | 125 +++--- 12 files changed, 1293 insertions(+), 149 deletions(-) create mode 100644 ben/src/io/bundle/error.rs diff --git a/Cargo.lock b/Cargo.lock index 3fc2f64..4a5b184 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -117,6 +117,7 @@ version = "1.0.0" dependencies = [ "byteorder", "clap 4.5.48", + "crc32c", "indicatif", "lipsum", "pcompress", @@ -264,6 +265,15 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -977,6 +987,15 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -1035,6 +1054,12 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.228" diff --git a/ben/Cargo.toml b/ben/Cargo.toml index 228f492..6ba6ded 100755 --- a/ben/Cargo.toml +++ b/ben/Cargo.toml @@ -17,6 +17,12 @@ name = "binary_ensemble" [dependencies] byteorder = "1.5.0" clap = { version = "^4.5.2", features = ["derive"] } +# CRC32C (Castagnoli polynomial) integrity checks on bendl payloads and the +# assignment stream. The `crc32c` crate is single-purpose, hardware-accelerated +# on SSE 4.2 / ARMv8 crypto extensions, and cannot be accidentally configured +# to compute IEEE CRC-32 (unlike `crc32fast`, whose CRC32C path lives behind a +# feature flag). +crc32c = "0.6" indicatif = "0.17" pcompress = "1.0.7" petgraph = "0.8.3" diff --git a/ben/src/cli/bendl/extract.rs b/ben/src/cli/bendl/extract.rs index 1a0b73f..52eff65 100644 --- a/ben/src/cli/bendl/extract.rs +++ b/ben/src/cli/bendl/extract.rs @@ -39,7 +39,8 @@ pub(super) fn run_extract(args: ExtractArgs) -> Result<(), String> { let mut asset = reader .asset_reader(&entry) .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; - io::copy(&mut asset, &mut out).map_err(|e| format!("failed to copy asset bytes: {e}"))?; + io::copy(&mut asset, &mut out) + .map_err(|e| format!("failed to copy asset {name:?} bytes: {e}"))?; } out.flush().map_err(|e| format!("flush failed: {e}"))?; diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index f5d0573..b6e4fa4 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -95,8 +95,9 @@ fn run_inspect_xben_format_and_checksum_flag() { use crate::io::bundle::format::ASSET_TYPE_CUSTOM; use crate::io::bundle::AddAssetOptions; - // Build a .bendl with a checksum asset so the flag_parts checksum - // branch is exercised. + // Every library-written asset carries ASSET_FLAG_CHECKSUM, so any + // add_asset call exercises the checksum flag_parts branch in + // `run_inspect`. let mut buf: Vec = Vec::new(); let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Xben).unwrap(); writer @@ -104,10 +105,7 @@ fn run_inspect_xben_format_and_checksum_flag() { ASSET_TYPE_CUSTOM, "checksummed", b"data", - AddAssetOptions { - checksum: Some(vec![0xAB, 0xCD]), - ..AddAssetOptions::defaults() - }, + AddAssetOptions::defaults().raw(), ) .unwrap(); let mut session = writer.into_stream_session().unwrap(); diff --git a/ben/src/io/bundle/error.rs b/ben/src/io/bundle/error.rs new file mode 100644 index 0000000..e941708 --- /dev/null +++ b/ben/src/io/bundle/error.rs @@ -0,0 +1,154 @@ +//! Read-side error types for `.bendl` bundles. +//! +//! [`BendlReadError`] is the canonical error type for high-level BENDL +//! convenience APIs (anything that returns an owned value: `asset_bytes`, +//! reader constructors that consume internally, etc.). Returned `Read` +//! / iterator / stream-wrapper values keep their native `io::Result` +//! surface; checksum failures on those paths are carried as +//! `io::ErrorKind::InvalidData` with an inner [`ChecksumError`] that +//! callers can downcast. +//! +//! Variant discipline is held at the wrap site, not by the type system: +//! `Io(io::Error)` and `Decode(io::Error)` carry the same payload type, +//! so a future refactor could accidentally wrap a decoder-runtime error +//! as `Io(_)` and the type system would not notice. The error-discipline +//! tests pin which variant fires for each representative read path. + +use std::fmt; +use std::io; + +use thiserror::Error; + +use super::format::BendlFormatError; +use crate::io::reader::DecoderInitError; + +/// Identifies which checksummed region a [`ChecksumError`] refers to. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ChecksumTarget { + /// A directory-entry asset, identified by name. + Asset(String), + /// The assignment stream (only one per bundle). + Stream, +} + +impl fmt::Display for ChecksumTarget { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ChecksumTarget::Asset(name) => write!(f, "asset {name:?}"), + ChecksumTarget::Stream => write!(f, "assignment stream"), + } + } +} + +/// Checksum-specific failures from BENDL reader APIs. +/// +/// Variant precedence is scoped per checksum domain, not global: +/// +/// - **Asset checksum:** `Unavailable` > `Mismatch`. The directory entry +/// is authoritative regardless of bundle finalization, so +/// `verify_asset_checksum` never returns `BundleIncomplete`. +/// - **Stream checksum:** `BundleIncomplete` > `Unavailable` > `Mismatch`. +/// The stream's stored CRC depends on `stream_len` being authoritative, +/// which only holds after finalization, so an unfinalized bundle +/// short-circuits to `BundleIncomplete` before the flag is inspected. +#[derive(Debug, Error)] +pub enum ChecksumError { + /// The computed CRC32C did not match the stored value. + #[error( + "checksum mismatch for {target}: computed 0x{computed:08x}, expected 0x{expected:08x}" + )] + Mismatch { + /// Which region failed verification. + target: ChecksumTarget, + /// CRC32C computed by the reader over the on-disk bytes. + computed: u32, + /// CRC32C stored in the bundle. + expected: u32, + }, + + /// The relevant checksum-presence flag (`ASSET_FLAG_CHECKSUM` on a + /// directory entry, or the stream-level equivalent on the header) + /// was clear; there is no stored checksum to verify against. The + /// library writer always sets these flags, so this fires only for + /// foreign or hand-built bytes. + #[error("checksum is unavailable for {target}")] + Unavailable { + /// Which region lacks a stored checksum. + target: ChecksumTarget, + }, + + /// The bundle is not finalized, so the stored checksum is not + /// authoritative. Stream-only: asset-checksum APIs never produce + /// this variant because directory entries are authoritative + /// regardless of bundle finalization. + #[error("bundle is unfinalized; {target} checksum is not authoritative yet")] + BundleIncomplete { + /// Which region's checksum is not yet authoritative. + target: ChecksumTarget, + }, +} + +/// High-level error returned by BENDL convenience APIs that consume +/// internally before producing an owned value. +/// +/// See [`crate::io::bundle::reader::BendlReader`] for the variant rules +/// per API. The variant discipline is held at the wrap site (where each +/// underlying error becomes a `BendlReadError`); the type system alone +/// cannot prevent a future refactor from mis-wrapping a codec error as +/// `Io` or a header parse failure as `DecoderInit`. The "variant +/// discipline" tests pin which variant fires for each representative +/// read path. +#[derive(Debug, Error)] +pub enum BendlReadError { + /// Underlying I/O failure at the bundle layer (seek, range read, + /// filesystem). Never used to carry codec or checksum failures. + #[error("IO error: {0}")] + Io(io::Error), + + /// A format-layer error. Reserved for higher-level APIs that wrap + /// an `open` failure or for future lazy-validation paths; normal + /// post-open accessors should not produce this from + /// header/directory structure. + #[error("bundle format error: {0}")] + Format(BendlFormatError), + + /// Checksum verification failed, was unavailable, or could not be + /// authoritatively performed. + #[error("checksum error: {0}")] + Checksum(#[from] ChecksumError), + + /// A BEN/XBEN decoder rejected the embedded stream banner. + #[error("decoder init error: {0}")] + DecoderInit(DecoderInitError), + + /// A codec error raised while a BEN/XBEN/xz decoder was already + /// running (malformed compressed payload, malformed assignment + /// stream, etc.). + #[error("decode error: {0}")] + Decode(io::Error), +} + +impl From for BendlReadError { + fn from(e: io::Error) -> Self { + BendlReadError::Io(e) + } +} + +impl From for BendlReadError { + fn from(e: BendlFormatError) -> Self { + // BendlFormatError already carries an `Io` arm; unwrap it so + // that ordinary I/O failures at the format layer surface as + // `BendlReadError::Io` rather than getting buried inside a + // synthetic `Format` wrap. + match e { + BendlFormatError::Io(io) => BendlReadError::Io(io), + other => BendlReadError::Format(other), + } + } +} + +impl From for BendlReadError { + fn from(e: DecoderInitError) -> Self { + BendlReadError::DecoderInit(e) + } +} diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 23ccaec..bc190f4 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -148,8 +148,21 @@ pub const ASSET_FLAG_JSON: u16 = 1 << 0; /// directory field refers to the compressed size on disk. pub const ASSET_FLAG_XZ: u16 = 1 << 1; /// Asset flag bit: the entry carries a trailing checksum. +/// +/// When set, the trailing checksum is exactly four little-endian bytes +/// containing a CRC32C (Castagnoli polynomial) over the **on-disk +/// payload bytes** (`payload_offset..payload_offset + payload_len`). +/// For an xz-compressed asset the CRC is over the compressed bytes, +/// not the decompressed content — verification happens before +/// decompression. Library writer paths always set this flag with +/// `checksum_len == [`ASSET_CHECKSUM_LEN`]`; readers reject any entry +/// where the flag and `checksum_len` are inconsistent (see +/// [`BendlDirectoryEntry::read_from`]). pub const ASSET_FLAG_CHECKSUM: u16 = 1 << 2; +/// On-disk byte width of an asset-payload CRC32C. +pub const ASSET_CHECKSUM_LEN: u32 = 4; + /// Default xz preset level used when compressing asset payloads. /// /// Level 6 matches the `xz` CLI's own default and `xz2::XzEncoder::new`'s @@ -356,7 +369,21 @@ impl BendlDirectoryEntry { // header[6..8] reserved; ignored let payload_offset = u64::from_le_bytes(header[8..16].try_into().unwrap()); let payload_len = u64::from_le_bytes(header[16..24].try_into().unwrap()); - let checksum_len = u32::from_le_bytes(header[24..28].try_into().unwrap()) as usize; + let checksum_len_raw = u32::from_le_bytes(header[24..28].try_into().unwrap()); + + // Reject (flag, checksum_len) inconsistencies before allocating anything. + let flag_set = asset_flags & ASSET_FLAG_CHECKSUM != 0; + match (flag_set, checksum_len_raw) { + (true, ASSET_CHECKSUM_LEN) => {} + (false, 0) => {} + _ => { + return Err(BendlFormatError::InconsistentChecksumMetadata { + flag_set, + checksum_len: checksum_len_raw, + }); + } + } + let checksum_len = checksum_len_raw as usize; let mut name_buf = vec![0u8; name_len]; reader.read_exact(&mut name_buf)?; @@ -379,6 +406,23 @@ impl BendlDirectoryEntry { checksum, }) } + + /// Return the stored CRC32C as a `u32`, if and only if the entry carries a valid checksum + /// (flag set, 4 bytes). + /// + /// This is the canonical accessor for verification code. Returns `None` for entries with + /// `ASSET_FLAG_CHECKSUM` clear; entries where the flag and length are inconsistent are + /// rejected at read time and so cannot reach this method. + pub fn checksum_u32(&self) -> Option { + if self.asset_flags & ASSET_FLAG_CHECKSUM == 0 { + return None; + } + let bytes = self.checksum.as_deref()?; + if bytes.len() != ASSET_CHECKSUM_LEN as usize { + return None; + } + Some(u32::from_le_bytes(bytes.try_into().ok()?)) + } } // --------------------------------------------------------------------------- @@ -460,6 +504,18 @@ pub enum BendlFormatError { #[error("malformed directory: {0}")] MalformedDirectory(String), + /// A directory entry's `ASSET_FLAG_CHECKSUM` bit and `checksum_len` disagree. The wire format + /// requires `flag set iff checksum_len == 4` and `flag clear iff checksum_len == 0`. + #[error( + "inconsistent checksum metadata: ASSET_FLAG_CHECKSUM={flag_set}, checksum_len={checksum_len}" + )] + InconsistentChecksumMetadata { + /// Whether the entry had the `ASSET_FLAG_CHECKSUM` bit set. + flag_set: bool, + /// The trailing-checksum length the entry actually declared. + checksum_len: u32, + }, + /// An I/O error occurred while reading or writing the format layer. #[error("IO error: {0}")] Io(#[from] io::Error), @@ -473,4 +529,3 @@ impl From for io::Error { } } } - diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index 1794cd1..e037860 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -13,6 +13,7 @@ //! encode/decode helpers. Pure functions over byte buffers; no I/O. //! - [`manifest`] — serde structs for the optional `metadata.json` asset. +pub mod error; pub mod format; pub mod manifest; pub mod reader; @@ -21,5 +22,6 @@ pub mod writer; #[cfg(test)] mod tests; +pub use error::{BendlReadError, ChecksumError, ChecksumTarget}; pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 0fb34f1..ba43f1a 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -7,13 +7,27 @@ //! likewise exposed as a byte range the caller can plumb into a //! [`BenStreamReader`] without this module reinterpreting any BEN/XBEN //! internals. +//! +//! ## Verification surface +//! +//! - [`BendlReader::asset_bytes`] and [`BendlReader::asset_reader`] are +//! **verify-on-touch**: the CRC32C of the on-disk payload bytes is +//! computed as data flows through, and a mismatch is reported at EOF. +//! - [`BendlReader::asset_bytes_unverified`], [`BendlReader::asset_reader_unverified`], +//! and [`BendlReader::asset_payload_reader_unverified`] are the +//! explicit recovery/debug escape hatches; they never surface a +//! [`ChecksumError`]. +//! - [`BendlReader::verify_asset_checksum`] and +//! [`BendlReader::verify_all_asset_checksums`] are explicit raw-bytes +//! verifiers (no decoding) that do not return decoded payload bytes. use std::io::{self, Read, Seek, SeekFrom, Take}; use xz2::read::XzDecoder; +use super::error::{BendlReadError, ChecksumError, ChecksumTarget}; use super::format::{ - standardized_name_for, read_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, + read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_XZ, }; use crate::io::reader::{BenStreamReader, BenWireFormat}; @@ -163,27 +177,108 @@ impl BendlReader { } } - /// Read the fully-decoded bytes of an asset by directory entry. + /// Read the fully-decoded bytes of an asset by directory entry, verifying its CRC32C before + /// returning. + /// + /// **Contract:** this is exactly `asset_reader(entry)? then read_to_end` — one behavioral + /// path shared with the streaming API so the two cannot drift apart. Implications: /// - /// If the entry has [`ASSET_FLAG_XZ`] set, the payload is decompressed - /// through `xz2::read::XzDecoder`. Otherwise the bytes are returned - /// as-is. - pub fn asset_bytes(&mut self, entry: &BendlDirectoryEntry) -> io::Result> { + /// - Uncompressed asset, payload byte flipped → the CRC tee observes the mismatch at raw EOF + /// and returns + /// [`BendlReadError::Checksum`]. + /// - Uncompressed asset, stored CRC bytes flipped → same; the tee compares computed-vs-stored + /// at EOF. + /// - xz-compressed asset with broken xz framing → the xz decoder fails before the raw tee + /// reaches EOF; surface is [`BendlReadError::Decode`]. (CRC is over compressed bytes, but + /// the decoder's failure precedes any CRC check.) + /// - xz-compressed asset with intact xz but wrong stored CRC → codec reaches EOF, BENDL-owned + /// wrapper checks CRC, returns [`BendlReadError::Checksum`]. + /// - Entry has `ASSET_FLAG_CHECKSUM` clear (foreign/hand-built bytes; the library writer + /// never produces this) → [`ChecksumError::Unavailable`]. + pub fn asset_bytes(&mut self, entry: &BendlDirectoryEntry) -> Result, BendlReadError> { let mut out = Vec::new(); - self.asset_reader(entry)?.read_to_end(&mut out)?; - Ok(out) + let mut reader = self.asset_reader(entry)?; + match reader.read_to_end(&mut out) { + Ok(_) => Ok(out), + Err(e) => Err(classify_read_error(e, entry)), + } + } + + /// Same as [`Self::asset_bytes`] but skips CRC verification. + /// + /// Never returns [`BendlReadError::Checksum`]. Other variants (I/O, codec) still apply — + /// corrupted xz framing still surfaces as [`BendlReadError::Decode`]. + pub fn asset_bytes_unverified( + &mut self, + entry: &BendlDirectoryEntry, + ) -> Result, BendlReadError> { + let mut out = Vec::new(); + let mut reader = self.asset_reader_unverified(entry)?; + match reader.read_to_end(&mut out) { + Ok(_) => Ok(out), + Err(e) => Err(classify_read_error(e, entry)), + } } - /// Obtain a boxed reader for the decoded contents of an asset. + /// Obtain a boxed reader for the decoded contents of an asset, with CRC32C verification at EOF. /// - /// The returned reader is positioned at the first decoded byte and - /// automatically handles xz decompression when the asset is flagged - /// as compressed. The reader borrows `self`, so only one asset or - /// stream reader may be live at a time. + /// The returned reader is positioned at the first decoded byte and automatically handles xz + /// decompression when the asset is flagged as compressed. The reader borrows `self`, so only + /// one asset or stream reader may be live at a time. + /// + /// Checksum mismatch surfaces from `Read::read` as + /// `io::Error::new(io::ErrorKind::InvalidData, ChecksumError)` on the call that would + /// otherwise return `Ok(0)` at EOF. Early-drop or partial-read callers do **not** observe + /// verification — the reader must be driven to EOF for the CRC to be checked. pub fn asset_reader<'a>( &'a mut self, entry: &BendlDirectoryEntry, - ) -> io::Result> { + ) -> Result, BendlReadError> { + let expected = match entry.checksum_u32() { + Some(c) => c, + None => { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Asset(entry.name.clone()), + })); + } + }; + let target = ChecksumTarget::Asset(entry.name.clone()); + + self.inner.seek(SeekFrom::Start(entry.payload_offset))?; + let raw = (&mut self.inner).take(entry.payload_len); + + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + // Compressed: CRC tee sits *inside* the XzDecoder so the tee accumulates over raw + // compressed bytes; the BENDL-owned wrapper around the decoder finalizes the + // check after the codec reaches its own EOF. + let tee = CrcTeeReader::new(raw); + let decoder = XzDecoder::new(tee); + Ok(Box::new(DecodedVerifyingReader { + decoder, + expected, + target, + state: VerifyState::Reading, + })) + } else { + Ok(Box::new(RawVerifyingReader { + inner: raw, + hasher: 0, + expected, + target, + state: VerifyState::Reading, + })) + } + } + + /// Decoded reader without CRC verification — explicit escape hatch for recovery/debug or + /// `--no-verify` flows. + /// + /// If the asset is xz-flagged the returned bytes are still decompressed; "unverified" only + /// disables the CRC check. + pub fn asset_reader_unverified<'a>( + &'a mut self, + entry: &BendlDirectoryEntry, + ) -> Result, BendlReadError> { self.inner.seek(SeekFrom::Start(entry.payload_offset))?; let raw = (&mut self.inner).take(entry.payload_len); if entry.asset_flags & ASSET_FLAG_XZ != 0 { @@ -193,6 +288,91 @@ impl BendlReader { } } + /// Raw on-disk payload reader without CRC verification — kept + /// distinct from [`Self::asset_reader_unverified`] so that callers + /// doing low-level recovery never accidentally emit decompressed + /// bytes (or, conversely, never accidentally emit compressed bytes + /// expecting raw). + /// + /// For an xz-flagged asset this yields the compressed payload + /// bytes byte-for-byte; for an uncompressed asset it is the same + /// as [`Self::asset_reader_unverified`]. + pub fn asset_payload_reader_unverified<'a>( + &'a mut self, + entry: &BendlDirectoryEntry, + ) -> Result, BendlReadError> { + self.inner.seek(SeekFrom::Start(entry.payload_offset))?; + Ok(Box::new((&mut self.inner).take(entry.payload_len))) + } + + /// Verify the stored CRC32C of a single asset without returning + /// any decoded bytes. + /// + /// The CRC is over the raw on-disk payload bytes; no decoder is + /// invoked, so corrupted xz framing under an intact stored CRC + /// will still report `Ok(())` (or, conversely, an intact xz + /// payload with a corrupted stored CRC will deterministically + /// report [`ChecksumError::Mismatch`]). + pub fn verify_asset_checksum( + &mut self, + entry: &BendlDirectoryEntry, + ) -> Result<(), BendlReadError> { + let expected = match entry.checksum_u32() { + Some(c) => c, + None => { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Asset(entry.name.clone()), + })); + } + }; + + self.inner.seek(SeekFrom::Start(entry.payload_offset))?; + let mut remaining = entry.payload_len; + let mut buf = [0u8; 64 * 1024]; + let mut hasher: u32 = 0; + while remaining > 0 { + let want = remaining.min(buf.len() as u64) as usize; + let n = self.inner.read(&mut buf[..want])?; + if n == 0 { + // Short read against the declared payload length — + // surface as an I/O error so callers can distinguish a + // truncated bundle from a CRC mismatch. + return Err(BendlReadError::Io(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "asset {:?} payload ended {} byte(s) before declared length", + entry.name, remaining + ), + ))); + } + hasher = crc32c::crc32c_append(hasher, &buf[..n]); + remaining -= n as u64; + } + + if hasher != expected { + return Err(BendlReadError::Checksum(ChecksumError::Mismatch { + target: ChecksumTarget::Asset(entry.name.clone()), + computed: hasher, + expected, + })); + } + Ok(()) + } + + /// Verify every asset's CRC in directory order. Returns the + /// **first** mismatch encountered and stops; callers that want a + /// full audit should iterate the directory and call + /// [`Self::verify_asset_checksum`] per entry themselves. + pub fn verify_all_asset_checksums(&mut self) -> Result<(), BendlReadError> { + // Clone the entries so we don't borrow self.directory across + // the seek/read calls on self.inner. + let entries = self.directory.clone(); + for entry in &entries { + self.verify_asset_checksum(entry)?; + } + Ok(()) + } + /// Validate that the loaded directory is well-formed under the /// canonical-name and uniqueness rules. /// @@ -204,7 +384,151 @@ impl BendlReader { pub fn validate_directory(&self) -> Result<(), BundleValidationError> { validate_directory_entries(&self.directory) } +} + +// --------------------------------------------------------------------------- +// Verifying reader plumbing +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum VerifyState { + /// Still feeding bytes from the underlying reader. + Reading, + /// Underlying reader returned EOF and the CRC matched. Subsequent reads return `Ok(0)` + /// (normal EOF). + EofChecked, + /// CRC mismatch was reported to the caller. Subsequent reads return `Ok(0)` so the reader + /// stays well-behaved if the caller re-polls after the error. + Failed, +} + +/// Uncompressed-asset verifying reader: forwards bytes from the bounded payload, accumulates +/// CRC32C as they fly past, and on raw EOF either confirms the checksum or returns +/// [`ChecksumError::Mismatch`] in place of the usual `Ok(0)`. +struct RawVerifyingReader<'a, R: Read + Seek> { + inner: Take<&'a mut R>, + hasher: u32, + expected: u32, + target: ChecksumTarget, + state: VerifyState, +} + +impl Read for RawVerifyingReader<'_, R> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.state { + VerifyState::EofChecked | VerifyState::Failed => return Ok(0), + VerifyState::Reading => {} + } + let n = self.inner.read(buf)?; + if n == 0 { + if self.hasher == self.expected { + self.state = VerifyState::EofChecked; + return Ok(0); + } + let err = ChecksumError::Mismatch { + target: self.target.clone(), + computed: self.hasher, + expected: self.expected, + }; + self.state = VerifyState::Failed; + return Err(io::Error::new(io::ErrorKind::InvalidData, err)); + } + self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); + Ok(n) + } +} +/// CRC accumulator that sits *inside* an [`XzDecoder`] for compressed assets. It must never +/// substitute a checksum error for raw EOF — the codec needs to see the natural `Ok(0)` so it +/// can flush pending output. The post-decoder wrapper ([`DecodedVerifyingReader`]) +/// inspects this struct's accumulated hash after codec EOF. +struct CrcTeeReader { + inner: R, + hasher: u32, +} + +impl CrcTeeReader { + fn new(inner: R) -> Self { + Self { inner, hasher: 0 } + } +} + +impl Read for CrcTeeReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let n = self.inner.read(buf)?; + if n > 0 { + self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); + } + Ok(n) + } +} + +/// Verifying wrapper around an `XzDecoder>`. Lets the codec observe normal raw +/// EOF before finalizing the CRC check at the decoded layer. +struct DecodedVerifyingReader<'a, R: Read + Seek> { + decoder: XzDecoder>>, + expected: u32, + target: ChecksumTarget, + state: VerifyState, +} + +impl Read for DecodedVerifyingReader<'_, R> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.state { + VerifyState::EofChecked | VerifyState::Failed => return Ok(0), + VerifyState::Reading => {} + } + let n = self.decoder.read(buf)?; + if n == 0 { + let computed = self.decoder.get_ref().hasher; + if computed == self.expected { + self.state = VerifyState::EofChecked; + return Ok(0); + } + let err = ChecksumError::Mismatch { + target: self.target.clone(), + computed, + expected: self.expected, + }; + self.state = VerifyState::Failed; + return Err(io::Error::new(io::ErrorKind::InvalidData, err)); + } + Ok(n) + } +} + +/// Map a `read_to_end`-time `io::Error` (or any `Read`-derived `io::Error`) into the right +/// [`BendlReadError`] variant. +/// +/// The wrap discipline is held here, in one place: a `ChecksumError`-bearing `io::Error` becomes +/// [`BendlReadError::Checksum`]; everything else fans out into `Io` vs `Decode` according to +/// context. Codec-runtime errors from xz/BEN go to [`BendlReadError::Decode`] when the entry is +/// xz-flagged; raw payload errors stay `Io`. +fn classify_read_error(err: io::Error, entry: &BendlDirectoryEntry) -> BendlReadError { + if err.get_ref().is_some_and(|e| e.is::()) { + match err + .into_inner() + .map(|boxed| boxed.downcast::()) + { + Some(Ok(boxed)) => return BendlReadError::Checksum(*boxed), + Some(Err(other)) => { + // Downcast failed unexpectedly — reconstruct an io::Error + // around the still-boxed payload so we don't lose context. + return BendlReadError::Io(io::Error::new(io::ErrorKind::InvalidData, other)); + } + None => { + return BendlReadError::Io(io::Error::new( + io::ErrorKind::InvalidData, + "checksum error with no payload", + )); + } + } + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + BendlReadError::Decode(err) + } else { + BendlReadError::Io(err) + } } pub(crate) fn validate_directory_entries( @@ -262,4 +586,3 @@ pub enum BundleValidationError { found: String, }, } - diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index cc03c79..38d8d0a 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -122,22 +122,77 @@ fn directory_entry_round_trip_no_checksum() { #[test] fn directory_entry_round_trip_with_checksum() { + // ASSET_FLAG_CHECKSUM ⇒ exactly four bytes of CRC32C. let entry = BendlDirectoryEntry { asset_type: ASSET_TYPE_CUSTOM, asset_flags: ASSET_FLAG_CHECKSUM, name: "custom_blob".to_string(), payload_offset: 2048, payload_len: 512, - checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE]), + checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF]), }; let bytes = entry.to_bytes().unwrap(); let mut cursor = &bytes[..]; let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); assert_eq!(decoded, entry); - assert_eq!( - decoded.checksum.unwrap(), - vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE] - ); + assert_eq!(decoded.checksum.as_deref(), Some(&[0xDE, 0xAD, 0xBE, 0xEF][..])); + assert_eq!(decoded.checksum_u32(), Some(0xEFBEADDE)); +} + +#[test] +fn directory_entry_rejects_flag_set_with_wrong_checksum_len() { + // Construct entry bytes by hand: flag bit set but checksum_len == 6. + let mut entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_CHECKSUM, + name: "x".to_string(), + payload_offset: 0, + payload_len: 0, + checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF]), + }; + let mut bytes = entry.to_bytes().unwrap(); + // Patch checksum_len at bytes 24..28 to claim 6 (also append two + // bytes so we don't crash on short read in the negative path). + bytes[24..28].copy_from_slice(&6u32.to_le_bytes()); + bytes.extend_from_slice(&[0x00, 0x00]); // pad to declared len + entry.checksum = Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x00]); + let mut cursor = &bytes[..]; + let err = BendlDirectoryEntry::read_from(&mut cursor).unwrap_err(); + assert!(matches!( + err, + BendlFormatError::InconsistentChecksumMetadata { + flag_set: true, + checksum_len: 6, + } + )); +} + +#[test] +fn directory_entry_rejects_flag_clear_with_nonzero_checksum_len() { + // Construct entry bytes with flag clear but checksum_len == 4. + let mut entry = BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "x".to_string(), + payload_offset: 0, + payload_len: 0, + checksum: None, + }; + let mut bytes = entry.to_bytes().unwrap(); + // The encoded bytes have checksum_len == 0 and no trailing checksum + // bytes; patch checksum_len to 4 and append four bytes. + bytes[24..28].copy_from_slice(&4u32.to_le_bytes()); + bytes.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD]); + entry.checksum = Some(vec![0xAA, 0xBB, 0xCC, 0xDD]); + let mut cursor = &bytes[..]; + let err = BendlDirectoryEntry::read_from(&mut cursor).unwrap_err(); + assert!(matches!( + err, + BendlFormatError::InconsistentChecksumMetadata { + flag_set: false, + checksum_len: 4, + } + )); } #[test] diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 01a98bc..d9c486e 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -4,14 +4,26 @@ use xz2::write::XzEncoder; use crate::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, - ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, - ASSET_TYPE_NODE_PERMUTATION_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, - FINALIZED_YES, HEADER_SIZE, + ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, + ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, + BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, }; use crate::io::bundle::reader::{ validate_directory_entries, BendlReader, BundleAssignmentReaderError, BundleValidationError, }; +/// Stamp a valid CRC32C and `ASSET_FLAG_CHECKSUM` onto a hand-built +/// directory entry whose on-disk payload bytes are `payload`. Use this +/// in test fixtures so the entry round-trips through the verify-on-touch +/// reader APIs. Tests that want to exercise the foreign-bundle / +/// clear-flag path build entries directly with the flag clear and +/// `checksum: None`. +fn with_crc(mut entry: BendlDirectoryEntry, payload: &[u8]) -> BendlDirectoryEntry { + entry.asset_flags |= ASSET_FLAG_CHECKSUM; + entry.checksum = Some(crc32c::crc32c(payload).to_le_bytes().to_vec()); + entry +} + /// Build a complete in-memory finalized bundle with two assets: /// an xz-compressed `graph.json` and a raw custom blob, followed by /// a fake BEN stream and a trailing directory. @@ -49,22 +61,28 @@ fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { let directory_offset = bundle.len() as u64; let entries = vec![ - BendlDirectoryEntry { - asset_type: ASSET_TYPE_GRAPH, - asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, - name: "graph.json".to_string(), - payload_offset: graph_offset, - payload_len: compressed_graph.len() as u64, - checksum: None, - }, - BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: "custom.bin".to_string(), - payload_offset: custom_offset, - payload_len: custom_blob.len() as u64, - checksum: None, - }, + with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: graph_offset, + payload_len: compressed_graph.len() as u64, + checksum: None, + }, + &compressed_graph, + ), + with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "custom.bin".to_string(), + payload_offset: custom_offset, + payload_len: custom_blob.len() as u64, + checksum: None, + }, + &custom_blob, + ), ]; let directory_bytes = encode_directory(&entries).unwrap(); bundle.extend_from_slice(&directory_bytes); @@ -259,14 +277,17 @@ fn build_basic_finalized_bundle() -> Vec { // Directory at EOF with one entry. let directory_offset = bytes.len() as u64; - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_METADATA, - asset_flags: ASSET_FLAG_JSON, - name: "metadata.json".to_string(), - payload_offset: metadata_offset, - payload_len: metadata_payload.len() as u64, - checksum: None, - }]; + let entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_METADATA, + asset_flags: ASSET_FLAG_JSON, + name: "metadata.json".to_string(), + payload_offset: metadata_offset, + payload_len: metadata_payload.len() as u64, + checksum: None, + }, + &metadata_payload, + )]; let directory = encode_directory(&entries).unwrap(); bytes.extend_from_slice(&directory); let directory_len = directory.len() as u64; @@ -588,14 +609,17 @@ fn stress_thousand_custom_assets_round_trip() { .collect(); let offset = bytes.len() as u64; bytes.extend_from_slice(&payload); - entries.push(BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: 0, - name: format!("blob-{i:04}.bin"), - payload_offset: offset, - payload_len: payload.len() as u64, - checksum: None, - }); + entries.push(with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: format!("blob-{i:04}.bin"), + payload_offset: offset, + payload_len: payload.len() as u64, + checksum: None, + }, + &payload, + )); expected.push(payload); } @@ -648,14 +672,17 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { let stream_offset = bytes.len() as u64; let directory_offset = bytes.len() as u64; - let entries = vec![BendlDirectoryEntry { - asset_type: ASSET_TYPE_CUSTOM, - asset_flags: ASSET_FLAG_XZ, - name: "broken.xz".to_string(), - payload_offset, - payload_len: bad_payload.len() as u64, - checksum: None, - }]; + let entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_XZ, + name: "broken.xz".to_string(), + payload_offset, + payload_len: bad_payload.len() as u64, + checksum: None, + }, + &bad_payload, + )]; let directory = encode_directory(&entries).unwrap(); bytes.extend_from_slice(&directory); @@ -782,3 +809,497 @@ fn validate_directory_rejects_wrong_canonical_name() { _ => panic!("expected WrongCanonicalName, got {err:?}"), } } + +// ===================================================================== +// Asset CRC32C verification +// ===================================================================== +// +// These tests pin the verify-on-touch contract for directory-entry +// assets. The structural split is: +// +// - explicit verifier (`verify_asset_checksum`) vs implicit +// verifier (`asset_bytes` / `asset_reader`), +// - uncompressed vs xz-compressed assets, +// - stored-checksum corruption vs payload corruption (vs xz-framing +// corruption for compressed assets). +// +// The unverified APIs (`*_unverified`) are pinned in matching tests to +// ensure they NEVER surface a `ChecksumError` (codec errors are still +// permitted). + +use crate::io::bundle::error::{BendlReadError, ChecksumError, ChecksumTarget}; + +/// Build a finalized bundle with exactly one uncompressed asset whose +/// payload bytes are `payload`. Returns `(bundle_bytes, asset_name, +/// directory_offset, payload_offset)` for hand-patching tests. +fn make_single_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, u64, u64) { + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(payload); + + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: name.to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: None, + }, + payload, + )]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + (bytes, name.to_string(), directory_offset, payload_offset) +} + +/// Build a finalized bundle whose only asset is `payload` stored xz- +/// compressed. The stored CRC is over the **compressed** bytes (CRC is +/// pre-decompression). Returns +/// `(bundle_bytes, name, compressed_payload, directory_offset, payload_offset)`. +fn make_single_xz_asset_bundle( + name: &str, + payload: &[u8], +) -> (Vec, String, Vec, u64, u64) { + let mut encoder = XzEncoder::new(Vec::new(), 6); + encoder.write_all(payload).unwrap(); + let compressed = encoder.finish().unwrap(); + + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&compressed); + + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_XZ, + name: name.to_string(), + payload_offset, + payload_len: compressed.len() as u64, + checksum: None, + }, + &compressed, + )]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + ( + bytes, + name.to_string(), + compressed, + directory_offset, + payload_offset, + ) +} + +/// Locate the offset of an asset's stored CRC32C bytes inside a +/// hand-built single-asset bundle. Assumes the directory starts at +/// `directory_offset`, the entry count is one, and the entry's +/// `checksum_len` is 4 (the only legal value when the flag is set). +fn stored_checksum_offset(directory_offset: u64, name: &str) -> usize { + // directory layout: [u32 count][entry][...] + // entry layout: [28-byte header][name bytes][checksum bytes] + let entry_start = directory_offset as usize + 4; + entry_start + 28 + name.len() +} + +// ----- Explicit verify_asset_checksum ------------------------------- + +#[test] +fn verify_asset_checksum_uncompressed_passes_on_intact_bundle() { + let (bytes, name, _, _) = make_single_asset_bundle("blob", b"hello world"); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + reader.verify_asset_checksum(&entry).unwrap(); +} + +#[test] +fn verify_asset_checksum_uncompressed_corrupt_stored_crc_returns_mismatch() { + let (mut bytes, name, dir_off, _) = make_single_asset_bundle("blob", b"hello world"); + let crc_off = stored_checksum_offset(dir_off, &name); + bytes[crc_off] ^= 0xFF; // flip stored checksum + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.verify_asset_checksum(&entry).unwrap_err(); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { ref target, .. }) + if matches!(target, ChecksumTarget::Asset(n) if n == &name) + )); +} + +#[test] +fn verify_asset_checksum_uncompressed_corrupt_payload_byte_returns_mismatch() { + let (mut bytes, name, _, payload_off) = make_single_asset_bundle("blob", b"hello world"); + bytes[payload_off as usize] ^= 0x01; // flip first payload byte + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.verify_asset_checksum(&entry).unwrap_err(); + assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); +} + +#[test] +fn verify_asset_checksum_xz_corrupt_stored_crc_returns_mismatch_no_decoder() { + // The explicit verifier reads raw bytes — no XzDecoder is invoked, + // so even an intact compressed payload reports `Mismatch` + // deterministically when only the stored CRC has been corrupted. + let (mut bytes, name, _, dir_off, _) = + make_single_xz_asset_bundle("blob.xz", b"some compressible content"); + let crc_off = stored_checksum_offset(dir_off, &name); + bytes[crc_off] ^= 0xFF; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.verify_asset_checksum(&entry).unwrap_err(); + assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); +} + +#[test] +fn verify_asset_checksum_xz_corrupt_payload_returns_mismatch_no_decoder() { + // Verifier is over raw bytes — a payload flip that breaks xz framing + // still surfaces as Mismatch, NOT a decoder error, because the + // explicit verifier never invokes the decoder. + let (mut bytes, name, compressed, _, payload_off) = + make_single_xz_asset_bundle("blob.xz", b"some compressible content"); + assert!(compressed.len() > 5); + bytes[payload_off as usize + 5] ^= 0xFF; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.verify_asset_checksum(&entry).unwrap_err(); + assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); +} + +#[test] +fn verify_asset_checksum_returns_unavailable_when_flag_clear() { + // Hand-build a foreign bundle whose entry has the flag clear. + let payload = b"orphan".to_vec(); + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, // explicitly NO checksum flag + name: "noflag".to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("noflag").cloned().unwrap(); + let err = reader.verify_asset_checksum(&entry).unwrap_err(); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Asset(_), + }) + )); + // The unverified path can still read the bytes. + let got = reader.asset_bytes_unverified(&entry).unwrap(); + assert_eq!(got, payload); +} + +// ----- Verify-on-touch via asset_bytes ------------------------------ + +#[test] +fn asset_bytes_uncompressed_corrupt_payload_returns_checksum_mismatch() { + let (mut bytes, name, _, payload_off) = make_single_asset_bundle("blob", b"hello world"); + bytes[payload_off as usize] ^= 0x01; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); +} + +#[test] +fn asset_bytes_unverified_uncompressed_returns_corrupted_bytes_no_check() { + let (mut bytes, name, _, payload_off) = make_single_asset_bundle("blob", b"hello world"); + bytes[payload_off as usize] ^= 0x01; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let got = reader.asset_bytes_unverified(&entry).unwrap(); + // The bytes returned are the corrupted bytes; we do not assert + // exact content, only that the operation succeeded — the + // *_unverified contract is that ChecksumError NEVER fires. + assert_eq!(got.len(), b"hello world".len()); +} + +#[test] +fn asset_bytes_xz_corrupt_stored_crc_returns_checksum_mismatch() { + // xz framing intact, but stored CRC is wrong. The codec reaches EOF + // cleanly first and then the BENDL-owned wrapper reports + // `ChecksumError::Mismatch`. + let (mut bytes, name, _, dir_off, _) = + make_single_xz_asset_bundle("blob.xz", b"some compressible content"); + let crc_off = stored_checksum_offset(dir_off, &name); + bytes[crc_off] ^= 0xFF; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); +} + +#[test] +fn asset_bytes_xz_corrupt_framing_returns_decode_error_not_checksum() { + // Payload flip breaks xz framing — the decoder fails before the + // CRC tee reaches raw EOF, so the variant is + // `BendlReadError::Decode`, not `BendlReadError::Checksum`. + let (mut bytes, name, compressed, _, payload_off) = + make_single_xz_asset_bundle("blob.xz", b"some compressible content"); + assert!(compressed.len() > 5); + bytes[payload_off as usize + 5] ^= 0xFF; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + assert!( + matches!(err, BendlReadError::Decode(_)), + "expected Decode for broken xz framing, got {err:?}" + ); +} + +#[test] +fn asset_bytes_unverified_xz_corrupt_framing_returns_decode_error_never_checksum() { + let (mut bytes, name, compressed, _, payload_off) = + make_single_xz_asset_bundle("blob.xz", b"some compressible content"); + assert!(compressed.len() > 5); + bytes[payload_off as usize + 5] ^= 0xFF; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.asset_bytes_unverified(&entry).unwrap_err(); + // Unverified path NEVER surfaces a checksum error; codec errors + // are still allowed. + assert!(!matches!(err, BendlReadError::Checksum(_))); + assert!(matches!(err, BendlReadError::Decode(_))); +} + +#[test] +fn asset_bytes_returns_unavailable_when_flag_clear() { + // Same hand-built foreign bundle as in the verifier test. + let payload = b"orphan".to_vec(); + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "noflag".to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset: directory_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("noflag").cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Unavailable { .. }) + )); +} + +// ----- asset_reader EOF semantics ---------------------------------- + +#[test] +fn asset_reader_uncompressed_surfaces_mismatch_on_final_read() { + // Drive `asset_reader` byte-by-byte and assert the call that + // would otherwise return Ok(0) at EOF returns InvalidData wrapping + // ChecksumError::Mismatch. + let (mut bytes, name, _, payload_off) = make_single_asset_bundle("blob", b"abcdef"); + bytes[payload_off as usize] ^= 0x01; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let mut r = reader.asset_reader(&entry).unwrap(); + let mut buf = [0u8; 1024]; + // Consume bytes until 0 or error. + let mut total_ok = 0usize; + loop { + match r.read(&mut buf) { + Ok(0) => panic!("expected a checksum error at EOF, got Ok(0) after {total_ok} bytes"), + Ok(n) => total_ok += n, + Err(e) => { + assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); + let inner = e + .get_ref() + .and_then(|x| x.downcast_ref::()) + .expect("inner ChecksumError"); + assert!(matches!(inner, ChecksumError::Mismatch { .. })); + break; + } + } + } + assert_eq!(total_ok, b"abcdef".len()); +} + +// ----- Bulk verifier ------------------------------------------------- + +#[test] +fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { + // Build a bundle with two assets, both corrupted. The bulk + // verifier must return the *first* mismatch in directory order + // and stop. Construct manually so we can corrupt independently. + let p1 = b"first".to_vec(); + let p2 = b"second".to_vec(); + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let off1 = bytes.len() as u64; + bytes.extend_from_slice(&p1); + let off2 = bytes.len() as u64; + bytes.extend_from_slice(&p2); + let stream_offset = bytes.len() as u64; + let directory_offset = bytes.len() as u64; + let e1 = with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "first".to_string(), + payload_offset: off1, + payload_len: p1.len() as u64, + checksum: None, + }, + &p1, + ); + let e2 = with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "second".to_string(), + payload_offset: off2, + payload_len: p2.len() as u64, + checksum: None, + }, + &p2, + ); + let directory = encode_directory(&[e1, e2]).unwrap(); + bytes.extend_from_slice(&directory); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + reserved_0: 0, + flags: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + // Corrupt both payloads. + bytes[off1 as usize] ^= 0x01; + bytes[off2 as usize] ^= 0x01; + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = reader.verify_all_asset_checksums().unwrap_err(); + let target = match &err { + BendlReadError::Checksum(ChecksumError::Mismatch { target, .. }) => target.clone(), + other => panic!("expected first-asset Mismatch, got {other:?}"), + }; + assert!(matches!(&target, ChecksumTarget::Asset(n) if n == "first")); +} + +// ----- Polynomial pin ------------------------------------------------ + +#[test] +fn crc32c_polynomial_pin_against_known_vectors() { + // Pin known CRC32C (Castagnoli) values so a future accidental + // swap to IEEE CRC-32 is caught at test time. The IEEE CRC-32 of + // [0x01,0x02,0x03,0x04] is 0xB63CFBCD; the CRC32C value below + // diverges from that, which is the whole point of the pin. + // + // CRC32C("") = 0x00000000 + // CRC32C([1,2,3,4]) = 0x8A2D413B + // CRC32C(b"123456789") = 0xE3069283 (Castagnoli check value) + // + // The Castagnoli check value 0xE3069283 is the canonical CRC32C + // test vector cited in the IEEE 802.3 / SCTP RFC 3720 specs and + // diverges from the IEEE CRC-32 polynomial's check value + // (0xCBF43926). If a future contributor accidentally swaps to + // IEEE CRC-32, this assertion fires. + assert_eq!(crc32c::crc32c(b""), 0x0000_0000); + // 0xE3069283 is the canonical Castagnoli check value + // (CRC32C of ASCII "123456789"); the IEEE CRC-32 polynomial's + // check value over the same input is 0xCBF43926, so any + // accidental swap is caught here. + assert_eq!(crc32c::crc32c(b"123456789"), 0xE306_9283); + // Extra sentinels to broaden the trip-wire. + assert_eq!(crc32c::crc32c(&[0x01, 0x02, 0x03, 0x04]), 0x2930_8CF4); +} diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index cbb1f6d..fa145d7 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -825,18 +825,17 @@ fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { } #[test] -fn writer_asset_with_checksum_round_trips_through_reader() { +fn writer_asset_round_trips_with_auto_computed_crc32c() { + // Every asset gets ASSET_FLAG_CHECKSUM with a 4-byte CRC32C of the + // on-disk payload bytes (post-compression for xz-flagged assets). + let payload = b"hello".to_vec(); let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let checksum = vec![0x01, 0x02, 0x03, 0x04]; writer .add_asset( ASSET_TYPE_CUSTOM, "with_checksum", - b"hello", - AddAssetOptions { - checksum: Some(checksum.clone()), - ..AddAssetOptions::defaults() - }, + &payload, + AddAssetOptions::defaults().raw(), ) .unwrap(); let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); @@ -844,8 +843,14 @@ fn writer_asset_with_checksum_round_trips_through_reader() { let reader = BendlReader::open(Cursor::new(buf)).unwrap(); let entry = reader.find_asset_by_name("with_checksum").cloned().unwrap(); - assert_eq!(entry.checksum, Some(checksum)); assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); + let expected_crc = crc32c::crc32c(&payload); + assert_eq!(entry.checksum_u32(), Some(expected_crc)); + assert_eq!( + entry.checksum, + Some(expected_crc.to_le_bytes().to_vec()), + "stored checksum is the little-endian CRC32C" + ); } #[test] @@ -1207,26 +1212,24 @@ fn stream_session_flush_succeeds() { // ── BendlAppender: checksum flag ──────────────────────────────── #[test] -fn appender_commit_with_checksum_sets_checksum_flag() { +fn appender_commit_auto_computes_crc32c_on_pending_assets() { let (bundle, _) = build_base_bundle(); + let payload = b"payload".to_vec(); let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); appender .add_asset( ASSET_TYPE_CUSTOM, "checksummed", - b"payload", - AddAssetOptions { - checksum: Some(vec![0xAB, 0xCD]), - ..AddAssetOptions::defaults() - }, + &payload, + AddAssetOptions::defaults().raw(), ) .unwrap(); let buf = appender.commit().unwrap().into_inner(); let reader = BendlReader::open(Cursor::new(buf)).unwrap(); let entry = reader.find_asset_by_name("checksummed").unwrap(); - assert_eq!(entry.checksum, Some(vec![0xAB, 0xCD])); assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); + assert_eq!(entry.checksum_u32(), Some(crc32c::crc32c(&payload))); } // ── BendlAppender: trailing directory bytes ────────────────────── diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 6a817a2..9cc6b55 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -6,26 +6,22 @@ //! [header] [asset payloads] [assignment stream] [directory] //! ``` //! -//! The writer operates in three logical phases, expressed via owned -//! typestate transitions: +//! The writer operates in three logical phases, expressed via owned typestate transitions: //! -//! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero -//! or more times. Each call writes the (optionally xz-compressed) -//! payload to the file and records its absolute offset and length in -//! an in-memory entry list. -//! 2. **stream phase** — the caller invokes -//! [`BendlWriter::into_stream_session`] to consume the writer and -//! obtain a [`BendlStreamSession`] that owns the underlying writer -//! and implements `Write`. When the stream is complete the caller -//! calls [`BendlStreamSession::finish_into_writer`] to recover the -//! [`BendlWriter`] in the `StreamWritten` state. -//! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing -//! directory and patches the header. +//! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero or more times. Each +//! call writes the (optionally xz-compressed) payload to the file and records its absolute +//! offset and length in an in-memory entry list. +//! 2. **stream phase** — the caller invokes [`BendlWriter::into_stream_session`] to consume the +//! writer and obtain a [`BendlStreamSession`] that owns the underlying writer and implements +//! `Write`. When the stream is complete the caller calls +//! [`BendlStreamSession::finish_into_writer`] to recover the [`BendlWriter`] in the +//! `StreamWritten` state. +//! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing directory and patches the +//! header. //! -//! The writer requires `Write + Seek` because the header is patched -//! twice: once with the stream offset (implicitly, by having reserved -//! its slot at construction) and once with the finalized stream length, -//! sample count, directory offset, directory length, and `complete` flag. +//! The writer requires `Write + Seek` because the header is patched twice: once with the stream +//! offset (implicitly, by having reserved its slot at construction) and once with the finalized +//! stream length, sample count, directory offset, directory length, and `complete` flag. use std::collections::HashSet; use std::io::{self, Read, Seek, SeekFrom, Write}; @@ -34,17 +30,16 @@ use thiserror::Error; use xz2::write::XzEncoder; use super::format::{ - standardized_name_for, default_compresses_by_type, encode_directory, read_directory, + default_compresses_by_type, encode_directory, read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, - ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, FINALIZED_YES, DEFAULT_XZ_PRESET, - HEADER_SIZE, + ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, DEFAULT_XZ_PRESET, + FINALIZED_YES, HEADER_SIZE, }; /// Ability to truncate an underlying seekable target to a given length. /// -/// This is not part of `std::io`, so `BendlAppender` takes a trait bound -/// that abstracts it and is implemented below for `std::fs::File` and -/// `std::io::Cursor>`. +/// This is not part of `std::io`, so `BendlAppender` takes a trait bound that abstracts it and is +/// implemented below for `std::fs::File` and `std::io::Cursor>`. pub trait BendlTruncate { /// Truncate or extend the underlying target to exactly `len` bytes. fn truncate_at(&mut self, len: u64) -> io::Result<()>; @@ -67,6 +62,11 @@ impl BendlTruncate for std::io::Cursor> { } /// Options passed alongside each [`BendlWriter::add_asset`] call. +/// +/// There is no "checksum opt-in/opt-out" knob: every asset written through the library carries a +/// CRC32C of its on-disk payload bytes, computed automatically by the writer. A future +/// recovery/debug writer that needs to emit unchecked assets must be an explicitly named +/// `*_unverified` API and excluded from normal write paths. #[derive(Debug, Clone, Default)] pub struct AddAssetOptions { /// Compression override. `None` means "follow the default policy for @@ -76,10 +76,6 @@ pub struct AddAssetOptions { /// Whether the decoded payload is UTF-8 JSON. Adds the /// [`ASSET_FLAG_JSON`] bit to the entry's flags. pub is_json: bool, - /// Optional trailing checksum bytes to store in the directory entry. - /// When set, [`crate::io::bundle::format::ASSET_FLAG_CHECKSUM`] is - /// applied automatically. - pub checksum: Option>, } impl AddAssetOptions { @@ -215,17 +211,19 @@ impl BendlWriter { payload.to_vec() }; - // Flags. - let mut asset_flags: u16 = 0; + // CRC32C over the on-disk payload bytes. For compressed assets this is the compressed + // bytes (verification happens before decompression). See ASSET_FLAG_CHECKSUM for the + // wire-format pin. + let crc = crc32c::crc32c(&payload_bytes); + let checksum_bytes = crc.to_le_bytes().to_vec(); + + let mut asset_flags: u16 = ASSET_FLAG_CHECKSUM; if options.is_json { asset_flags |= ASSET_FLAG_JSON; } if compress { asset_flags |= ASSET_FLAG_XZ; } - if options.checksum.is_some() { - asset_flags |= crate::io::bundle::format::ASSET_FLAG_CHECKSUM; - } // Write at current file position. let payload_offset = self.inner.seek(SeekFrom::Current(0))?; @@ -240,7 +238,7 @@ impl BendlWriter { name: name.to_string(), payload_offset, payload_len, - checksum: options.checksum, + checksum: Some(checksum_bytes), }); Ok(()) @@ -261,19 +259,24 @@ impl BendlWriter { ) } - /// Add one of the known singleton assets, using its reserved asset-type - /// integer and standardized name automatically. + /// Add one of the known singleton assets, using its reserved asset-type integer and + /// standardized name automatically. pub fn add_known_asset( &mut self, kind: KnownAssetKind, payload: &[u8], options: AddAssetOptions, ) -> Result<(), BendlWriteError> { - self.add_asset(kind.asset_type(), kind.standardized_name(), payload, options) + self.add_asset( + kind.asset_type(), + kind.standardized_name(), + payload, + options, + ) } - /// Add a custom (writer-named) asset. The asset-type is set to - /// [`ASSET_TYPE_CUSTOM`] automatically. + /// Add a custom (writer-named) asset. The asset-type is set to [`ASSET_TYPE_CUSTOM`] + /// automatically. pub fn add_custom_asset( &mut self, name: &str, @@ -285,22 +288,17 @@ impl BendlWriter { /// Consume the writer and transition into the stream phase. /// - /// The returned [`BendlStreamSession`] owns the underlying writer - /// and implements `Write`, so it can be plumbed into a - /// [`crate::io::writer::BenStreamWriter`] (or written to directly). - /// When the stream is complete the caller calls - /// [`BendlStreamSession::finish_into_writer`] to recover ownership - /// of a [`BendlWriter`] in the `StreamWritten` state, ready for + /// The returned [`BendlStreamSession`] owns the underlying writer and implements `Write`, so + /// it can be plumbed into a [`crate::io::writer::BenStreamWriter`] (or written to directly). + /// When the stream is complete the caller calls [`BendlStreamSession::finish_into_writer`] + /// to recover ownership of a [`BendlWriter`] in the `StreamWritten` state, ready for /// [`BendlWriter::finish`]. /// - /// Returns [`BendlWriteError::WrongState`] when called on a writer - /// that has already produced a stream (e.g. via a prior - /// `finish_into_writer`); this guard prevents a second - /// `into_stream_session` from silently overwriting - /// `header.stream_offset` and corrupting the bundle. - pub fn into_stream_session( - mut self, - ) -> Result, BendlWriteError> { + /// Returns [`BendlWriteError::WrongState`] when called on a writer that has already produced + /// a stream (e.g. via a prior `finish_into_writer`); this guard prevents a second + /// `into_stream_session` from silently overwriting `header.stream_offset` and corrupting the + /// bundle. + pub fn into_stream_session(mut self) -> Result, BendlWriteError> { match self.state { WriterState::Assets => {} WriterState::StreamWritten { .. } => { @@ -369,7 +367,6 @@ impl BendlWriter { Ok(self.inner) } - } /// Internal state of a [`BendlWriter`] that has been temporarily moved @@ -555,7 +552,6 @@ struct PendingAsset { /// Resolved compression decision: `true` means compress, `false` means raw. compress: bool, is_json: bool, - checksum: Option>, } impl BendlAppender { @@ -656,7 +652,6 @@ impl BendlAppender { raw_payload: payload.to_vec(), compress, is_json: options.is_json, - checksum: options.checksum, }); Ok(()) } @@ -684,7 +679,12 @@ impl BendlAppender { payload: &[u8], options: AddAssetOptions, ) -> Result<(), BendlWriteError> { - self.add_asset(kind.asset_type(), kind.standardized_name(), payload, options) + self.add_asset( + kind.asset_type(), + kind.standardized_name(), + payload, + options, + ) } /// Append a custom (writer-named) asset. The asset-type is set to @@ -733,23 +733,24 @@ impl BendlAppender { asset.raw_payload }; - let mut asset_flags: u16 = 0; + // CRC32C over on-disk payload bytes (compressed if XZ). + let crc = crc32c::crc32c(&bytes); + let checksum_bytes = crc.to_le_bytes().to_vec(); + + let mut asset_flags: u16 = ASSET_FLAG_CHECKSUM; if asset.is_json { asset_flags |= ASSET_FLAG_JSON; } if asset.compress { asset_flags |= ASSET_FLAG_XZ; } - if asset.checksum.is_some() { - asset_flags |= crate::io::bundle::format::ASSET_FLAG_CHECKSUM; - } encoded.push(EncodedPending { asset_type: asset.asset_type, name: asset.name, bytes, asset_flags, - checksum: asset.checksum, + checksum: Some(checksum_bytes), }); } From 1ce870ce8471012155d77509c372cbb2eed90f8d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 18 May 2026 16:33:25 -0600 Subject: [PATCH 099/221] formatting pass --- ben-py/src/decode/decoder.rs | 160 ++++++-------- ben-py/src/decode/helpers.rs | 29 +-- ben-py/src/decode/py_funcs.rs | 21 +- ben-py/src/decode/types.rs | 12 +- ben-py/src/encode/encoder.rs | 62 +++--- ben-py/src/encode/helpers.rs | 8 +- ben-py/src/lib.rs | 5 +- ben/src/cli/ben/args.rs | 93 ++++---- ben/src/cli/ben/bundle.rs | 18 +- ben/src/cli/ben/modes/encode.rs | 3 +- ben/src/cli/ben/modes/mod.rs | 6 +- ben/src/cli/ben/modes/xdecode.rs | 3 +- ben/src/cli/ben/modes/xencode.rs | 3 +- ben/src/cli/ben/paths.rs | 13 +- ben/src/cli/ben/tests.rs | 2 +- ben/src/cli/bendl/args.rs | 15 +- ben/src/cli/bendl/create.rs | 8 +- ben/src/cli/bendl/mod.rs | 10 +- ben/src/cli/bendl/tests.rs | 62 +++--- ben/src/cli/common/error.rs | 12 +- ben/src/cli/common/mod.rs | 33 ++- ben/src/cli/pcben/args.rs | 8 +- ben/src/cli/pcben/modes/mod.rs | 6 +- ben/src/cli/pcben/tests.rs | 16 +- ben/src/cli/pcben/translate.rs | 5 +- ben/src/cli/reben/args.rs | 14 +- ben/src/cli/reben/tests.rs | 88 +++----- ben/src/codec/decode/ben.rs | 28 +-- ben/src/codec/decode/ben32.rs | 8 +- ben/src/codec/decode/jsonl.rs | 36 +-- ben/src/codec/decode/mod.rs | 3 +- ben/src/codec/decode/path.rs | 12 +- ben/src/codec/decode/tests/mkvchain.rs | 39 ++-- ben/src/codec/decode/tests/mod.rs | 16 +- ben/src/codec/decode/tests/standard.rs | 22 +- ben/src/codec/decode/tests/twodelta.rs | 47 ++-- ben/src/codec/decode/twodelta.rs | 20 +- ben/src/codec/decode/xz.rs | 3 +- ben/src/codec/encode/ben.rs | 6 +- ben/src/codec/encode/jsonl.rs | 26 +-- ben/src/codec/encode/path.rs | 19 +- ben/src/codec/encode/tests.rs | 54 +++-- ben/src/codec/encode/twodelta.rs | 159 +++++++------- ben/src/codec/encode/xz.rs | 72 +++--- ben/src/codec/frames/decode.rs | 82 +++---- ben/src/codec/frames/encode.rs | 66 +++--- ben/src/codec/frames/mod.rs | 15 +- ben/src/codec/frames/tests.rs | 24 +- ben/src/codec/translate/mod.rs | 42 ++-- ben/src/codec/translate/tests.rs | 15 +- ben/src/io/bundle/error.rs | 84 +++---- ben/src/io/bundle/format.rs | 75 +++---- ben/src/io/bundle/manifest.rs | 22 +- ben/src/io/bundle/mod.rs | 15 +- ben/src/io/bundle/reader.rs | 176 +++++++-------- ben/src/io/bundle/tests/format.rs | 17 +- ben/src/io/bundle/tests/manifest.rs | 3 +- ben/src/io/bundle/tests/reader.rs | 229 +++++++++----------- ben/src/io/bundle/tests/writer.rs | 144 ++++++------ ben/src/io/bundle/writer.rs | 194 ++++++++--------- ben/src/io/reader/stream_reader/ben.rs | 10 +- ben/src/io/reader/stream_reader/frames.rs | 23 +- ben/src/io/reader/stream_reader/mod.rs | 80 +++---- ben/src/io/reader/stream_reader/xben.rs | 15 +- ben/src/io/reader/subsample.rs | 33 ++- ben/src/io/reader/tests.rs | 152 +++++++------ ben/src/io/writer/options.rs | 13 +- ben/src/io/writer/stream_writer/ben.rs | 28 +-- ben/src/io/writer/stream_writer/mod.rs | 110 +++++----- ben/src/io/writer/stream_writer/xben.rs | 13 +- ben/src/io/writer/tests.rs | 53 +++-- ben/src/io/writer/twodelta.rs | 9 +- ben/src/io/writer/utils.rs | 8 +- ben/src/json/graph/errors.rs | 3 +- ben/src/json/graph/mlc.rs | 161 ++++++-------- ben/src/json/graph/mod.rs | 31 ++- ben/src/json/graph/nx_formats.rs | 14 +- ben/src/json/graph/petxgraph/mod.rs | 20 +- ben/src/json/graph/petxgraph/nx_convert.rs | 44 ++-- ben/src/json/graph/petxgraph/permutation.rs | 16 +- ben/src/json/graph/petxgraph/sort.rs | 20 +- ben/src/json/graph/rcm.rs | 28 +-- ben/src/json/graph/tests/test_algos.rs | 23 +- ben/src/json/graph/tests/test_io.rs | 22 +- ben/src/lib.rs | 32 ++- ben/src/logging.rs | 8 +- ben/src/ops/extract/mod.rs | 4 +- ben/src/ops/extract/tests.rs | 12 +- ben/src/ops/relabel/mod.rs | 64 +++--- ben/src/ops/relabel/permutation.rs | 5 +- ben/src/ops/relabel/tests.rs | 118 +++++----- ben/src/progress/mod.rs | 26 +-- ben/src/test_utils.rs | 28 +-- ben/tests/common/mod.rs | 10 +- ben/tests/test_assignment_reader.rs | 67 +++--- ben/tests/test_coverage.rs | 226 ++++++++++--------- ben/tests/test_impls_pipeline.rs | 70 +++--- ben/tests/test_pipeline.rs | 6 +- ben/tests/test_stress_edges.rs | 31 +-- rustfmt.toml | 3 + 100 files changed, 1916 insertions(+), 2211 deletions(-) create mode 100644 rustfmt.toml diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 76cecac..2bb7a40 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -21,9 +21,9 @@ pub struct PyBenDecoder { path: PathBuf, mode: DecoderMode, backend: DecoderBackend, - /// Lazily-constructed frame iterator. We defer construction so opening - /// a bundle whose stream is empty or truncated still succeeds — only - /// methods that actually walk the stream need a live iterator. + /// Lazily-constructed frame iterator. We defer construction so opening a bundle whose stream is + /// empty or truncated still succeeds — only methods that actually walk the stream need a live + /// iterator. iter: Option, current_assignment: Option>, remaining_count: u16, @@ -36,23 +36,22 @@ pub struct PyBenDecoder { impl PyBenDecoder { /// Open a decoder on a `.ben`, `.xben`, or `.bendl` file. /// - /// The file's leading bytes are sniffed to decide whether it is a - /// bundle. When the file is a `.bendl`, the bundle's header decides - /// the BEN/XBEN format and the `mode` argument is ignored; when the - /// file is a plain stream, `mode` selects between the BEN and XBEN - /// readers and defaults to `"ben"`. + /// The file's leading bytes are sniffed to decide whether it is a bundle. When the file is a + /// `.bendl`, the bundle's header decides the BEN/XBEN format and the `mode` argument is + /// ignored; when the file is a plain stream, `mode` selects between the BEN and XBEN readers + /// and defaults to `"ben"`. /// /// # Arguments /// /// * `file_path` - Path to the input file. - /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain - /// streams; bundles use `assignment_format` from the header. + /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain streams; bundles use + /// `assignment_format` from the header. #[new] #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] fn new(py: Python<'_>, file_path: PathBuf, mode: &str) -> PyResult { - // Validate the mode string up front so "Unknown mode" is reported - // regardless of whether the file exists or turns out to be a bundle. + // Validate the mode string up front so "Unknown mode" is reported regardless of whether the + // file exists or turns out to be a bundle. let parsed_mode = DecoderMode::parse(mode)?; let is_bundle = detect_is_bundle(&file_path).map_err(|e| { PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) @@ -69,18 +68,15 @@ impl PyBenDecoder { )) })?; let fmt = reader.assignment_format().ok_or_else(|| { - PyException::new_err( - "Bundle header has an unrecognized assignment_format field.", - ) + PyException::new_err("Bundle header has an unrecognized assignment_format field.") })?; let derived_mode = DecoderMode::from_assignment_format(fmt); - let (stream_offset, stream_len) = - reader.assignment_stream_range().map_err(|e| { - PyException::new_err(format!( - "Failed to determine stream region in {}: {e}", - file_path.display() - )) - })?; + let (stream_offset, stream_len) = reader.assignment_stream_range().map_err(|e| { + PyException::new_err(format!( + "Failed to determine stream region in {}: {e}", + file_path.display() + )) + })?; let state = BundleState { reader, stream_offset, @@ -92,11 +88,10 @@ impl PyBenDecoder { warn_xben_startup(py)?; } - // Iter construction is deferred: opening a bundle with an - // empty or truncated stream is legal (incomplete or zero-sample - // finalized bundles), and metadata methods like - // `count_samples`, `asset_names`, and `extract_stream` don't - // need a live iterator. Iteration paths build it on demand. + // Iter construction is deferred: opening a bundle with an empty or truncated stream is + // legal (incomplete or zero-sample finalized bundles), and metadata methods like + // `count_samples`, `asset_names`, and `extract_stream` don't need a live iterator. + // Iteration paths build it on demand. Ok(Self { path: file_path, mode: derived_mode, @@ -112,10 +107,9 @@ impl PyBenDecoder { if matches!(parsed_mode, DecoderMode::XBen) { warn_xben_startup(py)?; } - // For plain streams, opening the file as a BEN/XBEN reader is - // the only way to learn the variant — keep eager construction - // so we surface a malformed-banner error at open time, matching - // the documented behaviour of `BenDecoder("…", mode="ben")`. + // For plain streams, opening the file as a BEN/XBEN reader is the only way to learn the + // variant — keep eager construction so we surface a malformed-banner error at open + // time, matching the documented behaviour of `BenDecoder("…", mode="ben")`. let iter = build_plain_iter(&file_path, parsed_mode)?; Ok(Self { path: file_path, @@ -131,12 +125,12 @@ impl PyBenDecoder { } } - /// Return `self` as an iterator, rebuilding the underlying frame - /// walker so iteration can be restarted. + /// Return `self` as an iterator, rebuilding the underlying frame walker so iteration can be + /// restarted. /// - /// Calling `iter(dec)` (or using `for x in dec: …`) more than once - /// is supported: each call reopens the stream region from the start - /// and, if a subsample selection is active, reapplies it. + /// Calling `iter(dec)` (or using `for x in dec: …`) more than once is supported: each call + /// reopens the stream region from the start and, if a subsample selection is active, reapplies + /// it. fn __iter__(mut slf: PyRefMut) -> PyResult> { slf.current_assignment = None; slf.remaining_count = 0; @@ -169,10 +163,9 @@ impl PyBenDecoder { let a = slf.current_assignment.as_ref().unwrap().clone(); return Ok(Some(a)); } - // Build the iterator on first use (e.g. when iteration begins - // without an explicit `__iter__` call). For bundle backends with - // empty/truncated streams this is where the BEN-banner-required - // error surfaces, instead of at `BenDecoder(...)` construction. + // Build the iterator on first use (e.g. when iteration begins without an explicit + // `__iter__` call). For bundle backends with empty/truncated streams this is where the + // BEN-banner-required error surfaces, instead of at `BenDecoder(...)` construction. if slf.iter.is_none() { let path = slf.path.clone(); let mode = slf.mode; @@ -218,12 +211,10 @@ impl PyBenDecoder { #[pyo3(text_signature = "(self)")] fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - // Always reports the total number of samples in the source file, - // even after `subsample_*` has been applied. We deliberately do - // not touch `len_hint` here: when a subsample selection is - // active, `len_hint` tracks the filtered count that `__len__` - // should return, and clobbering it would break `len(dec)` after - // a `count_samples()` call. + // Always reports the total number of samples in the source file, even after `subsample_*` + // has been applied. We deliberately do not touch `len_hint` here: when a subsample + // selection is active, `len_hint` tracks the filtered count that `__len__` should return, + // and clobbering it would break `len(dec)` after a `count_samples()` call. ensure_base_len(&mut slf, py) } @@ -234,11 +225,10 @@ impl PyBenDecoder { py: Python<'_>, ) -> PyResult> { if !indices.iter().is_sorted() { - // We need to sort and deduplicate the indices - // This is a bit annoying, but it is necessary to ensure that we can - // efficiently iterate over the underlying data. - // We use unstable sort because we don't care about the order of equal elements - // and it is faster than stable sort. + // We need to sort and deduplicate the indices This is a bit annoying, but it is + // necessary to ensure that we can efficiently iterate over the underlying data. We use + // unstable sort because we don't care about the order of equal elements and it is + // faster than stable sort. let warnings = py.import("warnings")?; let kwargs = PyDict::new(py); // kwargs.set_item("stacklevel", 2)?; @@ -330,27 +320,24 @@ impl PyBenDecoder { // --------------------------------------------------------------------- // Bundle-inspection surface. // - // These methods only make sense when the decoder was opened on a - // `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear - // error pointing the user at the right tool. + // These methods only make sense when the decoder was opened on a `.bendl` file; on a plain + // `.ben`/`.xben` stream they raise a clear error pointing the user at the right tool. // --------------------------------------------------------------------- - /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a - /// plain `.ben`/`.xben` stream (`False`). + /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a plain `.ben`/`.xben` + /// stream (`False`). #[pyo3(text_signature = "(self)")] fn is_bundle(&self) -> bool { self.backend.is_bundle() } - /// Return the container format of the underlying assignment stream - /// as `"ben"` or `"xben"`. + /// Return the container format of the underlying assignment stream as `"ben"` or `"xben"`. #[pyo3(text_signature = "(self)")] fn assignment_format(&self) -> &'static str { self.mode.as_str() } - /// Return the bundle's format version as a `(major, minor)` tuple. - /// Errors on plain streams. + /// Return the bundle's format version as a `(major, minor)` tuple. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn version(&self) -> PyResult<(u16, u16)> { let state = self.require_bundle("version()")?; @@ -358,16 +345,14 @@ impl PyBenDecoder { Ok((h.major_version, h.minor_version)) } - /// Whether the bundle was successfully finalized. Errors on plain - /// streams. + /// Whether the bundle was successfully finalized. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn is_complete(&self) -> PyResult { let state = self.require_bundle("is_complete()")?; Ok(state.reader.is_finalized()) } - /// Names of every entry in the bundle's directory, in directory - /// order. Errors on plain streams. + /// Names of every entry in the bundle's directory, in directory order. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn asset_names(&self) -> PyResult> { let state = self.require_bundle("asset_names()")?; @@ -379,9 +364,8 @@ impl PyBenDecoder { .collect()) } - /// Return the full bundle directory as a list of dicts with keys - /// `name`, `type`, `offset`, `len`, and `flags` (a list of string - /// tags). Errors on plain streams. + /// Return the full bundle directory as a list of dicts with keys `name`, `type`, `offset`, + /// `len`, and `flags` (a list of string tags). Errors on plain streams. #[pyo3(text_signature = "(self)")] fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { let state = self.require_bundle("list_assets()")?; @@ -409,8 +393,8 @@ impl PyBenDecoder { Ok(out) } - /// Read the (decoded) bytes of a named asset as a Python `bytes` - /// object. Errors on plain streams. + /// Read the (decoded) bytes of a named asset as a Python `bytes` object. Errors on plain + /// streams. #[pyo3(text_signature = "(self, name, /)")] fn read_asset_bytes(&mut self, name: &str) -> PyResult> { let state = self.require_bundle_mut("read_asset_bytes()")?; @@ -425,9 +409,8 @@ impl PyBenDecoder { .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) } - /// Parse a JSON asset into a Python object (dict, list, …). Errors - /// on plain streams and when the asset does not exist or is not - /// valid UTF-8 / JSON. + /// Parse a JSON asset into a Python object (dict, list, …). Errors on plain streams and when + /// the asset does not exist or is not valid UTF-8 / JSON. #[pyo3(text_signature = "(self, name, /)")] fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { let bytes = self.read_asset_bytes(name)?; @@ -438,9 +421,8 @@ impl PyBenDecoder { Ok(parsed.into()) } - /// Read the bundle's `graph.json` asset as a parsed JSON object. - /// Returns `None` if the bundle does not carry a graph asset. Errors - /// on plain streams. + /// Read the bundle's `graph.json` asset as a parsed JSON object. Returns `None` if the bundle + /// does not carry a graph asset. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { { @@ -452,8 +434,8 @@ impl PyBenDecoder { Ok(Some(self.read_json_asset(py, "graph.json")?)) } - /// Read the bundle's `metadata.json` asset as a parsed JSON object, - /// or `None` if absent. Errors on plain streams. + /// Read the bundle's `metadata.json` asset as a parsed JSON object, or `None` if absent. Errors + /// on plain streams. #[pyo3(text_signature = "(self)")] fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { { @@ -469,8 +451,8 @@ impl PyBenDecoder { Ok(Some(self.read_json_asset(py, "metadata.json")?)) } - /// Read the bundle's `node_permutation_map.json` asset as a parsed - /// JSON object, or `None` if absent. Errors on plain streams. + /// Read the bundle's `node_permutation_map.json` asset as a parsed JSON object, or `None` if + /// absent. Errors on plain streams. #[pyo3(text_signature = "(self)")] fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { { @@ -486,10 +468,9 @@ impl PyBenDecoder { Ok(Some(self.read_json_asset(py, "node_permutation_map.json")?)) } - /// Copy the embedded assignment stream region verbatim to - /// `out_path`. The resulting file can be opened directly with - /// `PyBenDecoder(out_path, mode=dec.assignment_format())`. - /// Errors on plain streams. + /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be + /// opened directly with `PyBenDecoder(out_path, mode=dec.assignment_format())`. Errors on plain + /// streams. #[pyo3(signature = (out_path, overwrite=false))] #[pyo3(text_signature = "(self, out_path, overwrite=False)")] fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { @@ -528,8 +509,8 @@ impl PyBenDecoder { } impl PyBenDecoder { - /// Borrow the bundle state or raise a clear Python error explaining - /// that the decoder was opened on a plain stream. + /// Borrow the bundle state or raise a clear Python error explaining that the decoder was opened + /// on a plain stream. fn require_bundle(&self, op: &str) -> PyResult<&BundleState> { match &self.backend { DecoderBackend::Bundle(state) => Ok(state), @@ -587,11 +568,10 @@ fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { - // Prefer the authoritative sample_count carried in the - // bundle header, which is set for finalized bundles and is - // O(1). Fall back to scanning the stream region when the - // header has no count (unfinalized append target, or a - // header byte we cannot interpret). + // Prefer the authoritative sample_count carried in the bundle header, which is set for + // finalized bundles and is O(1). Fall back to scanning the stream region when the + // header has no count (unfinalized append target, or a header byte we cannot + // interpret). if let Some(n) = state.reader.sample_count() { if n >= 0 { n as usize diff --git a/ben-py/src/decode/helpers.rs b/ben-py/src/decode/helpers.rs index df9696d..eb35c2d 100644 --- a/ben-py/src/decode/helpers.rs +++ b/ben-py/src/decode/helpers.rs @@ -2,8 +2,8 @@ use super::types::{BundleState, DecoderBackend, DecoderMode, DynIter}; use crate::common::open_input; use binary_ensemble::io::bundle::format::BENDL_MAGIC; use binary_ensemble::io::reader::{ - build_frame_iter, build_frame_iter_from_reader, count_samples_from_frame_iter, - BenStreamReader, BenWireFormat, + build_frame_iter, build_frame_iter_from_reader, count_samples_from_frame_iter, BenStreamReader, + BenWireFormat, }; use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; @@ -28,8 +28,7 @@ pub(super) fn warn_xben_startup(py: Python<'_>) -> PyResult<()> { Ok(()) } -/// Sniff the first 8 bytes of a file and decide whether it starts with -/// the `BENDL` magic. +/// Sniff the first 8 bytes of a file and decide whether it starts with the `BENDL` magic. pub(super) fn detect_is_bundle(path: &Path) -> io::Result { let mut file = File::open(path)?; let mut magic = [0u8; 8]; @@ -64,9 +63,8 @@ pub(super) fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult PyResult { let format = mode.wire_format(); match backend { - DecoderBackend::Plain => { - build_frame_iter(&path.to_path_buf(), format).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - path.display() - )) - }) - } + DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), format).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + path.display() + )) + }), DecoderBackend::Bundle(state) => { let reader = open_bundle_stream_reader(path, state)?; build_frame_iter_from_reader(reader, format).map_err(|e| { diff --git a/ben-py/src/decode/py_funcs.rs b/ben-py/src/decode/py_funcs.rs index 9a90fd0..5b45bf9 100644 --- a/ben-py/src/decode/py_funcs.rs +++ b/ben-py/src/decode/py_funcs.rs @@ -1,7 +1,6 @@ use crate::common::{open_input, open_output, validate_input_output_paths}; use binary_ensemble::codec::decode::{ - decode_ben_to_jsonl as core_decode_ben_to_jsonl, - decode_xben_to_ben as core_decode_xben_to_ben, + decode_ben_to_jsonl as core_decode_ben_to_jsonl, decode_xben_to_ben as core_decode_xben_to_ben, decode_xben_to_jsonl as core_decode_xben_to_jsonl, }; use pyo3::exceptions::PyIOError; @@ -11,11 +10,7 @@ use std::path::PathBuf; #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decode_xben_to_ben( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { +pub fn decode_xben_to_ben(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> PyResult<()> { validate_input_output_paths(&in_file, &out_file)?; let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; @@ -34,11 +29,7 @@ pub fn decode_xben_to_ben( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decode_xben_to_jsonl( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { +pub fn decode_xben_to_jsonl(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> PyResult<()> { validate_input_output_paths(&in_file, &out_file)?; let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; @@ -57,11 +48,7 @@ pub fn decode_xben_to_jsonl( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] -pub fn decode_ben_to_jsonl( - in_file: PathBuf, - out_file: PathBuf, - overwrite: bool, -) -> PyResult<()> { +pub fn decode_ben_to_jsonl(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> PyResult<()> { validate_input_output_paths(&in_file, &out_file)?; let reader = open_input(&in_file)?; let writer = open_output(&out_file, overwrite)?; diff --git a/ben-py/src/decode/types.rs b/ben-py/src/decode/types.rs index e826fc3..07c5004 100644 --- a/ben-py/src/decode/types.rs +++ b/ben-py/src/decode/types.rs @@ -49,9 +49,8 @@ impl DecoderMode { /// Cached bundle state for a decoder opened on a `.bendl` file. /// -/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC -/// inspection and asset-read calls without disturbing the iterator (which -/// reads the stream region through a separate file handle). +/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC inspection and asset-read calls +/// without disturbing the iterator (which reads the stream region through a separate file handle). pub(super) struct BundleState { pub reader: BendlReader>, pub stream_offset: u64, @@ -72,10 +71,9 @@ impl DecoderBackend { /// Stored form of the most recently installed subsampling selection. /// -/// The iterator is single-pass, so to support restarting iteration -/// (e.g. `for x in dec: ... ; for x in dec: ...`) the decoder remembers -/// the active selection and rebuilds a fresh frame decoder on every -/// call to `__iter__`. +/// The iterator is single-pass, so to support restarting iteration (e.g. +/// `for x in dec: ... ; for x in dec: ...`) the decoder remembers the active selection and rebuilds +/// a fresh frame decoder on every call to `__iter__`. #[derive(Clone)] pub(super) enum ActiveSelection { None, diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index ab91837..70689ab 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -11,20 +11,17 @@ use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -/// Per-call encoder state. The bundle path threads ownership of the -/// underlying file through `BendlWriter` → `BendlStreamSession` → -/// `BenStreamWriter`, so when `close()` runs we walk the chain back -/// from `BenStreamWriter::finish_into_inner` (returning the session) -/// to `BendlStreamSession::finish_into_writer` (returning the bundle -/// writer) to `BendlWriter::finish` (returning the buffered file). +/// Per-call encoder state. The bundle path threads ownership of the underlying file through +/// `BendlWriter` → `BendlStreamSession` → `BenStreamWriter`, so when `close()` runs we walk the +/// chain back from `BenStreamWriter::finish_into_inner` (returning the session) to +/// `BendlStreamSession::finish_into_writer` (returning the bundle writer) to `BendlWriter::finish` +/// (returning the buffered file). enum EncoderState { - /// Plain `.ben` file path: writes directly to a buffered file with - /// no bundle framing. + /// Plain `.ben` file path: writes directly to a buffered file with no bundle framing. BenOnly(BenStreamWriter>), - /// `.bendl` bundle path: the session owns the buffered file and the - /// `BenStreamWriter` writes through it. `sample_count` is tracked - /// alongside so it can be plumbed into `finish_into_writer` at - /// `close()` time. + /// `.bendl` bundle path: the session owns the buffered file and the `BenStreamWriter` writes + /// through it. `sample_count` is tracked alongside so it can be plumbed into + /// `finish_into_writer` at `close()` time. BundleStreaming { writer: BenStreamWriter>>, sample_count: i64, @@ -51,24 +48,22 @@ impl PyBenEncoder { #[pymethods] impl PyBenEncoder { - /// Open a new encoder. The default output is a `.bendl` bundle with - /// an embedded assignment stream and an optional embedded graph; set - /// `ben_file_only=True` to emit a plain `.ben` file instead. + /// Open a new encoder. The default output is a `.bendl` bundle with an embedded assignment + /// stream and an optional embedded graph; set `ben_file_only=True` to emit a plain `.ben` file + /// instead. /// /// # Arguments /// /// * `file_path` - Output path. Must not exist unless `overwrite=True`. /// * `overwrite` - Replace an existing file at `file_path`. - /// * `variant` - BEN variant for the assignment stream (`"standard"`, - /// `"mkv_chain"`, or `"twodelta"`). - /// * `graph` - Optional graph to embed as the `graph.json` asset when - /// writing a bundle. Accepts a `pathlib.Path` / `str` path, a - /// `bytes` object containing UTF-8 JSON, a Python `dict` / `list` - /// that will be serialized with `json.dumps`, or a file-like object - /// with a `.read()` method. Passing a graph alongside - /// `ben_file_only=True` is an error. - /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no - /// bundle framing. Defaults to `False`. + /// * `variant` - BEN variant for the assignment stream (`"standard"`, `"mkv_chain"`, or + /// `"twodelta"`). + /// * `graph` - Optional graph to embed as the `graph.json` asset when writing a bundle. Accepts + /// a `pathlib.Path` / `str` path, a `bytes` object containing UTF-8 JSON, a Python `dict` / + /// `list` that will be serialized with `json.dumps`, or a file-like object with a `.read()` + /// method. Passing a graph alongside `ben_file_only=True` is an error. + /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no bundle framing. Defaults to + /// `False`. #[new] #[pyo3(signature = ( file_path, @@ -100,13 +95,10 @@ impl PyBenEncoder { let buf = open_output(&file_path, overwrite)?; let state = if ben_file_only { - EncoderState::BenOnly( - BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?, - ) + EncoderState::BenOnly(BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?) } else { - // Bundle path. Add the optional graph asset before opening - // the stream session — the bundle writer auto-compresses - // graphs (default_compresses_by_type), so we hand it raw + // Bundle path. Add the optional graph asset before opening the stream session — the + // bundle writer auto-compresses graphs (default_compresses_by_type), so we hand it raw // JSON bytes and let it apply the XZ flag. let mut writer = BendlWriter::new(buf, AssignmentFormat::Ben).map_err(Self::map_io_err)?; @@ -120,9 +112,7 @@ impl PyBenEncoder { ) .map_err(Self::map_bundle_err)?; } - let session = writer - .into_stream_session() - .map_err(Self::map_bundle_err)?; + let session = writer.into_stream_session().map_err(Self::map_bundle_err)?; let writer = BenStreamWriter::for_ben(session, ben_var).map_err(Self::map_io_err)?; EncoderState::BundleStreaming { writer, @@ -158,8 +148,8 @@ impl PyBenEncoder { Ok(()) } - /// Flush the assignment stream and, for bundle output, patch the - /// header and write the trailing directory. Idempotent. + /// Flush the assignment stream and, for bundle output, patch the header and write the trailing + /// directory. Idempotent. fn close(&mut self) -> PyResult<()> { let Some(state) = self.state.take() else { return Ok(()); diff --git a/ben-py/src/encode/helpers.rs b/ben-py/src/encode/helpers.rs index 17335e3..99bf379 100644 --- a/ben-py/src/encode/helpers.rs +++ b/ben-py/src/encode/helpers.rs @@ -9,8 +9,8 @@ use std::path::PathBuf; /// /// - `dict` / `list`: serialized via `json.dumps`. /// - `bytes` / `bytearray`: used verbatim. -/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): -/// `.read()` is called and the result is coerced to bytes. +/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): `.read()` is called and +/// the result is coerced to bytes. /// - `pathlib.Path` or `str`: treated as a filesystem path to read. pub(super) fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { // Dict / list → json.dumps. @@ -29,8 +29,8 @@ pub(super) fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyRes return Ok(b); } - // File-like: must have .read(). Check before str/path, since a plain - // `str` / `Path` has no `.read()` attribute and will fall through. + // File-like: must have .read(). Check before str/path, since a plain `str` / `Path` has no + // `.read()` attribute and will fall through. if obj.hasattr("read")? { let data = obj.call_method0("read")?; if let Ok(b) = data.downcast::() { diff --git a/ben-py/src/lib.rs b/ben-py/src/lib.rs index 642db94..5ac526f 100755 --- a/ben-py/src/lib.rs +++ b/ben-py/src/lib.rs @@ -11,10 +11,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_function(wrap_pyfunction!(crate::decode::decode_ben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::decode::decode_xben_to_ben, m)?)?; - m.add_function(wrap_pyfunction!( - crate::decode::decode_xben_to_jsonl, - m - )?)?; + m.add_function(wrap_pyfunction!(crate::decode::decode_xben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_ben, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_xben, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_ben_to_xben, m)?)?; diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index d02c625..bcb7462 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -16,8 +16,7 @@ pub(super) enum CliVariant { /// Resolve the BEN variant from the CLI flags. /// -/// `--variant` takes precedence over `--save-all`. -/// If neither is given, defaults to MkvChain. +/// `--variant` takes precedence over `--save-all`. If neither is given, defaults to MkvChain. pub(super) fn resolve_variant(variant: Option, save_all: bool) -> BenVariant { match variant { Some(CliVariant::Standard) => BenVariant::Standard, @@ -61,53 +60,44 @@ pub(super) struct Args { /// Input file to read from. #[arg()] pub input_file: Option, - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. + /// Output file to write to. Optional. If not provided, the output file will be determined based + /// on the input file and the mode of operation. #[arg(short, long)] pub output_file: Option, - /// The standard behaviour is to try and derive the output file - /// name from the input file name. If this flag is set, then this - /// logic is ignored and the output is printed to stdout. - /// This flag is considered a higher priority than - /// the output_file flag, so if both are present, the output - /// will be printed to stdout. + /// The standard behaviour is to try and derive the output file name from the input file name. + /// If this flag is set, then this logic is ignored and the output is printed to stdout. This + /// flag is considered a higher priority than the output_file flag, so if both are present, the + /// output will be printed to stdout. #[arg(short, long)] pub print: bool, /// Sample number to extract. Optional. #[arg(short = 'n', long)] pub sample_number: Option, - /// If input and output files are not provided, - /// then this tells the x-encode, x-decode, and decode modes - /// that the expected formats are BEN and XBEN + /// If input and output files are not provided, then this tells the x-encode, x-decode, and + /// decode modes that the expected formats are BEN and XBEN #[arg(short = 'b', long)] pub ben_and_xben: bool, - /// If input and output files are not provided, - /// then this tells the x-encode and x-decode modes + /// If input and output files are not provided, then this tells the x-encode and x-decode modes /// that the expected formats are JSONL and XBEN #[arg(short = 'J', long)] pub jsonl_and_xben: bool, - /// If the input and output files are not provided, - /// then this tells the decode mode that the expected - /// formats are JSONL and BEN + /// If the input and output files are not provided, then this tells the decode mode that the + /// expected formats are JSONL and BEN #[arg(short = 'j', long)] pub jsonl_and_ben: bool, - /// When saving a file in the BEN format, the deault is to have - /// an assignment vector saved followed by the number of repetitions - /// of that assignment vector (this is useful for Markov chian methods - /// like ReCom). This flag will cause the program to forgo the repetition - /// count and just save all of the assignment vectors as they are encountered. - /// Equivalent to `--variant standard`. Ignored if `--variant` is set. + /// When saving a file in the BEN format, the deault is to have an assignment vector saved + /// followed by the number of repetitions of that assignment vector (this is useful for Markov + /// chian methods like ReCom). This flag will cause the program to forgo the repetition count + /// and just save all of the assignment vectors as they are encountered. Equivalent to + /// `--variant standard`. Ignored if `--variant` is set. #[arg(short = 'a', long)] pub save_all: bool, - /// BEN variant to use when encoding. - /// Possible values: standard, mkvchain, twodelta. - /// Defaults to mkvchain if neither this nor --save-all is given. - /// Takes precedence over --save-all when both are provided. + /// BEN variant to use when encoding. Possible values: standard, mkvchain, twodelta. Defaults to + /// mkvchain if neither this nor --save-all is given. Takes precedence over --save-all when both + /// are provided. #[arg(short = 't', long, value_enum)] pub variant: Option, - /// If the output file already exists, this flag - /// will cause the program to overwrite it without + /// If the output file already exists, this flag will cause the program to overwrite it without /// asking the user for confirmation. #[arg(short = 'w', long)] pub overwrite: bool, @@ -117,37 +107,32 @@ pub(super) struct Args { /// Suppress in-place progress spinners. Trace logging is unaffected. #[arg(short = 'q', long)] pub quiet: bool, - /// Number of threads the XZ encoder may use during x-encode and - /// xz-compress. Defaults to 1 (single-threaded). Pass an explicit - /// value to fan compression out across worker threads; values larger - /// than the host's available parallelism are silently clamped down. - /// `-1` is a sentinel meaning "use every available core" (sklearn - /// convention). See also `--xz-block-size`, which controls how much - /// input each thread gets before it can start compressing. + /// Number of threads the XZ encoder may use during x-encode and xz-compress. Defaults to 1 + /// (single-threaded). Pass an explicit value to fan compression out across worker threads; + /// values larger than the host's available parallelism are silently clamped down. `-1` is a + /// sentinel meaning "use every available core" (sklearn convention). See also + /// `--xz-block-size`, which controls how much input each thread gets before it can start + /// compressing. #[arg(short = 'c', long, allow_hyphen_values = true)] pub n_cpus: Option, - /// When running x-encoder, this flag will deterimine the level of compression to use. - /// By default, the highest level of compression will be used. - /// Valid values are 0-9, where 0 is no compression and 9 is the highest level of compression. + /// When running x-encoder, this flag will deterimine the level of compression to use. By + /// default, the highest level of compression will be used. Valid values are 0-9, where 0 is no + /// compression and 9 is the highest level of compression. #[arg(short = 'l', long)] pub compression_level: Option, - /// Number of TwoDelta delta frames per columnar chunk in XBEN encoding. - /// Only affects TwoDelta variant. Larger chunks improve XZ compression. - /// Default is 10,000. + /// Number of TwoDelta delta frames per columnar chunk in XBEN encoding. Only affects TwoDelta + /// variant. Larger chunks improve XZ compression. Default is 10,000. #[arg(long)] pub chunk_size: Option, - /// Per-block size in bytes for the multithreaded XZ encoder. - /// liblzma needs a non-zero block size to actually fan compression - /// out across worker threads; smaller blocks scale parallelism better - /// at a slight compression-ratio cost. Defaults to 16 MiB when - /// `--n-cpus > 1`, or 0 (liblzma auto, ~192 MiB at preset 9) for - /// single-thread runs. + /// Per-block size in bytes for the multithreaded XZ encoder. liblzma needs a non-zero block + /// size to actually fan compression out across worker threads; smaller blocks scale parallelism + /// better at a slight compression-ratio cost. Defaults to 16 MiB when `--n-cpus > 1`, or 0 + /// (liblzma auto, ~192 MiB at preset 9) for single-thread runs. #[arg(long)] pub xz_block_size: Option, - /// Embed a graph JSON asset alongside the assignment stream and emit - /// the result as a `.bendl` bundle. The graph is added after the - /// assignment stream has been fully written. Only applies to the - /// `encode` and `x-encode` modes. + /// Embed a graph JSON asset alongside the assignment stream and emit the result as a `.bendl` + /// bundle. The graph is added after the assignment stream has been fully written. Only applies + /// to the `encode` and `x-encode` modes. #[arg(long)] pub graph: Option, } diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 4f3f825..47eb6da 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -10,9 +10,9 @@ use std::fs::{File, OpenOptions}; use std::io::{self, BufReader, Result}; use std::path::Path; -/// After a finalized `.bendl` has been written, reopen it in append mode -/// and attach the graph asset in-place. This runs *after* the stream has -/// finished, which is why we print "Adding graph..." at this point. +/// After a finalized `.bendl` has been written, reopen it in append mode and attach the graph asset +/// in-place. This runs *after* the stream has finished, which is why we print "Adding graph..." at +/// this point. pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { eprintln!("Adding graph..."); let graph_bytes = std::fs::read(graph_path).map_err(|e| { @@ -44,16 +44,16 @@ pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<() Ok(()) } -/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at -/// `out_path` and then append the graph as a post-stream asset. +/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at `out_path` and then append +/// the graph as a post-stream asset. pub(super) fn run_encode_bundle_with_graph( input_path: &Path, out_path: &str, variant: BenVariant, graph_path: &Path, ) -> Result<()> { - // Validate the graph file is readable before we do any real work, - // so a bad --graph path doesn't leave a half-written bundle behind. + // Validate the graph file is readable before we do any real work, so a bad --graph path doesn't + // leave a half-written bundle behind. std::fs::metadata(graph_path).map_err(|e| { io::Error::new( io::ErrorKind::Other, @@ -81,8 +81,8 @@ pub(super) fn run_encode_bundle_with_graph( append_graph_asset(out_path, graph_path) } -/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` -/// bundle at `out_path` and then append the graph as a post-stream asset. +/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` bundle at `out_path` and +/// then append the graph as a post-stream asset. #[allow(clippy::too_many_arguments)] pub(super) fn run_xencode_bundle_with_graph( input_path: &Path, diff --git a/ben/src/cli/ben/modes/encode.rs b/ben/src/cli/ben/modes/encode.rs index 48f9c78..4687f43 100644 --- a/ben/src/cli/ben/modes/encode.rs +++ b/ben/src/cli/ben/modes/encode.rs @@ -12,8 +12,7 @@ use std::path::Path; pub(in crate::cli::ben) fn run(args: Args) -> CliResult { tracing::trace!("Running in encode mode"); - // --graph path: produce a .bendl bundle with the BEN stream - // plus a post-stream graph asset. + // --graph path: produce a .bendl bundle with the BEN stream plus a post-stream graph asset. if let Some(graph_path) = args.graph.as_ref() { let in_file = args.input_file.as_ref().ok_or_else(|| { CliError::other("--graph requires an input file (stdin not supported).") diff --git a/ben/src/cli/ben/modes/mod.rs b/ben/src/cli/ben/modes/mod.rs index adb90dc..bbdb363 100644 --- a/ben/src/cli/ben/modes/mod.rs +++ b/ben/src/cli/ben/modes/mod.rs @@ -1,8 +1,8 @@ //! Per-mode handlers for the `ben` CLI. //! -//! The dispatcher in `super::run` matches on the parsed `Mode` enum and -//! forwards to one of these handlers. Splitting one handler per file keeps -//! each mode under ~40 lines and makes them individually testable. +//! The dispatcher in `super::run` matches on the parsed `Mode` enum and forwards to one of these +//! handlers. Splitting one handler per file keeps each mode under ~40 lines and makes them +//! individually testable. pub(super) mod decode; pub(super) mod encode; diff --git a/ben/src/cli/ben/modes/xdecode.rs b/ben/src/cli/ben/modes/xdecode.rs index 436f298..2a3dabe 100644 --- a/ben/src/cli/ben/modes/xdecode.rs +++ b/ben/src/cli/ben/modes/xdecode.rs @@ -13,8 +13,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { let reader = open_reader(args.input_file.as_deref()); let writer = match args.input_file.as_ref() { Some(file) if !args.print => { - let path = - decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite)?; + let path = decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite)?; open_derived_writer(path) } _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs index e99fa5f..ab40173 100644 --- a/ben/src/cli/ben/modes/xencode.rs +++ b/ben/src/cli/ben/modes/xencode.rs @@ -23,8 +23,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { } } - // --graph path: produce a .bendl bundle with the XBEN stream - // plus a post-stream graph asset. + // --graph path: produce a .bendl bundle with the XBEN stream plus a post-stream graph asset. if let Some(graph_path) = args.graph.as_ref() { let in_file = args.input_file.as_ref().ok_or_else(|| { CliError::other("--graph requires an input file (stdin not supported).") diff --git a/ben/src/cli/ben/paths.rs b/ben/src/cli/ben/paths.rs index 85a2ff7..58249c2 100644 --- a/ben/src/cli/ben/paths.rs +++ b/ben/src/cli/ben/paths.rs @@ -15,9 +15,8 @@ pub(super) type DynWriter = Box; /// * `input_file_name` - The input file path supplied by the user. /// * `output_file_name` - An optional explicit output path. /// * `overwrite` - Whether to skip overwrite prompting. -/// * `with_graph` - When true, the output is a `.bendl` bundle instead -/// of a bare `.ben`/`.xben` stream, so the derived extension is -/// `.bendl` regardless of `mode`. +/// * `with_graph` - When true, the output is a `.bendl` bundle instead of a bare `.ben`/`.xben` +/// stream, so the derived extension is `.bendl` regardless of `mode`. /// /// # Returns /// @@ -65,8 +64,7 @@ pub(super) fn encode_setup( /// /// * `in_file_name` - The input file path supplied by the user. /// * `out_file_name` - An optional explicit output path. -/// * `full_decode` - Whether the decode should go all the way to JSONL instead -/// of stopping at BEN. +/// * `full_decode` - Whether the decode should go all the way to JSONL instead of stopping at BEN. /// * `overwrite` - Whether to skip overwrite prompting. /// /// # Returns @@ -165,9 +163,8 @@ pub(super) fn open_derived_writer(path: String) -> DynWriter { Box::new(BufWriter::new(File::create(path).unwrap())) } -/// Count the number of non-empty lines in a JSONL file. Used to populate -/// the bundle header's `sample_count` when wrapping a stream encode in a -/// `.bendl` container. +/// Count the number of non-empty lines in a JSONL file. Used to populate the bundle header's +/// `sample_count` when wrapping a stream encode in a `.bendl` container. pub(super) fn count_jsonl_lines(path: &Path) -> io::Result { let file = File::open(path)?; let reader = BufReader::new(file); diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index e322bd2..03bfe8e 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -1,5 +1,5 @@ -use super::args::{Args, CliVariant, Mode}; use super::args::resolve_variant; +use super::args::{Args, CliVariant, Mode}; use super::bundle::{ append_graph_asset, run_encode_bundle_with_graph, run_xencode_bundle_with_graph, }; diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs index 3d4aa85..c10acdf 100644 --- a/ben/src/cli/bendl/args.rs +++ b/ben/src/cli/bendl/args.rs @@ -58,15 +58,15 @@ pub(super) enum Command { #[derive(Parser, Debug)] pub(super) struct CreateArgs { - /// Path to the `.ben` or `.xben` assignment stream to embed. - /// File extension chooses the container format. + /// Path to the `.ben` or `.xben` assignment stream to embed. File extension chooses the + /// container format. #[arg(short = 'i', long)] pub input: PathBuf, /// Destination `.bendl` path. #[arg(short = 'o', long)] pub output: PathBuf, - /// Optional `graph.json` asset path. Will be stored under the - /// standardized name `graph.json` and xz-compressed by default. + /// Optional `graph.json` asset path. Will be stored under the standardized name `graph.json` + /// and xz-compressed by default. #[arg(long)] pub graph: Option, /// Optional `metadata.json` asset path. Stored under standardized name. @@ -99,12 +99,11 @@ pub(super) struct ExtractArgs { /// Output file path for the extracted bytes. #[arg(short = 'o', long)] pub output: PathBuf, - /// Extract the embedded assignment stream region verbatim. Mutually - /// exclusive with `--asset`. + /// Extract the embedded assignment stream region verbatim. Mutually exclusive with `--asset`. #[arg(long, conflicts_with = "asset")] pub stream: bool, - /// Name of the asset to extract (e.g. `graph.json`). If the asset is - /// xz-compressed, the extracted file contains the decompressed bytes. + /// Name of the asset to extract (e.g. `graph.json`). If the asset is xz-compressed, the + /// extracted file contains the decompressed bytes. #[arg(long)] pub asset: Option, /// Overwrite the output file if it already exists. diff --git a/ben/src/cli/bendl/create.rs b/ben/src/cli/bendl/create.rs index e2c5447..075775d 100644 --- a/ben/src/cli/bendl/create.rs +++ b/ben/src/cli/bendl/create.rs @@ -16,8 +16,8 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { ) .map_err(|e| format!("{e}"))?; - // Count samples up front so we can patch the header at finalize time. - // This pre-scan is O(stream size); the second pass streams bytes directly. + // Count samples up front so we can patch the header at finalize time. This pre-scan is + // O(stream size); the second pass streams bytes directly. let sample_count: i64 = count_samples_from_file(&args.input, BenWireFormat::from(format)) .map_err(|e| format!("failed to count samples in {:?}: {e}", args.input))? as i64; @@ -56,8 +56,8 @@ pub(super) fn run_create(args: CreateArgs) -> Result<(), String> { add_custom_file_asset(&mut writer, name, path, AddAssetOptions::defaults())?; } - // Stream phase: copy bytes from the input file directly into the - // bundle's stream region. This preserves the exact BEN/XBEN bytes. + // Stream phase: copy bytes from the input file directly into the bundle's stream region. This + // preserves the exact BEN/XBEN bytes. let mut session = writer .into_stream_session() .map_err(|e| format!("failed to open stream region: {e}"))?; diff --git a/ben/src/cli/bendl/mod.rs b/ben/src/cli/bendl/mod.rs index f75a08e..62bbdb5 100644 --- a/ben/src/cli/bendl/mod.rs +++ b/ben/src/cli/bendl/mod.rs @@ -2,13 +2,11 @@ //! //! Exposes four subcommands: //! -//! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional -//! asset files into a finalized `.bendl` bundle. +//! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional asset files into a +//! finalized `.bendl` bundle. //! - `inspect` — print the header and directory of a `.bendl` file. -//! - `extract` — copy the embedded stream region or a named asset out -//! of a bundle to disk. -//! - `append` — add new asset files to an already-finalized bundle -//! without rewriting the stream. +//! - `extract` — copy the embedded stream region or a named asset out of a bundle to disk. +//! - `append` — add new asset files to an already-finalized bundle without rewriting the stream. mod append; mod args; diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index b6e4fa4..f9884fb 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -54,7 +54,10 @@ fn run_create_with_relabel_map_and_custom_asset() { // Must end in .ben so format_from_path recognises it. let p = std::env::temp_dir().join(format!( "bendl-create-relabel-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() )); let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; let mut b = Vec::new(); @@ -82,7 +85,9 @@ fn run_create_with_relabel_map_and_custom_asset() { run_create(args).unwrap(); let reader = BendlReader::open(BufReader::new(std::fs::File::open(&out).unwrap())).unwrap(); - assert!(reader.find_asset_by_name("node_permutation_map.json").is_some()); + assert!(reader + .find_asset_by_name("node_permutation_map.json") + .is_some()); assert!(reader.find_asset_by_name("myblob").is_some()); for p in [&ben, &relabel, &custom, &out] { @@ -95,9 +100,8 @@ fn run_inspect_xben_format_and_checksum_flag() { use crate::io::bundle::format::ASSET_TYPE_CUSTOM; use crate::io::bundle::AddAssetOptions; - // Every library-written asset carries ASSET_FLAG_CHECKSUM, so any - // add_asset call exercises the checksum flag_parts branch in - // `run_inspect`. + // Every library-written asset carries ASSET_FLAG_CHECKSUM, so any add_asset call exercises the + // checksum flag_parts branch in `run_inspect`. let mut buf: Vec = Vec::new(); let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Xben).unwrap(); writer @@ -135,8 +139,7 @@ fn run_append_no_assets_is_noop() { }; run_append(args).unwrap(); // File should be unchanged (bundle is still valid). - let reader = - BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); assert!(reader.is_finalized()); let _ = std::fs::remove_file(&bendl); } @@ -159,10 +162,11 @@ fn run_append_with_metadata_and_relabel_map() { }; run_append(args).unwrap(); - let reader = - BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); assert!(reader.find_asset_by_name("metadata.json").is_some()); - assert!(reader.find_asset_by_name("node_permutation_map.json").is_some()); + assert!(reader + .find_asset_by_name("node_permutation_map.json") + .is_some()); for p in [&bendl, &meta, &relabel] { let _ = std::fs::remove_file(p); @@ -174,7 +178,10 @@ fn run_create_with_graph_raw_flag() { let ben = { let p = std::env::temp_dir().join(format!( "bendl-create-raw-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() )); let jsonl = b"{\"assignment\":[1,2],\"sample\":1}\n"; let mut b = Vec::new(); @@ -212,15 +219,15 @@ fn run_inspect_unknown_format_and_no_sample_count() { BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, HEADER_SIZE, }; - // Build a header with an unknown assignment format byte and - // finalized=0 so sample_count() returns None. + // Build a header with an unknown assignment format byte and finalized=0 so sample_count() + // returns None. let mut header = [0u8; HEADER_SIZE]; header[0..8].copy_from_slice(&BENDL_MAGIC); header[8..10].copy_from_slice(&BENDL_MAJOR_VERSION.to_le_bytes()); header[10..12].copy_from_slice(&BENDL_MINOR_VERSION.to_le_bytes()); header[12] = FINALIZED_NO; header[13] = 0xFF; // unknown format byte - // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 + // stream_offset = HEADER_SIZE, stream_len = 0, sample_count = -1 let stream_offset = HEADER_SIZE as u64; header[40..48].copy_from_slice(&stream_offset.to_le_bytes()); let sample_count: i64 = -1; @@ -251,8 +258,7 @@ fn run_append_with_graph_raw_and_graph_asset() { }; run_append(args).unwrap(); - let reader = - BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); + let reader = BendlReader::open(BufReader::new(std::fs::File::open(&bendl).unwrap())).unwrap(); assert!(reader.find_asset_by_name("graph.json").is_some()); for p in [&bendl, &graph] { @@ -262,13 +268,8 @@ fn run_append_with_graph_raw_and_graph_asset() { #[test] fn run_extract_rejects_missing_stream_and_asset() { - let args = ExtractArgs::try_parse_from([ - "extract", - "--output", - "/tmp/out.bin", - "bundle.bendl", - ]) - .unwrap(); + let args = ExtractArgs::try_parse_from(["extract", "--output", "/tmp/out.bin", "bundle.bendl"]) + .unwrap(); let err = run_extract(args).unwrap_err(); assert!(err.contains("either --stream or --asset")); } @@ -278,7 +279,10 @@ fn run_create_errors_on_missing_metadata_file() { let ben = { let p = std::env::temp_dir().join(format!( "bendl-err-meta-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() )); let jsonl = b"{\"assignment\":[1],\"sample\":1}\n"; let mut b = Vec::new(); @@ -308,7 +312,10 @@ fn run_create_errors_on_missing_relabel_map_file() { let ben = { let p = std::env::temp_dir().join(format!( "bendl-err-relabel-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() )); let mut b = Vec::new(); encode_jsonl_to_ben( @@ -342,7 +349,10 @@ fn run_create_errors_on_missing_custom_asset_file() { let ben = { let p = std::env::temp_dir().join(format!( "bendl-err-custom-{}.ben", - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() )); let mut b = Vec::new(); encode_jsonl_to_ben( diff --git a/ben/src/cli/common/error.rs b/ben/src/cli/common/error.rs index 3942a57..eb6cf14 100644 --- a/ben/src/cli/common/error.rs +++ b/ben/src/cli/common/error.rs @@ -1,9 +1,9 @@ //! Error type used by the top-level `run()` functions of every CLI binary. //! -//! The shape is intentionally narrow: a few specific variants for cases where -//! a caller (or test) might want to match the error type, plus an `Other` -//! catch-all that preserves the older `Result<(), String>` ergonomic so the -//! existing per-command runners still propagate cleanly via `?`. +//! The shape is intentionally narrow: a few specific variants for cases where a caller (or test) +//! might want to match the error type, plus an `Other` catch-all that preserves the older +//! `Result<(), String>` ergonomic so the existing per-command runners still propagate cleanly via +//! `?`. use std::fmt; use std::io; @@ -16,8 +16,8 @@ pub enum CliError { Io(io::Error), /// The output path already existed and the user declined to overwrite. OverwriteRefused(PathBuf), - /// A free-form error message. Used as a catch-all so existing - /// `Result<(), String>` runners still flow through unchanged. + /// A free-form error message. Used as a catch-all so existing `Result<(), String>` runners + /// still flow through unchanged. Other(String), } diff --git a/ben/src/cli/common/mod.rs b/ben/src/cli/common/mod.rs index 6f07b0e..2969f8d 100644 --- a/ben/src/cli/common/mod.rs +++ b/ben/src/cli/common/mod.rs @@ -9,9 +9,8 @@ static QUIET: AtomicBool = AtomicBool::new(false); /// Configure tracing for CLI execution. /// -/// When `verbose` is set and the user has not already provided `RUST_LOG`, the -/// default log filter is elevated to `trace`. The tracing subscriber is then -/// initialized exactly once for the process. +/// When `verbose` is set and the user has not already provided `RUST_LOG`, the default log filter +/// is elevated to `trace`. The tracing subscriber is then initialized exactly once for the process. /// /// # Arguments /// @@ -29,8 +28,8 @@ pub fn set_verbose(verbose: bool) { /// Suppress in-place progress spinners for this process. /// -/// Independent of [`set_verbose`]: trace logging is gated by `RUST_LOG`, -/// while spinners are gated by this flag plus stderr TTY detection. +/// Independent of [`set_verbose`]: trace logging is gated by `RUST_LOG`, while spinners are gated +/// by this flag plus stderr TTY detection. /// /// # Arguments /// @@ -52,25 +51,24 @@ pub fn is_quiet() -> bool { QUIET.load(Ordering::Relaxed) } -/// Decide whether overwriting an output path should proceed, given the -/// state observed by the caller. +/// Decide whether overwriting an output path should proceed, given the state observed by the +/// caller. /// -/// This is the pure half of [`check_overwrite`]: it does no I/O, so it can -/// be unit-tested by enumerating the four reachable states (file missing / -/// `overwrite` flag set / user said yes / user said anything else). +/// This is the pure half of [`check_overwrite`]: it does no I/O, so it can be unit-tested by +/// enumerating the four reachable states (file missing / `overwrite` flag set / user said yes / +/// user said anything else). /// /// # Arguments /// /// * `file_exists` - Whether the candidate output path already exists. /// * `overwrite` - Whether the caller passed `--overwrite` to skip prompting. -/// * `response` - The line the user typed in response to the overwrite -/// prompt, or `None` if no prompt was issued. +/// * `response` - The line the user typed in response to the overwrite prompt, or `None` if no +/// prompt was issued. /// /// # Returns /// -/// Returns `true` when the caller may safely overwrite; `false` when the -/// user (or the absence of a yes-response) indicates the operation should -/// be aborted. +/// Returns `true` when the caller may safely overwrite; `false` when the user (or the absence of a +/// yes-response) indicates the operation should be aborted. pub(crate) fn check_overwrite_pure( file_exists: bool, overwrite: bool, @@ -87,9 +85,8 @@ pub(crate) fn check_overwrite_pure( /// Confirm whether an existing output path may be overwritten. /// -/// If `overwrite` is `false` and the destination already exists, the user is -/// prompted on stdin. An `AlreadyExists` error is returned when the user -/// declines. +/// If `overwrite` is `false` and the destination already exists, the user is prompted on stdin. An +/// `AlreadyExists` error is returned when the user declines. /// /// # Arguments /// diff --git a/ben/src/cli/pcben/args.rs b/ben/src/cli/pcben/args.rs index 13fb568..7c11fed 100644 --- a/ben/src/cli/pcben/args.rs +++ b/ben/src/cli/pcben/args.rs @@ -27,13 +27,11 @@ pub(super) struct Args { /// Input file to read from. #[arg(short, long)] pub(super) input_file: Option, - /// Output file to write to. Optional. - /// If not provided, the output file will be determined - /// based on the input file and the mode of operation. + /// Output file to write to. Optional. If not provided, the output file will be determined based + /// on the input file and the mode of operation. #[arg(short, long)] pub(super) output_file: Option, - /// If the output file already exists, this flag - /// will cause the program to overwrite it without + /// If the output file already exists, this flag will cause the program to overwrite it without /// asking the user for confirmation. #[arg(short = 'w', long)] pub(super) overwrite: bool, diff --git a/ben/src/cli/pcben/modes/mod.rs b/ben/src/cli/pcben/modes/mod.rs index d5a78dc..76cb286 100644 --- a/ben/src/cli/pcben/modes/mod.rs +++ b/ben/src/cli/pcben/modes/mod.rs @@ -1,8 +1,8 @@ //! Per-mode handlers for the `pcben` CLI. //! -//! The dispatcher in `super::run` matches on the parsed `Mode` enum and -//! forwards to one of these handlers. Splitting one handler per file keeps -//! each mode under ~40 lines and makes them individually testable. +//! The dispatcher in `super::run` matches on the parsed `Mode` enum and forwards to one of these +//! handlers. Splitting one handler per file keeps each mode under ~40 lines and makes them +//! individually testable. pub(super) mod ben_to_pc; pub(super) mod pc_to_ben; diff --git a/ben/src/cli/pcben/tests.rs b/ben/src/cli/pcben/tests.rs index 800513e..7edc867 100644 --- a/ben/src/cli/pcben/tests.rs +++ b/ben/src/cli/pcben/tests.rs @@ -1,8 +1,6 @@ use super::args::{Args, Mode}; use super::paths::{derive_output_path, resolved_output_path}; -use super::translate::{ - assignment_decode_ben, assignment_encode_ben, assignment_encode_xben, -}; +use super::translate::{assignment_decode_ben, assignment_encode_ben, assignment_encode_xben}; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; use crate::codec::encode::encode_jsonl_to_ben; use crate::BenVariant; @@ -118,11 +116,10 @@ fn assignment_encode_xben_offsets_values_and_writes_xben() { #[test] fn assignment_decode_ben_iterator_error_propagates() { - // Provides a valid BEN banner so BenStreamReader::from_ben succeeds, - // then returns a non-EOF error on the next read so the iterator - // fires the Err(e) => return Err(e) arm (line 204). - use std::io::Read; + // Provides a valid BEN banner so BenStreamReader::from_ben succeeds, then returns a non-EOF + // error on the next read so the iterator fires the Err(e) => return Err(e) arm (line 204). use crate::format::banners::STANDARD_BEN_BANNER; + use std::io::Read; struct BannerThenError { banner: &'static [u8], @@ -141,7 +138,10 @@ fn assignment_decode_ben_iterator_error_propagates() { } } - let reader = BannerThenError { banner: STANDARD_BEN_BANNER, pos: 0 }; + let reader = BannerThenError { + banner: STANDARD_BEN_BANNER, + pos: 0, + }; let mut out = Vec::new(); let err = assignment_decode_ben(reader, &mut out).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); diff --git a/ben/src/cli/pcben/translate.rs b/ben/src/cli/pcben/translate.rs index 27db926..b86ff1c 100644 --- a/ben/src/cli/pcben/translate.rs +++ b/ben/src/cli/pcben/translate.rs @@ -1,8 +1,7 @@ //! BEN ↔ PCOMPRESS assignment translation helpers. //! -//! PCOMPRESS uses zero-based district ids; BEN uses one-based. These -//! helpers bridge the two conventions so the per-mode handlers can be -//! kept short. +//! PCOMPRESS uses zero-based district ids; BEN uses one-based. These helpers bridge the two +//! conventions so the per-mode handlers can be kept short. use crate::io::reader::BenStreamReader; use crate::io::writer::BenStreamWriter; diff --git a/ben/src/cli/reben/args.rs b/ben/src/cli/reben/args.rs index 56a13fe..8c8021b 100644 --- a/ben/src/cli/reben/args.rs +++ b/ben/src/cli/reben/args.rs @@ -53,19 +53,17 @@ pub(super) struct Args { /// Topology-based ordering method to use instead of a key sort. #[arg(long, value_enum)] pub ordering: Option, - /// Shape file to use for sorting the BEN file. Only needed - /// in BEN mode when a map is not provided. + /// Shape file to use for sorting the BEN file. Only needed in BEN mode when a map is not + /// provided. #[arg(short, long)] pub shape_file: Option, /// Map file to use for relabeling the BEN file. #[arg(short = 'p', long)] pub map_file: Option, - /// Mode to run the program in (either JSON or BEN). - /// The JSON mode will sort a JSON file by a given key or graph-ordering - /// method. The BEN mode will relabel a BEN file according to a map file - /// or a graph-ordering request (which also requires a dual-graph file). If no - /// map file or key is provided, the BEN mode will canonicalize - /// the assignment vectors in the BEN file. + /// Mode to run the program in (either JSON or BEN). The JSON mode will sort a JSON file by a + /// given key or graph-ordering method. The BEN mode will relabel a BEN file according to a map + /// file or a graph-ordering request (which also requires a dual-graph file). If no map file or + /// key is provided, the BEN mode will canonicalize the assignment vectors in the BEN file. #[arg(short, long)] pub mode: Mode, /// Only relabel the first `n` expanded samples in BEN mode. diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 407b165..1953f8d 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -106,7 +106,14 @@ fn parse_ben_mode_output_variant_args() { #[test] fn run_json_mode_rejects_n_items() { let args = Args::try_parse_from([ - "reben", "x.json", "--mode", "json", "--key", "k", "--n-items", "5", + "reben", + "x.json", + "--mode", + "json", + "--key", + "k", + "--n-items", + "5", ]) .unwrap(); let err = run_json_mode(args).unwrap_err(); @@ -115,14 +122,7 @@ fn run_json_mode_rejects_n_items() { #[test] fn run_ben_mode_rejects_convert_only_without_variant() { - let args = Args::try_parse_from([ - "reben", - "x.ben", - "--mode", - "ben", - "--convert-only", - ]) - .unwrap(); + let args = Args::try_parse_from(["reben", "x.ben", "--mode", "ben", "--convert-only"]).unwrap(); let err = run_ben_mode(args).unwrap_err(); assert!(err.contains("--output-variant")); } @@ -162,8 +162,7 @@ fn to_ben_variant_covers_standard() { #[test] fn relabeling_label_errors_on_both_key_and_ordering() { - let err = - relabeling_label(Some("k"), Some(&OrderingMethod::MultiLevelCluster)).unwrap_err(); + let err = relabeling_label(Some("k"), Some(&OrderingMethod::MultiLevelCluster)).unwrap_err(); assert!(err.contains("not both")); } @@ -213,17 +212,9 @@ fn run_json_mode_with_ordering_derives_output_name() { .unwrap(); let result = run_json_mode(args); // Clean up derived output file. - let derived = shape - .to_str() - .unwrap() - .trim_end_matches(".json") - .to_owned() + let derived = shape.to_str().unwrap().trim_end_matches(".json").to_owned() + "_sorted_by_reverse-cuthill-mckee_map.json"; - let derived2 = shape - .to_str() - .unwrap() - .trim_end_matches(".json") - .to_owned() + let derived2 = shape.to_str().unwrap().trim_end_matches(".json").to_owned() + "_sorted_by_reverse-cuthill-mckee.jsonl.ben"; let _ = fs::remove_file(&derived); let _ = fs::remove_file(&derived2); @@ -326,8 +317,8 @@ fn run_ben_mode_with_output_variant_and_n_items() { #[test] fn run_ben_mode_with_shape_file_and_ordering() { - // Covers the shape_file + ordering path. - // Creates a map from the shape file ordering, then relabels the BEN. + // Covers the shape_file + ordering path. Creates a map from the shape file ordering, then + // relabels the BEN. let input = write_temp_ben("shape_order_input.jsonl.ben"); let shape = unique_path("shape_order_shape.json"); fs::write( @@ -351,17 +342,9 @@ fn run_ben_mode_with_shape_file_and_ordering() { .unwrap(); let result = run_ben_mode(args); // Clean up the map file the function derives automatically. - let map = shape - .to_str() - .unwrap() - .trim_end_matches(".json") - .to_owned() + let map = shape.to_str().unwrap().trim_end_matches(".json").to_owned() + "_sorted_by_reverse-cuthill-mckee_map.json"; - let sorted_json = shape - .to_str() - .unwrap() - .trim_end_matches(".json") - .to_owned() + let sorted_json = shape.to_str().unwrap().trim_end_matches(".json").to_owned() + "_sorted_by_reverse-cuthill-mckee.json"; let _ = fs::remove_file(&map); let _ = fs::remove_file(&sorted_json); @@ -493,28 +476,30 @@ fn read_node_permutation_map_file_rejects_non_integer_index() { let _ = fs::remove_file(&map_path); } -/// Pin today's behavior when a JSON map has two old indices targeting the -/// same new index: `HashMap::insert` overwrites the prior `(new, old)` entry, -/// shrinking the inverted map. The remaining slots no longer cover -/// `0..=max_key` contiguously, so the relabel driver returns -/// `NonContiguousMap` from `dense_permutation`. This is reachable from valid -/// JSON because `serde_json` retains the last value when the input has -/// duplicate JSON keys, and even with unique keys two distinct old indices -/// can target the same new index. +/// Pin today's behavior when a JSON map has two old indices targeting the same new index: +/// `HashMap::insert` overwrites the prior `(new, old)` entry, shrinking the inverted map. The +/// remaining slots no longer cover `0..=max_key` contiguously, so the relabel driver returns +/// `NonContiguousMap` from `dense_permutation`. This is reachable from valid JSON because +/// `serde_json` retains the last value when the input has duplicate JSON keys, and even with unique +/// keys two distinct old indices can target the same new index. #[test] fn read_node_permutation_map_file_duplicate_new_index_creates_gap() { use crate::ops::relabel::{relabel_ben_file, RelabelOptions}; let map_path = unique_path("dup_new_index_map.json"); - // old→new: {0→1, 1→1, 2→2}. Inverted: {1: 1 (overwrites 0), 2: 2}. - // Slot 0 is missing in the inverted map, so dense_permutation rejects. + // old→new: {0→1, 1→1, 2→2}. Inverted: {1: 1 (overwrites 0), 2: 2}. Slot 0 is missing in the + // inverted map, so dense_permutation rejects. fs::write( &map_path, b"{\"node_permutation_old_to_new\":{\"0\":1,\"1\":1,\"2\":2}}", ) .unwrap(); let (map, _label) = read_node_permutation_map_file(map_path.to_str().unwrap()).unwrap(); - assert_eq!(map.len(), 2, "duplicate new index must overwrite, shrinking the map"); + assert_eq!( + map.len(), + 2, + "duplicate new index must overwrite, shrinking the map" + ); // Build a tiny BEN file to drive the relabel through dense_permutation. let mut ben = Vec::new(); @@ -553,13 +538,7 @@ fn read_node_permutation_map_file_rejects_non_integer_value() { #[test] fn run_ben_mode_canonicalize_derives_output_name() { let input = write_temp_ben("canon.jsonl.ben"); - let args = Args::try_parse_from([ - "reben", - input.to_str().unwrap(), - "--mode", - "ben", - ]) - .unwrap(); + let args = Args::try_parse_from(["reben", input.to_str().unwrap(), "--mode", "ben"]).unwrap(); let result = run_ben_mode(args); let derived = input .to_str() @@ -585,12 +564,7 @@ fn run_ben_mode_with_output_variant_derives_name() { ]) .unwrap(); let result = run_ben_mode(args); - let derived = input - .to_str() - .unwrap() - .trim_end_matches(".ben") - .to_owned() - + "_standard.ben"; + let derived = input.to_str().unwrap().trim_end_matches(".ben").to_owned() + "_standard.ben"; let _ = fs::remove_file(&derived); fs::remove_file(&input).unwrap(); result.unwrap(); diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 63b5d25..b6116c3 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -2,21 +2,19 @@ use std::io::{self, Read}; /// Decode a single BEN frame payload into run-length encoded assignments. /// -/// This function expects only the packed payload bytes for one BEN frame, not -/// the leading per-frame BEN header. +/// This function expects only the packed payload bytes for one BEN frame, not the leading per-frame +/// BEN header. /// /// # Arguments /// -/// * `reader` - A reader positioned at the packed payload bytes for a single -/// BEN frame. +/// * `reader` - A reader positioned at the packed payload bytes for a single BEN frame. /// * `max_val_bits` - The number of bits used to encode each label value. /// * `max_len_bits` - The number of bits used to encode each run length. /// * `n_bytes` - The number of payload bytes to read from `reader`. /// /// # Returns /// -/// Returns the decoded run-length encoded assignment vector as `(value, count)` -/// pairs. +/// Returns the decoded run-length encoded assignment vector as `(value, count)` pairs. pub fn decode_ben_line( mut reader: R, max_val_bits: u8, @@ -74,9 +72,9 @@ pub fn decode_ben_line( len_set = false; } - // The while condition guarantees enough bits for a complete (val, len) pair. - // len_set is always false on entry (reset by the outer for body above), - // so we extract len unconditionally. + // The while condition guarantees enough bits for a complete (val, len) pair. len_set is + // always false on entry (reset by the outer for body above), so we extract len + // unconditionally. while n_bits_in_buff >= max_val_bits as u16 + max_len_bits as u16 { if !val_set { val = (buffer >> (32 - max_val_bits)) as u16; @@ -105,19 +103,17 @@ mod tests { #[test] fn decode_ben_line_skips_zero_length_run() { - // max_val_bits=1, max_len_bits=1, 1 byte payload = 0x80. - // Bit layout: [val=1][len=0] → run with len=0 is not pushed. + // max_val_bits=1, max_len_bits=1, 1 byte payload = 0x80. Bit layout: [val=1][len=0] → run + // with len=0 is not pushed. let result = decode_ben_line(Cursor::new(&[0x80u8]), 1, 1, 1).unwrap(); assert!(result.is_empty()); } #[test] fn decode_ben_line_partial_bits_skip_val_len_check() { - // max_val_bits=8, max_len_bits=8 → each run requires 2 bytes. - // After byte 1: val_set=true, len_set=false → `if val_set && len_set` - // is false (the `}` closing that block is the false-path counter in - // LLVM coverage). - // After byte 2: both set → run (1, 3) is pushed. + // max_val_bits=8, max_len_bits=8 → each run requires 2 bytes. After byte 1: val_set=true, + // len_set=false → `if val_set && len_set` is false (the `}` closing that block is the + // false-path counter in LLVM coverage). After byte 2: both set → run (1, 3) is pushed. let result = decode_ben_line(Cursor::new(&[0x01u8, 0x03u8]), 8, 8, 2).unwrap(); assert_eq!(result, vec![(1u16, 3u16)]); } diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs index e630300..86f74c5 100644 --- a/ben/src/codec/decode/ben32.rs +++ b/ben/src/codec/decode/ben32.rs @@ -5,8 +5,8 @@ use std::io::{self, BufRead, Write}; /// Decode a single ben32 frame into an assignment vector and repetition count. /// -/// This helper is crate-private because ben32 is an implementation detail of -/// XBEN, but it underpins both the stream decoders and the translation logic. +/// This helper is crate-private because ben32 is an implementation detail of XBEN, but it underpins +/// both the stream decoders and the translation logic. /// /// # Arguments /// @@ -59,8 +59,8 @@ pub(crate) fn decode_ben32_line( /// /// * `reader` - The ben32 input stream. /// * `writer` - The destination for the JSONL output. -/// * `starting_sample` - The 0-based sample offset that should be added to the -/// emitted sample numbers. +/// * `starting_sample` - The 0-based sample offset that should be added to the emitted sample +/// numbers. /// * `variant` - The BEN variant used to interpret repetition counts. /// /// # Returns diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index eab31bf..f9c68c5 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -10,14 +10,13 @@ use xz2::read::XzDecoder; /// Decode a BEN stream into JSONL assignment records. /// -/// Each decoded sample is written as a JSON object containing an `assignment` -/// vector and a 1-based `sample` index. +/// Each decoded sample is written as a JSON object containing an `assignment` vector and a 1-based +/// `sample` index. /// /// # Arguments /// /// * `reader` - The input BEN stream, including the 17-byte BEN banner. -/// * `writer` - The destination that will receive one JSON object per decoded -/// sample. +/// * `writer` - The destination that will receive one JSON object per decoded sample. /// /// # Returns /// @@ -32,8 +31,7 @@ pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Resul /// # Arguments /// /// * `reader` - The compressed XBEN input stream. -/// * `writer` - The destination that will receive one JSON object per decoded -/// sample. +/// * `writer` - The destination that will receive one JSON object per decoded sample. /// /// # Returns /// @@ -146,17 +144,29 @@ mod tests { // Build a valid Standard XBEN stream. let jsonl = b"{\"assignment\":[1,2,3],\"sample\":1}\n"; let mut xben = Vec::new(); - encode_jsonl_to_xben(jsonl.as_slice(), &mut xben, BenVariant::Standard, Some(1), Some(1), None, None) - .unwrap(); + encode_jsonl_to_xben( + jsonl.as_slice(), + &mut xben, + BenVariant::Standard, + Some(1), + Some(1), + None, + None, + ) + .unwrap(); - // Use a read-only File as the writer — writing to it fails with a - // permission error, which propagates through the jsonl_decode_ben32 - // call at line 128 of this file. No custom Write impl needed. - let nonce = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + // Use a read-only File as the writer — writing to it fails with a permission error, which + // propagates through the jsonl_decode_ben32 call at line 128 of this file. No custom Write + // impl needed. + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); let path = std::env::temp_dir().join(format!("xben-ro-{nonce}.tmp")); std::fs::write(&path, b"").unwrap(); let ro_file = std::fs::File::open(&path).unwrap(); // read-only - // Writing to a read-only file fails — the exact error kind varies by OS. + // Writing to a read-only file fails — + // the exact error kind varies by OS. let err = decode_xben_to_jsonl(BufReader::new(xben.as_slice()), ro_file).unwrap_err(); assert!(err.kind() != io::ErrorKind::UnexpectedEof); let _ = std::fs::remove_file(path); diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index 664ac33..4b64f12 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -13,7 +13,8 @@ pub use ben::decode_ben_line; pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub use path::{ - decode_ben_to_jsonl_path, decode_xben_to_ben_path, decode_xben_to_jsonl_path, xz_decompress_path, + decode_ben_to_jsonl_path, decode_xben_to_ben_path, decode_xben_to_jsonl_path, + xz_decompress_path, }; pub(crate) use twodelta::apply_twodelta_runs_to_assignment; pub use twodelta::decode_twodelta_frame; diff --git a/ben/src/codec/decode/path.rs b/ben/src/codec/decode/path.rs index e424c65..117b097 100644 --- a/ben/src/codec/decode/path.rs +++ b/ben/src/codec/decode/path.rs @@ -1,9 +1,9 @@ //! Path-based convenience wrappers around the streaming decoders. //! -//! Each wrapper opens a buffered reader on the input and a buffered writer on -//! the output, then delegates to the corresponding streaming function. The -//! wrappers exist so that CLI dispatch and library consumers do not have to -//! repeat the `BufReader`/`BufWriter`/`File` plumbing at every callsite. +//! Each wrapper opens a buffered reader on the input and a buffered writer on the output, then +//! delegates to the corresponding streaming function. The wrappers exist so that CLI dispatch and +//! library consumers do not have to repeat the `BufReader`/`BufWriter`/`File` plumbing at every +//! callsite. use std::fs::File; use std::io::{BufReader, BufWriter, Result}; @@ -53,6 +53,6 @@ mod tests { let _ = std::fs::remove_file(&out); } - // The happy-path round-trip tests for these decoders live alongside the - // matching encoders in `super::super::encode::path::tests`. + // The happy-path round-trip tests for these decoders live alongside the matching encoders in + // `super::super::encode::path::tests`. } diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs index e320a34..5d56864 100644 --- a/ben/src/codec/decode/tests/mkvchain.rs +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -52,14 +52,13 @@ fn decode_ben_to_jsonl_count_three_expands_to_three_lines() { #[test] fn decode_ben_to_jsonl_sample_numbers_continue_across_frames() { - // Frame 1: [1,1,1,1,2,3,3,3] count=2 → samples 1,2 - // Frame 2: [23] count=3 → samples 3,4,5 + // Frame 1: [1,1,1,1,2,3,3,3] count=2 → samples 1,2 Frame 2: [23] count=3 → samples 3,4,5 let mut ben = b"MKVCHAIN BEN FILE".to_vec(); ben.extend_from_slice(FRAME_HEADER); ben.extend_from_slice(FRAME_PAYLOAD); ben.extend_from_slice(&2u16.to_be_bytes()); - // Frame for assignment [23]: max_val_bits=5, max_len_bits=1, n_bytes=1 - // payload 0b101111_00 = bits 10111_1 → val=10111=23, len=1=1 + // Frame for assignment [23]: max_val_bits=5, max_len_bits=1, n_bytes=1 payload 0b101111_00 = + // bits 10111_1 → val=10111=23, len=1=1 ben.extend_from_slice(&[5, 1, 0, 0, 0, 1, 0b101111_00]); ben.extend_from_slice(&3u16.to_be_bytes()); @@ -77,8 +76,8 @@ fn decode_ben_to_jsonl_sample_numbers_continue_across_frames() { #[test] fn decode_ben_to_jsonl_16bit_value_with_count() { - // Frame bytes from test_jsonl_decode_ben_16_bit_val (assignment [1,1,1,1,512,3,3,3]) - // with count=2 appended. + // Frame bytes from test_jsonl_decode_ben_16_bit_val (assignment [1,1,1,1,512,3,3,3]) with + // count=2 appended. let mut ben = b"MKVCHAIN BEN FILE".to_vec(); ben.extend_from_slice(&[10, 3, 0, 0, 0, 5]); ben.extend_from_slice(&[ @@ -140,8 +139,7 @@ fn jsonl_decode_ben32_mkvchain_count_five_expands_correctly() { #[test] fn jsonl_decode_ben32_mkvchain_two_records_correct_sample_numbers() { - // Record 1: [23] count=2 → samples 1,2 - // Record 2: [1,2,3,4] count=1 → sample 3 + // Record 1: [23] count=2 → samples 1,2 Record 2: [1,2,3,4] count=1 → sample 3 let mut input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; input.extend_from_slice(&2u16.to_be_bytes()); input.extend_from_slice(&[0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 0, 0, 0]); @@ -180,7 +178,7 @@ fn decode_xben_to_ben_mkvchain_roundtrip() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -206,7 +204,7 @@ fn decode_xben_to_jsonl_mkvchain_count_expands() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -238,16 +236,23 @@ fn decode_xben_to_jsonl_rejects_mkvchain_partial_overflow() { let mut xz = Vec::new(); let mut inner = b"MKVCHAIN BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); assert!(out.is_empty()); } -// ─── decode_ben_to_jsonl — byte-level frame encoding counterparts ────── -// These mirror the Standard tests in standard.rs exactly, differing only in -// the MKVCHAIN banner and the trailing u16 BE count field appended to each frame. +// ─── decode_ben_to_jsonl — byte-level frame encoding counterparts ────── These mirror the Standard +// tests in standard.rs exactly, differing only in the MKVCHAIN banner and the trailing u16 BE count +// field appended to each frame. #[test] fn decode_ben_to_jsonl_exact() { @@ -502,9 +507,9 @@ fn decode_ben_to_jsonl_three_frames() { assert_eq!(out, expected.as_bytes()); } -// ─── jsonl_decode_ben32 — byte-level counterparts ───────────────────── -// Each Standard ben32 record has [pairs...][0,0,0,0] terminator. -// Each MkvChain ben32 record appends a u16 BE count after the terminator. +// ─── jsonl_decode_ben32 — byte-level counterparts ───────────────────── Each Standard ben32 record +// has [pairs...][0,0,0,0] terminator. Each MkvChain ben32 record appends a u16 BE count after the +// terminator. #[test] fn jsonl_decode_ben32_16bit_val() { diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 3df836f..87ffe64 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -40,7 +40,7 @@ fn decode_xben_to_ben_twodelta_roundtrip() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -83,7 +83,7 @@ fn decode_xben_to_jsonl_twodelta() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -248,7 +248,7 @@ fn decode_xben_to_ben_twodelta_with_repeated_assignments() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -291,11 +291,8 @@ fn xz_compress_direct_test() { assert!(!out.is_empty()); let mut decompressed = Vec::new(); - crate::codec::decode::xz_decompress( - std::io::BufReader::new(out.as_slice()), - &mut decompressed, - ) - .unwrap(); + crate::codec::decode::xz_decompress(std::io::BufReader::new(out.as_slice()), &mut decompressed) + .unwrap(); assert_eq!(decompressed, data); } @@ -305,7 +302,8 @@ fn encode_ben_to_xben_rejects_invalid_banner() { let garbage = b"GARBAGE BANNER!!!extra_padding"; let mut out = Vec::new(); - let err = encode_ben_to_xben(garbage.as_slice(), &mut out, Some(1), Some(1), None, None).unwrap_err(); + let err = + encode_ben_to_xben(garbage.as_slice(), &mut out, Some(1), Some(1), None, None).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } diff --git a/ben/src/codec/decode/tests/standard.rs b/ben/src/codec/decode/tests/standard.rs index 1b139b3..34c8dba 100644 --- a/ben/src/codec/decode/tests/standard.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -299,7 +299,7 @@ fn test_decode_xben_to_ben_rejects_invalid_inner_header() { &mut xz, Some(1), Some(0), - None, + None, ) .unwrap(); @@ -315,7 +315,7 @@ fn test_decode_xben_to_jsonl_rejects_invalid_inner_header() { &mut xz, Some(1), Some(0), - None, + None, ) .unwrap(); @@ -328,7 +328,14 @@ fn test_decode_xben_to_ben_handles_partial_overflow_without_frame() { let mut xz = Vec::new(); let mut inner = b"STANDARD BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); let mut out = Vec::new(); decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap(); @@ -340,7 +347,14 @@ fn test_decode_xben_to_jsonl_handles_partial_overflow_without_frame() { let mut xz = Vec::new(); let mut inner = b"STANDARD BEN FILE".to_vec(); inner.extend_from_slice(&[1, 2, 3]); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index 8c0c35c..2de7a85 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -30,10 +30,9 @@ fn expected_line(assignment: &[u16], sample: usize) -> String { #[test] fn apply_runs_basic_two_position_swap() { - // prev: [1,2,1,2], run_lengths=[2,2] starting with value 1 - // → first 2 pair positions get value 1, next 2 get value 2 - // pair positions (where val is 1 or 2): 0,1,2,3 - // run 1 (len=2, val=1): pos 0,1 → 1,1; run 2 (len=2, val=2): pos 2,3 → 2,2 + // prev: [1,2,1,2], run_lengths=[2,2] starting with value 1 → first 2 pair positions get value + // 1, next 2 get value 2 pair positions (where val is 1 or 2): 0,1,2,3 run 1 (len=2, val=1): pos + // 0,1 → 1,1; run 2 (len=2, val=2): pos 2,3 → 2,2 let prev = vec![1u16, 2, 1, 2]; let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[2, 2]).unwrap(); assert_eq!(result, vec![1, 1, 2, 2]); @@ -41,10 +40,8 @@ fn apply_runs_basic_two_position_swap() { #[test] fn apply_runs_non_pair_positions_unchanged() { - // prev: [1,2,3,1,2], pair=(1,2), run_lengths=[2,2] - // pair positions: 0,1,3,4 (index 2 holds value 3 → unchanged) - // run 1 (len=2, val=1): pos 0,1 → 1,1 - // run 2 (len=2, val=2): pos 3,4 → 2,2 + // prev: [1,2,3,1,2], pair=(1,2), run_lengths=[2,2] pair positions: 0,1,3,4 (index 2 holds value + // 3 → unchanged) run 1 (len=2, val=1): pos 0,1 → 1,1 run 2 (len=2, val=2): pos 3,4 → 2,2 let prev = vec![1u16, 2, 3, 1, 2]; let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[2, 2]).unwrap(); assert_eq!(result, vec![1, 1, 3, 2, 2]); @@ -52,8 +49,7 @@ fn apply_runs_non_pair_positions_unchanged() { #[test] fn apply_runs_full_reversal() { - // prev: [1,1,2,2], pair=(2,1), run_lengths=[2,2] - // pair positions: 0,1,2,3; pair.0=2 comes first + // prev: [1,1,2,2], pair=(2,1), run_lengths=[2,2] pair positions: 0,1,2,3; pair.0=2 comes first // run 1 (len=2, val=2): pos 0,1 → 2,2; run 2 (len=2, val=1): pos 2,3 → 1,1 let prev = vec![1u16, 1, 2, 2]; let result = apply_twodelta_runs_to_assignment(prev, (2, 1), &[2, 2]).unwrap(); @@ -62,8 +58,8 @@ fn apply_runs_full_reversal() { #[test] fn apply_runs_exhausted_before_all_positions_covered_errors() { - // prev: [1,2,1], pair=(1,2), run_lengths=[1] — too short - // After consuming run 0 (1 position with value 1), run 1 missing → error + // prev: [1,2,1], pair=(1,2), run_lengths=[1] — too short After consuming run 0 (1 position with + // value 1), run 1 missing → error let prev = vec![1u16, 2, 1]; let err = apply_twodelta_runs_to_assignment(prev, (1, 2), &[1]).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -71,8 +67,8 @@ fn apply_runs_exhausted_before_all_positions_covered_errors() { #[test] fn apply_runs_alternating_single_positions() { - // prev: [1,2,1,2,1], pair=(1,2), run_lengths=[1,1,1,1,1] - // Each pair position flips: run alternates 1,2,1,2,1 + // prev: [1,2,1,2,1], pair=(1,2), run_lengths=[1,1,1,1,1] Each pair position flips: run + // alternates 1,2,1,2,1 let prev = vec![1u16, 2, 1, 2, 1]; let result = apply_twodelta_runs_to_assignment(prev, (1, 2), &[1, 1, 1, 1, 1]).unwrap(); // run[0]=1 → pos0=1; run[1]=1 → pos1=2; run[2]=1 → pos2=1; etc. @@ -91,8 +87,8 @@ fn decode_twodelta_frame_basic() { #[test] fn decode_twodelta_frame_full_swap() { - // pair=(2,1) means run starts with value 2; run_lengths=[2,2] - // prev [1,2,1,2]: pair positions 0,1,2,3 → [2,2,1,1] + // pair=(2,1) means run starts with value 2; run_lengths=[2,2] prev [1,2,1,2]: pair positions + // 0,1,2,3 → [2,2,1,1] let frame = BenEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None); let prev = vec![1u16, 2, 1, 2]; let result = decode_twodelta_frame(prev, &frame).unwrap(); @@ -101,8 +97,8 @@ fn decode_twodelta_frame_full_swap() { #[test] fn decode_twodelta_frame_chain_returns_to_original() { - // Frame 1: (1,2) run=[2,2] applied to [1,2,1,2] → [1,1,2,2] - // Frame 2: (1,2) run=[1,1,1,1] applied to [1,1,2,2] → [1,2,1,2] + // Frame 1: (1,2) run=[2,2] applied to [1,2,1,2] → [1,1,2,2] Frame 2: (1,2) run=[1,1,1,1] + // applied to [1,1,2,2] → [1,2,1,2] let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); let f2 = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None); let initial = vec![1u16, 2, 1, 2]; @@ -214,11 +210,10 @@ fn decode_ben_to_jsonl_twodelta_multiple_repeated_deltas() { assert_eq!(out, expected.as_bytes()); } -// ─── decode_ben_to_jsonl — byte-level anchor frame counterparts ──────── -// The TwoDelta first frame (anchor) is encoded in MkvChain format. These tests -// mirror every byte-level Standard / MkvChain decode_ben_to_jsonl test using the -// TWODELTA banner and the same bit-packed frame bytes, verifying that the anchor -// path decodes the same payload correctly regardless of variant. +// ─── decode_ben_to_jsonl — byte-level anchor frame counterparts ──────── The TwoDelta first frame +// (anchor) is encoded in MkvChain format. These tests mirror every byte-level Standard / MkvChain +// decode_ben_to_jsonl test using the TWODELTA banner and the same bit-packed frame bytes, verifying +// that the anchor path decodes the same payload correctly regardless of variant. #[test] fn decode_ben_to_jsonl_underflow_anchor() { @@ -493,7 +488,7 @@ fn decode_xben_to_jsonl_twodelta_anchor_only() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -518,7 +513,7 @@ fn decode_xben_to_jsonl_twodelta_chain_roundtrip() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -542,7 +537,7 @@ fn decode_xben_to_jsonl_twodelta_with_repetitions() { Some(1), Some(0), None, - None, + None, ) .unwrap(); diff --git a/ben/src/codec/decode/twodelta.rs b/ben/src/codec/decode/twodelta.rs index 8441a27..3b17d85 100644 --- a/ben/src/codec/decode/twodelta.rs +++ b/ben/src/codec/decode/twodelta.rs @@ -4,9 +4,8 @@ use std::io; /// Apply decoded TwoDelta run lengths to produce a new assignment vector. /// -/// Positions in `assignment` that hold either value of `pair` are overwritten -/// according to the alternating run-length encoding. `pair.0` fills the first -/// run, `pair.1` the second, and so on. +/// Positions in `assignment` that hold either value of `pair` are overwritten according to the +/// alternating run-length encoding. `pair.0` fills the first run, `pair.1` the second, and so on. /// /// # Arguments /// @@ -16,8 +15,8 @@ use std::io; /// /// # Returns /// -/// Returns the updated assignment vector, or an error if the run lengths are -/// exhausted before all relevant positions are covered. +/// Returns the updated assignment vector, or an error if the run lengths are exhausted before all +/// relevant positions are covered. pub(crate) fn apply_twodelta_runs_to_assignment( mut assignment: Vec, pair: (u16, u16), @@ -66,17 +65,12 @@ pub(crate) fn apply_twodelta_runs_to_assignment( /// # Arguments /// /// * `previous` - The assignment vector from the preceding frame. -/// * `frame` - A TwoDelta-arm [`BenEncodeFrame`] containing the pair and -/// run-length vector. +/// * `frame` - A TwoDelta-arm [`BenEncodeFrame`] containing the pair and run-length vector. /// /// # Returns /// -/// Returns the updated assignment vector, or an error if `frame` is not the -/// `TwoDelta` arm. -pub fn decode_twodelta_frame( - previous: Vec, - frame: &BenEncodeFrame, -) -> io::Result> { +/// Returns the updated assignment vector, or an error if `frame` is not the `TwoDelta` arm. +pub fn decode_twodelta_frame(previous: Vec, frame: &BenEncodeFrame) -> io::Result> { match frame { BenEncodeFrame::TwoDelta { pair, diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 56207af..5bbf80d 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -10,8 +10,7 @@ use xz2::read::XzDecoder; /// Decode an XBEN stream into an equivalent BEN stream. /// -/// The output begins with the normal BEN banner followed by uncompressed BEN -/// frames. +/// The output begins with the normal BEN banner followed by uncompressed BEN frames. /// /// # Arguments /// diff --git a/ben/src/codec/encode/ben.rs b/ben/src/codec/encode/ben.rs index 691eb9b..3826536 100644 --- a/ben/src/codec/encode/ben.rs +++ b/ben/src/codec/encode/ben.rs @@ -1,8 +1,7 @@ use serde_json::Value; use std::io::{Error, ErrorKind, Result}; -/// Encode a JSON assignment record into the ben32 frame representation used by -/// XBEN streams. +/// Encode a JSON assignment record into the ben32 frame representation used by XBEN streams. /// /// Note: This is a helper function that is only used in the testing suite. /// @@ -12,8 +11,7 @@ use std::io::{Error, ErrorKind, Result}; /// /// # Returns /// -/// Returns the encoded ben32 frame bytes terminated by the four-byte `0` -/// sentinel. +/// Returns the encoded ben32 frame bytes terminated by the four-byte `0` sentinel. #[cfg_attr(not(test), allow(dead_code))] pub(crate) fn encode_ben32_line(data: Value) -> Result> { let json_value_assign_vec = match data["assignment"].as_array() { diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 5fba9aa..59f55c4 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -8,24 +8,23 @@ use xz2::write::XzEncoder; /// Encode JSONL assignment records directly into an XBEN stream. /// -/// Each input line must be a JSON object with an `assignment` array. The output -/// stream begins with the standard BEN banner inside the compressed payload and -/// then stores each assignment in ben32 form. +/// Each input line must be a JSON object with an `assignment` array. The output stream begins with +/// the standard BEN banner inside the compressed payload and then stores each assignment in ben32 +/// form. /// /// # Arguments /// /// * `reader` - A JSONL input stream with one assignment record per line. /// * `writer` - The destination for the compressed XBEN bytes. /// * `variant` - The BEN variant to use inside the XBEN payload. -/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` -/// (single-threaded) when `None`. Values larger than the host's -/// available parallelism are silently clamped down. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` (single-threaded) when `None`. +/// Values larger than the host's available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. -/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for -/// Standard and MkvChain variants. -/// * `block_size` - Optional per-block size in bytes for the MT encoder. -/// `None` defaults to [`crate::codec::encode::xz::XZ_DEFAULT_MT_BLOCK_SIZE`] -/// when threads > 1, or `0` (liblzma auto) for single-thread runs. +/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for Standard and MkvChain +/// variants. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. `None` defaults to +/// [`crate::codec::encode::xz::XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or `0` (liblzma auto) +/// for single-thread runs. /// /// # Returns /// @@ -69,9 +68,8 @@ pub fn encode_jsonl_to_xben( /// Encode JSONL assignment records into an uncompressed BEN file. /// -/// The input is expected to contain one JSON object per line with an -/// `assignment` array. The `sample` field is ignored because BEN sample order is -/// determined by the stream position. +/// The input is expected to contain one JSON object per line with an `assignment` array. The +/// `sample` field is ignored because BEN sample order is determined by the stream position. /// /// # Arguments /// diff --git a/ben/src/codec/encode/path.rs b/ben/src/codec/encode/path.rs index dd1059f..07caea9 100644 --- a/ben/src/codec/encode/path.rs +++ b/ben/src/codec/encode/path.rs @@ -1,9 +1,9 @@ //! Path-based convenience wrappers around the streaming encoders. //! -//! Each wrapper opens a buffered reader on the input and a buffered writer on -//! the output, then delegates to the corresponding streaming function. The -//! wrappers exist so that CLI dispatch and library consumers do not have to -//! repeat the `BufReader`/`BufWriter`/`File` plumbing at every callsite. +//! Each wrapper opens a buffered reader on the input and a buffered writer on the output, then +//! delegates to the corresponding streaming function. The wrappers exist so that CLI dispatch and +//! library consumers do not have to repeat the `BufReader`/`BufWriter`/`File` plumbing at every +//! callsite. use std::fs::File; use std::io::{BufReader, BufWriter, Result}; @@ -151,17 +151,16 @@ mod tests { let xben = unique_path("path-bxb.xben"); let ben_back = unique_path("path-bxb-back.ben"); - std::fs::write( - &jsonl_in, - jsonl_from_assignments(&[vec![1, 2, 3]]), - ) - .unwrap(); + std::fs::write(&jsonl_in, jsonl_from_assignments(&[vec![1, 2, 3]])).unwrap(); encode_jsonl_to_ben_path(&jsonl_in, &ben, BenVariant::Standard).unwrap(); encode_ben_to_xben_path(&ben, &xben, Some(1), Some(1), None, None).unwrap(); decode_xben_to_ben_path(&xben, &ben_back).unwrap(); // Round trip: ben_back should be byte-equivalent to ben (same banner, same content). - assert_eq!(std::fs::read(&ben).unwrap(), std::fs::read(&ben_back).unwrap()); + assert_eq!( + std::fs::read(&ben).unwrap(), + std::fs::read(&ben_back).unwrap() + ); for p in [&jsonl_in, &ben, &xben, &ben_back] { let _ = std::fs::remove_file(p); diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index ce7e07c..98ce749 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -262,8 +262,7 @@ fn test_encode_jsonl_to_ben_len_65535() { #[test] fn test_encode_ben_vec_from_assign_matches_rle_entrypoint() { let assign_vec = vec![4u16, 4, 4, 1, 1, 3, 3, 3, 2]; - let direct = - BenEncodeFrame::from_assignment(assign_vec.clone(), BenVariant::Standard, None); + let direct = BenEncodeFrame::from_assignment(assign_vec.clone(), BenVariant::Standard, None); let via_rle = BenEncodeFrame::from_rle( crate::util::rle::assign_to_rle(assign_vec), BenVariant::Standard, @@ -736,7 +735,7 @@ fn encode_jsonl_to_xben_roundtrip() { Some(1), Some(1), None, - None, + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -755,7 +754,7 @@ fn encode_jsonl_to_xben_with_chunk_size() { Some(1), Some(1), Some(2), - None, + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -772,7 +771,7 @@ fn encode_jsonl_to_xben_invalid_json_errors() { Some(1), Some(1), None, - None, + None, ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -792,7 +791,7 @@ fn encode_jsonl_to_xben_mkv_variant() { Some(1), Some(1), None, - None, + None, ) .unwrap(); assert!(!xben.is_empty()); @@ -938,7 +937,7 @@ fn encode_jsonl_to_xben_roundtrip_verifies_content() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -971,7 +970,7 @@ fn encode_jsonl_to_xben_mkv_verifies_content() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -1008,10 +1007,9 @@ fn twodelta_encode_with_count() { #[test] fn twodelta_encode_run_lengths_correct() { use crate::codec::encode::encode_twodelta_frame; - // prev: [1,1,2,2], next: [2,1,2,1] - // pair positions (1 or 2): 0,1,2,3 - // In next: pos0=2, pos1=1, pos2=2, pos3=1 → runs of (2,1,2,1) = [1,1,1,1] - // pair.0 = value at first pair position in next = 2 + // prev: [1,1,2,2], next: [2,1,2,1] pair positions (1 or 2): 0,1,2,3 In next: pos0=2, pos1=1, + // pos2=2, pos3=1 → runs of (2,1,2,1) = [1,1,1,1] pair.0 = value at first pair position in next + // = 2 let prev = vec![1u16, 1, 2, 2]; let next = vec![2u16, 1, 2, 1]; let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); @@ -1022,9 +1020,8 @@ fn twodelta_encode_run_lengths_correct() { #[test] fn twodelta_encode_run_lengths_with_non_pair_gaps() { use crate::codec::encode::encode_twodelta_frame; - // prev: [1,3,2,3,1], next: [2,3,1,3,2] - // pair=(1,2), pair positions: 0,2,4 (positions with value 1 or 2) - // In next: pos0=2, pos2=1, pos4=2 → runs [1,1,1] + // prev: [1,3,2,3,1], next: [2,3,1,3,2] pair=(1,2), pair positions: 0,2,4 (positions with value + // 1 or 2) In next: pos0=2, pos2=1, pos4=2 → runs [1,1,1] let prev = vec![1u16, 3, 2, 3, 1]; let next = vec![2u16, 3, 1, 3, 2]; let frame = encode_twodelta_frame(&prev, &next, None).unwrap(); @@ -1173,7 +1170,7 @@ fn encode_jsonl_to_xben_twodelta_roundtrip() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -1194,8 +1191,8 @@ fn encode_jsonl_to_xben_twodelta_roundtrip() { fn twodelta_encode_outside_pair_change_errors() { use super::twodelta::encode_twodelta_frame; - // prev=[1,2,3,4], curr=[2,1,3,5] — positions 0,1 swap pair (1,2), - // but position 3 changes from 4→5 which is outside the pair. + // prev=[1,2,3,4], curr=[2,1,3,5] — positions 0,1 swap pair (1,2), but position 3 changes from + // 4→5 which is outside the pair. let prev = vec![1u16, 2, 3, 4]; let curr = vec![2u16, 1, 3, 5]; let err = encode_twodelta_frame(&prev, &curr, None).unwrap_err(); @@ -1284,9 +1281,9 @@ fn twodelta_encode_pair_mask_run_exceeds_u16_max_errors() { use crate::codec::encode::encode_twodelta_frame_with_hint; use std::collections::HashMap; - // 65538 positions: pair positions 0..65537 hold value 1 in prev, one more - // (65537) holds value 2. In curr all pair positions hold value 2, so the - // run of value-2 positions reaches u16::MAX and the encoder must error. + // 65538 positions: pair positions 0..65537 hold value 1 in prev, one more (65537) holds value + // 2. In curr all pair positions hold value 2, so the run of value-2 positions reaches u16::MAX + // and the encoder must error. let mut prev = vec![1u16; 65538]; prev[65537] = 2; let mut curr = vec![2u16; 65538]; @@ -1296,9 +1293,8 @@ fn twodelta_encode_pair_mask_run_exceeds_u16_max_errors() { masks.insert(1, (0..65537_usize).collect()); masks.insert(2, vec![65537_usize]); - let err = - encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) - .unwrap_err(); + let err = encode_twodelta_frame_with_hint(&prev, &curr, Some((1, 2)), Some(&mut masks), None) + .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); assert!(err.to_string().contains("u16::MAX")); } @@ -1307,8 +1303,8 @@ fn twodelta_encode_pair_mask_run_exceeds_u16_max_errors() { fn twodelta_encode_from_scratch_run_exceeds_u16_max_errors() { use crate::codec::encode::encode_twodelta_frame; - // 65538 positions all in pair {1, 2}: first 65537 change 1→2, last 1 changes 2→1. - // The from-scratch encoder hits u16::MAX consecutive positions with value 2 and errors. + // 65538 positions all in pair {1, 2}: first 65537 change 1→2, last 1 changes 2→1. The + // from-scratch encoder hits u16::MAX consecutive positions with value 2 and errors. let mut prev = vec![1u16; 65538]; prev[65537] = 2; let mut curr = vec![2u16; 65538]; @@ -1323,9 +1319,9 @@ fn twodelta_encode_from_scratch_run_exceeds_u16_max_errors() { fn ben32_encode_run_exceeding_u16_max_splits_correctly() { use super::ben::encode_ben32_assignments; - // Build an assignment with 65537 identical values: the run reaches u16::MAX - // (65535) and must be flushed early, then continues with a new run. - // encode_ben32_assignments appends a 4-byte zero sentinel at the end. + // Build an assignment with 65537 identical values: the run reaches u16::MAX (65535) and must be + // flushed early, then continues with a new run. encode_ben32_assignments appends a 4-byte zero + // sentinel at the end. let assign: Vec = vec![7u16; 65537]; let encoded = encode_ben32_assignments(&assign).unwrap(); diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index a3127ae..dab3816 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -3,53 +3,50 @@ use crate::codec::BenEncodeFrame; use std::collections::HashMap; use std::io::{Error, ErrorKind, Result}; -/// Encode a transition between two assignment vectors as a TwoDelta frame, optionally -/// using caller-supplied hints to accelerate encoding. +/// Encode a transition between two assignment vectors as a TwoDelta frame, optionally using +/// caller-supplied hints to accelerate encoding. /// /// # Arguments /// /// * `previous_assignment` - The full assignment vector from the preceding sample. /// * `new_assignment` - The full assignment vector for the sample being encoded. -/// * `delta_pair` - An optional hint asserting which pair of ids is involved in the -/// transition. Must be provided together with `previous_masks`, and the two ids must be distinct. -/// * `previous_masks` - An optional mutable map from district id to the sorted list of positions -/// it occupies in `previous_assignment`. When provided, the map is updated in-place to -/// reflect `new_assignment` before returning. +/// * `delta_pair` - An optional hint asserting which pair of ids is involved in the transition. +/// Must be provided together with `previous_masks`, and the two ids must be distinct. +/// * `previous_masks` - An optional mutable map from district id to the sorted list of positions it +/// occupies in `previous_assignment`. When provided, the map is updated in-place to reflect +/// `new_assignment` before returning. /// /// # Returns /// -/// A `BenEncodeFrame` describing the transition from `previous_assignment` to -/// `new_assignment`. +/// A `BenEncodeFrame` describing the transition from `previous_assignment` to `new_assignment`. /// /// # TwoDelta encoding /// -/// A TwoDelta frame is valid only when every position that changes between -/// `previous_assignment` and `new_assignment` involves exactly two district ids -/// (call them A and B), and no position outside that pair changes. The frame stores -/// the pair and the lengths of alternating runs of A and B over the positions -/// occupied by the pair, ordered by position. The first run always corresponds to -/// whichever id occupies the lowest-indexed position. +/// A TwoDelta frame is valid only when every position that changes between `previous_assignment` +/// and `new_assignment` involves exactly two district ids (call them A and B), and no position +/// outside that pair changes. The frame stores the pair and the lengths of alternating runs of A +/// and B over the positions occupied by the pair, ordered by position. The first run always +/// corresponds to whichever id occupies the lowest-indexed position. /// /// # Hints /// /// Two optional hints can be provided to avoid scanning the full assignment vector: /// -/// - `delta_pair`: The caller asserts that exactly this pair of ids is involved in -/// the transition. Must be provided together with `previous_masks`. The pair must have two -/// distinct ids — passing `(x, x)` is an error. +/// - `delta_pair`: The caller asserts that exactly this pair of ids is involved in the transition. +/// Must be provided together with `previous_masks`. The pair must have two distinct ids — passing +/// `(x, x)` is an error. /// -/// - `previous_masks`: A mutable map from district id to the sorted list of positions it -/// occupies in `previous_assignment`. When provided, the function reads positions -/// directly from the map instead of scanning the assignment vector, and updates -/// the map in-place to reflect `new_assignment` before returning. The previous_masks must -/// cover every id that appears in the pair; a missing or empty entry is an error. +/// - `previous_masks`: A mutable map from district id to the sorted list of positions it occupies +/// in `previous_assignment`. When provided, the function reads positions directly from the map +/// instead of scanning the assignment vector, and updates the map in-place to reflect +/// `new_assignment` before returning. The previous_masks must cover every id that appears in the +/// pair; a missing or empty entry is an error. /// -/// The hints are not independent: `delta_pair` requires `previous_masks`. Providing `previous_masks` -/// without `delta_pair` is allowed — the function will infer the pair from the first -/// differing position and then use the previous_masks from there. +/// The hints are not independent: `delta_pair` requires `previous_masks`. Providing +/// `previous_masks` without `delta_pair` is allowed — the function will infer the pair from the +/// first differing position and then use the previous_masks from there. /// -/// When no hints are provided the function falls back to a full scan of both -/// assignment vectors. +/// When no hints are provided the function falls back to a full scan of both assignment vectors. /// /// # Errors /// @@ -110,12 +107,12 @@ pub(crate) fn encode_twodelta_frame_with_hint( // Ok(BenEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) } -/// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return -/// the pair ordered so that `pair.0` occupies a lower index than `pair.1`. +/// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return the +/// pair ordered so that `pair.0` occupies a lower index than `pair.1`. /// -/// Ordering by first position ensures that the run-length sequence produced during -/// encoding always begins with the id whose positions come first in the assignment -/// vector, which is required for deterministic round-trip decoding. +/// Ordering by first position ensures that the run-length sequence produced during encoding always +/// begins with the id whose positions come first in the assignment vector, which is required for +/// deterministic round-trip decoding. /// /// # Arguments /// @@ -125,7 +122,8 @@ pub(crate) fn encode_twodelta_frame_with_hint( /// # Returns /// /// The pair reordered so that `pair.0` has a smaller first position in the current vector than -/// `pair.1`, or an error if either id is absent from `previous_masks` or has an empty position list. +/// `pair.1`, or an error if either id is absent from `previous_masks` or has an empty position +/// list. fn validate_masks_and_order_pairs_for_twodelta( pair: (u16, u16), masks: &HashMap>, @@ -149,9 +147,9 @@ fn validate_masks_and_order_pairs_for_twodelta( return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.1 })); }; - // Order so that pair.0 is the value the new assignment places at the first - // pair position (the lowest index held by either mask). This guarantees - // run_lengths[0] >= 1 with no leading-zero sentinel. + // Order so that pair.0 is the value the new assignment places at the first pair position (the + // lowest index held by either mask). This guarantees run_lengths[0] >= 1 with no leading-zero + // sentinel. let first_pos = mask_a[0].min(mask_b[0]); if current[first_pos] == pair.0 { Ok((pair.0, pair.1)) @@ -162,28 +160,27 @@ fn validate_masks_and_order_pairs_for_twodelta( /// Build a TwoDelta frame using both a known pair and pre-computed position masks. /// -/// This is the fast path used during recombination-aware encoding, where the caller -/// already knows which two ids are swapping and has maintained a mask for each id. +/// This is the fast path used during recombination-aware encoding, where the caller already knows +/// which two ids are swapping and has maintained a mask for each id. /// /// The function merges the two sorted position lists from `previous_masks` to produce the -/// interleaved sequence of positions, validates that every referenced position in -/// `previous` and `current` belongs to the pair, computes the run lengths over -/// `current`, and then updates `previous_masks` in-place to reflect the new positions of -/// each id in `current`. +/// interleaved sequence of positions, validates that every referenced position in `previous` and +/// `current` belongs to the pair, computes the run lengths over `current`, and then updates +/// `previous_masks` in-place to reflect the new positions of each id in `current`. /// /// # Arguments /// /// * `previous` - The full assignment vector from the preceding sample. /// * `current` - The full assignment vector for the sample being encoded. /// * `delta_pair` - The pair of ids asserted to be involved in the transition. -/// * `previous_masks` - Mutable position mask map for both ids in the pair. Updated in-place -/// to reflect `current` before returning. +/// * `previous_masks` - Mutable position mask map for both ids in the pair. Updated in-place to +/// reflect `current` before returning. /// /// # Returns /// -/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if no -/// position actually changed value (signalling the frame can be deduplicated), or -/// another error if a mask entry is inconsistent with the assignment data. +/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if no position +/// actually changed value (signalling the frame can be deduplicated), or another error if a mask +/// entry is inconsistent with the assignment data. fn construct_twodelta_frame_from_pair_and_mask_hints( previous: &[u16], current: &[u16], @@ -220,9 +217,10 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let mut new_mask_b = Vec::with_capacity(new_capacity); let (mut i, mut j) = (0usize, 0usize); - // pair.0 is guaranteed to equal current[first_pos] by validate_masks_and_order_pairs_for_twodelta, - // so the first iteration always hits the `new_val == run_value` branch and increments - // the count — no special-case initialization needed. + // pair.0 is guaranteed to equal current[first_pos] by + // validate_masks_and_order_pairs_for_twodelta, so the first iteration always hits the + // `new_val == run_value` branch and increments the count — no special-case initialization + // needed. let mut run_value = pair.0; let mut current_mask_count = 0u16; let mut found_assignment_change = false; @@ -289,20 +287,15 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous_masks.insert(pair.0, new_mask_a); previous_masks.insert(pair.1, new_mask_b); - Ok(BenEncodeFrame::from_run_lengths( - pair, - run_lengths, - count, - )) + Ok(BenEncodeFrame::from_run_lengths(pair, run_lengths, count)) } -/// Build a TwoDelta frame using only pre-computed position masks, inferring the pair -/// from the first differing position between `previous` and `current`. +/// Build a TwoDelta frame using only pre-computed position masks, inferring the pair from the first +/// differing position between `previous` and `current`. /// -/// Scans until it finds a position where the two assignments differ, then delegates -/// to `construct_twodelta_frame_from_pair_and_mask_hints` with that pair. If no -/// difference is found the assignments are identical and -/// `BenEncodeError::RepeatedSample` is returned. +/// Scans until it finds a position where the two assignments differ, then delegates to +/// `construct_twodelta_frame_from_pair_and_mask_hints` with that pair. If no difference is found +/// the assignments are identical and `BenEncodeError::RepeatedSample` is returned. /// /// # Arguments /// @@ -313,8 +306,8 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( /// /// # Returns /// -/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if the -/// two assignments are identical. +/// A `BenEncodeFrame` for the transition, or `BenEncodeError::RepeatedSample` if the two +/// assignments are identical. fn construct_twodelta_frame_from_mask_hint( previous: &[u16], current: &[u16], @@ -336,14 +329,13 @@ fn construct_twodelta_frame_from_mask_hint( return Err(Error::from(EncodeError::TwoDeltaIdentical)); } -/// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no -/// hints from the caller. +/// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no hints from the +/// caller. /// -/// Scans to the first changed position to discover the raw pair values, then makes -/// a second pass from position 0 to build run lengths over all pair positions. -/// `enc_pair.0` is determined lazily at the first pair position encountered in the -/// second pass (which may precede the first changed position), guaranteeing -/// `run_lengths[0] >= 1` with no leading zero. +/// Scans to the first changed position to discover the raw pair values, then makes a second pass +/// from position 0 to build run lengths over all pair positions. `enc_pair.0` is determined lazily +/// at the first pair position encountered in the second pass (which may precede the first changed +/// position), guaranteeing `run_lengths[0] >= 1` with no leading zero. /// /// # Arguments /// @@ -352,8 +344,8 @@ fn construct_twodelta_frame_from_mask_hint( /// /// # Returns /// -/// A `BenEncodeFrame` for the transition, or an error if more than two distinct ids -/// appear across all changed positions. +/// A `BenEncodeFrame` for the transition, or an error if more than two distinct ids appear across +/// all changed positions. fn construct_twodelta_frame_from_scratch( previous: &[u16], current: &[u16], @@ -368,9 +360,9 @@ fn construct_twodelta_frame_from_scratch( let (a, b) = (previous[first_change], current[first_change]); - // Scan all positions: build run lengths for pair positions in previous. - // enc_pair ordering is determined lazily at the first pair position encountered: - // curr_val there is enc_pair.0, which may precede first_change for unchanged pair positions. + // Scan all positions: build run lengths for pair positions in previous. enc_pair ordering is + // determined lazily at the first pair position encountered: curr_val there is enc_pair.0, which + // may precede first_change for unchanged pair positions. let mut enc_pair = (0u16, 0u16); let mut enc_pair_known = false; let mut run_lengths: Vec = Vec::new(); @@ -415,13 +407,12 @@ fn construct_twodelta_frame_from_scratch( /// Encode a transition between two assignment vectors as a TwoDelta frame. /// -/// This is the unhinted entry point. It falls back to a full scan of both -/// assignment vectors to discover the pair and compute run lengths. Prefer -/// `encode_twodelta_frame_with_hint` when previous_masks are available, as it avoids -/// the scan entirely. +/// This is the unhinted entry point. It falls back to a full scan of both assignment vectors to +/// discover the pair and compute run lengths. Prefer `encode_twodelta_frame_with_hint` when +/// previous_masks are available, as it avoids the scan entirely. /// -/// The transition is valid only when all changed positions involve exactly two -/// district ids and positions outside that pair remain unchanged. +/// The transition is valid only when all changed positions involve exactly two district ids and +/// positions outside that pair remain unchanged. /// /// # Arguments /// @@ -430,8 +421,8 @@ fn construct_twodelta_frame_from_scratch( /// /// # Returns /// -/// Returns a TwoDelta frame describing the transition, or an error if the -/// transition involves more than two ids or the assignments are identical. +/// Returns a TwoDelta frame describing the transition, or an error if the transition involves more +/// than two ids or the assignments are identical. pub fn encode_twodelta_frame( previous_assignment: impl AsRef<[u16]>, new_assignment: impl AsRef<[u16]>, diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 39c5ecc..2b22247 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -6,34 +6,29 @@ use std::io::{self, BufRead, Cursor, Read, Result, Write}; use xz2::stream::{MtStreamBuilder, Stream}; use xz2::write::XzEncoder; -/// Default per-block size used by the multithreaded XZ encoder when the -/// caller does not pass an explicit `block_size`. +/// Default per-block size used by the multithreaded XZ encoder when the caller does not pass an +/// explicit `block_size`. /// -/// liblzma's `block_size = 0` means "auto" (`3 × dict_size`), which at -/// preset 9 is ~192 MiB — far too coarse for streaming inputs to fan out -/// across worker threads. 16 MiB strikes a balance between scaling -/// thread utilization on medium ensembles and keeping per-block -/// dictionary reuse mostly intact. +/// liblzma's `block_size = 0` means "auto" (`3 × dict_size`), which at preset 9 is ~192 MiB — far +/// too coarse for streaming inputs to fan out across worker threads. 16 MiB strikes a balance +/// between scaling thread utilization on medium ensembles and keeping per-block dictionary reuse +/// mostly intact. pub const XZ_DEFAULT_MT_BLOCK_SIZE: u64 = 16 * 1024 * 1024; /// Resolve `n_threads` against the host's available parallelism. pub(crate) fn resolve_threads(n_threads: Option) -> u32 { - n_threads - .unwrap_or(1) - .min(host_parallelism()) - .max(1) + n_threads.unwrap_or(1).min(host_parallelism()).max(1) } -/// Number of cores reported by `std::thread::available_parallelism`, -/// or `1` if the platform cannot answer. +/// Number of cores reported by `std::thread::available_parallelism`, or `1` if the platform cannot +/// answer. fn host_parallelism() -> u32 { std::thread::available_parallelism() .map(|n| n.get()) .unwrap_or(1) as u32 } -/// Convert a user-supplied signed thread count into the unsigned count -/// the encoder expects. +/// Convert a user-supplied signed thread count into the unsigned count the encoder expects. /// /// CLI and Python users want sklearn-style sentinel semantics: /// @@ -50,13 +45,11 @@ pub fn cpus_from_signed(n: i32) -> u32 { } } -/// Build a multithreaded XZ encoder stream with the project's default -/// `block_size` policy applied. +/// Build a multithreaded XZ encoder stream with the project's default `block_size` policy applied. /// -/// When `block_size` is `Some(n)`, that exact byte count is passed to -/// liblzma. When it is `None`, we default to [`XZ_DEFAULT_MT_BLOCK_SIZE`] -/// for `n_threads > 1` and to `0` (liblzma's "auto") for the single-thread -/// case so single-thread encoding does not pay any block-overhead cost. +/// When `block_size` is `Some(n)`, that exact byte count is passed to liblzma. When it is `None`, +/// we default to [`XZ_DEFAULT_MT_BLOCK_SIZE`] for `n_threads > 1` and to `0` (liblzma's "auto") for +/// the single-thread case so single-thread encoding does not pay any block-overhead cost. pub(crate) fn build_mt_stream( n_threads: u32, level: u32, @@ -78,21 +71,19 @@ pub(crate) fn build_mt_stream( /// Compress an arbitrary byte stream with XZ/LZMA2. /// -/// This is a general-purpose helper used by the XBEN tooling, but it can also -/// be used for plain XZ compression when BEN-specific framing is not needed. +/// This is a general-purpose helper used by the XBEN tooling, but it can also be used for plain XZ +/// compression when BEN-specific framing is not needed. /// /// # Arguments /// /// * `reader` - The input byte stream to compress. /// * `writer` - The destination for the compressed XZ bytes. -/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` -/// (single-threaded) when `None`. Values larger than the host's -/// available parallelism are silently clamped down. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` (single-threaded) when `None`. +/// Values larger than the host's available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. -/// * `block_size` - Optional per-block size in bytes for the MT encoder. -/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or -/// `0` (liblzma auto) for single-thread runs. Smaller blocks improve -/// thread fan-out at a slight compression-ratio cost. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. `None` defaults to +/// [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or `0` (liblzma auto) for single-thread runs. +/// Smaller blocks improve thread fan-out at a slight compression-ratio cost. /// /// # Returns /// @@ -125,22 +116,20 @@ pub fn xz_compress( /// Convert an existing BEN stream into an XBEN stream. /// -/// The input must begin with a BEN banner so that the variant can be preserved -/// in the compressed output. +/// The input must begin with a BEN banner so that the variant can be preserved in the compressed +/// output. /// /// # Arguments /// /// * `reader` - The input BEN stream, including its banner. /// * `writer` - The destination for the compressed XBEN bytes. -/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` -/// (single-threaded) when `None`. Values larger than the host's -/// available parallelism are silently clamped down. +/// * `n_threads` - Optional XZ encoder thread count. Defaults to `1` (single-threaded) when `None`. +/// Values larger than the host's available parallelism are silently clamped down. /// * `compression_level` - Optional XZ compression level in the range `0..=9`. -/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for -/// Standard and MkvChain variants. -/// * `block_size` - Optional per-block size in bytes for the MT encoder. -/// `None` defaults to [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or -/// `0` (liblzma auto) for single-thread runs. +/// * `chunk_size` - Optional TwoDelta columnar chunk size; ignored for Standard and MkvChain +/// variants. +/// * `block_size` - Optional per-block size in bytes for the MT encoder. `None` defaults to +/// [`XZ_DEFAULT_MT_BLOCK_SIZE`] when threads > 1, or `0` (liblzma auto) for single-thread runs. /// /// # Returns /// @@ -167,8 +156,7 @@ pub fn encode_ben_to_xben( actual: check_buffer.to_vec(), }) })?; - let mut ben_encoder = - BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size)?; + let mut ben_encoder = BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size)?; ben_encoder.ingest_ben_stream(Cursor::new(check_buffer).chain(reader))?; ben_encoder.finish()?; Ok(()) diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index eb1bcff..e5243f9 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -3,18 +3,15 @@ use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Read}; -/// One sample's encoded bytes at the frame layer, freshly read from a wire -/// stream. +/// One sample's encoded bytes at the frame layer, freshly read from a wire stream. /// -/// `Standard` and `MkvChain` carry **opaque** bit-packed payload bytes — the -/// runs are not expanded until a caller asks for them. This is what makes -/// frame-level subsampling cheap: the iterator can pull frames at byte level -/// and only the kept frames pay the bit-unpacking cost. +/// `Standard` and `MkvChain` carry **opaque** bit-packed payload bytes — the runs are not expanded +/// until a caller asks for them. This is what makes frame-level subsampling cheap: the iterator can +/// pull frames at byte level and only the kept frames pay the bit-unpacking cost. /// -/// `TwoDelta` is the exception: applying a delta to the previous assignment -/// requires the run-length vector, so the decoder unpacks it eagerly at parse -/// time. This is not a regression; the bytes would have been needed -/// immediately on use anyway. +/// `TwoDelta` is the exception: applying a delta to the previous assignment requires the run-length +/// vector, so the decoder unpacks it eagerly at parse time. This is not a regression; the bytes +/// would have been needed immediately on use anyway. #[derive(Debug, Clone, PartialEq, Eq)] pub enum BenDecodeFrame { /// A `Standard`-variant frame with no trailing repetition count. @@ -41,13 +38,12 @@ pub enum BenDecodeFrame { /// The number of times this frame repeats. count: u16, }, - /// A `TwoDelta`-variant delta frame. Run lengths are eagerly decoded at - /// parse time because applying the delta needs them. + /// A `TwoDelta`-variant delta frame. Run lengths are eagerly decoded at parse time because + /// applying the delta needs them. TwoDelta { /// The pair of district ids encoded in this frame. pair: (u16, u16), - /// The unpacked alternating run lengths over the positions occupied - /// by the pair. + /// The unpacked alternating run lengths over the positions occupied by the pair. run_lengths: Vec, /// The number of times this delta repeats. count: u16, @@ -57,19 +53,15 @@ pub enum BenDecodeFrame { impl BenDecodeFrame { /// Read the next frame in the wire format dictated by `variant`. /// - /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` - /// on success, and `Err` on any I/O or format error. + /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` on success, and + /// `Err` on any I/O or format error. /// - /// Note: in a `TwoDelta` *stream*, the first frame is encoded in - /// `MkvChain` wire format. The caller (e.g. [`BenStreamReader`]) tracks - /// that state and passes [`BenVariant::MkvChain`] for the first frame and - /// [`BenVariant::TwoDelta`] for the rest. + /// Note: in a `TwoDelta` *stream*, the first frame is encoded in `MkvChain` wire format. The + /// caller (e.g. [`BenStreamReader`]) tracks that state and passes [`BenVariant::MkvChain`] for + /// the first frame and [`BenVariant::TwoDelta`] for the rest. /// /// [`BenStreamReader`]: crate::io::reader::BenStreamReader - pub fn from_reader( - reader: &mut impl Read, - variant: BenVariant, - ) -> io::Result> { + pub fn from_reader(reader: &mut impl Read, variant: BenVariant) -> io::Result> { match variant { BenVariant::Standard => Self::read_standard(reader), BenVariant::MkvChain => Self::read_mkv_chain(reader), @@ -144,9 +136,9 @@ impl BenDecodeFrame { let count = reader.read_u16::()?; - // Reuse the encode-side bit unpacker so the unpack logic lives in one - // place; we then drop the resulting BenEncodeFrame's raw_bytes since - // the decode-side TwoDelta arm doesn't keep them. + // Reuse the encode-side bit unpacker so the unpack logic lives in one place; we then drop + // the resulting BenEncodeFrame's raw_bytes since the decode-side TwoDelta arm doesn't keep + // them. let pair = (pair_a, pair_b); let encode_frame = BenEncodeFrame::from_parts(pair, max_len_bits, payload, count); let run_lengths = match encode_frame { @@ -180,20 +172,17 @@ impl BenDecodeFrame { } } - /// Borrow the bit-packed payload bytes for `Standard`/`MkvChain` arms. - /// Returns `None` for `TwoDelta` (which doesn't keep raw bytes after - /// parsing). + /// Borrow the bit-packed payload bytes for `Standard`/`MkvChain` arms. Returns `None` for + /// `TwoDelta` (which doesn't keep raw bytes after parsing). pub fn raw_bytes(&self) -> Option<&[u8]> { match self { - Self::Standard { raw_bytes, .. } | Self::MkvChain { raw_bytes, .. } => { - Some(raw_bytes) - } + Self::Standard { raw_bytes, .. } | Self::MkvChain { raw_bytes, .. } => Some(raw_bytes), Self::TwoDelta { .. } => None, } } - /// The bit width of the largest district id in this frame, or `None` for - /// `TwoDelta` (which doesn't carry one). + /// The bit width of the largest district id in this frame, or `None` for `TwoDelta` + /// (which doesn't carry one). pub fn max_val_bit_count(&self) -> Option { match self { Self::Standard { @@ -206,8 +195,8 @@ impl BenDecodeFrame { } } - /// The bit width of the largest run length, or `None` for `TwoDelta` - /// (whose width sat in the wire format but is not retained on decode). + /// The bit width of the largest run length, or `None` for `TwoDelta` (whose width sat in the + /// wire format but is not retained on decode). pub fn max_len_bit_count(&self) -> Option { match self { Self::Standard { @@ -220,8 +209,7 @@ impl BenDecodeFrame { } } - /// The number of payload bytes for `Standard`/`MkvChain`, or `None` for - /// `TwoDelta`. + /// The number of payload bytes for `Standard`/`MkvChain`, or `None` for `TwoDelta`. pub fn n_bytes(&self) -> Option { match self { Self::Standard { n_bytes, .. } | Self::MkvChain { n_bytes, .. } => Some(*n_bytes), @@ -229,8 +217,7 @@ impl BenDecodeFrame { } } - /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for - /// the snapshot arms. + /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for the snapshot arms. pub fn pair(&self) -> Option<(u16, u16)> { match self { Self::TwoDelta { pair, .. } => Some(*pair), @@ -238,8 +225,8 @@ impl BenDecodeFrame { } } - /// Borrow the alternating run-length vector for a `TwoDelta` frame, or - /// `None` for the snapshot arms. + /// Borrow the alternating run-length vector for a `TwoDelta` frame, or `None` for the snapshot + /// arms. pub fn run_lengths(&self) -> Option<&[u16]> { match self { Self::TwoDelta { run_lengths, .. } => Some(run_lengths), @@ -249,11 +236,12 @@ impl BenDecodeFrame { /// Materialize the frame as a full assignment vector. /// - /// `Standard` and `MkvChain` ignore `prev` (any owned vector is dropped). - /// `TwoDelta` consumes `prev` in place to apply the delta and returns an - /// error if `prev` is `None`. + /// `Standard` and `MkvChain` ignore `prev` (any owned vector is dropped). `TwoDelta` consumes + /// `prev` in place to apply the delta and returns an error if `prev` is `None`. pub fn expand(&self, prev: Option>) -> io::Result> { - use crate::codec::decode::{apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError}; + use crate::codec::decode::{ + apply_twodelta_runs_to_assignment, decode_ben_line, DecodeError, + }; use crate::util::rle::rle_to_vec; use std::io::Cursor; diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs index 5836af4..890126d 100644 --- a/ben/src/codec/frames/encode.rs +++ b/ben/src/codec/frames/encode.rs @@ -4,10 +4,9 @@ use crate::BenVariant; /// One sample's encoded bytes at the frame layer. /// -/// Variants mirror [`BenVariant`]: a stream's variant tag dictates which arm -/// each frame in the stream uses. Encode-side arms carry the source RLE runs -/// (or run-length vector for `TwoDelta`) alongside the serialized `raw_bytes`, -/// because frames on this side are built *from* runs. +/// Variants mirror [`BenVariant`]: a stream's variant tag dictates which arm each frame in the +/// stream uses. Encode-side arms carry the source RLE runs (or run-length vector for `TwoDelta`) +/// alongside the serialized `raw_bytes`, because frames on this side are built *from* runs. #[derive(Debug, Clone, PartialEq, Eq)] pub enum BenEncodeFrame { /// A `Standard`-variant frame. No trailing repetition count on the wire. @@ -38,18 +37,16 @@ pub enum BenEncodeFrame { /// The number of times this frame repeats. count: u16, }, - /// A `TwoDelta`-variant frame: a delta over `pair` with alternating run - /// lengths. Carries a trailing `u16` repetition count. + /// A `TwoDelta`-variant frame: a delta over `pair` with alternating run lengths. Carries a + /// trailing `u16` repetition count. TwoDelta { - /// The pair of district ids encoded in this frame. - /// `pair.0` corresponds to the first run. + /// The pair of district ids encoded in this frame. `pair.0` corresponds to the first run. pair: (u16, u16), /// The number of bits used to encode the maximum run length. max_len_bit_count: u8, /// The number of bytes in the packed payload. n_bytes: u32, - /// The alternating run-length vector over the positions occupied by - /// the pair. + /// The alternating run-length vector over the positions occupied by the pair. run_length_vector: Vec, /// The full serialized TwoDelta frame bytes (header + payload + count). raw_bytes: Vec, @@ -65,8 +62,8 @@ impl BenEncodeFrame { /// /// # Panics /// - /// Panics if `variant` is [`BenVariant::TwoDelta`]; use - /// [`BenEncodeFrame::from_run_lengths`] for that. + /// Panics if `variant` is [`BenVariant::TwoDelta`]; use [`BenEncodeFrame::from_run_lengths`] + /// for that. pub fn from_rle(runs: Vec<(u16, u16)>, variant: BenVariant, count: Option) -> Self { let (max_val, max_len) = runs .iter() @@ -112,8 +109,8 @@ impl BenEncodeFrame { /// /// # Panics /// - /// Panics if `variant` is [`BenVariant::TwoDelta`]; TwoDelta frames cannot - /// be derived from a single assignment vector. + /// Panics if `variant` is [`BenVariant::TwoDelta`]; TwoDelta frames cannot be derived from a + /// single assignment vector. pub fn from_assignment( assignment: impl AsRef<[u16]>, variant: BenVariant, @@ -178,13 +175,11 @@ impl BenEncodeFrame { } } - /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a - /// raw payload. + /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload. /// - /// This is the inverse of [`BenEncodeFrame::from_run_lengths`]: it - /// re-assembles the serialized bytes and decodes the bit-packed payload - /// back into the run-length vector so that both representations are - /// available on the resulting frame. + /// This is the inverse of [`BenEncodeFrame::from_run_lengths`]: it re-assembles the serialized + /// bytes and decodes the bit-packed payload back into the run-length vector so that both + /// representations are available on the resulting frame. pub fn from_parts( pair: (u16, u16), max_len_bit_count: u8, @@ -251,26 +246,20 @@ impl BenEncodeFrame { } } - /// Borrow just the packed payload bytes (the variant-specific region - /// between the frame header and any trailing count). + /// Borrow just the packed payload bytes (the variant-specific region between the frame header + /// and any trailing count). /// /// Returns the payload slice for any well-formed frame. pub fn payload(&self) -> &[u8] { match self { Self::Standard { - n_bytes, - raw_bytes, - .. + n_bytes, raw_bytes, .. } | Self::MkvChain { - n_bytes, - raw_bytes, - .. + n_bytes, raw_bytes, .. } => &raw_bytes[6..6 + *n_bytes as usize], Self::TwoDelta { - n_bytes, - raw_bytes, - .. + n_bytes, raw_bytes, .. } => &raw_bytes[9..9 + *n_bytes as usize], } } @@ -292,8 +281,8 @@ impl BenEncodeFrame { } } - /// The bit width of the largest district id in this frame, or `None` for - /// `TwoDelta` (which doesn't carry one). + /// The bit width of the largest district id in this frame, or `None` for `TwoDelta` + /// (which doesn't carry one). pub fn max_val_bit_count(&self) -> Option { match self { Self::Standard { @@ -330,8 +319,7 @@ impl BenEncodeFrame { } } - /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for - /// the snapshot arms. + /// The pair of district ids encoded by a `TwoDelta` frame, or `None` for the snapshot arms. pub fn pair(&self) -> Option<(u16, u16)> { match self { Self::TwoDelta { pair, .. } => Some(*pair), @@ -339,8 +327,8 @@ impl BenEncodeFrame { } } - /// Borrow the source RLE runs for `Standard` and `MkvChain`, or `None` - /// for `TwoDelta` (which carries `run_length_vector` instead). + /// Borrow the source RLE runs for `Standard` and `MkvChain`, or `None` for `TwoDelta` + /// (which carries `run_length_vector` instead). pub fn runs(&self) -> Option<&[(u16, u16)]> { match self { Self::Standard { runs, .. } | Self::MkvChain { runs, .. } => Some(runs), @@ -348,8 +336,8 @@ impl BenEncodeFrame { } } - /// Borrow the alternating run-length vector for a `TwoDelta` frame, or - /// `None` for the snapshot arms. + /// Borrow the alternating run-length vector for a `TwoDelta` frame, or `None` for the snapshot + /// arms. pub fn run_length_vector(&self) -> Option<&[u16]> { match self { Self::TwoDelta { diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index 489cbe6..4276680 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -1,15 +1,12 @@ //! Frame-layer types — one sample's encoded bytes. //! -//! See `docs/glossary.md` for the encoding-stack layering. This module owns -//! layer 2 (frame). Each direction is a single enum whose arms mirror -//! [`crate::BenVariant`]: +//! See `docs/glossary.md` for the encoding-stack layering. This module owns layer 2 (frame). Each +//! direction is a single enum whose arms mirror [`crate::BenVariant`]: //! -//! - [`BenEncodeFrame`] is built **from** RLE runs (or a pair + run-length -//! vector for the `TwoDelta` arm) and carries the source representation -//! alongside the serialized bytes. -//! - [`BenDecodeFrame`] is built **from** wire bytes and keeps the bit-packed -//! payload opaque on `Standard`/`MkvChain` arms so frame-level subsampling -//! stays cheap (no eager bit-unpacking). +//! - [`BenEncodeFrame`] is built **from** RLE runs (or a pair + run-length vector for the +//! `TwoDelta` arm) and carries the source representation alongside the serialized bytes. +//! - [`BenDecodeFrame`] is built **from** wire bytes and keeps the bit-packed payload opaque on +//! `Standard`/`MkvChain` arms so frame-level subsampling stays cheap (no eager bit-unpacking). mod decode; mod encode; diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index a80645b..25c1036 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -67,7 +67,13 @@ fn unwrap_encode_standard(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u3 max_len_bit_count, n_bytes, raw_bytes, - } => (runs, max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes), + } => ( + runs, + max_val_bit_count, + max_len_bit_count, + n_bytes, + raw_bytes, + ), other => panic!("expected Standard encode arm, got {:?}", other), } } @@ -93,9 +99,7 @@ fn unwrap_encode_mkv(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u32, Ve } } -fn unwrap_encode_twodelta( - frame: BenEncodeFrame, -) -> ((u16, u16), u8, u32, Vec, Vec, u16) { +fn unwrap_encode_twodelta(frame: BenEncodeFrame) -> ((u16, u16), u8, u32, Vec, Vec, u16) { match frame { BenEncodeFrame::TwoDelta { pair, @@ -355,11 +359,9 @@ fn twodelta_from_run_lengths_count_none_defaults_to_one() { fn twodelta_from_run_lengths_then_from_parts_roundtrip() { let original = BenEncodeFrame::from_run_lengths((3, 4), vec![5, 5, 5], Some(2)); let bytes = original.as_slice().to_vec(); - let (pair, max_len_bits, n_bytes, _, _, count) = - unwrap_encode_twodelta(original.clone()); + let (pair, max_len_bits, n_bytes, _, _, count) = unwrap_encode_twodelta(original.clone()); let payload_slice = &bytes[9..9 + n_bytes as usize]; - let rebuilt = - BenEncodeFrame::from_parts(pair, max_len_bits, payload_slice.to_vec(), count); + let rebuilt = BenEncodeFrame::from_parts(pair, max_len_bits, payload_slice.to_vec(), count); let (rb_pair, _, _, rb_runs, _, rb_count) = unwrap_encode_twodelta(rebuilt); assert_eq!(rb_pair, pair); assert_eq!(rb_runs, vec![5, 5, 5]); @@ -619,11 +621,7 @@ fn encode_partial_eq_vec_both_directions() { #[test] fn decode_expand_standard_assignment() { // An assignment of [1, 1, 2, 2, 3] becomes RLE [(1,2),(2,2),(3,1)]. - let encoded = BenEncodeFrame::from_assignment( - &[1u16, 1, 2, 2, 3], - BenVariant::Standard, - None, - ); + let encoded = BenEncodeFrame::from_assignment(&[1u16, 1, 2, 2, 3], BenVariant::Standard, None); let mut cursor = io::Cursor::new(encoded.into_bytes()); let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index b67efe3..01a9729 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -1,10 +1,9 @@ //! Translation helpers between BEN and ben32 representations. //! -//! The ben32 intermediate format is used only by the Standard and MkvChain -//! variants. TwoDelta streams use a separate columnar layout and bypass -//! ben32 entirely — see [`BenStreamWriter`](crate::io::writer::BenStreamWriter) -//! and [`BenStreamReader`](crate::io::reader::BenStreamReader) for the -//! TwoDelta compressed-I/O path. +//! The ben32 intermediate format is used only by the Standard and MkvChain variants. TwoDelta +//! streams use a separate columnar layout and bypass ben32 entirely — see +//! [`BenStreamWriter`](crate::io::writer::BenStreamWriter) and +//! [`BenStreamReader`](crate::io::reader::BenStreamReader) for the TwoDelta compressed-I/O path. mod errors; use errors::TranslateError; @@ -22,18 +21,14 @@ use crate::{BenVariant, XBenVariant}; /// # Arguments /// /// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator. -/// * `variant` - The BEN32-supporting variant. Determines whether the resulting -/// BEN frame embeds a trailing repetition count. +/// * `variant` - The BEN32-supporting variant. Determines whether the resulting BEN frame embeds a +/// trailing repetition count. /// * `count` - The repetition count for `MkvChain`. Ignored for `Standard`. /// /// # Returns /// /// Returns the encoded BEN frame payload and header. -fn ben32_to_ben_line( - ben32_vec: Vec, - variant: XBenVariant, - count: u16, -) -> io::Result> { +fn ben32_to_ben_line(ben32_vec: Vec, variant: XBenVariant, count: u16) -> io::Result> { let mut buffer = [0u8; 4]; let mut ben32_rle: Vec<(u16, u16)> = Vec::new(); @@ -69,17 +64,17 @@ fn ben32_to_ben_line( /// Translate a stream of ben32 frames into BEN frames. /// -/// This is primarily used while decoding XBEN, where the compressed payload is -/// stored in ben32 form. Parameterised by [`XBenVariant`] so TwoDelta is -/// excluded at compile time; TwoDelta streams use a different compressed -/// layout and do not pass through ben32 (see the module-level documentation). +/// This is primarily used while decoding XBEN, where the compressed payload is stored in ben32 +/// form. Parameterised by [`XBenVariant`] so TwoDelta is excluded at compile time; TwoDelta streams +/// use a different compressed layout and do not pass through ben32 (see the module-level +/// documentation). /// /// # Arguments /// /// * `reader` - The ben32 input stream. /// * `writer` - The destination for the translated BEN frames. -/// * `variant` - The BEN32-supporting variant, used to determine whether -/// repetition counts follow each ben32 frame. +/// * `variant` - The BEN32-supporting variant, used to determine whether repetition counts follow +/// each ben32 frame. /// /// # Returns /// @@ -156,17 +151,16 @@ fn ben_to_ben32_line( /// Translate a BEN stream into ben32 frames. /// -/// This is the format used inside XBEN after the outer XZ compression layer is -/// removed. Parameterised by [`XBenVariant`] so TwoDelta is excluded at compile -/// time; TwoDelta streams use a separate columnar layout and bypass ben32 -/// entirely (see the module-level documentation). +/// This is the format used inside XBEN after the outer XZ compression layer is removed. +/// Parameterised by [`XBenVariant`] so TwoDelta is excluded at compile time; TwoDelta streams use a +/// separate columnar layout and bypass ben32 entirely (see the module-level documentation). /// /// # Arguments /// /// * `reader` - The BEN input stream without its 17-byte file banner. /// * `writer` - The destination for the translated ben32 frames. -/// * `variant` - The BEN32-supporting variant, used to determine whether -/// repetition counts follow each translated frame. +/// * `variant` - The BEN32-supporting variant, used to determine whether repetition counts follow +/// each translated frame. /// /// # Returns /// diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index a4c5162..ac0d335 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -236,10 +236,9 @@ fn test_random_translation_ben_to_ben32() { #[test] fn test_ben_to_ben32_lines_non_eof_error_on_frame_boundary() { - // Provide a valid BEN frame followed by a read that errors with a non-EOF - // error at exactly the point where the next frame's first byte would be read. - // This exercises the `return Err(e)` branch (line ~191) in the - // `read_exact → match → Err(e) → not UnexpectedEof` path. + // Provide a valid BEN frame followed by a read that errors with a non-EOF error at exactly the + // point where the next frame's first byte would be read. This exercises the `return Err(e)` + // branch (line ~191) in the `read_exact → match → Err(e) → not UnexpectedEof` path. struct FailOnSecondFrame { data: Vec, pos: usize, @@ -249,7 +248,10 @@ fn test_ben_to_ben32_lines_non_eof_error_on_frame_boundary() { impl Read for FailOnSecondFrame { fn read(&mut self, buf: &mut [u8]) -> io::Result { if self.pos >= self.frame_boundary { - return Err(io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke on boundary")); + return Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "pipe broke on boundary", + )); } let available = (self.frame_boundary - self.pos).min(buf.len()); let end = self.pos + available; @@ -290,7 +292,8 @@ fn test_ben32_to_ben_line_rejects_invalid_length() { #[test] fn test_ben32_to_ben_line_rejects_missing_terminator() { - let err = ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1], XBenVariant::Standard, 0).unwrap_err(); + let err = + ben32_to_ben_line(vec![0, 1, 0, 2, 0, 0, 0, 1], XBenVariant::Standard, 0).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), diff --git a/ben/src/io/bundle/error.rs b/ben/src/io/bundle/error.rs index e941708..5d13569 100644 --- a/ben/src/io/bundle/error.rs +++ b/ben/src/io/bundle/error.rs @@ -1,17 +1,14 @@ //! Read-side error types for `.bendl` bundles. //! -//! [`BendlReadError`] is the canonical error type for high-level BENDL -//! convenience APIs (anything that returns an owned value: `asset_bytes`, -//! reader constructors that consume internally, etc.). Returned `Read` -//! / iterator / stream-wrapper values keep their native `io::Result` -//! surface; checksum failures on those paths are carried as -//! `io::ErrorKind::InvalidData` with an inner [`ChecksumError`] that -//! callers can downcast. +//! [`BendlReadError`] is the canonical error type for high-level BENDL convenience APIs (anything +//! that returns an owned value: `asset_bytes`, reader constructors that consume internally, etc.). +//! Returned `Read` / iterator / stream-wrapper values keep their native `io::Result` surface; +//! checksum failures on those paths are carried as `io::ErrorKind::InvalidData` with an inner +//! [`ChecksumError`] that callers can downcast. //! -//! Variant discipline is held at the wrap site, not by the type system: -//! `Io(io::Error)` and `Decode(io::Error)` carry the same payload type, -//! so a future refactor could accidentally wrap a decoder-runtime error -//! as `Io(_)` and the type system would not notice. The error-discipline +//! Variant discipline is held at the wrap site, not by the type system: `Io(io::Error)` and +//! `Decode(io::Error)` carry the same payload type, so a future refactor could accidentally wrap a +//! decoder-runtime error as `Io(_)` and the type system would not notice. The error-discipline //! tests pin which variant fires for each representative read path. use std::fmt; @@ -44,13 +41,12 @@ impl fmt::Display for ChecksumTarget { /// /// Variant precedence is scoped per checksum domain, not global: /// -/// - **Asset checksum:** `Unavailable` > `Mismatch`. The directory entry -/// is authoritative regardless of bundle finalization, so -/// `verify_asset_checksum` never returns `BundleIncomplete`. -/// - **Stream checksum:** `BundleIncomplete` > `Unavailable` > `Mismatch`. -/// The stream's stored CRC depends on `stream_len` being authoritative, -/// which only holds after finalization, so an unfinalized bundle -/// short-circuits to `BundleIncomplete` before the flag is inspected. +/// - **Asset checksum:** `Unavailable` > `Mismatch`. The directory entry is authoritative +/// regardless of bundle finalization, so `verify_asset_checksum` never returns +/// `BundleIncomplete`. +/// - **Stream checksum:** `BundleIncomplete` > `Unavailable` > `Mismatch`. The stream's stored CRC +/// depends on `stream_len` being authoritative, which only holds after finalization, so an +/// unfinalized bundle short-circuits to `BundleIncomplete` before the flag is inspected. #[derive(Debug, Error)] pub enum ChecksumError { /// The computed CRC32C did not match the stored value. @@ -66,20 +62,18 @@ pub enum ChecksumError { expected: u32, }, - /// The relevant checksum-presence flag (`ASSET_FLAG_CHECKSUM` on a - /// directory entry, or the stream-level equivalent on the header) - /// was clear; there is no stored checksum to verify against. The - /// library writer always sets these flags, so this fires only for - /// foreign or hand-built bytes. + /// The relevant checksum-presence flag (`ASSET_FLAG_CHECKSUM` on a directory entry, or the + /// stream-level equivalent on the header) was clear; there is no stored checksum to verify + /// against. The library writer always sets these flags, so this fires only for foreign or + /// hand-built bytes. #[error("checksum is unavailable for {target}")] Unavailable { /// Which region lacks a stored checksum. target: ChecksumTarget, }, - /// The bundle is not finalized, so the stored checksum is not - /// authoritative. Stream-only: asset-checksum APIs never produce - /// this variant because directory entries are authoritative + /// The bundle is not finalized, so the stored checksum is not authoritative. Stream-only: + /// asset-checksum APIs never produce this variant because directory entries are authoritative /// regardless of bundle finalization. #[error("bundle is unfinalized; {target} checksum is not authoritative yet")] BundleIncomplete { @@ -88,32 +82,28 @@ pub enum ChecksumError { }, } -/// High-level error returned by BENDL convenience APIs that consume -/// internally before producing an owned value. +/// High-level error returned by BENDL convenience APIs that consume internally before producing an +/// owned value. /// -/// See [`crate::io::bundle::reader::BendlReader`] for the variant rules -/// per API. The variant discipline is held at the wrap site (where each -/// underlying error becomes a `BendlReadError`); the type system alone -/// cannot prevent a future refactor from mis-wrapping a codec error as -/// `Io` or a header parse failure as `DecoderInit`. The "variant -/// discipline" tests pin which variant fires for each representative -/// read path. +/// See [`crate::io::bundle::reader::BendlReader`] for the variant rules per API. The variant +/// discipline is held at the wrap site (where each underlying error becomes a `BendlReadError`); +/// the type system alone cannot prevent a future refactor from mis-wrapping a codec error as `Io` +/// or a header parse failure as `DecoderInit`. The "variant discipline" tests pin which variant +/// fires for each representative read path. #[derive(Debug, Error)] pub enum BendlReadError { - /// Underlying I/O failure at the bundle layer (seek, range read, - /// filesystem). Never used to carry codec or checksum failures. + /// Underlying I/O failure at the bundle layer (seek, range read, filesystem). Never used to + /// carry codec or checksum failures. #[error("IO error: {0}")] Io(io::Error), - /// A format-layer error. Reserved for higher-level APIs that wrap - /// an `open` failure or for future lazy-validation paths; normal - /// post-open accessors should not produce this from + /// A format-layer error. Reserved for higher-level APIs that wrap an `open` failure or for + /// future lazy-validation paths; normal post-open accessors should not produce this from /// header/directory structure. #[error("bundle format error: {0}")] Format(BendlFormatError), - /// Checksum verification failed, was unavailable, or could not be - /// authoritatively performed. + /// Checksum verification failed, was unavailable, or could not be authoritatively performed. #[error("checksum error: {0}")] Checksum(#[from] ChecksumError), @@ -121,9 +111,8 @@ pub enum BendlReadError { #[error("decoder init error: {0}")] DecoderInit(DecoderInitError), - /// A codec error raised while a BEN/XBEN/xz decoder was already - /// running (malformed compressed payload, malformed assignment - /// stream, etc.). + /// A codec error raised while a BEN/XBEN/xz decoder was already running (malformed compressed + /// payload, malformed assignment stream, etc.). #[error("decode error: {0}")] Decode(io::Error), } @@ -136,9 +125,8 @@ impl From for BendlReadError { impl From for BendlReadError { fn from(e: BendlFormatError) -> Self { - // BendlFormatError already carries an `Io` arm; unwrap it so - // that ordinary I/O failures at the format layer surface as - // `BendlReadError::Io` rather than getting buried inside a + // BendlFormatError already carries an `Io` arm; unwrap it so that ordinary I/O failures at + // the format layer surface as `BendlReadError::Io` rather than getting buried inside a // synthetic `Format` wrap. match e { BendlFormatError::Io(io) => BendlReadError::Io(io), diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index bc190f4..00c817c 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -1,9 +1,8 @@ //! Binary header and directory definitions for the `.bendl` container. //! -//! This module is the pure format layer: it defines the on-disk byte -//! layout, the associated constants, and the encode/decode helpers that -//! convert between in-memory Rust structs and their on-disk representation. -//! There is no I/O orchestration here — higher layers (`reader`, `writer`) +//! This module is the pure format layer: it defines the on-disk byte layout, the associated +//! constants, and the encode/decode helpers that convert between in-memory Rust structs and their +//! on-disk representation. There is no I/O orchestration here — higher layers (`reader`, `writer`) //! combine these primitives with seekable files. //! //! All multi-byte integers in the `.bendl` format are little-endian. @@ -43,9 +42,8 @@ pub const ASSIGNMENT_FORMAT_XBEN: u8 = 2; /// Container format of the embedded assignment stream. /// -/// The BEN *variant* (`Standard`, `MkvChain`, `TwoDelta`) is carried by -/// the 17-byte banner at the start of the embedded stream and is not -/// duplicated in the bundle header. +/// The BEN *variant* (`Standard`, `MkvChain`, `TwoDelta`) is carried by the 17-byte banner at the +/// start of the embedded stream and is not duplicated in the bundle header. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AssignmentFormat { /// Uncompressed BEN byte stream. @@ -93,8 +91,8 @@ pub const STANDARDIZED_NAME_GRAPH: &str = "graph.json"; /// Standardized name for the `node_permutation_map.json` asset. pub const STANDARDIZED_NAME_NODE_PERMUTATION_MAP: &str = "node_permutation_map.json"; -/// Return the standardized name reserved for a known singleton asset type, -/// or `None` for custom or unknown types. +/// Return the standardized name reserved for a known singleton asset type, or `None` for custom or +/// unknown types. pub fn standardized_name_for(asset_type: u16) -> Option<&'static str> { match asset_type { ASSET_TYPE_METADATA => Some(STANDARDIZED_NAME_METADATA), @@ -106,9 +104,8 @@ pub fn standardized_name_for(asset_type: u16) -> Option<&'static str> { /// One of the known singleton asset types reserved by the bundle format. /// -/// Each variant carries a fixed `asset_type` integer and a fixed -/// standardized name. Custom assets (writer-chosen name, multiple allowed) -/// are not represented here. +/// Each variant carries a fixed `asset_type` integer and a fixed standardized name. Custom assets +/// (writer-chosen name, multiple allowed) are not represented here. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum KnownAssetKind { Metadata, @@ -136,28 +133,25 @@ impl KnownAssetKind { } } -/// Return whether a given asset type should default to xz compression -/// when the writer is not given an explicit compression option. +/// Return whether a given asset type should default to xz compression when the writer is not given +/// an explicit compression option. pub fn default_compresses_by_type(asset_type: u16) -> bool { matches!(asset_type, ASSET_TYPE_GRAPH) } /// Asset flag bit: the decoded payload is UTF-8 JSON. pub const ASSET_FLAG_JSON: u16 = 1 << 0; -/// Asset flag bit: the stored payload is xz-compressed. The `payload_len` -/// directory field refers to the compressed size on disk. +/// Asset flag bit: the stored payload is xz-compressed. The `payload_len` directory field refers to +/// the compressed size on disk. pub const ASSET_FLAG_XZ: u16 = 1 << 1; /// Asset flag bit: the entry carries a trailing checksum. /// -/// When set, the trailing checksum is exactly four little-endian bytes -/// containing a CRC32C (Castagnoli polynomial) over the **on-disk -/// payload bytes** (`payload_offset..payload_offset + payload_len`). -/// For an xz-compressed asset the CRC is over the compressed bytes, -/// not the decompressed content — verification happens before -/// decompression. Library writer paths always set this flag with -/// `checksum_len == [`ASSET_CHECKSUM_LEN`]`; readers reject any entry -/// where the flag and `checksum_len` are inconsistent (see -/// [`BendlDirectoryEntry::read_from`]). +/// When set, the trailing checksum is exactly four little-endian bytes containing a CRC32C +/// (Castagnoli polynomial) over the **on-disk payload bytes** (`payload_offset..payload_offset + +/// payload_len`). For an xz-compressed asset the CRC is over the compressed bytes, not the +/// decompressed content — verification happens before decompression. Library writer paths always +/// set this flag with `checksum_len == [`ASSET_CHECKSUM_LEN`]`; readers reject any entry where the +/// flag and `checksum_len` are inconsistent (see [`BendlDirectoryEntry::read_from`]). pub const ASSET_FLAG_CHECKSUM: u16 = 1 << 2; /// On-disk byte width of an asset-payload CRC32C. @@ -165,8 +159,8 @@ pub const ASSET_CHECKSUM_LEN: u32 = 4; /// Default xz preset level used when compressing asset payloads. /// -/// Level 6 matches the `xz` CLI's own default and `xz2::XzEncoder::new`'s -/// default, and is a reasonable ratio/speed balance for JSON payloads. +/// Level 6 matches the `xz` CLI's own default and `xz2::XzEncoder::new`'s default, and is a +/// reasonable ratio/speed balance for JSON payloads. pub const DEFAULT_XZ_PRESET: u32 = 6; // --------------------------------------------------------------------------- @@ -190,9 +184,8 @@ pub struct BendlHeader { pub reserved_0: u16, /// Bundle-level feature flags. pub flags: u64, - /// Absolute byte offset of the directory table, or `0` if no directory - /// has been written yet. In a finalized bundle the directory lives at - /// the end of the file. + /// Absolute byte offset of the directory table, or `0` if no directory has been written yet. In + /// a finalized bundle the directory lives at the end of the file. pub directory_offset: u64, /// Byte length of the directory table, or `0` if absent. pub directory_len: u64, @@ -200,8 +193,7 @@ pub struct BendlHeader { pub stream_offset: u64, /// Byte length of the assignment stream, or `0` if unfinalized. pub stream_len: u64, - /// Number of expanded samples in the assignment stream, or `-1` if - /// unfinalized. + /// Number of expanded samples in the assignment stream, or `-1` if unfinalized. pub sample_count: i64, } @@ -302,8 +294,8 @@ impl BendlHeader { // Directory entry // --------------------------------------------------------------------------- -/// Fixed-size header at the start of every directory entry, before the -/// variable-length `name` and optional `checksum` bytes. +/// Fixed-size header at the start of every directory entry, before the variable-length `name` and +/// optional `checksum` bytes. pub const DIRECTORY_ENTRY_HEADER_SIZE: usize = 28; /// In-memory representation of a single directory entry. @@ -317,8 +309,8 @@ pub struct BendlDirectoryEntry { pub name: String, /// Absolute file offset of the asset payload. pub payload_offset: u64, - /// Byte length of the asset payload as stored on disk (post-compression - /// when the xz flag is set). + /// Byte length of the asset payload as stored on disk (post-compression when the xz flag is + /// set). pub payload_len: u64, /// Optional trailing checksum bytes. Interpretation depends on flags. pub checksum: Option>, @@ -411,8 +403,8 @@ impl BendlDirectoryEntry { /// (flag set, 4 bytes). /// /// This is the canonical accessor for verification code. Returns `None` for entries with - /// `ASSET_FLAG_CHECKSUM` clear; entries where the flag and length are inconsistent are - /// rejected at read time and so cannot reach this method. + /// `ASSET_FLAG_CHECKSUM` clear; entries where the flag and length are inconsistent are rejected + /// at read time and so cannot reach this method. pub fn checksum_u32(&self) -> Option { if self.asset_flags & ASSET_FLAG_CHECKSUM == 0 { return None; @@ -431,10 +423,9 @@ impl BendlDirectoryEntry { /// Read the full directory table from a `Read` source. /// -/// The source should be positioned at the first byte of the directory -/// table (i.e. at `header.directory_offset`) and is expected to contain -/// exactly `entry_count` entries followed by no trailing bytes within the -/// directory region. +/// The source should be positioned at the first byte of the directory table (i.e. at +/// `header.directory_offset`) and is expected to contain exactly `entry_count` entries followed by +/// no trailing bytes within the directory region. pub fn read_directory( reader: &mut R, ) -> Result, BendlFormatError> { diff --git a/ben/src/io/bundle/manifest.rs b/ben/src/io/bundle/manifest.rs index 1c41e50..f02cdd2 100644 --- a/ben/src/io/bundle/manifest.rs +++ b/ben/src/io/bundle/manifest.rs @@ -1,29 +1,27 @@ //! JSON metadata structs for the optional `metadata.json` asset. //! -//! The authoritative values for `major_version`, `minor_version`, -//! `assignment_format`, `complete`, and the BEN variant all live in the -//! fixed bundle header (or in the embedded stream banner for the variant). -//! The `metadata.json` asset is a best-effort human-readable mirror -//! intended for debugging and tooling; writers should prefer reading the -//! header directly rather than trusting fields in this struct. +//! The authoritative values for `major_version`, `minor_version`, `assignment_format`, `complete`, +//! and the BEN variant all live in the fixed bundle header (or in the embedded stream banner for +//! the variant). The `metadata.json` asset is a best-effort human-readable mirror intended for +//! debugging and tooling; writers should prefer reading the header directly rather than trusting +//! fields in this struct. use serde::{Deserialize, Serialize}; /// Serde representation of the optional `metadata.json` asset. /// -/// Field names mirror the header where possible so that the JSON is -/// easy to cross-reference against the binary layout. +/// Field names mirror the header where possible so that the JSON is easy to cross-reference against +/// the binary layout. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct BendlManifest { /// Incompatible-change version of the bundle format. pub major_version: u16, /// Additive version of the bundle format. pub minor_version: u16, - /// Container format of the embedded assignment stream - /// (`"ben"` or `"xben"`). + /// Container format of the embedded assignment stream (`"ben"` or `"xben"`). pub assignment_format: String, - /// BEN variant (`"standard"`, `"mkv_chain"`, or `"two_delta"`) as - /// carried by the embedded stream's 17-byte banner. + /// BEN variant (`"standard"`, `"mkv_chain"`, or `"two_delta"`) as carried by the embedded + /// stream's 17-byte banner. #[serde(skip_serializing_if = "Option::is_none", default)] pub variant: Option, /// Whether the bundle was finalized successfully. diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index e037860..bb4e642 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -1,16 +1,15 @@ //! `.bendl` single-file dataset container. //! -//! A `.bendl` file is a seekable container that wraps the existing BEN or -//! XBEN assignment stream together with optional front-loaded assets such -//! as a graph JSON, a relabel map, or a metadata blob. The directory table -//! that describes those assets lives at the end of the file so that new -//! assets can be appended to a finalized bundle in O(new asset size + -//! directory size) without rewriting the assignment stream. +//! A `.bendl` file is a seekable container that wraps the existing BEN or XBEN assignment stream +//! together with optional front-loaded assets such as a graph JSON, a relabel map, or a metadata +//! blob. The directory table that describes those assets lives at the end of the file so that new +//! assets can be appended to a finalized bundle in O(new asset size + directory size) without +//! rewriting the assignment stream. //! //! The module is organised as: //! -//! - [`format`] — binary header and directory entry types, constants, and -//! encode/decode helpers. Pure functions over byte buffers; no I/O. +//! - [`format`] — binary header and directory entry types, constants, and encode/decode helpers. +//! Pure functions over byte buffers; no I/O. //! - [`manifest`] — serde structs for the optional `metadata.json` asset. pub mod error; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index ba43f1a..3d377cc 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -1,25 +1,21 @@ //! Read-only inspection of `.bendl` files. //! -//! A [`BendlReader`] parses a bundle's fixed header and (if present) its -//! trailing directory table. It does not read any asset payload bytes -//! until the caller explicitly requests them via [`BendlReader::asset_bytes`] -//! or [`BendlReader::asset_reader`]. The assignment stream region is -//! likewise exposed as a byte range the caller can plumb into a -//! [`BenStreamReader`] without this module reinterpreting any BEN/XBEN -//! internals. +//! A [`BendlReader`] parses a bundle's fixed header and (if present) its trailing directory table. +//! It does not read any asset payload bytes until the caller explicitly requests them via +//! [`BendlReader::asset_bytes`] or [`BendlReader::asset_reader`]. The assignment stream region is +//! likewise exposed as a byte range the caller can plumb into a [`BenStreamReader`] without this +//! module reinterpreting any BEN/XBEN internals. //! //! ## Verification surface //! -//! - [`BendlReader::asset_bytes`] and [`BendlReader::asset_reader`] are -//! **verify-on-touch**: the CRC32C of the on-disk payload bytes is -//! computed as data flows through, and a mismatch is reported at EOF. -//! - [`BendlReader::asset_bytes_unverified`], [`BendlReader::asset_reader_unverified`], -//! and [`BendlReader::asset_payload_reader_unverified`] are the -//! explicit recovery/debug escape hatches; they never surface a -//! [`ChecksumError`]. -//! - [`BendlReader::verify_asset_checksum`] and -//! [`BendlReader::verify_all_asset_checksums`] are explicit raw-bytes -//! verifiers (no decoding) that do not return decoded payload bytes. +//! - [`BendlReader::asset_bytes`] and [`BendlReader::asset_reader`] are **verify-on-touch**: the +//! CRC32C of the on-disk payload bytes is computed as data flows through, and a mismatch is +//! reported at EOF. +//! - [`BendlReader::asset_bytes_unverified`], [`BendlReader::asset_reader_unverified`], and +//! [`BendlReader::asset_payload_reader_unverified`] are the explicit recovery/debug escape +//! hatches; they never surface a [`ChecksumError`]. +//! - [`BendlReader::verify_asset_checksum`] and [`BendlReader::verify_all_asset_checksums`] are +//! explicit raw-bytes verifiers (no decoding) that do not return decoded payload bytes. use std::io::{self, Read, Seek, SeekFrom, Take}; @@ -49,11 +45,10 @@ pub struct BendlReader { } impl BendlReader { - /// Open a `.bendl` file by validating its header and loading the - /// directory table if one exists. + /// Open a `.bendl` file by validating its header and loading the directory table if one exists. /// - /// The underlying reader is left at an unspecified position; callers - /// should seek explicitly before reading asset or stream bytes. + /// The underlying reader is left at an unspecified position; callers should seek explicitly + /// before reading asset or stream bytes. pub fn open(mut inner: R) -> Result { inner.seek(SeekFrom::Start(0))?; let header = BendlHeader::read_from(&mut inner)?; @@ -90,8 +85,8 @@ impl BendlReader { self.header.is_finalized() } - /// The sample count recorded in the header, or `None` if not - /// authoritative (i.e. the bundle is still incomplete). + /// The sample count recorded in the header, or `None` if not authoritative (i.e. the bundle is + /// still incomplete). pub fn sample_count(&self) -> Option { if self.header.is_finalized() { Some(self.header.sample_count) @@ -115,20 +110,18 @@ impl BendlReader { self.directory.iter().find(|e| e.name == name) } - /// Look up the unique directory entry with the given asset type, if - /// any. Singleton types (`metadata.json`, `graph.json`, - /// `node_permutation_map.json`) use this to grab their payload without caring - /// about the standardized name. + /// Look up the unique directory entry with the given asset type, if any. Singleton types + /// (`metadata.json`, `graph.json`, `node_permutation_map.json`) use this to grab their payload + /// without caring about the standardized name. pub fn find_asset_by_type(&self, asset_type: u16) -> Option<&BendlDirectoryEntry> { self.directory.iter().find(|e| e.asset_type == asset_type) } /// Return the byte range occupied by the assignment stream. /// - /// For finalized bundles this is `(stream_offset, stream_len)` as - /// recorded in the header. For incomplete bundles the end of the - /// stream is taken as EOF (or the directory start, if a provisional - /// directory was written). + /// For finalized bundles this is `(stream_offset, stream_len)` as recorded in the header. For + /// incomplete bundles the end of the stream is taken as EOF (or the directory start, if a + /// provisional directory was written). pub fn assignment_stream_range(&mut self) -> io::Result<(u64, u64)> { if self.header.is_finalized() { Ok((self.header.stream_offset, self.header.stream_len)) @@ -143,23 +136,21 @@ impl BendlReader { } } - /// Return a `Take` reader positioned at the start of the assignment - /// stream and limited to its declared length. The caller is expected - /// to wrap the returned reader in a [`BenStreamReader`] (via - /// [`BendlReader::open_assignment_reader`] or directly) as - /// appropriate for [`BendlReader::assignment_format`]. + /// Return a `Take` reader positioned at the start of the assignment stream and limited to its + /// declared length. The caller is expected to wrap the returned reader in a [`BenStreamReader`] + /// (via [`BendlReader::open_assignment_reader`] or directly) as appropriate for + /// [`BendlReader::assignment_format`]. pub fn assignment_stream_reader(&mut self) -> io::Result> { let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; Ok((&mut self.inner).take(len)) } - /// Construct the appropriate assignment decoder for the bundle's - /// declared `assignment_format` and return it as a [`BenStreamReader`] - /// over the bundle's bounded stream region. + /// Construct the appropriate assignment decoder for the bundle's declared `assignment_format` + /// and return it as a [`BenStreamReader`] over the bundle's bounded stream region. /// - /// Returns an error if the header's `assignment_format` field is - /// unrecognized or the embedded banner is malformed. + /// Returns an error if the header's `assignment_format` field is unrecognized or the embedded + /// banner is malformed. pub fn open_assignment_reader( &mut self, ) -> Result>, BundleAssignmentReaderError> { @@ -180,21 +171,20 @@ impl BendlReader { /// Read the fully-decoded bytes of an asset by directory entry, verifying its CRC32C before /// returning. /// - /// **Contract:** this is exactly `asset_reader(entry)? then read_to_end` — one behavioral - /// path shared with the streaming API so the two cannot drift apart. Implications: + /// **Contract:** this is exactly `asset_reader(entry)? then read_to_end` — one behavioral path + /// shared with the streaming API so the two cannot drift apart. Implications: /// /// - Uncompressed asset, payload byte flipped → the CRC tee observes the mismatch at raw EOF - /// and returns - /// [`BendlReadError::Checksum`]. + /// and returns [`BendlReadError::Checksum`]. /// - Uncompressed asset, stored CRC bytes flipped → same; the tee compares computed-vs-stored /// at EOF. /// - xz-compressed asset with broken xz framing → the xz decoder fails before the raw tee - /// reaches EOF; surface is [`BendlReadError::Decode`]. (CRC is over compressed bytes, but - /// the decoder's failure precedes any CRC check.) + /// reaches EOF; surface is [`BendlReadError::Decode`]. (CRC is over compressed bytes, but the + /// decoder's failure precedes any CRC check.) /// - xz-compressed asset with intact xz but wrong stored CRC → codec reaches EOF, BENDL-owned /// wrapper checks CRC, returns [`BendlReadError::Checksum`]. - /// - Entry has `ASSET_FLAG_CHECKSUM` clear (foreign/hand-built bytes; the library writer - /// never produces this) → [`ChecksumError::Unavailable`]. + /// - Entry has `ASSET_FLAG_CHECKSUM` clear (foreign/hand-built bytes; the library writer never + /// produces this) → [`ChecksumError::Unavailable`]. pub fn asset_bytes(&mut self, entry: &BendlDirectoryEntry) -> Result, BendlReadError> { let mut out = Vec::new(); let mut reader = self.asset_reader(entry)?; @@ -227,9 +217,9 @@ impl BendlReader { /// one asset or stream reader may be live at a time. /// /// Checksum mismatch surfaces from `Read::read` as - /// `io::Error::new(io::ErrorKind::InvalidData, ChecksumError)` on the call that would - /// otherwise return `Ok(0)` at EOF. Early-drop or partial-read callers do **not** observe - /// verification — the reader must be driven to EOF for the CRC to be checked. + /// `io::Error::new(io::ErrorKind::InvalidData, ChecksumError)` on the call that would otherwise + /// return `Ok(0)` at EOF. Early-drop or partial-read callers do **not** observe verification — + /// the reader must be driven to EOF for the CRC to be checked. pub fn asset_reader<'a>( &'a mut self, entry: &BendlDirectoryEntry, @@ -249,8 +239,8 @@ impl BendlReader { if entry.asset_flags & ASSET_FLAG_XZ != 0 { // Compressed: CRC tee sits *inside* the XzDecoder so the tee accumulates over raw - // compressed bytes; the BENDL-owned wrapper around the decoder finalizes the - // check after the codec reaches its own EOF. + // compressed bytes; the BENDL-owned wrapper around the decoder finalizes the check + // after the codec reaches its own EOF. let tee = CrcTeeReader::new(raw); let decoder = XzDecoder::new(tee); Ok(Box::new(DecodedVerifyingReader { @@ -288,15 +278,13 @@ impl BendlReader { } } - /// Raw on-disk payload reader without CRC verification — kept - /// distinct from [`Self::asset_reader_unverified`] so that callers - /// doing low-level recovery never accidentally emit decompressed - /// bytes (or, conversely, never accidentally emit compressed bytes - /// expecting raw). + /// Raw on-disk payload reader without CRC verification — kept distinct from + /// [`Self::asset_reader_unverified`] so that callers doing low-level recovery never + /// accidentally emit decompressed bytes (or, conversely, never accidentally emit compressed + /// bytes expecting raw). /// - /// For an xz-flagged asset this yields the compressed payload - /// bytes byte-for-byte; for an uncompressed asset it is the same - /// as [`Self::asset_reader_unverified`]. + /// For an xz-flagged asset this yields the compressed payload bytes byte-for-byte; for an + /// uncompressed asset it is the same as [`Self::asset_reader_unverified`]. pub fn asset_payload_reader_unverified<'a>( &'a mut self, entry: &BendlDirectoryEntry, @@ -305,14 +293,12 @@ impl BendlReader { Ok(Box::new((&mut self.inner).take(entry.payload_len))) } - /// Verify the stored CRC32C of a single asset without returning - /// any decoded bytes. + /// Verify the stored CRC32C of a single asset without returning any decoded bytes. /// - /// The CRC is over the raw on-disk payload bytes; no decoder is - /// invoked, so corrupted xz framing under an intact stored CRC - /// will still report `Ok(())` (or, conversely, an intact xz - /// payload with a corrupted stored CRC will deterministically - /// report [`ChecksumError::Mismatch`]). + /// The CRC is over the raw on-disk payload bytes; no decoder is invoked, so corrupted xz + /// framing under an intact stored CRC will still report `Ok(())` (or, conversely, an intact xz + /// payload with a corrupted stored CRC will deterministically report + /// [`ChecksumError::Mismatch`]). pub fn verify_asset_checksum( &mut self, entry: &BendlDirectoryEntry, @@ -334,9 +320,8 @@ impl BendlReader { let want = remaining.min(buf.len() as u64) as usize; let n = self.inner.read(&mut buf[..want])?; if n == 0 { - // Short read against the declared payload length — - // surface as an I/O error so callers can distinguish a - // truncated bundle from a CRC mismatch. + // Short read against the declared payload length — surface as an I/O error so + // callers can distinguish a truncated bundle from a CRC mismatch. return Err(BendlReadError::Io(io::Error::new( io::ErrorKind::UnexpectedEof, format!( @@ -359,13 +344,12 @@ impl BendlReader { Ok(()) } - /// Verify every asset's CRC in directory order. Returns the - /// **first** mismatch encountered and stops; callers that want a - /// full audit should iterate the directory and call + /// Verify every asset's CRC in directory order. Returns the **first** mismatch encountered and + /// stops; callers that want a full audit should iterate the directory and call /// [`Self::verify_asset_checksum`] per entry themselves. pub fn verify_all_asset_checksums(&mut self) -> Result<(), BendlReadError> { - // Clone the entries so we don't borrow self.directory across - // the seek/read calls on self.inner. + // Clone the entries so we don't borrow self.directory across the seek/read calls on + // self.inner. let entries = self.directory.clone(); for entry in &entries { self.verify_asset_checksum(entry)?; @@ -373,14 +357,13 @@ impl BendlReader { Ok(()) } - /// Validate that the loaded directory is well-formed under the - /// canonical-name and uniqueness rules. + /// Validate that the loaded directory is well-formed under the canonical-name and uniqueness + /// rules. /// - /// Returns [`BundleValidationError`] if any entry violates the rules. - /// This is called automatically by [`BendlReader::open`] when the - /// `strict` constructor is used in tests; in normal reads, the - /// writer is already expected to enforce these rules and a - /// malformed bundle is a program bug somewhere else. + /// Returns [`BundleValidationError`] if any entry violates the rules. This is called + /// automatically by [`BendlReader::open`] when the `strict` constructor is used in tests; in + /// normal reads, the writer is already expected to enforce these rules and a malformed bundle + /// is a program bug somewhere else. pub fn validate_directory(&self) -> Result<(), BundleValidationError> { validate_directory_entries(&self.directory) } @@ -397,13 +380,13 @@ enum VerifyState { /// Underlying reader returned EOF and the CRC matched. Subsequent reads return `Ok(0)` /// (normal EOF). EofChecked, - /// CRC mismatch was reported to the caller. Subsequent reads return `Ok(0)` so the reader - /// stays well-behaved if the caller re-polls after the error. + /// CRC mismatch was reported to the caller. Subsequent reads return `Ok(0)` so the reader stays + /// well-behaved if the caller re-polls after the error. Failed, } -/// Uncompressed-asset verifying reader: forwards bytes from the bounded payload, accumulates -/// CRC32C as they fly past, and on raw EOF either confirms the checksum or returns +/// Uncompressed-asset verifying reader: forwards bytes from the bounded payload, accumulates CRC32C +/// as they fly past, and on raw EOF either confirms the checksum or returns /// [`ChecksumError::Mismatch`] in place of the usual `Ok(0)`. struct RawVerifyingReader<'a, R: Read + Seek> { inner: Take<&'a mut R>, @@ -439,9 +422,9 @@ impl Read for RawVerifyingReader<'_, R> { } /// CRC accumulator that sits *inside* an [`XzDecoder`] for compressed assets. It must never -/// substitute a checksum error for raw EOF — the codec needs to see the natural `Ok(0)` so it -/// can flush pending output. The post-decoder wrapper ([`DecodedVerifyingReader`]) -/// inspects this struct's accumulated hash after codec EOF. +/// substitute a checksum error for raw EOF — the codec needs to see the natural `Ok(0)` so it can +/// flush pending output. The post-decoder wrapper ([`DecodedVerifyingReader`]) inspects this +/// struct's accumulated hash after codec EOF. struct CrcTeeReader { inner: R, hasher: u32, @@ -463,8 +446,8 @@ impl Read for CrcTeeReader { } } -/// Verifying wrapper around an `XzDecoder>`. Lets the codec observe normal raw -/// EOF before finalizing the CRC check at the decoded layer. +/// Verifying wrapper around an `XzDecoder>`. Lets the codec observe normal raw EOF +/// before finalizing the CRC check at the decoded layer. struct DecodedVerifyingReader<'a, R: Read + Seek> { decoder: XzDecoder>>, expected: u32, @@ -512,8 +495,8 @@ fn classify_read_error(err: io::Error, entry: &BendlDirectoryEntry) -> BendlRead { Some(Ok(boxed)) => return BendlReadError::Checksum(*boxed), Some(Err(other)) => { - // Downcast failed unexpectedly — reconstruct an io::Error - // around the still-boxed payload so we don't lose context. + // Downcast failed unexpectedly — reconstruct an io::Error around the still-boxed + // payload so we don't lose context. return BendlReadError::Io(io::Error::new(io::ErrorKind::InvalidData, other)); } None => { @@ -567,8 +550,7 @@ pub enum BundleAssignmentReaderError { Io(#[from] io::Error), } -/// Errors raised when a directory violates the canonical-name or -/// uniqueness rules. +/// Errors raised when a directory violates the canonical-name or uniqueness rules. #[derive(Debug, thiserror::Error)] pub enum BundleValidationError { /// Two entries share the same name. diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index 38d8d0a..731a017 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -135,7 +135,10 @@ fn directory_entry_round_trip_with_checksum() { let mut cursor = &bytes[..]; let decoded = BendlDirectoryEntry::read_from(&mut cursor).unwrap(); assert_eq!(decoded, entry); - assert_eq!(decoded.checksum.as_deref(), Some(&[0xDE, 0xAD, 0xBE, 0xEF][..])); + assert_eq!( + decoded.checksum.as_deref(), + Some(&[0xDE, 0xAD, 0xBE, 0xEF][..]) + ); assert_eq!(decoded.checksum_u32(), Some(0xEFBEADDE)); } @@ -151,8 +154,8 @@ fn directory_entry_rejects_flag_set_with_wrong_checksum_len() { checksum: Some(vec![0xDE, 0xAD, 0xBE, 0xEF]), }; let mut bytes = entry.to_bytes().unwrap(); - // Patch checksum_len at bytes 24..28 to claim 6 (also append two - // bytes so we don't crash on short read in the negative path). + // Patch checksum_len at bytes 24..28 to claim 6 (also append two bytes so we don't crash on + // short read in the negative path). bytes[24..28].copy_from_slice(&6u32.to_le_bytes()); bytes.extend_from_slice(&[0x00, 0x00]); // pad to declared len entry.checksum = Some(vec![0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x00]); @@ -179,8 +182,8 @@ fn directory_entry_rejects_flag_clear_with_nonzero_checksum_len() { checksum: None, }; let mut bytes = entry.to_bytes().unwrap(); - // The encoded bytes have checksum_len == 0 and no trailing checksum - // bytes; patch checksum_len to 4 and append four bytes. + // The encoded bytes have checksum_len == 0 and no trailing checksum bytes; patch checksum_len + // to 4 and append four bytes. bytes[24..28].copy_from_slice(&4u32.to_le_bytes()); bytes.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD]); entry.checksum = Some(vec![0xAA, 0xBB, 0xCC, 0xDD]); @@ -241,8 +244,8 @@ fn empty_directory_table_round_trip() { #[test] fn header_and_directory_entry_header_sizes_are_stable() { - // These sizes are baked into the on-disk format; regressing them - // would silently break existing bundles. + // These sizes are baked into the on-disk format; regressing them would silently break existing + // bundles. assert_eq!(HEADER_SIZE, 64); assert_eq!(DIRECTORY_ENTRY_HEADER_SIZE, 28); } diff --git a/ben/src/io/bundle/tests/manifest.rs b/ben/src/io/bundle/tests/manifest.rs index bfe199a..6d1ab53 100644 --- a/ben/src/io/bundle/tests/manifest.rs +++ b/ben/src/io/bundle/tests/manifest.rs @@ -16,8 +16,7 @@ fn manifest_json_round_trip() { #[test] fn manifest_accepts_missing_variant() { - let json = - r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; + let json = r#"{"major_version":1,"minor_version":0,"assignment_format":"ben","complete":true}"#; let decoded: BendlManifest = serde_json::from_str(json).unwrap(); assert_eq!(decoded.variant, None); assert!(decoded.complete); diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index d9c486e..084c811 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -12,21 +12,18 @@ use crate::io::bundle::reader::{ validate_directory_entries, BendlReader, BundleAssignmentReaderError, BundleValidationError, }; -/// Stamp a valid CRC32C and `ASSET_FLAG_CHECKSUM` onto a hand-built -/// directory entry whose on-disk payload bytes are `payload`. Use this -/// in test fixtures so the entry round-trips through the verify-on-touch -/// reader APIs. Tests that want to exercise the foreign-bundle / -/// clear-flag path build entries directly with the flag clear and -/// `checksum: None`. +/// Stamp a valid CRC32C and `ASSET_FLAG_CHECKSUM` onto a hand-built directory entry whose on-disk +/// payload bytes are `payload`. Use this in test fixtures so the entry round-trips through the +/// verify-on-touch reader APIs. Tests that want to exercise the foreign-bundle / clear-flag path +/// build entries directly with the flag clear and `checksum: None`. fn with_crc(mut entry: BendlDirectoryEntry, payload: &[u8]) -> BendlDirectoryEntry { entry.asset_flags |= ASSET_FLAG_CHECKSUM; entry.checksum = Some(crc32c::crc32c(payload).to_le_bytes().to_vec()); entry } -/// Build a complete in-memory finalized bundle with two assets: -/// an xz-compressed `graph.json` and a raw custom blob, followed by -/// a fake BEN stream and a trailing directory. +/// Build a complete in-memory finalized bundle with two assets: an xz-compressed `graph.json` and a +/// raw custom blob, followed by a fake BEN stream and a trailing directory. fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { // Asset payloads (decoded): let graph_json = br#"{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}"#.to_vec(); @@ -259,9 +256,8 @@ fn validate_directory_catches_wrong_canonical_name() { // Robustness tests // ----------------------------------------------------------------------- -/// Build a small finalized bundle with a known graph asset, metadata -/// asset, empty stream, and no validation pitfalls. Useful as a base -/// that tests can mutate byte-by-byte. +/// Build a small finalized bundle with a known graph asset, metadata asset, empty stream, and no +/// validation pitfalls. Useful as a base that tests can mutate byte-by-byte. fn build_basic_finalized_bundle() -> Vec { let mut bytes = Vec::new(); bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); @@ -337,8 +333,8 @@ fn open_rejects_directory_with_inflated_entry_count() { let mut bytes = build_basic_finalized_bundle(); // Read directory_offset from the header (bytes 24..32). let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Blow up the entry count at the start of the directory to a - // value that cannot possibly fit in the remaining file bytes. + // Blow up the entry count at the start of the directory to a value that cannot possibly fit in + // the remaining file bytes. bytes[directory_offset..directory_offset + 4].copy_from_slice(&9999u32.to_le_bytes()); match BendlReader::open(Cursor::new(bytes)) { Err(BendlFormatError::Io(_)) => {} @@ -349,8 +345,8 @@ fn open_rejects_directory_with_inflated_entry_count() { #[test] fn open_rejects_directory_with_chopped_final_entry() { - // Drop the last byte of the file, which lies inside the name - // field of the final directory entry. + // Drop the last byte of the file, which lies inside the name field of the final directory + // entry. let mut bytes = build_basic_finalized_bundle(); bytes.pop(); match BendlReader::open(Cursor::new(bytes)) { @@ -403,24 +399,22 @@ fn interleaved_reads_do_not_corrupt_each_other() { #[test] fn asset_bytes_errors_when_declared_length_runs_past_eof() { - // Hand-construct a bundle where the metadata directory entry - // claims a payload_len that extends well past EOF. + // Hand-construct a bundle where the metadata directory entry claims a payload_len that extends + // well past EOF. let mut bytes = build_basic_finalized_bundle(); // Parse the directory offset to find where the entry lives. let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Skip the u32 entry count (4 bytes) and then the 16-byte fixed - // entry header up to `payload_len` (bytes 16..24 of the entry). + // Skip the u32 entry count (4 bytes) and then the 16-byte fixed entry header up to + // `payload_len` (bytes 16..24 of the entry). let entry_start = directory_offset + 4; let payload_len_offset = entry_start + 16; bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); - // The reader opens fine — the directory parses. But reading the - // asset bytes must surface an error eventually (short read vs - // declared length). xz would also trip on this, but this is the - // raw-asset path. - // Either returns an error or a slice shorter than u64::MAX. + // The reader opens fine — the directory parses. But reading the asset bytes must surface an + // error eventually (short read vs declared length). xz would also trip on this, but this is the + // raw-asset path. Either returns an error or a slice shorter than u64::MAX. reader .asset_bytes(&entry) .map(|b| assert!(b.len() < u64::MAX as usize)) @@ -429,9 +423,8 @@ fn asset_bytes_errors_when_declared_length_runs_past_eof() { #[test] fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { - // Build an incomplete bundle but stuff a stale sample count into - // the header. `sample_count()` must still return None because - // the `complete` flag is what makes the value authoritative. + // Build an incomplete bundle but stuff a stale sample count into the header. `sample_count()` + // must still return None because the `complete` flag is what makes the value authoritative. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, @@ -455,8 +448,8 @@ fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { #[test] fn unknown_assignment_format_reports_none_on_typed_getter() { - // Build a finalized but otherwise-empty bundle and corrupt the - // assignment_format byte to a value that is neither BEN nor XBEN. + // Build a finalized but otherwise-empty bundle and corrupt the assignment_format byte to a + // value that is neither BEN nor XBEN. let mut bytes = build_basic_finalized_bundle(); // assignment_format byte is at offset 13 in the header. bytes[13] = 42; @@ -508,10 +501,9 @@ fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { #[test] fn validate_directory_catches_duplicate_singleton_types() { - // Two entries of type METADATA. The second one uses a non-canonical - // name to confirm the canonical-name check fires (it lands first - // here, and is the path we cover; the singleton check is exercised - // elsewhere via duplicate standardized names). + // Two entries of type METADATA. The second one uses a non-canonical name to confirm the + // canonical-name check fires (it lands first here, and is the path we cover; the singleton + // check is exercised elsewhere via duplicate standardized names). let entries = vec![ BendlDirectoryEntry { asset_type: ASSET_TYPE_METADATA, @@ -524,16 +516,16 @@ fn validate_directory_catches_duplicate_singleton_types() { BendlDirectoryEntry { asset_type: ASSET_TYPE_METADATA, asset_flags: 0, - // Distinct name so the duplicate-name check does not fire - // first; the singleton-type check should catch this. + // Distinct name so the duplicate-name check does not fire first; the singleton-type + // check should catch this. name: "meta2.json".to_string(), payload_offset: 65, payload_len: 1, checksum: None, }, ]; - // The second entry has asset_type METADATA but name "meta2.json" - // which fails the canonical-name check. + // The second entry has asset_type METADATA but name "meta2.json" which fails the canonical-name + // check. let err = validate_directory_entries(&entries).unwrap_err(); assert!(matches!( err, @@ -543,8 +535,8 @@ fn validate_directory_catches_duplicate_singleton_types() { #[test] fn validate_directory_accepts_well_formed_multi_singleton_bundle() { - // A bundle with one of every singleton type, plus two custom - // assets with distinct names, should validate cleanly. + // A bundle with one of every singleton type, plus two custom assets with distinct names, should + // validate cleanly. let entries = vec![ BendlDirectoryEntry { asset_type: ASSET_TYPE_METADATA, @@ -592,9 +584,8 @@ fn validate_directory_accepts_well_formed_multi_singleton_bundle() { #[test] fn stress_thousand_custom_assets_round_trip() { - // Build a directory with 1000 small custom assets, each with a - // unique payload derived from its index, and confirm they all - // round-trip via `asset_bytes`. This catches any off-by-one or + // Build a directory with 1000 small custom assets, each with a unique payload derived from its + // index, and confirm they all round-trip via `asset_bytes`. This catches any off-by-one or // seek-caching bugs that might only show up with many entries. const N: usize = 1000; @@ -660,9 +651,8 @@ fn stress_thousand_custom_assets_round_trip() { #[test] fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { - // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ - // whose payload bytes are not a valid xz container. `asset_bytes` - // must surface an io::Error rather than panicking. + // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ whose payload bytes are not a + // valid xz container. `asset_bytes` must surface an io::Error rather than panicking. let mut bytes = Vec::new(); bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); @@ -710,15 +700,13 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { #[test] fn reader_scales_to_very_wide_stream_offset_field() { - // Confirm the `Take` bound clamps a stream reader even when the - // header's stream_len is much larger than the actual remaining - // bytes: the reader must return the shorter slice rather than + // Confirm the `Take` bound clamps a stream reader even when the header's stream_len is much + // larger than the actual remaining bytes: the reader must return the shorter slice rather than // loop forever or panic. This is a "short read" tolerance check. let fake_stream = b"STANDARD BEN FILE\x00\x01tiny".to_vec(); let actual_len = fake_stream.len() as u64; let directory_offset = HEADER_SIZE as u64 + actual_len; - // Build a bundle that lies about stream_len: claims ten times - // what's actually present. + // Build a bundle that lies about stream_len: claims ten times what's actually present. let entries: Vec = Vec::new(); let directory_bytes = encode_directory(&entries).unwrap(); let header = BendlHeader { @@ -742,25 +730,24 @@ fn reader_scales_to_very_wide_stream_offset_field() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let mut buf = Vec::new(); - // Take will try to read `stream_len` bytes but the Cursor will - // just return however many bytes remain from stream_offset to EOF. - // The reader must not panic; it must simply return what it got. + // Take will try to read `stream_len` bytes but the Cursor will just return however many bytes + // remain from stream_offset to EOF. The reader must not panic; it must simply return what it + // got. reader .assignment_stream_reader() .unwrap() .read_to_end(&mut buf) .unwrap(); - // Take includes the directory bytes in the window since they come - // after stream_offset and the claim exceeds file size — so we - // assert only that we got *at least* the real stream bytes as a + // Take includes the directory bytes in the window since they come after stream_offset and the + // claim exceeds file size — so we assert only that we got *at least* the real stream bytes as a // prefix, which is the basic "no truncation of what exists" check. assert!(buf.starts_with(&fake_stream)); } #[test] fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { - // An incomplete bundle where directory_offset is non-zero: - // the stream end is taken as directory_offset, not EOF. + // An incomplete bundle where directory_offset is non-zero: the stream end is taken as + // directory_offset, not EOF. let fake_stream = b"STANDARD BEN FILE\x00partial".to_vec(); let fake_dir = b"some-directory-bytes"; let stream_start = HEADER_SIZE as u64; @@ -814,24 +801,21 @@ fn validate_directory_rejects_wrong_canonical_name() { // Asset CRC32C verification // ===================================================================== // -// These tests pin the verify-on-touch contract for directory-entry -// assets. The structural split is: +// These tests pin the verify-on-touch contract for directory-entry assets. The structural split is: // -// - explicit verifier (`verify_asset_checksum`) vs implicit -// verifier (`asset_bytes` / `asset_reader`), +// - explicit verifier (`verify_asset_checksum`) vs implicit verifier (`asset_bytes` / +// `asset_reader`), // - uncompressed vs xz-compressed assets, -// - stored-checksum corruption vs payload corruption (vs xz-framing -// corruption for compressed assets). +// - stored-checksum corruption vs payload corruption (vs xz-framing corruption for compressed +// assets). // -// The unverified APIs (`*_unverified`) are pinned in matching tests to -// ensure they NEVER surface a `ChecksumError` (codec errors are still -// permitted). +// The unverified APIs (`*_unverified`) are pinned in matching tests to ensure they NEVER surface a +// `ChecksumError` (codec errors are still permitted). use crate::io::bundle::error::{BendlReadError, ChecksumError, ChecksumTarget}; -/// Build a finalized bundle with exactly one uncompressed asset whose -/// payload bytes are `payload`. Returns `(bundle_bytes, asset_name, -/// directory_offset, payload_offset)` for hand-patching tests. +/// Build a finalized bundle with exactly one uncompressed asset whose payload bytes are `payload`. +/// Returns `(bundle_bytes, asset_name, directory_offset, payload_offset)` for hand-patching tests. fn make_single_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, u64, u64) { let mut bytes = Vec::new(); bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); @@ -873,14 +857,10 @@ fn make_single_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, u64 (bytes, name.to_string(), directory_offset, payload_offset) } -/// Build a finalized bundle whose only asset is `payload` stored xz- -/// compressed. The stored CRC is over the **compressed** bytes (CRC is -/// pre-decompression). Returns +/// Build a finalized bundle whose only asset is `payload` stored xz- compressed. The stored CRC is +/// over the **compressed** bytes (CRC is pre-decompression). Returns /// `(bundle_bytes, name, compressed_payload, directory_offset, payload_offset)`. -fn make_single_xz_asset_bundle( - name: &str, - payload: &[u8], -) -> (Vec, String, Vec, u64, u64) { +fn make_single_xz_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, Vec, u64, u64) { let mut encoder = XzEncoder::new(Vec::new(), 6); encoder.write_all(payload).unwrap(); let compressed = encoder.finish().unwrap(); @@ -931,13 +911,12 @@ fn make_single_xz_asset_bundle( ) } -/// Locate the offset of an asset's stored CRC32C bytes inside a -/// hand-built single-asset bundle. Assumes the directory starts at -/// `directory_offset`, the entry count is one, and the entry's +/// Locate the offset of an asset's stored CRC32C bytes inside a hand-built single-asset bundle. +/// Assumes the directory starts at `directory_offset`, the entry count is one, and the entry's /// `checksum_len` is 4 (the only legal value when the flag is set). fn stored_checksum_offset(directory_offset: u64, name: &str) -> usize { - // directory layout: [u32 count][entry][...] - // entry layout: [28-byte header][name bytes][checksum bytes] + // directory layout: [u32 count][entry][...] entry layout: [28-byte header][name bytes][checksum + // bytes] let entry_start = directory_offset as usize + 4; entry_start + 28 + name.len() } @@ -974,14 +953,16 @@ fn verify_asset_checksum_uncompressed_corrupt_payload_byte_returns_mismatch() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.verify_asset_checksum(&entry).unwrap_err(); - assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { .. }) + )); } #[test] fn verify_asset_checksum_xz_corrupt_stored_crc_returns_mismatch_no_decoder() { - // The explicit verifier reads raw bytes — no XzDecoder is invoked, - // so even an intact compressed payload reports `Mismatch` - // deterministically when only the stored CRC has been corrupted. + // The explicit verifier reads raw bytes — no XzDecoder is invoked, so even an intact compressed + // payload reports `Mismatch` deterministically when only the stored CRC has been corrupted. let (mut bytes, name, _, dir_off, _) = make_single_xz_asset_bundle("blob.xz", b"some compressible content"); let crc_off = stored_checksum_offset(dir_off, &name); @@ -989,14 +970,16 @@ fn verify_asset_checksum_xz_corrupt_stored_crc_returns_mismatch_no_decoder() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.verify_asset_checksum(&entry).unwrap_err(); - assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { .. }) + )); } #[test] fn verify_asset_checksum_xz_corrupt_payload_returns_mismatch_no_decoder() { - // Verifier is over raw bytes — a payload flip that breaks xz framing - // still surfaces as Mismatch, NOT a decoder error, because the - // explicit verifier never invokes the decoder. + // Verifier is over raw bytes — a payload flip that breaks xz framing still surfaces as + // Mismatch, NOT a decoder error, because the explicit verifier never invokes the decoder. let (mut bytes, name, compressed, _, payload_off) = make_single_xz_asset_bundle("blob.xz", b"some compressible content"); assert!(compressed.len() > 5); @@ -1004,7 +987,10 @@ fn verify_asset_checksum_xz_corrupt_payload_returns_mismatch_no_decoder() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.verify_asset_checksum(&entry).unwrap_err(); - assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { .. }) + )); } #[test] @@ -1066,7 +1052,10 @@ fn asset_bytes_uncompressed_corrupt_payload_returns_checksum_mismatch() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.asset_bytes(&entry).unwrap_err(); - assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { .. }) + )); } #[test] @@ -1076,17 +1065,15 @@ fn asset_bytes_unverified_uncompressed_returns_corrupted_bytes_no_check() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let got = reader.asset_bytes_unverified(&entry).unwrap(); - // The bytes returned are the corrupted bytes; we do not assert - // exact content, only that the operation succeeded — the - // *_unverified contract is that ChecksumError NEVER fires. + // The bytes returned are the corrupted bytes; we do not assert exact content, only that the + // operation succeeded — the *_unverified contract is that ChecksumError NEVER fires. assert_eq!(got.len(), b"hello world".len()); } #[test] fn asset_bytes_xz_corrupt_stored_crc_returns_checksum_mismatch() { - // xz framing intact, but stored CRC is wrong. The codec reaches EOF - // cleanly first and then the BENDL-owned wrapper reports - // `ChecksumError::Mismatch`. + // xz framing intact, but stored CRC is wrong. The codec reaches EOF cleanly first and then the + // BENDL-owned wrapper reports `ChecksumError::Mismatch`. let (mut bytes, name, _, dir_off, _) = make_single_xz_asset_bundle("blob.xz", b"some compressible content"); let crc_off = stored_checksum_offset(dir_off, &name); @@ -1094,14 +1081,16 @@ fn asset_bytes_xz_corrupt_stored_crc_returns_checksum_mismatch() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.asset_bytes(&entry).unwrap_err(); - assert!(matches!(err, BendlReadError::Checksum(ChecksumError::Mismatch { .. }))); + assert!(matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { .. }) + )); } #[test] fn asset_bytes_xz_corrupt_framing_returns_decode_error_not_checksum() { - // Payload flip breaks xz framing — the decoder fails before the - // CRC tee reaches raw EOF, so the variant is - // `BendlReadError::Decode`, not `BendlReadError::Checksum`. + // Payload flip breaks xz framing — the decoder fails before the CRC tee reaches raw EOF, so the + // variant is `BendlReadError::Decode`, not `BendlReadError::Checksum`. let (mut bytes, name, compressed, _, payload_off) = make_single_xz_asset_bundle("blob.xz", b"some compressible content"); assert!(compressed.len() > 5); @@ -1124,8 +1113,7 @@ fn asset_bytes_unverified_xz_corrupt_framing_returns_decode_error_never_checksum let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let err = reader.asset_bytes_unverified(&entry).unwrap_err(); - // Unverified path NEVER surfaces a checksum error; codec errors - // are still allowed. + // Unverified path NEVER surfaces a checksum error; codec errors are still allowed. assert!(!matches!(err, BendlReadError::Checksum(_))); assert!(matches!(err, BendlReadError::Decode(_))); } @@ -1178,9 +1166,8 @@ fn asset_bytes_returns_unavailable_when_flag_clear() { #[test] fn asset_reader_uncompressed_surfaces_mismatch_on_final_read() { - // Drive `asset_reader` byte-by-byte and assert the call that - // would otherwise return Ok(0) at EOF returns InvalidData wrapping - // ChecksumError::Mismatch. + // Drive `asset_reader` byte-by-byte and assert the call that would otherwise return Ok(0) at + // EOF returns InvalidData wrapping ChecksumError::Mismatch. let (mut bytes, name, _, payload_off) = make_single_asset_bundle("blob", b"abcdef"); bytes[payload_off as usize] ^= 0x01; let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); @@ -1211,9 +1198,8 @@ fn asset_reader_uncompressed_surfaces_mismatch_on_final_read() { #[test] fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { - // Build a bundle with two assets, both corrupted. The bulk - // verifier must return the *first* mismatch in directory order - // and stop. Construct manually so we can corrupt independently. + // Build a bundle with two assets, both corrupted. The bulk verifier must return the *first* + // mismatch in directory order and stop. Construct manually so we can corrupt independently. let p1 = b"first".to_vec(); let p2 = b"second".to_vec(); let mut bytes = Vec::new(); @@ -1280,25 +1266,22 @@ fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { #[test] fn crc32c_polynomial_pin_against_known_vectors() { - // Pin known CRC32C (Castagnoli) values so a future accidental - // swap to IEEE CRC-32 is caught at test time. The IEEE CRC-32 of - // [0x01,0x02,0x03,0x04] is 0xB63CFBCD; the CRC32C value below + // Pin known CRC32C (Castagnoli) values so a future accidental swap to IEEE CRC-32 is caught at + // test time. The IEEE CRC-32 of [0x01,0x02,0x03,0x04] is 0xB63CFBCD; the CRC32C value below // diverges from that, which is the whole point of the pin. // // CRC32C("") = 0x00000000 // CRC32C([1,2,3,4]) = 0x8A2D413B // CRC32C(b"123456789") = 0xE3069283 (Castagnoli check value) // - // The Castagnoli check value 0xE3069283 is the canonical CRC32C - // test vector cited in the IEEE 802.3 / SCTP RFC 3720 specs and - // diverges from the IEEE CRC-32 polynomial's check value - // (0xCBF43926). If a future contributor accidentally swaps to - // IEEE CRC-32, this assertion fires. + // The Castagnoli check value 0xE3069283 is the canonical CRC32C test vector cited in the IEEE + // 802.3 / SCTP RFC 3720 specs and diverges from the IEEE CRC-32 polynomial's check value + // (0xCBF43926). If a future contributor accidentally swaps to IEEE CRC-32, this assertion + // fires. assert_eq!(crc32c::crc32c(b""), 0x0000_0000); - // 0xE3069283 is the canonical Castagnoli check value - // (CRC32C of ASCII "123456789"); the IEEE CRC-32 polynomial's - // check value over the same input is 0xCBF43926, so any - // accidental swap is caught here. + // 0xE3069283 is the canonical Castagnoli check value (CRC32C of ASCII "123456789"); the IEEE + // CRC-32 polynomial's check value over the same input is 0xCBF43926, so any accidental swap is + // caught here. assert_eq!(crc32c::crc32c(b"123456789"), 0xE306_9283); // Extra sentinels to broaden the trip-wire. assert_eq!(crc32c::crc32c(&[0x01, 0x02, 0x03, 0x04]), 0x2930_8CF4); diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index fa145d7..ea42711 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -8,9 +8,7 @@ use crate::io::bundle::format::{ BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, }; use crate::io::bundle::reader::BendlReader; -use crate::io::bundle::writer::{ - AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter, -}; +use crate::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter}; use crate::io::reader::BenWireFormat; use crate::io::writer::BenStreamWriter; @@ -18,8 +16,8 @@ fn make_buffer() -> Cursor> { Cursor::new(Vec::new()) } -/// Test helper: replicate the deleted `BendlWriter::write_stream_bytes` -/// using the owned-session chain. Used purely to keep test bodies short. +/// Test helper: replicate the deleted `BendlWriter::write_stream_bytes` using the owned-session +/// chain. Used purely to keep test bodies short. fn write_stream_bytes_via_session( writer: BendlWriter>>, bytes: &[u8], @@ -80,9 +78,9 @@ fn graph_asset_is_compressed_by_default() { .cloned() .expect("graph entry present"); assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0); - // Compressed size should differ from the raw size for a non-trivial - // JSON payload. For very short payloads xz actually inflates the - // bytes, so this just checks the size is non-zero and different. + // Compressed size should differ from the raw size for a non-trivial JSON payload. For very + // short payloads xz actually inflates the bytes, so this just checks the size is non-zero and + // different. assert_ne!(entry.payload_len, graph.len() as u64); // Decoded bytes round-trip. @@ -164,8 +162,8 @@ fn writer_rejects_duplicate_custom_name() { #[test] fn writer_rejects_asset_added_after_stream_begins() { - // After a session has been finished, the writer is in `StreamWritten` - // and `add_*_asset` rejects further additions with `AssetsAfterStream`. + // After a session has been finished, the writer is in `StreamWritten` and `add_*_asset` rejects + // further additions with `AssetsAfterStream`. let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let mut writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); let err = writer @@ -212,9 +210,8 @@ fn finalized_directory_lives_at_eof() { // Append-path tests // ----------------------------------------------------------------------- -/// Build a finalized bundle with a single `metadata.json` asset and -/// a short fake stream, then return both the bytes and the byte -/// range (offset, len) occupied by the stream region. +/// Build a finalized bundle with a single `metadata.json` asset and a short fake stream, then +/// return both the bytes and the byte range (offset, len) occupied by the stream region. fn build_base_bundle() -> (Vec, (u64, u64)) { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer @@ -265,8 +262,8 @@ fn append_leaves_stream_bytes_byte_for_byte_unchanged() { .unwrap(); let buf = appender.commit().unwrap().into_inner(); - // Read back the new header to locate the stream region, then - // confirm the stream bytes are byte-identical to the original. + // Read back the new header to locate the stream region, then confirm the stream bytes are + // byte-identical to the original. let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); let (off, len) = (reader.header().stream_offset, reader.header().stream_len); let appended_stream_bytes = buf[off as usize..(off + len) as usize].to_vec(); @@ -323,8 +320,8 @@ fn append_rejects_duplicate_singleton_without_touching_file() { #[test] fn append_rejects_duplicate_custom_name_without_touching_file() { - // Start from a bundle containing a custom asset named "blob", then - // try to append another "blob". + // Start from a bundle containing a custom asset named "blob", then try to append another + // "blob". let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer .add_asset( @@ -376,8 +373,8 @@ fn append_rejects_wrong_canonical_name_without_touching_file() { #[test] fn append_rejects_incomplete_bundle() { - // Construct a minimal incomplete bundle: just the provisional - // header and some stream bytes, no directory. + // Construct a minimal incomplete bundle: just the provisional header and some stream bytes, no + // directory. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, @@ -405,8 +402,7 @@ fn append_rejects_incomplete_bundle() { #[test] fn append_rejects_complete_bundle_with_zero_directory() { - // Header claims complete but has directory_offset=0 — hits the second - // BundleIncomplete check. + // Header claims complete but has directory_offset=0 — hits the second BundleIncomplete check. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, @@ -458,8 +454,8 @@ fn append_multiple_assets_in_one_commit() { let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.assets().len(), 4); - // Round-trip the appended graph through the reader to confirm - // compression happened and decodes cleanly. + // Round-trip the appended graph through the reader to confirm compression happened and decodes + // cleanly. let graph_entry = reader .find_asset_by_name("graph.json") .cloned() @@ -670,12 +666,11 @@ fn fully_empty_bundle_finalizes_and_round_trips() { #[test] fn into_stream_session_after_stream_written_returns_wrong_state() { - // Regression fixture for the `into_stream_session` guard: a writer - // that has already finished one stream phase must reject a second - // attempt to enter the stream phase. Without this guard, a chained - // `into_stream_session → finish_into_writer → into_stream_session` - // would silently overwrite `header.stream_offset` and corrupt the - // bundle. This is the only runtime fixture for that guard. + // Regression fixture for the `into_stream_session` guard: a writer that has already finished + // one stream phase must reject a second attempt to enter the stream phase. Without this guard, + // a chained `into_stream_session → finish_into_writer → into_stream_session` would silently + // overwrite `header.stream_offset` and corrupt the bundle. This is the only runtime fixture for + // that guard. let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); // Writer is now in StreamWritten state; into_stream_session must fail. @@ -758,16 +753,14 @@ fn append_then_reopen_and_append_again() { assert!(names.contains(&"metadata.json")); assert!(names.contains(&"graph.json")); assert!(names.contains(&"extra.bin")); - // Sample count from the original stream is preserved across both - // appends. + // Sample count from the original stream is preserved across both appends. assert_eq!(reader.sample_count(), Some(3)); } #[test] fn append_does_not_disturb_front_loaded_asset_bytes() { - // Base bundle has a graph.json asset with known bytes; after - // append of a custom blob, reading graph.json must still return - // exactly the same decoded bytes as before. + // Base bundle has a graph.json asset with known bytes; after append of a custom blob, reading + // graph.json must still return exactly the same decoded bytes as before. let graph = br#"{"nodes":[0,1,2,3,4,5,6,7,8,9,10]}"#; let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer @@ -806,9 +799,9 @@ fn append_does_not_disturb_front_loaded_asset_bytes() { #[test] fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { - // A custom asset named "graph.json" is not a singleton because the - // singleton uniqueness check keys off asset_type, not name. Adding - // a real GRAPH singleton after it must then fail on DuplicateName. + // A custom asset named "graph.json" is not a singleton because the singleton uniqueness check + // keys off asset_type, not name. Adding a real GRAPH singleton after it must then fail on + // DuplicateName. let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer .add_asset( @@ -826,8 +819,8 @@ fn writer_accepts_custom_asset_with_canonical_name_but_non_canonical_type() { #[test] fn writer_asset_round_trips_with_auto_computed_crc32c() { - // Every asset gets ASSET_FLAG_CHECKSUM with a 4-byte CRC32C of the - // on-disk payload bytes (post-compression for xz-flagged assets). + // Every asset gets ASSET_FLAG_CHECKSUM with a 4-byte CRC32C of the on-disk payload bytes + // (post-compression for xz-flagged assets). let payload = b"hello".to_vec(); let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer @@ -857,8 +850,8 @@ fn writer_asset_round_trips_with_auto_computed_crc32c() { fn finished_writer_rejects_further_operations() { let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); - // `finish` consumes `self`, which is itself the protection — there - // is no way to call add_asset / into_stream_session afterwards. + // `finish` consumes `self`, which is itself the protection — there is no way to call add_asset + // / into_stream_session afterwards. let buf = writer.finish().unwrap().into_inner(); // The resulting buffer is a valid finalized bundle. let reader = BendlReader::open(Cursor::new(buf)).unwrap(); @@ -895,8 +888,8 @@ fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { .. } )); - // After a rejected add, no entries have been recorded — a - // subsequent valid add proceeds normally. + // After a rejected add, no entries have been recorded — a subsequent valid add proceeds + // normally. writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{}") .unwrap(); @@ -908,9 +901,8 @@ fn writer_rejects_add_json_asset_with_wrong_canonical_metadata_name() { #[test] fn writer_rejected_add_leaves_singleton_slot_usable() { - // A rejected singleton add must not consume the singleton slot — - // otherwise a future valid add with the correct standardized name - // would spuriously fail with DuplicateSingletonType. + // A rejected singleton add must not consume the singleton slot — otherwise a future valid add + // with the correct standardized name would spuriously fail with DuplicateSingletonType. let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); // First try with wrong standardized name — rejected. let _ = writer @@ -945,10 +937,9 @@ fn append_rejects_duplicate_name_across_existing_and_pending() { // Randomized / stress tests // ----------------------------------------------------------------------- -/// Build a bundle from a random set of custom assets (plus an optional -/// metadata asset) and fully round-trip it through the reader. Repeated -/// with a seeded ChaCha PRNG so the sequence is deterministic but -/// covers a wide surface. +/// Build a bundle from a random set of custom assets (plus an optional metadata asset) and fully +/// round-trip it through the reader. Repeated with a seeded ChaCha PRNG so the sequence is +/// deterministic but covers a wide surface. #[test] fn randomized_round_trip_many_custom_assets() { use rand::{Rng, SeedableRng}; @@ -976,8 +967,8 @@ fn randomized_round_trip_many_custom_assets() { let compress = rng.random_bool(0.4); let is_json = rng.random_bool(0.15) && size > 0; let payload = if is_json { - // Override with a synthetic JSON blob so the json flag - // actually matches the content. + // Override with a synthetic JSON blob so the json flag actually matches the + // content. format!(r#"{{"i":{i},"seed":{seed}}}"#).into_bytes() } else { payload @@ -999,8 +990,7 @@ fn randomized_round_trip_many_custom_assets() { expected.push((name, payload, is_json)); } - // Write a small deterministic stream so the bundle is - // assignment-complete. + // Write a small deterministic stream so the bundle is assignment-complete. let sample_count: i64 = rng.random_range(0..=20); let fake_stream = b"STANDARD BEN FILE\x00\x01\x02payload".to_vec(); let writer = write_stream_bytes_via_session(writer, &fake_stream, sample_count); @@ -1036,10 +1026,9 @@ fn randomized_round_trip_many_custom_assets() { #[test] fn five_successive_appends_preserve_everything() { - // Start from a finalized bundle with only a metadata asset and a - // short stream. Then open it five times via BendlAppender and add - // one asset per round. After every round, the previous assets must - // still be readable and sample_count must remain authoritative. + // Start from a finalized bundle with only a metadata asset and a short stream. Then open it + // five times via BendlAppender and add one asset per round. After every round, the previous + // assets must still be readable and sample_count must remain authoritative. let (mut buf, _) = build_base_bundle(); // Sanity-check the baseline. @@ -1068,8 +1057,8 @@ fn five_successive_appends_preserve_everything() { buf = commit.into_inner(); accumulated.push((name, payload)); - // Re-open and verify the full set is intact and sample_count - // still matches the baseline (append must not touch it). + // Re-open and verify the full set is intact and sample_count still matches the baseline + // (append must not touch it). let mut reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); assert!(reader.is_finalized(), "round {round}"); assert_eq!( @@ -1097,9 +1086,8 @@ fn five_successive_appends_preserve_everything() { #[test] fn randomized_append_sequence_preserves_all_prior_entries() { - // Independent coverage for append: random number of rounds, random - // payload sizes. Catches any bookkeeping drift in the appender's - // directory-rewrite path. + // Independent coverage for append: random number of rounds, random payload sizes. Catches any + // bookkeeping drift in the appender's directory-rewrite path. use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; @@ -1204,8 +1192,8 @@ fn stream_session_flush_succeeds() { let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); let mut session = writer.into_stream_session().unwrap(); session.flush().unwrap(); - // Discard the session — it would warn on Drop, but the test runner - // does not assert on log output, so this is fine for unit tests. + // Discard the session — it would warn on Drop, but the test runner does not assert on log + // output, so this is fine for unit tests. let _ = session.finish_into_writer(0); } @@ -1237,8 +1225,8 @@ fn appender_commit_auto_computes_crc32c_on_pending_assets() { #[test] fn appender_rejects_bundle_with_trailing_directory_bytes() { let (mut bundle, _) = build_base_bundle(); - // Patch the header's directory_len field (bytes 32-39) to claim - // the directory is 4 bytes longer than it actually is. + // Patch the header's directory_len field (bytes 32-39) to claim the directory is 4 bytes longer + // than it actually is. let old_len = u64::from_le_bytes(bundle[32..40].try_into().unwrap()); let patched = (old_len + 4).to_le_bytes(); bundle[32..40].copy_from_slice(&patched); @@ -1272,9 +1260,8 @@ fn finish_after_assignment_stream_produces_finalized_bundle() { // ── Plan verification tests ────────────────────────────────────── -/// Verification #7: dropping a `BendlStreamSession` mid-flight must -/// leave the bundle on disk unfinalized (no directory written, header -/// `finalized != FINALIZED_YES`). +/// Verification #7: dropping a `BendlStreamSession` mid-flight must leave the bundle on disk +/// unfinalized (no directory written, header `finalized != FINALIZED_YES`). #[test] fn bundle_streaming_session_drop_leaves_unfinalized() { let mut buf: Vec = Vec::new(); @@ -1294,17 +1281,16 @@ fn bundle_streaming_session_drop_leaves_unfinalized() { ); } -/// Verification #9: `BendlStreamSession::write` must increment its -/// internal byte counter by the returned write count, not by the -/// requested buffer length, so partial writes are accounted correctly -/// and the finalized header's `stream_len` matches the actual byte -/// count of the stream region. +/// Verification #9: `BendlStreamSession::write` must increment its internal byte counter by the +/// returned write count, not by the requested buffer length, so partial writes are accounted +/// correctly and the finalized header's `stream_len` matches the actual byte count of the stream +/// region. #[test] fn stream_session_partial_writes_account_returned_bytes() { use std::io::{self, Cursor as IoCursor, SeekFrom}; - /// Inner writer that always reports `cap` bytes written per call, - /// regardless of the buffer length, but writes the matching prefix. + /// Inner writer that always reports `cap` bytes written per call, regardless of the buffer + /// length, but writes the matching prefix. struct ShortWriter { cursor: IoCursor>, cap: usize, @@ -1335,8 +1321,8 @@ fn stream_session_partial_writes_account_returned_bytes() { let writer = BendlWriter::new(inner, AssignmentFormat::Ben).unwrap(); let mut session = writer.into_stream_session().unwrap(); - // Drive a few partial writes; total written should equal the sum - // of the returned `n` from each call. + // Drive a few partial writes; total written should equal the sum of the returned `n` from each + // call. let mut total_returned: u64 = 0; for _ in 0..5 { let n = session.write(b"hello world").unwrap(); diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 9cc6b55..07d8777 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -8,9 +8,9 @@ //! //! The writer operates in three logical phases, expressed via owned typestate transitions: //! -//! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero or more times. Each -//! call writes the (optionally xz-compressed) payload to the file and records its absolute -//! offset and length in an in-memory entry list. +//! 1. **asset phase** — the caller invokes [`BendlWriter::add_asset`] zero or more times. Each call +//! writes the (optionally xz-compressed) payload to the file and records its absolute offset and +//! length in an in-memory entry list. //! 2. **stream phase** — the caller invokes [`BendlWriter::into_stream_session`] to consume the //! writer and obtain a [`BendlStreamSession`] that owns the underlying writer and implements //! `Write`. When the stream is complete the caller calls @@ -69,12 +69,11 @@ impl BendlTruncate for std::io::Cursor> { /// `*_unverified` API and excluded from normal write paths. #[derive(Debug, Clone, Default)] pub struct AddAssetOptions { - /// Compression override. `None` means "follow the default policy for - /// this asset type"; `Some(true)` forces xz compression; `Some(false)` - /// forces a raw payload. + /// Compression override. `None` means "follow the default policy for this asset type"; + /// `Some(true)` forces xz compression; `Some(false)` forces a raw payload. pub compress: Option, - /// Whether the decoded payload is UTF-8 JSON. Adds the - /// [`ASSET_FLAG_JSON`] bit to the entry's flags. + /// Whether the decoded payload is UTF-8 JSON. Adds the [`ASSET_FLAG_JSON`] bit to the entry's + /// flags. pub is_json: bool, } @@ -96,8 +95,7 @@ impl AddAssetOptions { self } - /// Force the writer to store the payload raw even if the default - /// policy would compress it. + /// Force the writer to store the payload raw even if the default policy would compress it. pub fn raw(mut self) -> Self { self.compress = Some(false); self @@ -116,28 +114,26 @@ pub struct BendlWriter { #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum WriterState { - /// No assets have been written yet, but the provisional header is - /// already in place and the writer is positioned just after it. + /// No assets have been written yet, but the provisional header is already in place and the + /// writer is positioned just after it. Assets, - /// A stream session has been finished and the writer is ready for - /// [`BendlWriter::finish`]. The streaming phase itself is expressed - /// in the type system via [`BendlStreamSession`] and is therefore - /// not observable in this enum. + /// A stream session has been finished and the writer is ready for [`BendlWriter::finish`]. The + /// streaming phase itself is expressed in the type system via [`BendlStreamSession`] and is + /// therefore not observable in this enum. StreamWritten { stream_len: u64, sample_count: i64 }, } impl BendlWriter { /// Create a new writer by writing a provisional header at offset 0. /// - /// The assignment stream will begin immediately after the asset - /// payload region — [`BendlWriter::into_stream_session`] computes - /// the exact offset at the moment it is called, so asset writes - /// that happen between `new` and `into_stream_session` push the - /// stream out as expected. + /// The assignment stream will begin immediately after the asset payload region — + /// [`BendlWriter::into_stream_session`] computes the exact offset at the moment it is called, + /// so asset writes that happen between `new` and `into_stream_session` push the stream out as + /// expected. pub fn new(mut inner: W, assignment_format: AssignmentFormat) -> io::Result { inner.seek(SeekFrom::Start(0))?; - // stream_offset in the provisional header is patched at - // into_stream_session time; start it just after the header. + // stream_offset in the provisional header is patched at into_stream_session time; start it + // just after the header. let header = BendlHeader::provisional(assignment_format, HEADER_SIZE as u64); header.write_to(&mut inner)?; @@ -153,14 +149,12 @@ impl BendlWriter { /// Add an asset to the bundle. /// - /// The payload is written to the file immediately at the current - /// position (right after the previous asset, or right after the - /// header if this is the first asset). Its absolute offset and - /// length are recorded in the in-memory directory entry list. + /// The payload is written to the file immediately at the current position (right after the + /// previous asset, or right after the header if this is the first asset). Its absolute offset + /// and length are recorded in the in-memory directory entry list. /// - /// This method enforces the canonical-name and uniqueness rules - /// **before** writing any bytes, so a rejected asset leaves the - /// file untouched. + /// This method enforces the canonical-name and uniqueness rules **before** writing any bytes, + /// so a rejected asset leaves the file untouched. pub fn add_asset( &mut self, asset_type: u16, @@ -188,9 +182,8 @@ impl BendlWriter { // Unique name rule. if !self.names.insert(name.to_string()) { - // Roll back the singleton insertion before returning, so - // the writer remains in a consistent state. (Only known - // singleton types would have been inserted above.) + // Roll back the singleton insertion before returning, so the writer remains in a + // consistent state. (Only known singleton types would have been inserted above.) if standardized_name_for(asset_type).is_some() { self.singleton_types.remove(&asset_type); } @@ -211,9 +204,9 @@ impl BendlWriter { payload.to_vec() }; - // CRC32C over the on-disk payload bytes. For compressed assets this is the compressed - // bytes (verification happens before decompression). See ASSET_FLAG_CHECKSUM for the - // wire-format pin. + // CRC32C over the on-disk payload bytes. For compressed assets this is the compressed bytes + // (verification happens before decompression). See ASSET_FLAG_CHECKSUM for the wire-format + // pin. let crc = crc32c::crc32c(&payload_bytes); let checksum_bytes = crc.to_le_bytes().to_vec(); @@ -288,14 +281,14 @@ impl BendlWriter { /// Consume the writer and transition into the stream phase. /// - /// The returned [`BendlStreamSession`] owns the underlying writer and implements `Write`, so - /// it can be plumbed into a [`crate::io::writer::BenStreamWriter`] (or written to directly). - /// When the stream is complete the caller calls [`BendlStreamSession::finish_into_writer`] - /// to recover ownership of a [`BendlWriter`] in the `StreamWritten` state, ready for + /// The returned [`BendlStreamSession`] owns the underlying writer and implements `Write`, so it + /// can be plumbed into a [`crate::io::writer::BenStreamWriter`] (or written to directly). When + /// the stream is complete the caller calls [`BendlStreamSession::finish_into_writer`] to + /// recover ownership of a [`BendlWriter`] in the `StreamWritten` state, ready for /// [`BendlWriter::finish`]. /// - /// Returns [`BendlWriteError::WrongState`] when called on a writer that has already produced - /// a stream (e.g. via a prior `finish_into_writer`); this guard prevents a second + /// Returns [`BendlWriteError::WrongState`] when called on a writer that has already produced a + /// stream (e.g. via a prior `finish_into_writer`); this guard prevents a second /// `into_stream_session` from silently overwriting `header.stream_offset` and corrupting the /// bundle. pub fn into_stream_session(mut self) -> Result, BendlWriteError> { @@ -325,8 +318,7 @@ impl BendlWriter { }) } - /// Write the trailing directory, patch the header, and return the - /// underlying writer. + /// Write the trailing directory, patch the header, and return the underlying writer. pub fn finish(mut self) -> Result { let (stream_len, sample_count) = match self.state { WriterState::StreamWritten { @@ -334,8 +326,7 @@ impl BendlWriter { sample_count, } => (stream_len, sample_count), WriterState::Assets => { - // No stream written; treat as empty stream located just - // after the asset region. + // No stream written; treat as empty stream located just after the asset region. let stream_offset = self.inner.seek(SeekFrom::Current(0))?; self.header.stream_offset = stream_offset; (0, 0) @@ -369,9 +360,9 @@ impl BendlWriter { } } -/// Internal state of a [`BendlWriter`] that has been temporarily moved -/// into a [`BendlStreamSession`]. Stored as a single struct so -/// `finish_into_writer` can rebuild the writer with one move. +/// Internal state of a [`BendlWriter`] that has been temporarily moved into a +/// [`BendlStreamSession`]. Stored as a single struct so `finish_into_writer` can rebuild the writer +/// with one move. struct ParentState { header: BendlHeader, entries: Vec, @@ -379,18 +370,14 @@ struct ParentState { singleton_types: HashSet, } -/// Owned stream-phase session. Holds the underlying writer and the -/// parent [`BendlWriter`]'s in-memory state across the streaming phase, -/// implements `Write` so it can be plumbed into a -/// [`crate::io::writer::BenStreamWriter`], and exposes -/// [`Self::finish_into_writer`] to hand ownership back as a -/// [`BendlWriter`] in the `StreamWritten` state. +/// Owned stream-phase session. Holds the underlying writer and the parent [`BendlWriter`]'s +/// in-memory state across the streaming phase, implements `Write` so it can be plumbed into a +/// [`crate::io::writer::BenStreamWriter`], and exposes [`Self::finish_into_writer`] to hand +/// ownership back as a [`BendlWriter`] in the `StreamWritten` state. /// -/// `inner` and `parent` are wrapped in `Option` so `finish_into_writer` -/// can `take()` them without partial-moving out of a `Drop` type. The -/// [`Drop`] impl emits a `tracing::warn!` if the session is dropped -/// without `finish_into_writer`, since that leaves the bundle on disk -/// unfinalized. +/// `inner` and `parent` are wrapped in `Option` so `finish_into_writer` can `take()` them without +/// partial-moving out of a `Drop` type. The [`Drop`] impl emits a `tracing::warn!` if the session +/// is dropped without `finish_into_writer`, since that leaves the bundle on disk unfinalized. pub struct BendlStreamSession { inner: Option, parent: Option, @@ -399,25 +386,23 @@ pub struct BendlStreamSession { } impl BendlStreamSession { - /// Number of bytes written into the stream region so far. Pure - /// counter — no I/O, no `&mut` required. + /// Number of bytes written into the stream region so far. Pure counter — no I/O, no `&mut` + /// required. pub fn bytes_written(&self) -> u64 { self.bytes_written } - /// Offset (in the underlying writer) at which the stream region - /// began, recorded at session-construction time. + /// Offset (in the underlying writer) at which the stream region began, recorded at + /// session-construction time. pub fn start_offset(&self) -> u64 { self.start_offset } - /// End the stream phase and return ownership of a [`BendlWriter`] - /// in the `StreamWritten` state, ready for [`BendlWriter::finish`]. + /// End the stream phase and return ownership of a [`BendlWriter`] in the `StreamWritten` state, + /// ready for [`BendlWriter::finish`]. /// - /// Infallible: the body is `take()` + arithmetic + struct - /// construction with no I/O. Once this method returns, the - /// session's [`Drop`] impl observes `inner.is_none()` and skips - /// the warn. + /// Infallible: the body is `take()` + arithmetic + struct construction with no I/O. Once this + /// method returns, the session's [`Drop`] impl observes `inner.is_none()` and skips the warn. pub fn finish_into_writer(mut self, sample_count: i64) -> BendlWriter { let inner = self.inner.take().expect("session has not been finished"); let parent = self.parent.take().expect("session has not been finished"); @@ -514,24 +499,21 @@ pub enum BendlWriteError { // Append path // --------------------------------------------------------------------------- -/// Post-finalize appender that grows an existing `.bendl` file with new -/// assets without rewriting the assignment stream. +/// Post-finalize appender that grows an existing `.bendl` file with new assets without rewriting +/// the assignment stream. /// /// The workflow is: /// -/// 1. [`BendlAppender::open`] opens a finalized bundle and loads its -/// directory into memory. -/// 2. [`BendlAppender::add_asset`] (or [`BendlAppender::add_json_asset`]) -/// validates and buffers each new asset. Validation happens up front, -/// so duplicate singletons or names are rejected **before** any file -/// mutation, and a rejected add_asset leaves the file unchanged. -/// 3. [`BendlAppender::commit`] compresses the buffered assets (if any), -/// truncates the file at the old directory offset, writes the new -/// asset payloads, writes a new directory at the new EOF, and patches -/// the header. +/// 1. [`BendlAppender::open`] opens a finalized bundle and loads its directory into memory. +/// 2. [`BendlAppender::add_asset`] (or [`BendlAppender::add_json_asset`]) validates and buffers +/// each new asset. Validation happens up front, so duplicate singletons or names are rejected +/// **before** any file mutation, and a rejected add_asset leaves the file unchanged. +/// 3. [`BendlAppender::commit`] compresses the buffered assets (if any), truncates the file at the +/// old directory offset, writes the new asset payloads, writes a new directory at the new EOF, +/// and patches the header. /// -/// A [`BendlAppender`] that is dropped without calling `commit` leaves -/// the underlying file unchanged. +/// A [`BendlAppender`] that is dropped without calling `commit` leaves the underlying file +/// unchanged. pub struct BendlAppender { inner: W, header: BendlHeader, @@ -557,9 +539,8 @@ struct PendingAsset { impl BendlAppender { /// Open a finalized bundle for append. /// - /// Returns [`BendlWriteError::BundleIncomplete`] if the header's - /// `complete` flag is not set — append is unsafe on unfinalized - /// bundles because the stream region has no authoritative end. + /// Returns [`BendlWriteError::BundleIncomplete`] if the header's `complete` flag is not set — + /// append is unsafe on unfinalized bundles because the stream region has no authoritative end. pub fn open(mut inner: W) -> Result { inner.seek(SeekFrom::Start(0))?; let header = BendlHeader::read_from(&mut inner).map_err(BendlWriteError::Format)?; @@ -606,10 +587,9 @@ impl BendlAppender { /// Enqueue a new asset for append. /// - /// This validates the new asset against both the loaded directory - /// and any previously-enqueued pending assets. If validation fails, - /// the pending list is unchanged and no bytes have been written to - /// the file. + /// This validates the new asset against both the loaded directory and any previously-enqueued + /// pending assets. If validation fails, the pending list is unchanged and no bytes have been + /// written to the file. pub fn add_asset( &mut self, asset_type: u16, @@ -671,8 +651,8 @@ impl BendlAppender { ) } - /// Append one of the known singleton assets, using its reserved - /// asset-type integer and standardized name automatically. + /// Append one of the known singleton assets, using its reserved asset-type integer and + /// standardized name automatically. pub fn add_known_asset( &mut self, kind: KnownAssetKind, @@ -687,8 +667,8 @@ impl BendlAppender { ) } - /// Append a custom (writer-named) asset. The asset-type is set to - /// [`ASSET_TYPE_CUSTOM`] automatically. + /// Append a custom (writer-named) asset. The asset-type is set to [`ASSET_TYPE_CUSTOM`] + /// automatically. pub fn add_custom_asset( &mut self, name: &str, @@ -700,10 +680,9 @@ impl BendlAppender { /// Commit all pending appends. /// - /// This compresses any buffered payloads that need it (entirely in - /// memory), then performs the file mutation in a single burst: - /// truncate at the old directory offset, write new payloads, write - /// a new directory, and patch the header. + /// This compresses any buffered payloads that need it (entirely in memory), then performs the + /// file mutation in a single burst: truncate at the old directory offset, write new payloads, + /// write a new directory, and patch the header. /// /// If compression fails, the file is left unchanged. pub fn commit(mut self) -> Result { @@ -712,9 +691,8 @@ impl BendlAppender { return Ok(self.inner); } - // Phase 1: compress any pending payloads and build new entries with - // placeholder offsets. Do this entirely in memory so failures here - // leave the file untouched. + // Phase 1: compress any pending payloads and build new entries with placeholder offsets. Do + // this entirely in memory so failures here leave the file untouched. struct EncodedPending { asset_type: u16, name: String, @@ -754,11 +732,10 @@ impl BendlAppender { }); } - // Phase 2: file mutation. From this point forward, a failure - // leaves the bundle in a damaged state. We do everything in the - // order (truncate, write payloads, write directory, patch header) - // so that even if we crash mid-way, the header still points at - // the old directory until the very last write. + // Phase 2: file mutation. From this point forward, a failure leaves the bundle in a damaged + // state. We do everything in the order (truncate, write payloads, write directory, patch + // header) so that even if we crash mid-way, the header still points at the old directory + // until the very last write. let old_directory_offset = self.header.directory_offset; // Truncate at the old directory offset. @@ -799,8 +776,7 @@ impl BendlAppender { Ok(self.inner) } - /// Release the underlying reader without committing any pending - /// appends. The file is unchanged. + /// Release the underlying reader without committing any pending appends. The file is unchanged. pub fn abort(self) -> W { self.inner } diff --git a/ben/src/io/reader/stream_reader/ben.rs b/ben/src/io/reader/stream_reader/ben.rs index 8cf95ac..b043919 100644 --- a/ben/src/io/reader/stream_reader/ben.rs +++ b/ben/src/io/reader/stream_reader/ben.rs @@ -10,9 +10,8 @@ use crate::BenVariant; /// Read the next frame from the underlying BEN stream. /// -/// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire -/// format; this helper tracks that state so the frame module stays -/// variant-clean. +/// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire format; this helper tracks +/// that state so the frame module stays variant-clean. pub(super) fn pop_frame_from_reader( reader: &mut R, variant: BenVariant, @@ -103,10 +102,7 @@ pub(super) fn next_record_ben( Some(Ok((assignment, count))) } -pub(super) fn count_samples_ben( - mut reader: R, - variant: BenVariant, -) -> io::Result { +pub(super) fn count_samples_ben(mut reader: R, variant: BenVariant) -> io::Result { let mut twodelta_consumed_first_frame = false; let mut total = 0usize; while let Some(frame_res) = diff --git a/ben/src/io/reader/stream_reader/frames.rs b/ben/src/io/reader/stream_reader/frames.rs index fdc2c49..10f00f3 100644 --- a/ben/src/io/reader/stream_reader/frames.rs +++ b/ben/src/io/reader/stream_reader/frames.rs @@ -13,14 +13,13 @@ use crate::BenVariant; /// Iterator over raw frames from a [`BenStreamReader`]. /// -/// In the BEN arm: `Standard` and `MkvChain` frames are yielded as read off -/// the wire; `TwoDelta` frames are materialized as assignments and re-encoded -/// as `Standard` decode frames so downstream subsample consumers always see -/// self-contained frames. +/// In the BEN arm: `Standard` and `MkvChain` frames are yielded as read off the wire; `TwoDelta` +/// frames are materialized as assignments and re-encoded as `Standard` decode frames so downstream +/// subsample consumers always see self-contained frames. /// -/// In the XBEN arm: `Standard` and `MkvChain` frames are yielded as raw -/// ben32 byte slices with their repetition count; `TwoDelta` chunks are -/// materialized to assignments and re-encoded as ben32 frames. +/// In the XBEN arm: `Standard` and `MkvChain` frames are yielded as raw ben32 byte slices with +/// their repetition count; `TwoDelta` chunks are materialized to assignments and re-encoded as +/// ben32 frames. pub struct BenStreamFrameReader { inner: BenStreamReader, } @@ -113,9 +112,9 @@ impl Iterator for BenStreamFrameReader { "BenEncodeFrame::from_assignment(Standard) always returns Standard" ), }; - // Strip the 6-byte frame header so the emitted decode-side - // frame's raw_bytes matches the historical payload-only - // shape that BenDecodeFrame::Standard carries. + // Strip the 6-byte frame header so the emitted decode-side frame's + // raw_bytes matches the historical payload-only shape that + // BenDecodeFrame::Standard carries. let payload_only = raw_bytes[6..].to_vec(); Some(Ok(( DecodeFrame::Ben(BenDecodeFrame::Standard { @@ -140,8 +139,8 @@ impl Iterator for BenStreamFrameReader { /// Pull the next raw ben32 frame from an XBEN inner state. /// -/// For TwoDelta streams the underlying chunk is materialized via the record -/// iterator and re-encoded as a self-contained ben32 frame. +/// For TwoDelta streams the underlying chunk is materialized via the record iterator and re-encoded +/// as a self-contained ben32 frame. pub(super) fn next_frame_xben( inner: &mut XBenInner, variant: BenVariant, diff --git a/ben/src/io/reader/stream_reader/mod.rs b/ben/src/io/reader/stream_reader/mod.rs index ab9a943..86ed4f3 100644 --- a/ben/src/io/reader/stream_reader/mod.rs +++ b/ben/src/io/reader/stream_reader/mod.rs @@ -1,11 +1,9 @@ -//! Unified reader for the BEN-stack stream layer (layer 3 — see -//! `docs/glossary.md`). +//! Unified reader for the BEN-stack stream layer (layer 3 — see `docs/glossary.md`). //! -//! Hides the wire-format choice (BEN bit-packed vs ben32 columnar) and the -//! transport choice (plain vs xz-compressed) behind one type. The decode-side -//! laziness invariant is preserved on both wire formats: frame payload bytes -//! stay opaque until [`crate::codec::BenDecodeFrame::expand`] (frame-level -//! decode), not to be confused with +//! Hides the wire-format choice (BEN bit-packed vs ben32 columnar) and the transport choice (plain +//! vs xz-compressed) behind one type. The decode-side laziness invariant is preserved on both wire +//! formats: frame payload bytes stay opaque until [`crate::codec::BenDecodeFrame::expand`] +//! (frame-level decode), not to be confused with //! [`crate::io::reader::DecodeFrame::expand_self_contained`] (subsample-level). mod ben; @@ -28,9 +26,9 @@ pub use frames::BenStreamFrameReader; /// Wire format of a BEN-stack stream. /// -/// The Rust representation of the BEN/XBEN stream choice. This is the seam -/// the public reader API uses to dispatch on wire format; the bundle layer -/// owns its own conversion from `AssignmentFormat`. +/// The Rust representation of the BEN/XBEN stream choice. This is the seam the public reader API +/// uses to dispatch on wire format; the bundle layer owns its own conversion from +/// `AssignmentFormat`. #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum BenWireFormat { /// Plain BEN bit-packed wire format over an unwrapped byte stream. @@ -41,9 +39,8 @@ pub enum BenWireFormat { /// Reader for an encoded BEN-stack stream of samples. /// -/// Construct with [`BenStreamReader::from_ben`] or -/// [`BenStreamReader::from_xben`]. Both arms expose the same downstream -/// surface for assignment iteration, JSONL writing, sample counting, and +/// Construct with [`BenStreamReader::from_ben`] or [`BenStreamReader::from_xben`]. Both arms expose +/// the same downstream surface for assignment iteration, JSONL writing, sample counting, and /// subsampling. pub struct BenStreamReader { inner: BenStreamInner, @@ -51,9 +48,9 @@ pub struct BenStreamReader { silent: bool, } -/// Wire-format split: the `Ben` arm carries inline state, the `XBen` arm is -/// boxed so the enum's static size stays close to the smaller plain-BEN -/// footprint instead of being dictated by the larger xz state. +/// Wire-format split: the `Ben` arm carries inline state, the `XBen` arm is boxed so the enum's +/// static size stays close to the smaller plain-BEN footprint instead of being dictated by the +/// larger xz state. pub(crate) enum BenStreamInner { Ben { reader: R, @@ -103,8 +100,8 @@ impl BenStreamReader { }) } - /// Open an XBEN stream. The reader must produce, after xz decompression, - /// a 17-byte BEN banner followed by ben32 columnar frames. + /// Open an XBEN stream. The reader must produce, after xz decompression, a 17-byte BEN banner + /// followed by ben32 columnar frames. pub fn from_xben(reader: R) -> Result { let xz = XzDecoder::new(reader); let mut xz = BufReader::with_capacity(1 << 20, xz); @@ -119,12 +116,8 @@ impl BenStreamReader { Ok(Self::from_xben_decompressed(xz, variant)) } - /// Build from a decompressed XBEN stream already positioned past the - /// 17-byte BEN banner. - pub(crate) fn from_xben_decompressed( - xz: BufReader>, - variant: BenVariant, - ) -> Self { + /// Build from a decompressed XBEN stream already positioned past the 17-byte BEN banner. + pub(crate) fn from_xben_decompressed(xz: BufReader>, variant: BenVariant) -> Self { Self { inner: BenStreamInner::XBen(Box::new(XBenInner { xz, @@ -153,15 +146,11 @@ impl BenStreamReader { /// Suppress progress output from this decoder's iteration paths. /// - /// In the `Ben` arm, this clears any active spinner. In the `XBen` arm, - /// `for_each_assignment` consults `silent` before creating its local - /// spinner. + /// In the `Ben` arm, this clears any active spinner. In the `XBen` arm, `for_each_assignment` + /// consults `silent` before creating its local spinner. pub fn silent(mut self, silent: bool) -> Self { self.silent = silent; - if let BenStreamInner::Ben { - spinner, .. - } = &mut self.inner - { + if let BenStreamInner::Ben { spinner, .. } = &mut self.inner { if silent { *spinner = None; } @@ -178,8 +167,8 @@ impl BenStreamReader { &mut self.inner } - /// Consume this decoder and iterate over raw BEN/ben32 frames instead of - /// materialized assignments. + /// Consume this decoder and iterate over raw BEN/ben32 frames instead of materialized + /// assignments. pub fn into_frames(self) -> BenStreamFrameReader { BenStreamFrameReader::from_stream(self) } @@ -190,18 +179,16 @@ impl BenStreamReader { pub fn count_samples(self) -> io::Result { let variant = self.variant; match self.inner { - BenStreamInner::Ben { reader, .. } => { - ben::count_samples_ben(reader, variant) - } + BenStreamInner::Ben { reader, .. } => ben::count_samples_ben(reader, variant), BenStreamInner::XBen(inner) => xben::count_samples_xben(*inner, variant), } } /// Decode assignments and pass each one to a callback by reference. /// - /// Unlike [`Iterator`], this avoids cloning the assignment buffer on every - /// frame. The callback receives a borrowed slice and its repetition - /// count. Return `true` to continue or `false` to stop early. + /// Unlike [`Iterator`], this avoids cloning the assignment buffer on every frame. The callback + /// receives a borrowed slice and its repetition count. Return `true` to continue or `false` to + /// stop early. pub fn for_each_assignment(&mut self, f: F) -> io::Result<()> where F: FnMut(&[u16], u16) -> io::Result, @@ -233,8 +220,8 @@ impl BenStreamReader { /// Decode the remaining stream and write it as JSONL. /// - /// Each decoded sample is written as a JSON object containing an - /// `assignment` vector and a 1-based `sample` index. + /// Each decoded sample is written as a JSON object containing an `assignment` vector and a + /// 1-based `sample` index. pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { let mut sample_number = 0usize; self.for_each_assignment(|assignment, count| { @@ -281,8 +268,7 @@ impl Iterator for BenStreamReader { } impl BenStreamReader { - /// Convert this decoder into a subsampling iterator over explicit 1-based - /// indices. + /// Convert this decoder into a subsampling iterator over explicit 1-based indices. pub fn into_subsample_by_indices( self, indices: T, @@ -293,8 +279,8 @@ impl BenStreamReader { SubsampleFrameDecoder::by_indices(self.into_frames(), indices) } - /// Convert this decoder into a subsampling iterator over the inclusive - /// 1-based range `[start, end]`. + /// Convert this decoder into a subsampling iterator over the inclusive 1-based range + /// `[start, end]`. pub fn into_subsample_by_range( self, start: usize, @@ -303,8 +289,8 @@ impl BenStreamReader { SubsampleFrameDecoder::by_range(self.into_frames(), start, end) } - /// Convert this decoder into a subsampling iterator that selects every - /// `step` samples from the 1-based `offset`. + /// Convert this decoder into a subsampling iterator that selects every `step` samples from the + /// 1-based `offset`. pub fn into_subsample_every( self, step: usize, diff --git a/ben/src/io/reader/stream_reader/xben.rs b/ben/src/io/reader/stream_reader/xben.rs index 55fbf70..6e39970 100644 --- a/ben/src/io/reader/stream_reader/xben.rs +++ b/ben/src/io/reader/stream_reader/xben.rs @@ -12,8 +12,8 @@ use crate::BenVariant; /// Try to extract one complete ben32 frame from the buffered overflow. /// -/// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 -/// frame and, for MkvChain streams, reads the trailing repetition count. +/// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 frame and, for MkvChain +/// streams, reads the trailing repetition count. pub(super) fn pop_frame_from_overflow<'a>( variant: BenVariant, overflow: &'a [u8], @@ -86,9 +86,9 @@ fn pop_twodelta_frame_from_overflow( /// Try to parse a columnar TwoDelta chunk from the overflow buffer. /// -/// If the overflow starts with the chunk tag and contains enough bytes for -/// the full chunk, all frames are decoded and pushed onto `chunk_queue`. -/// Returns `true` on success, `false` when the overflow is incomplete. +/// If the overflow starts with the chunk tag and contains enough bytes for the full chunk, all +/// frames are decoded and pushed onto `chunk_queue`. Returns `true` on success, `false` when the +/// overflow is incomplete. fn try_parse_twodelta_chunk(inner: &mut XBenInner) -> bool { if inner.overflow.first() != Some(&XBEN_TWODELTA_CHUNK_TAG) { return false; @@ -167,7 +167,10 @@ fn try_parse_twodelta_chunk(inner: &mut XBenInner) -> bool { } /// Decode one raw ben32 frame from an XBEN stream into a full assignment vector. -fn decode_xben_frame_to_assignment(frame_bytes: &[u8], variant: BenVariant) -> io::Result> { +fn decode_xben_frame_to_assignment( + frame_bytes: &[u8], + variant: BenVariant, +) -> io::Result> { let (assignment, _) = decode_ben32_line(Cursor::new(frame_bytes), variant)?; Ok(assignment) } diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index a1de190..025a90c 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -26,12 +26,10 @@ pub enum DecodeFrame { impl DecodeFrame { /// Expand a self-contained subsample frame into an assignment vector. /// - /// Distinct from [`BenDecodeFrame::expand`] (which takes a previous - /// assignment for delta variants); the frame readers guarantee frames - /// reaching the subsample path are self-contained, so no `prev` is - /// needed: plain-BEN TwoDelta is materialized and re-encoded as - /// `Standard`, and XBEN TwoDelta is materialized and re-encoded as - /// ben32. + /// Distinct from [`BenDecodeFrame::expand`] (which takes a previous assignment for delta + /// variants); the frame readers guarantee frames reaching the subsample path are + /// self-contained, so no `prev` is needed: plain-BEN TwoDelta is materialized and re-encoded as + /// `Standard`, and XBEN TwoDelta is materialized and re-encoded as ben32. pub fn expand_self_contained(&self) -> io::Result> { match self { DecodeFrame::Ben(f) => f.expand(None), @@ -197,8 +195,8 @@ where /// Build a generic frame iterator from a BEN or XBEN file path. /// -/// Frame iteration is useful for subsampling and counting because it avoids -/// decoding every sample into a full assignment vector. +/// Frame iteration is useful for subsampling and counting because it avoids decoding every sample +/// into a full assignment vector. pub fn build_frame_iter(file_path: &PathBuf, format: BenWireFormat) -> io::Result { let file = File::options().read(true).open(file_path)?; let reader = BufReader::new(file); @@ -207,10 +205,9 @@ pub fn build_frame_iter(file_path: &PathBuf, format: BenWireFormat) -> io::Resul /// Build a generic frame iterator from an already-opened reader. /// -/// This is the reader-driven variant of [`build_frame_iter`], useful when -/// the caller needs to iterate frames over a sub-region of a file (e.g. -/// the assignment stream embedded in a `.bendl` bundle, wrapped in a -/// [`std::io::Read::take`] guard) without re-opening the file from offset +/// This is the reader-driven variant of [`build_frame_iter`], useful when the caller needs to +/// iterate frames over a sub-region of a file (e.g. the assignment stream embedded in a `.bendl` +/// bundle, wrapped in a [`std::io::Read::take`] guard) without re-opening the file from offset /// zero. pub fn build_frame_iter_from_reader( reader: R, @@ -230,8 +227,8 @@ pub fn build_frame_iter_from_reader( /// Count the number of samples in a BEN or XBEN file on disk. /// -/// The file is walked frame-by-frame, so this is linear in file size but avoids -/// materializing full assignment vectors. +/// The file is walked frame-by-frame, so this is linear in file size but avoids materializing full +/// assignment vectors. pub fn count_samples_from_file(path: &Path, format: BenWireFormat) -> io::Result { let iter = build_frame_iter(&path.to_path_buf(), format)?; count_samples_from_frame_iter(iter) @@ -239,10 +236,9 @@ pub fn count_samples_from_file(path: &Path, format: BenWireFormat) -> io::Result /// Count the number of samples reachable through a pre-built frame iterator. /// -/// Mirror of [`count_samples_from_file`] that operates on an existing -/// [`FrameIter`], so callers that already have one (e.g. constructed via -/// [`build_frame_iter_from_reader`] over a bundle's stream region) can -/// reuse the walking logic without re-opening any files. +/// Mirror of [`count_samples_from_file`] that operates on an existing [`FrameIter`], so callers +/// that already have one (e.g. constructed via [`build_frame_iter_from_reader`] over a bundle's +/// stream region) can reuse the walking logic without re-opening any files. pub fn count_samples_from_frame_iter(iter: FrameIter) -> io::Result { let mut total = 0usize; for item in iter { @@ -251,4 +247,3 @@ pub fn count_samples_from_frame_iter(iter: FrameIter) -> io::Result { } Ok(total) } - diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index d53867d..50f4645 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -10,7 +10,16 @@ use xz2::write::XzEncoder; /// Build a minimal XBEN stream from JSONL input for testing. fn make_xben(jsonl: &str, variant: BenVariant) -> Vec { let mut xben = Vec::new(); - encode_jsonl_to_xben(jsonl.as_bytes(), &mut xben, variant, Some(1), Some(1), None, None).unwrap(); + encode_jsonl_to_xben( + jsonl.as_bytes(), + &mut xben, + variant, + Some(1), + Some(1), + None, + None, + ) + .unwrap(); xben } @@ -656,11 +665,7 @@ fn subsample_indices_empty_yields_nothing() { #[test] fn subsample_twodelta_by_range() { - let assignments = vec![ - vec![1u16, 1, 2, 2], - vec![2, 1, 2, 2], - vec![2, 2, 2, 2], - ]; + let assignments = vec![vec![1u16, 1, 2, 2], vec![2, 1, 2, 2], vec![2, 2, 2, 2]]; let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader @@ -725,7 +730,9 @@ fn xz_twodelta_all_identical_single_value_roundtrip() { fn xz_twodelta_alternating_assignments_roundtrip() { let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; - let assignments: Vec<_> = (0..50).map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }).collect(); + let assignments: Vec<_> = (0..50) + .map(|i| if i % 2 == 0 { a.clone() } else { b.clone() }) + .collect(); let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); @@ -763,8 +770,7 @@ fn xz_twodelta_chunk_boundary_roundtrip() { { let encoder = XzEncoder::new(&mut xben, 1); let mut writer = - BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::TwoDelta, Some(3)) - .unwrap(); + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::TwoDelta, Some(3)).unwrap(); writer.write_assignment(anchor.clone()).unwrap(); for _ in 0..10 { writer.write_assignment(delta.clone()).unwrap(); @@ -821,8 +827,8 @@ fn xz_twodelta_repeated_delta_in_chunk_roundtrip() { #[test] fn translate_ben_twodelta_to_xben_roundtrip() { - use crate::codec::encode::encode_ben_to_xben; use crate::codec::decode::decode_xben_to_jsonl; + use crate::codec::encode::encode_ben_to_xben; use crate::io::writer::BenStreamWriter; use std::io::BufReader; @@ -840,7 +846,15 @@ fn translate_ben_twodelta_to_xben_roundtrip() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let mut jsonl = Vec::new(); decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); @@ -886,7 +900,15 @@ fn translate_ben_twodelta_to_xben_with_repetitions() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap()).collect(); @@ -915,7 +937,15 @@ fn translate_ben_twodelta_to_xben_many_deltas() { } let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None).unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); @@ -926,8 +956,10 @@ fn translate_ben_twodelta_to_xben_many_deltas() { #[test] fn count_samples_from_frame_iter_basic() { - use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::subsample::{ + build_frame_iter_from_reader, count_samples_from_frame_iter, + }; let jsonl = r#"{"assignment":[1,2],"sample":1} {"assignment":[3,4],"sample":2} @@ -942,7 +974,9 @@ fn count_samples_from_frame_iter_basic() { #[test] fn count_samples_from_frame_iter_xben() { - use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; + use crate::io::reader::subsample::{ + build_frame_iter_from_reader, count_samples_from_frame_iter, + }; let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[2,2,1,1],"sample":2} @@ -954,8 +988,10 @@ fn count_samples_from_frame_iter_xben() { #[test] fn count_samples_from_frame_iter_mkv() { - use crate::io::reader::subsample::{build_frame_iter_from_reader, count_samples_from_frame_iter}; use crate::codec::encode::encode_jsonl_to_ben; + use crate::io::reader::subsample::{ + build_frame_iter_from_reader, count_samples_from_frame_iter, + }; let jsonl = r#"{"assignment":[1,2],"sample":1} {"assignment":[1,2],"sample":2} @@ -1080,9 +1116,7 @@ fn xz_reader_standard_zero_count_frame_errors() { { let mut encoder = XzEncoder::new(&mut xben, 1); // Write banner - encoder - .write_all(b"STANDARD BEN FILE") - .unwrap(); + encoder.write_all(b"STANDARD BEN FILE").unwrap(); // Write a ben32 frame: one RLE pair (value=1, count=3) + zero terminator let frame: &[u8] = &[ 0, 1, 0, 3, // (value=1, count=3) @@ -1092,19 +1126,16 @@ fn xz_reader_standard_zero_count_frame_errors() { encoder.finish().unwrap(); } - // Manually patch: for Standard, there's no count field after the - // terminator. Zero-count only fires for MkvChain where the count is explicit. - // So test MkvChain zero-count instead. + // Manually patch: for Standard, there's no count field after the terminator. Zero-count only + // fires for MkvChain where the count is explicit. So test MkvChain zero-count instead. let mut xben_mkv = Vec::new(); { let mut encoder = XzEncoder::new(&mut xben_mkv, 1); - encoder - .write_all(b"MKVCHAIN BEN FILE") - .unwrap(); + encoder.write_all(b"MKVCHAIN BEN FILE").unwrap(); let frame: &[u8] = &[ 0, 1, 0, 3, // (value=1, count=3) 0, 0, 0, 0, // zero terminator - 0, 0, // count = 0 <-- triggers zero_count_frame_error + 0, 0, // count = 0 <-- triggers zero_count_frame_error ]; encoder.write_all(frame).unwrap(); encoder.finish().unwrap(); @@ -1122,9 +1153,7 @@ fn xz_reader_twodelta_unknown_frame_tag_errors() { let mut xben = Vec::new(); { let mut encoder = XzEncoder::new(&mut xben, 1); - encoder - .write_all(b"TWODELTA BEN FILE") - .unwrap(); + encoder.write_all(b"TWODELTA BEN FILE").unwrap(); // Write a byte with unknown tag (0xFF) encoder.write_all(&[0xFF]).unwrap(); encoder.finish().unwrap(); @@ -1142,9 +1171,7 @@ fn xz_reader_truncated_stream_errors() { let mut xben = Vec::new(); { let mut encoder = XzEncoder::new(&mut xben, 1); - encoder - .write_all(b"STANDARD BEN FILE") - .unwrap(); + encoder.write_all(b"STANDARD BEN FILE").unwrap(); // Write a partial ben32 frame (no zero terminator) encoder.write_all(&[0, 1, 0, 3]).unwrap(); encoder.finish().unwrap(); @@ -1159,8 +1186,8 @@ fn xz_reader_truncated_stream_errors() { #[test] fn subsample_every_first_past_hi() { - // 4 samples, step=10, offset=5: first selected = 5, but only 4 samples - // exist → the `first > hi` branch fires for every frame. + // 4 samples, step=10, offset=5: first selected = 5, but only 4 samples exist → the `first > hi` + // branch fires for every frame. let jsonl = concat!( "{\"assignment\":[1,2],\"sample\":1}\n", "{\"assignment\":[3,4],\"sample\":2}\n", @@ -1246,7 +1273,7 @@ fn xz_reader_twodelta_chunk_zero_count_errors() { // Chunk (tag=2) with 1 frame, count=0 encoder.write_all(&[2u8]).unwrap(); // tag=2 encoder.write_all(&1u32.to_be_bytes()).unwrap(); // n_frames=1 - // Pair channel: (2,1) + // Pair channel: (2,1) encoder.write_all(&2u16.to_be_bytes()).unwrap(); encoder.write_all(&1u16.to_be_bytes()).unwrap(); // Count channel: 0 @@ -1263,15 +1290,19 @@ fn xz_reader_twodelta_chunk_zero_count_errors() { let results: Vec<_> = reader.collect(); assert_eq!(results.len(), 2); // anchor + chunk frame assert!(results[0].is_ok()); - assert!(results[1].as_ref().unwrap_err().to_string().contains("zero")); + assert!(results[1] + .as_ref() + .unwrap_err() + .to_string() + .contains("zero")); } // ── Subsample with indices that skip past frame boundaries ────────── #[test] fn subsample_indices_skip_past_lo() { - // MkvChain stream where first frame has count=5 but we only want indices [7,8]. - // This forces the Indices selection to skip past `lo` (line 160-161 in subsample.rs). + // MkvChain stream where first frame has count=5 but we only want indices [7,8]. This forces the + // Indices selection to skip past `lo` (line 160-161 in subsample.rs). let jsonl = concat!( "{\"assignment\":[1,2,3],\"sample\":1}\n", "{\"assignment\":[1,2,3],\"sample\":2}\n", @@ -1388,8 +1419,7 @@ fn xz_reader_twodelta_tag1_rejected_as_unknown() { let _first = iter.next().unwrap().unwrap(); // consume the valid full frame let err = iter.next().unwrap().unwrap_err(); assert!( - err.to_string().to_lowercase().contains("unknown") - || err.to_string().contains("tag"), + err.to_string().to_lowercase().contains("unknown") || err.to_string().contains("tag"), "expected unknown-tag error, got: {}", err ); @@ -1492,15 +1522,15 @@ fn xz_frame_reader_twodelta_truncated_errors() { fn xz_reader_standard_corrupt_frame_errors() { use xz2::write::XzEncoder; - // Write a valid-looking ben32 frame structure but with corrupted content - // that decode_xben_frame_to_assignment can't parse + // Write a valid-looking ben32 frame structure but with corrupted content that + // decode_xben_frame_to_assignment can't parse let mut xben = Vec::new(); { let mut encoder = XzEncoder::new(&mut xben, 1); encoder.write_all(b"STANDARD BEN FILE").unwrap(); - // Write 4 bytes followed by zero terminator — the frame decodes to - // a single run (value=255, count=255). This should actually be valid. - // Instead, write a completely empty frame (just the zero terminator). + // Write 4 bytes followed by zero terminator — the frame decodes to a single run (value=255, + // count=255). This should actually be valid. Instead, write a completely empty frame + // (just the zero terminator). encoder.write_all(&[0, 0, 0, 0]).unwrap(); // just zero terminator (no runs) encoder.finish().unwrap(); } @@ -1516,17 +1546,15 @@ fn xz_reader_standard_corrupt_frame_errors() { #[test] fn subsample_decoder_zero_count_frame_errors() { - // A frame iterator that yields a frame with count=0 should produce an - // InvalidData error from SubsampleFrameDecoder::next(). + // A frame iterator that yields a frame with count=0 should produce an InvalidData error from + // SubsampleFrameDecoder::next(). let frame = DecodeFrame::XBen( vec![0, 1, 0, 2, 0, 0, 0, 0], // valid ben32: [1,2] + zero terminator BenVariant::Standard, ); let items: Vec> = vec![Ok((frame, 0))]; - let mut decoder = SubsampleFrameDecoder::new( - items.into_iter(), - Selection::Range { start: 1, end: 10 }, - ); + let mut decoder = + SubsampleFrameDecoder::new(items.into_iter(), Selection::Range { start: 1, end: 10 }); let err = decoder.next().unwrap().unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("zero"), "got: {}", err); @@ -1536,9 +1564,8 @@ fn subsample_decoder_zero_count_frame_errors() { #[test] fn xz_frame_reader_twodelta_into_frames() { - // Verify that into_frames() works for TwoDelta streams. The frame reader - // takes the TwoDelta short-circuit path (re-encoding decoded assignments - // back to ben32). + // Verify that into_frames() works for TwoDelta streams. The frame reader takes the TwoDelta + // short-circuit path (re-encoding decoded assignments back to ben32). let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} {"assignment":[2,1,2,2],"sample":2} "#; @@ -1596,15 +1623,16 @@ fn raw_frame_iter_propagates_twodelta_decode_error() { writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } - // Locate the TwoDelta delta frame start by parsing the anchor (MkvChain) - // frame header: banner(17) + max_val_bits(1) + max_len_bits(1) + - // n_bytes(4 BE) + payload(n_bytes) + count(2) = anchor_end. + // Locate the TwoDelta delta frame start by parsing the anchor (MkvChain) frame header: + // banner(17) + max_val_bits(1) + max_len_bits(1) + n_bytes(4 BE) + payload(n_bytes) + count(2) + // = anchor_end. let banner_len = 17usize; - let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let n_bytes = + u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; let anchor_end = banner_len + 6 + n_bytes + 2; - // The TwoDelta delta frame: pair_a(2) + pair_b(2) + max_len_bits(1) + ... - // Set max_len_bits to 0, which triggers InvalidData during decoding. + // The TwoDelta delta frame: pair_a(2) + pair_b(2) + max_len_bits(1) + ... Set max_len_bits to + // 0, which triggers InvalidData during decoding. ben[anchor_end + 4] = 0; let reader = BenStreamReader::from_ben(Cursor::new(ben)).unwrap(); @@ -1642,9 +1670,7 @@ fn assignment_reader_for_each_rejects_zero_count_frame() { use crate::io::reader::BenStreamReader; let data = make_mkvchain_zero_count_frame(); let mut reader = BenStreamReader::from_ben(Cursor::new(data)).unwrap(); - let err = reader - .for_each_assignment(|_, _| Ok(true)) - .unwrap_err(); + let err = reader.for_each_assignment(|_, _| Ok(true)).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } diff --git a/ben/src/io/writer/options.rs b/ben/src/io/writer/options.rs index fb04e31..83ccfbd 100644 --- a/ben/src/io/writer/options.rs +++ b/ben/src/io/writer/options.rs @@ -1,10 +1,9 @@ //! Encode-side configuration knobs for the unified BEN stream writer. //! -//! Mirrors the discipline of `RelabelOptions`: a `#[non_exhaustive]` struct -//! with private fields and value-taking builder setters, so adding a knob -//! later is non-breaking. `None` semantically means "use the codec/lzma -//! default" and is distinct from any specific user-provided value; -//! callers who want defaults simply do not call the setter. +//! Mirrors the discipline of `RelabelOptions`: a `#[non_exhaustive]` struct with private fields and +//! value-taking builder setters, so adding a knob later is non-breaking. `None` semantically means +//! "use the codec/lzma default" and is distinct from any specific user-provided value; callers who +//! want defaults simply do not call the setter. use super::twodelta::DEFAULT_TWODELTA_CHUNK_SIZE; @@ -42,8 +41,8 @@ impl XzEncodeOptions { self } - /// Set the TwoDelta columnar chunk size. `0` normalizes to `1`. - /// Ignored for Standard and MkvChain XBEN streams. + /// Set the TwoDelta columnar chunk size. `0` normalizes to `1`. Ignored for Standard and + /// MkvChain XBEN streams. pub fn with_twodelta_chunk_size(mut self, size: usize) -> Self { self.twodelta_chunk_size = size.max(1); self diff --git a/ben/src/io/writer/stream_writer/ben.rs b/ben/src/io/writer/stream_writer/ben.rs index b15b2c0..0789d72 100644 --- a/ben/src/io/writer/stream_writer/ben.rs +++ b/ben/src/io/writer/stream_writer/ben.rs @@ -31,8 +31,8 @@ impl BenState { } } - /// Encode and write the buffered assignment with the accumulated repetition count. - /// No-op when nothing is pending. + /// Encode and write the buffered assignment with the accumulated repetition count. No-op when + /// nothing is pending. pub(super) fn flush_pending_frame(&mut self) -> io::Result<()> { let pending = match self.pending_assignment.take() { Some(p) => p, @@ -45,8 +45,8 @@ impl BenState { Ok(()) } - /// Encode one `(assignment, count)` directly, used for both flush and `write_frame`. - /// Updates `previous_masks` for TwoDelta. + /// Encode one `(assignment, count)` directly, used for both flush and `write_frame`. Updates + /// `previous_masks` for TwoDelta. fn encode_and_write_frame(&mut self, assignment: &[u16], count: u16) -> io::Result<()> { match self.variant { BenVariant::Standard => { @@ -62,8 +62,8 @@ impl BenState { } BenVariant::TwoDelta => { if self.previous_assignment.is_empty() { - // First frame: encode as MkvChain wire format and seed - // the position masks for subsequent delta frames. + // First frame: encode as MkvChain wire format and seed the position masks for + // subsequent delta frames. for (idx, &val) in assignment.iter().enumerate() { self.previous_masks.entry(val).or_default().push(idx); } @@ -108,25 +108,21 @@ impl BenState { Ok(()) } - /// Encode one frame with the supplied count, flushing any pending merge state first. - /// Caller has already verified `count != 0` and that the writer is in a valid state. + /// Encode one frame with the supplied count, flushing any pending merge state first. Caller has + /// already verified `count != 0` and that the writer is in a valid state. pub(super) fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { self.flush_pending_frame()?; self.encode_and_write_frame(&assignment, count)?; - // For TwoDelta, the next delta is encoded against the just-emitted - // frame. `encode_and_write_frame` already updated `previous_masks` - // when the previous_assignment was empty; in all variants we need - // to update `previous_assignment` here so a subsequent + // For TwoDelta, the next delta is encoded against the just-emitted frame. + // `encode_and_write_frame` already updated `previous_masks` when the previous_assignment + // was empty; in all variants we need to update `previous_assignment` here so a subsequent // `write_assignment` sees the right baseline. self.previous_assignment = assignment; Ok(()) } } -pub(crate) fn twodelta_repeat_frame( - assignment: &[u16], - count: u16, -) -> io::Result { +pub(crate) fn twodelta_repeat_frame(assignment: &[u16], count: u16) -> io::Result { let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; Ok(BenEncodeFrame::from_run_lengths( pair, diff --git a/ben/src/io/writer/stream_writer/mod.rs b/ben/src/io/writer/stream_writer/mod.rs index 7c7b21b..c1cea66 100644 --- a/ben/src/io/writer/stream_writer/mod.rs +++ b/ben/src/io/writer/stream_writer/mod.rs @@ -1,8 +1,7 @@ -//! Unified writer for the BEN-stack stream layer (layer 3 — see -//! `docs/glossary.md`). +//! Unified writer for the BEN-stack stream layer (layer 3 — see `docs/glossary.md`). //! -//! Hides the wire-format choice (BEN bit-packed vs ben32 / XBEN columnar) -//! and the transport choice (plain vs xz-compressed) behind one type. +//! Hides the wire-format choice (BEN bit-packed vs ben32 / XBEN columnar) and the transport choice +//! (plain vs xz-compressed) behind one type. mod ben; mod xben; @@ -31,23 +30,20 @@ pub use crate::io::reader::BenWireFormat; use ben::BenState; use xben::XBenInner; -/// Writer for an encoded BEN-stack stream of samples (layer 3 — see -/// `docs/glossary.md`). +/// Writer for an encoded BEN-stack stream of samples (layer 3 — see `docs/glossary.md`). /// -/// Construct with [`BenStreamWriter::for_ben`] for plain BEN or -/// [`BenStreamWriter::for_xben`] for XBEN. `write_assignment` is available -/// on both arms; `write_frame` is plain-BEN-only and preserves one frame -/// boundary per call. Calling `write_frame` on an XBEN writer returns +/// Construct with [`BenStreamWriter::for_ben`] for plain BEN or [`BenStreamWriter::for_xben`] for +/// XBEN. `write_assignment` is available on both arms; `write_frame` is plain-BEN-only and +/// preserves one frame boundary per call. Calling `write_frame` on an XBEN writer returns /// `InvalidInput`. pub struct BenStreamWriter { - /// Wrapped in `Option` so [`Self::finish_into_inner`] can `take()` it - /// without partial-moving out of a `Drop` type. All other access - /// sites unwrap with `.expect("inner present")` — only the consuming - /// `finish_into_inner` ever leaves it `None`. + /// Wrapped in `Option` so [`Self::finish_into_inner`] can `take()` it without partial-moving + /// out of a `Drop` type. All other access sites unwrap with `.expect("inner present")` — only + /// the consuming `finish_into_inner` ever leaves it `None`. inner: Option>, state: WriterState, - /// Tracks whether any sample-writing or direct-ingest operation has - /// touched the writer. `ingest_ben_stream` requires this to be `false`. + /// Tracks whether any sample-writing or direct-ingest operation has touched the writer. + /// `ingest_ben_stream` requires this to be `false`. body_started: bool, } @@ -67,9 +63,8 @@ enum BenStreamInner { impl BenStreamWriter { /// Open a plain-BEN writer. Emits the BEN banner immediately. /// - /// On error, the underlying `writer` is dropped — no partial - /// `BenStreamWriter` is returned. The caller treats the output as - /// failed and discards. + /// On error, the underlying `writer` is dropped — no partial `BenStreamWriter` is returned. The + /// caller treats the output as failed and discards. pub fn for_ben(mut writer: W, variant: BenVariant) -> io::Result { writer.write_all(banner_for_variant(variant))?; Ok(Self { @@ -79,13 +74,9 @@ impl BenStreamWriter { }) } - /// Open an XBEN writer. Builds the xz encoder from `options` and emits - /// the BEN banner inside the compressed stream. - pub fn for_xben( - writer: W, - variant: BenVariant, - options: XzEncodeOptions, - ) -> io::Result { + /// Open an XBEN writer. Builds the xz encoder from `options` and emits the BEN banner inside + /// the compressed stream. + pub fn for_xben(writer: W, variant: BenVariant, options: XzEncodeOptions) -> io::Result { let n_cpus = resolve_threads(options.n_threads); let level = options.compression_level.unwrap_or(9).min(9); let mt: Stream = build_mt_stream(n_cpus, level, options.block_size)?; @@ -93,10 +84,9 @@ impl BenStreamWriter { Self::for_xben_with_encoder(encoder, variant, Some(options.twodelta_chunk_size)) } - /// Open an XBEN writer around an already-built xz encoder. Used by codec - /// plumbing that constructs encoders explicitly. The TwoDelta chunk - /// size is passed independently because compression options have - /// already been consumed building the encoder; `None` means default. + /// Open an XBEN writer around an already-built xz encoder. Used by codec plumbing that + /// constructs encoders explicitly. The TwoDelta chunk size is passed independently because + /// compression options have already been consumed building the encoder; `None` means default. pub(crate) fn for_xben_with_encoder( mut encoder: XzEncoder, variant: BenVariant, @@ -131,14 +121,15 @@ impl BenStreamWriter { } } - /// Encode one assignment vector. Count-capable formats buffer - /// adjacent-equal assignments into counted frames; XBEN-Standard writes - /// each assignment immediately, and Standard BEN expands buffered - /// counts into one-sample frames on flush. + /// Encode one assignment vector. Count-capable formats buffer adjacent-equal assignments into + /// counted frames; XBEN-Standard writes each assignment immediately, and Standard BEN expands + /// buffered counts into one-sample frames on flush. pub fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { match self.state { WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { - return Err(invalid_input("writer is not in a state that accepts samples")); + return Err(invalid_input( + "writer is not in a state that accepts samples", + )); } WriterState::Open => {} } @@ -154,16 +145,17 @@ impl BenStreamWriter { result } - /// Plain-BEN only: encode one assignment vector with a caller-supplied - /// count. MkvChain/TwoDelta emit one counted frame; Standard expands - /// `count` into one-sample frames. + /// Plain-BEN only: encode one assignment vector with a caller-supplied count. MkvChain/TwoDelta + /// emit one counted frame; Standard expands `count` into one-sample frames. /// - /// Guard order: writer-state, then mode, then zero-count no-op, then - /// the stateful flush/encode path. + /// Guard order: writer-state, then mode, then zero-count no-op, then the stateful flush/encode + /// path. pub fn write_frame(&mut self, assignment: Vec, count: u16) -> io::Result<()> { match self.state { WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { - return Err(invalid_input("writer is not in a state that accepts frames")); + return Err(invalid_input( + "writer is not in a state that accepts frames", + )); } WriterState::Open => {} } @@ -189,7 +181,9 @@ impl BenStreamWriter { pub fn write_json_value(&mut self, data: Value) -> io::Result<()> { match self.state { WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { - return Err(invalid_input("writer is not in a state that accepts samples")); + return Err(invalid_input( + "writer is not in a state that accepts samples", + )); } WriterState::Open => {} } @@ -207,9 +201,8 @@ impl BenStreamWriter { result } - /// Crate-private XBEN-only direct ingest. Fresh-writer-only and terminal - /// for sample writes: on success the writer transitions to `BodyClosed` - /// and only `finish()` remains valid. + /// Crate-private XBEN-only direct ingest. Fresh-writer-only and terminal for sample writes: on + /// success the writer transitions to `BodyClosed` and only `finish()` remains valid. pub(crate) fn ingest_ben_stream(&mut self, reader: impl BufRead) -> io::Result<()> { match self.state { WriterState::Complete | WriterState::Failed | WriterState::BodyClosed => { @@ -245,10 +238,9 @@ impl BenStreamWriter { } } - /// Flush buffered BEN/XBEN state and finalize the underlying compressed - /// stream when present. Valid from `Open` and `BodyClosed`. Repeated - /// `finish()` after success returns `Ok(())`. Once finalization enters - /// the stateful path, any encode/writer/encoder error transitions the + /// Flush buffered BEN/XBEN state and finalize the underlying compressed stream when present. + /// Valid from `Open` and `BodyClosed`. Repeated `finish()` after success returns `Ok(())`. Once + /// finalization enters the stateful path, any encode/writer/encoder error transitions the /// writer to `Failed`; subsequent calls return `InvalidInput`. pub fn finish(&mut self) -> io::Result<()> { match self.state { @@ -292,17 +284,14 @@ impl BenStreamWriter { } } - /// Consume the writer, flush any buffered state, finalize the - /// underlying compressed stream when present (XBEN), and return the - /// underlying `W`. + /// Consume the writer, flush any buffered state, finalize the underlying compressed stream when + /// present (XBEN), and return the underlying `W`. /// - /// Unlike `std::io::BufWriter::into_inner`, this method's name is - /// intentionally `finish_into_inner` because errors from the BEN - /// flush or the consuming `XzEncoder::finish()` can still lose - /// access to the inner writer. Returns `InvalidInput` if the writer - /// is in `Failed`. Accepted from `Open`, `BodyClosed`, and - /// `Complete`; the `Complete` path simply extracts the inner writer - /// after prior finalization. + /// Unlike `std::io::BufWriter::into_inner`, this method's name is intentionally + /// `finish_into_inner` because errors from the BEN flush or the consuming `XzEncoder::finish()` + /// can still lose access to the inner writer. Returns `InvalidInput` if the writer is in + /// `Failed`. Accepted from `Open`, `BodyClosed`, and `Complete`; the `Complete` path simply + /// extracts the inner writer after prior finalization. pub fn finish_into_inner(mut self) -> io::Result { let state = self.state; match state { @@ -330,8 +319,7 @@ impl BenStreamWriter { impl Drop for BenStreamWriter { fn drop(&mut self) { - if self.inner.is_some() - && matches!(self.state, WriterState::Open | WriterState::BodyClosed) + if self.inner.is_some() && matches!(self.state, WriterState::Open | WriterState::BodyClosed) { let _ = self.finish(); } diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index 3e4655c..4698fad 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -72,7 +72,11 @@ impl XBenState { } impl XBenInner { - pub(super) fn new(encoder: XzEncoder, variant: BenVariant, twodelta_chunk_size: usize) -> Self { + pub(super) fn new( + encoder: XzEncoder, + variant: BenVariant, + twodelta_chunk_size: usize, + ) -> Self { Self { encoder, state: XBenState::new(variant, twodelta_chunk_size), @@ -222,8 +226,8 @@ impl XBenInner { } } - /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without - /// materializing full assignment vectors. + /// Translate a BEN TwoDelta stream directly to XBEN TwoDelta without materializing full + /// assignment vectors. fn translate_ben_twodelta_to_xben(&mut self, mut reader: impl Read) -> io::Result<()> { let chunk_size = match &self.state { XBenState::TwoDelta { @@ -307,8 +311,7 @@ impl XBenInner { /// Crate-private direct ingest entry point. /// - /// Standard/MkvChain accept bannered or bannerless input; TwoDelta - /// requires a banner. + /// Standard/MkvChain accept bannered or bannerless input; TwoDelta requires a banner. pub(super) fn ingest_ben_stream(&mut self, mut reader: impl BufRead) -> io::Result<()> { let peek = reader.fill_buf()?; let has_banner = peek.len() >= BANNER_LEN && has_known_banner_prefix(peek); diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 301a892..19112ca 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -4,8 +4,8 @@ use crate::BenVariant; use std::io::Cursor; use xz2::write::XzEncoder; -/// Build a `BenStreamWriter` over an explicit single-thread XZ encoder so -/// the resulting xben byte stream is deterministic and small. +/// Build a `BenStreamWriter` over an explicit single-thread XZ encoder so the resulting xben byte +/// stream is deterministic and small. fn build_xben_writer<'a>( out: &'a mut Vec, variant: BenVariant, @@ -192,8 +192,8 @@ fn writer_twodelta_u16_max_value_in_assignment() { #[test] fn ben_writer_twodelta_repeat_frame_via_u16max_overflow() { - // Assignment with 3 distinct values exercises the `continue` skip path - // inside `twodelta_repeat_frame` for values outside the picked pair. + // Assignment with 3 distinct values exercises the `continue` skip path inside + // `twodelta_repeat_frame` for values outside the picked pair. let assign = vec![1u16, 2, 3, 1, 2]; let n = u16::MAX as usize + 2; // 65537: triggers overflow → repeat frame @@ -373,8 +373,8 @@ fn writer_twodelta_stress_many_unique_deltas() { #[test] fn writer_twodelta_anchor_count_overflow_u16max() { - // Use 3 distinct values to exercise the `continue` skip in - // twodelta_repeat_buffered_frame for values outside the picked pair. + // Use 3 distinct values to exercise the `continue` skip in twodelta_repeat_buffered_frame for + // values outside the picked pair. let assign = vec![1u16, 2, 3, 1, 2]; let n = u16::MAX as usize + 2; // 65537 — triggers the overflow branch @@ -451,8 +451,8 @@ fn writer_translate_ben_twodelta_chunk_flush() { #[test] fn xz_writer_twodelta_too_many_ids_propagates_on_write() { - // Writing a third assignment that changes 3 distinct IDs errors at the - // TwoDelta encode boundary. + // Writing a third assignment that changes 3 distinct IDs errors at the TwoDelta encode + // boundary. let anchor = vec![1u16, 1, 2, 2]; let invalid = vec![2u16, 3, 1, 3]; // 3 distinct changing ids let mut xben = Vec::new(); @@ -486,11 +486,13 @@ fn writer_mkv_count_overflow_u16max() { #[test] fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { - use super::stream_writer::test_helpers::{twodelta_repeat_buffered_frame, twodelta_repeat_frame}; + use super::stream_writer::test_helpers::{ + twodelta_repeat_buffered_frame, twodelta_repeat_frame, + }; use std::io; - // All-identical-value assignment with 65536 elements: the pair-position - // run reaches u16::MAX and the encoder must error. + // All-identical-value assignment with 65536 elements: the pair-position run reaches u16::MAX + // and the encoder must error. let assign = vec![1u16; 65536]; let err = twodelta_repeat_frame(&assign, 1).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidInput); @@ -507,9 +509,9 @@ fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { fn translate_twodelta_non_eof_read_error_propagates() { use std::io::{self, Read}; - // ingest_ben_stream in TwoDelta mode calls translate_ben_twodelta_to_xben. - // After reading the anchor frame it loops reading delta frames; a - // non-EOF error on pair_a (first u16 read in the loop) must propagate. + // ingest_ben_stream in TwoDelta mode calls translate_ben_twodelta_to_xben. After reading the + // anchor frame it loops reading delta frames; a non-EOF error on pair_a (first u16 read in the + // loop) must propagate. let mut xben = Vec::new(); let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); @@ -578,9 +580,8 @@ fn ben_write_frame_then_write_assignment_mixed_mkv() { #[test] fn ben_write_frame_zero_count_is_noop_and_does_not_flush() { - // write_assignment(a); write_frame(b, 0); write_assignment(a) should - // act like two adjacent write_assignment(a) calls — no inserted - // frame boundary. + // write_assignment(a); write_frame(b, 0); write_assignment(a) should act like two adjacent + // write_assignment(a) calls — no inserted frame boundary. let a = vec![1u16, 2, 3]; let b = vec![4u16, 5, 6]; @@ -627,8 +628,8 @@ fn ben_twodelta_write_frame_updates_previous_assignment_for_next_delta() { w.write_assignment(b.clone()).unwrap(); w.finish().unwrap(); } - // Round-trip must reproduce the inputs, which proves the delta against - // the emitted anchor was encoded correctly. + // Round-trip must reproduce the inputs, which proves the delta against the emitted anchor was + // encoded correctly. let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let mut samples: Vec> = Vec::new(); reader @@ -689,8 +690,7 @@ fn write_methods_after_finish_return_invalid_input() { #[test] fn write_frame_after_finish_with_zero_count_still_returns_invalid_input() { - // Pin guard ordering: finished/wrong-mode checks happen before the - // zero-count no-op. + // Pin guard ordering: finished/wrong-mode checks happen before the zero-count no-op. let mut ben = Vec::new(); let mut w = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); w.finish().unwrap(); @@ -767,10 +767,9 @@ fn ingest_ben_stream_rejects_ben_mode_writer() { #[test] fn ben_writer_failed_state_after_underlying_writer_error() { - // The banner write happens during construction; constructor failure - // bypasses WriterState entirely. To exercise the post-construction - // poisoning path we wrap a buffer that accepts only the 17 banner - // bytes and errors on subsequent writes. + // The banner write happens during construction; constructor failure bypasses WriterState + // entirely. To exercise the post-construction poisoning path we wrap a buffer that accepts only + // the 17 banner bytes and errors on subsequent writes. struct FailAfterN { buf: Vec, n: usize, @@ -798,8 +797,8 @@ fn ben_writer_failed_state_after_underlying_writer_error() { .unwrap(); // First call buffers the assignment as pending; no IO yet. w.write_assignment(vec![1u16, 2, 3]).unwrap(); - // Second call with a different assignment triggers a flush, which - // must fail and poison the writer. + // Second call with a different assignment triggers a flush, which must fail and poison the + // writer. let err = w.write_assignment(vec![4u16, 5, 6]).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); let err = w.write_assignment(vec![1u16, 2, 3]).unwrap_err(); diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 3e02226..3437990 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -6,12 +6,11 @@ pub(crate) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; /// Default number of delta frames per columnar chunk in XBEN TwoDelta. pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; -/// Walk a TwoDelta repeat-eligible assignment and emit the `(pair, run_lengths)` -/// describing it. +/// Walk a TwoDelta repeat-eligible assignment and emit the `(pair, run_lengths)` describing it. /// -/// Used by both the BEN and XBEN writers to construct the body of a TwoDelta -/// "repeat" frame: each writer wraps the result in its own frame type. Returns -/// an `InvalidInput` error if any run exceeds `u16::MAX` in length. +/// Used by both the BEN and XBEN writers to construct the body of a TwoDelta "repeat" frame: each +/// writer wraps the result in its own frame type. Returns an `InvalidInput` error if any run +/// exceeds `u16::MAX` in length. pub(crate) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16), Vec)> { let first = assignment.first().copied().unwrap_or(0); let second = assignment diff --git a/ben/src/io/writer/utils.rs b/ben/src/io/writer/utils.rs index c4d8c01..2e5e497 100644 --- a/ben/src/io/writer/utils.rs +++ b/ben/src/io/writer/utils.rs @@ -12,8 +12,8 @@ use std::io::{self, Result}; /// /// # Returns /// -/// Returns a `Vec` of assignment values, or an error if the field is -/// missing, not an array, or contains values that do not fit in a `u16`. +/// Returns a `Vec` of assignment values, or an error if the field is missing, not an array, or +/// contains values that do not fit in a `u16`. pub(crate) fn parse_json_assignment(data: Value) -> Result> { let assign_vec = data["assignment"].as_array().ok_or_else(|| { io::Error::new( @@ -47,8 +47,8 @@ pub(crate) fn parse_json_assignment(data: Value) -> Result> { /// Encode an assignment vector as a full XBEN two-delta frame. /// -/// The frame begins with a full-frame tag byte followed by RLE-encoded -/// assignment runs in big-endian format. +/// The frame begins with a full-frame tag byte followed by RLE-encoded assignment runs in +/// big-endian format. /// /// # Arguments /// diff --git a/ben/src/json/graph/errors.rs b/ben/src/json/graph/errors.rs index 445e03a..5718a3e 100644 --- a/ben/src/json/graph/errors.rs +++ b/ben/src/json/graph/errors.rs @@ -1,8 +1,7 @@ use serde_json::Value; use std::fmt; -/// Errors that can occur when converting between [`NxGraphAdjFormat`] and -/// [`PetxGraph`]. +/// Errors that can occur when converting between [`NxGraphAdjFormat`] and [`PetxGraph`]. #[derive(Debug)] pub(crate) enum NxPetgraphError { /// The `directed` flag on the input does not match the target graph type. diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index 4eea7fd..b25cf38 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -8,18 +8,15 @@ use std::cmp::Reverse; use std::collections::HashSet; use std::time::Duration; -/// Per-phase progress tracker for MLC, with one spinner line per recursion -/// depth. +/// Per-phase progress tracker for MLC, with one spinner line per recursion depth. /// -/// Phase 1 (depth 0) processes the original nodes; phase 2 processes the -/// level-1 clusters produced by phase 1; and so on. Bars are added lazily -/// the first time a given depth is reached, and each bar's total grows as -/// new work at that depth is discovered (e.g. when the next top-level +/// Phase 1 (depth 0) processes the original nodes; phase 2 processes the level-1 clusters produced +/// by phase 1; and so on. Bars are added lazily the first time a given depth is reached, and each +/// bar's total grows as new work at that depth is discovered (e.g. when the next top-level /// component recurses and introduces more coarse nodes). /// -/// Spinners auto-hide when stderr is not a terminal (e.g. under `cargo -/// test` or when output is piped), so no config is needed for CI/test -/// environments. +/// Spinners auto-hide when stderr is not a terminal (e.g. under `cargo test` or when output is +/// piped), so no config is needed for CI/test environments. struct MlcProgress { multi: MultiProgress, bars: Vec, @@ -38,8 +35,7 @@ impl MlcProgress { } } - /// Make sure a bar exists for `depth`, creating any intermediate bars - /// that don't exist yet. + /// Make sure a bar exists for `depth`, creating any intermediate bars that don't exist yet. fn ensure_depth(&mut self, depth: usize) { while self.bars.len() <= depth { let bar = self.multi.add(ProgressBar::new_spinner()); @@ -108,19 +104,19 @@ impl MlcProgress { /// Compute a multilevel cluster ordering and apply it to the graph in place. /// -/// The graph is reordered so that nodes which are topologically close end up -/// at adjacent indices. Each connected component is ordered independently, -/// and components are sorted by their minimum node index. +/// The graph is reordered so that nodes which are topologically close end up at adjacent indices. +/// Each connected component is ordered independently, and components are sorted by their minimum +/// node index. /// /// Arguments: /// -/// - `petx_graph`: The graph to reorder in place. Only edge topology is -/// considered; node and edge attributes are preserved but relocated. +/// - `petx_graph`: The graph to reorder in place. Only edge topology is considered; node and edge +/// attributes are preserved but relocated. /// /// Returns: /// -/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` -/// the node occupied before reordering. +/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` the node occupied +/// before reordering. pub(super) fn apply_multi_level_clustering(petx_graph: &mut PetxGraph) -> Vec where Ty: petgraph::EdgeType, @@ -134,27 +130,27 @@ where order } -/// Recursively order each connected component via multilevel clustering, then -/// concatenate the results. +/// Recursively order each connected component via multilevel clustering, then concatenate the +/// results. /// -/// Components are sorted by decreasing size (ties broken by minimum label) -/// so that larger components occupy the beginning of the output. Each -/// component is ordered independently by [`mlc_component`]. +/// Components are sorted by decreasing size (ties broken by minimum label) so that larger +/// components occupy the beginning of the output. Each component is ordered independently by +/// [`mlc_component`]. /// /// # Arguments /// -/// * `graph` - The input graph to order. Generic over node/edge weights and -/// edge type so it also works with the coarse graph during recursion. -/// * `labels` - A per-node label vector used for tie-breaking when choosing -/// seeds and sorting neighbors. Indexed by `NodeIndex::index()`. +/// * `graph` - The input graph to order. Generic over node/edge weights and edge type so it also +/// works with the coarse graph during recursion. +/// * `labels` - A per-node label vector used for tie-breaking when choosing seeds and sorting +/// neighbors. Indexed by `NodeIndex::index()`. /// * `progress` - Progress tracker for the multi-phase spinner display. -/// * `depth` - Recursion depth (0 at the top level). Used to route progress -/// updates to the correct phase bar. +/// * `depth` - Recursion depth (0 at the top level). Used to route progress updates to the correct +/// phase bar. /// /// # Returns /// -/// A permutation vector where `order[new_index]` is the `NodeIndex` of the -/// node that should occupy position `new_index`. +/// A permutation vector where `order[new_index]` is the `NodeIndex` of the node that should occupy +/// position `new_index`. fn mlc_order_inner( graph: &Graph, labels: &[usize], @@ -186,34 +182,29 @@ where order } -/// Order a single connected component by seed-expansion clustering plus -/// recursive coarsening. +/// Order a single connected component by seed-expansion clustering plus recursive coarsening. /// /// Steps: /// /// 1. Singleton components return immediately. /// 2. [`greedy_cluster_partition`] carves the component into stars. -/// 3. Each cluster is re-ordered internally via [`rcm_component`] on its -/// induced subgraph, so peripheral leaves bracket the cluster and the -/// high-degree seed sits in the interior. -/// 4. If the partition returns a single cluster (a star that covers the -/// whole component), that RCM-ordered cluster is the final order. -/// 5. Otherwise a coarse graph is built with one node per cluster, and -/// [`mlc_order_inner`] recurses on it to decide the order in which -/// clusters are emitted. The recursion terminates when each coarse -/// component collapses to a single cluster. -/// 6. The final order is produced by unrolling: emit clusters in the -/// recursive coarse order, each cluster in its RCM-ordered form. +/// 3. Each cluster is re-ordered internally via [`rcm_component`] on its induced subgraph, so +/// peripheral leaves bracket the cluster and the high-degree seed sits in the interior. +/// 4. If the partition returns a single cluster (a star that covers the whole component), that +/// RCM-ordered cluster is the final order. +/// 5. Otherwise a coarse graph is built with one node per cluster, and [`mlc_order_inner`] recurses +/// on it to decide the order in which clusters are emitted. The recursion terminates when each +/// coarse component collapses to a single cluster. +/// 6. The final order is produced by unrolling: emit clusters in the recursive coarse order, each +/// cluster in its RCM-ordered form. /// /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by -/// `NodeIndex::index()`. +/// * `labels` - Per-node labels for tie-breaking, indexed by `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to order. /// * `progress` - Progress tracker for the multi-phase spinner display. -/// * `depth` - Recursion depth; routes progress updates to the correct -/// phase bar. +/// * `depth` - Recursion depth; routes progress updates to the correct phase bar. /// /// # Returns /// @@ -233,16 +224,14 @@ where return vec![component[0]]; } - // `greedy_cluster_partition` ticks this depth's progress per cluster, - // so every node in `component` contributes to phase `depth+1` exactly - // once. + // `greedy_cluster_partition` ticks this depth's progress per cluster, so every node in + // `component` contributes to phase `depth+1` exactly once. let mut clusters = greedy_cluster_partition(graph, labels, component, progress, depth); - // Reorder each cluster internally via RCM on the subgraph induced by - // its members. This puts peripheral (degree-1) nodes at both ends of - // the cluster and the high-degree seed near the middle/end, which - // keeps cluster boundaries "loose" and avoids stranding the most- - // connected node next to the previous cluster. + // Reorder each cluster internally via RCM on the subgraph induced by its members. This puts + // peripheral (degree-1) nodes at both ends of the cluster and the high-degree seed near the + // middle/end, which keeps cluster boundaries "loose" and avoids stranding the most- connected + // node next to the previous cluster. for cluster in clusters.iter_mut() { *cluster = rcm_component(graph, labels, cluster); } @@ -252,8 +241,8 @@ where return clusters.into_iter().next().unwrap(); } - // Multi-cluster case: recurse on the coarse graph to decide the order - // in which the clusters appear. + // Multi-cluster case: recurse on the coarse graph to decide the order in which the clusters + // appear. let (coarse_graph, coarse_labels) = build_coarse_graph(graph, labels, &clusters); let coarse_order = mlc_order_inner(&coarse_graph, &coarse_labels, progress, depth + 1); @@ -264,35 +253,29 @@ where order } -/// Partition a component into star-shaped clusters using a greedy -/// seed-expansion strategy. +/// Partition a component into star-shaped clusters using a greedy seed-expansion strategy. /// -/// At each step, the lowest-degree unassigned node (ties broken by label) is -/// chosen as a seed, and the seed together with all of its unassigned -/// neighbors becomes the next cluster. Local degrees are then decremented -/// for every unassigned node adjacent to a newly-assigned one, so subsequent +/// At each step, the lowest-degree unassigned node (ties broken by label) is chosen as a seed, and +/// the seed together with all of its unassigned neighbors becomes the next cluster. Local degrees +/// are then decremented for every unassigned node adjacent to a newly-assigned one, so subsequent /// seed selections reflect the residual graph. /// -/// Only cluster *membership* is meaningful here; the internal order of each -/// returned cluster is not final and is expected to be overwritten by the -/// caller (e.g. via [`rcm_component`]). +/// Only cluster *membership* is meaningful here; the internal order of each returned cluster is not +/// final and is expected to be overwritten by the caller (e.g. via [`rcm_component`]). /// /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by -/// `NodeIndex::index()`. +/// * `labels` - Per-node labels for tie-breaking, indexed by `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to partition. -/// * `progress` - Progress tracker; `depth`'s done counter is advanced by -/// each cluster's size as the cluster is formed, so the caller's phase -/// bar fills up gradually during large partitions. -/// * `depth` - Recursion depth of the caller, used to select the phase -/// bar to update. +/// * `progress` - Progress tracker; `depth`'s done counter is advanced by each cluster's size as +/// the cluster is formed, so the caller's phase bar fills up gradually during large partitions. +/// * `depth` - Recursion depth of the caller, used to select the phase bar to update. /// /// # Returns /// -/// A vector of clusters, where each cluster is a vector of `NodeIndex` -/// values. Every node in `component` appears in exactly one cluster. +/// A vector of clusters, where each cluster is a vector of `NodeIndex` values. Every node in +/// `component` appears in exactly one cluster. fn greedy_cluster_partition( graph: &Graph, labels: &[usize], @@ -317,9 +300,8 @@ where let mut cluster = vec![seed]; assigned[seed.index()] = true; - // Cluster membership is seed + every unassigned in-component - // neighbor. Internal order here is irrelevant: the caller - // (`mlc_component`) overwrites it with an RCM ordering on the + // Cluster membership is seed + every unassigned in-component neighbor. Internal order here + // is irrelevant: the caller (`mlc_component`) overwrites it with an RCM ordering on the // cluster's induced subgraph. for neighbor in graph.neighbors(seed) { if component_set.contains(&neighbor) && !assigned[neighbor.index()] { @@ -347,25 +329,24 @@ where /// Build a coarse graph where each cluster is contracted into a single node. /// -/// The coarse graph is always undirected: an edge exists between two coarse -/// nodes whenever any original-graph edge connects their clusters. Each coarse -/// node's label is the minimum original label among its cluster members. +/// The coarse graph is always undirected: an edge exists between two coarse nodes whenever any +/// original-graph edge connects their clusters. Each coarse node's label is the minimum original +/// label among its cluster members. /// /// # Arguments /// /// * `graph` - The full graph containing the original edges. -/// * `labels` - Per-node labels for the original graph, indexed by -/// `NodeIndex::index()`. -/// * `clusters` - The partition produced by [`greedy_cluster_partition`]. -/// Cluster `i` maps to coarse node `i`. +/// * `labels` - Per-node labels for the original graph, indexed by `NodeIndex::index()`. +/// * `clusters` - The partition produced by [`greedy_cluster_partition`]. Cluster `i` maps to +/// coarse node `i`. /// /// # Returns /// /// A tuple of: -/// * The coarse `Graph<(), (), Undirected>` with one node per cluster and -/// one edge per inter-cluster connection. -/// * A label vector for the coarse graph (one entry per cluster), where -/// each label is the minimum original label in that cluster. +/// * The coarse `Graph<(), (), Undirected>` with one node per cluster and one edge per +/// inter-cluster connection. +/// * A label vector for the coarse graph (one entry per cluster), where each label is the minimum +/// original label in that cluster. fn build_coarse_graph( graph: &Graph, labels: &[usize], diff --git a/ben/src/json/graph/mod.rs b/ben/src/json/graph/mod.rs index e5bc88a..493f8ca 100644 --- a/ben/src/json/graph/mod.rs +++ b/ben/src/json/graph/mod.rs @@ -26,16 +26,14 @@ pub enum GraphOrderingMethod { /// Sorts a JSON-formatted NetworkX graph file by a node attribute. /// -/// Reads a NetworkX adjacency-format JSON graph, reorders nodes so that they -/// are sorted by the given attribute key, and writes the reordered graph back -/// as JSON. +/// Reads a NetworkX adjacency-format JSON graph, reorders nodes so that they are sorted by the +/// given attribute key, and writes the reordered graph back as JSON. /// /// # Arguments /// /// * `reader` - A source of JSON bytes in NetworkX adjacency format. /// * `writer` - Destination for the reordered JSON output. -/// * `key` - The node attribute name to sort by. Use `"id"` to sort by the -/// NetworkX node id. +/// * `key` - The node attribute name to sort by. Use `"id"` to sort by the NetworkX node id. /// /// # Returns /// @@ -61,9 +59,8 @@ pub fn sort_json_file_by_key( /// Reorder a JSON-formatted NetworkX graph file using a topology-based method. /// -/// Reads a NetworkX adjacency-format JSON graph, reorders nodes using the -/// specified graph ordering algorithm, and writes the reordered graph back -/// as JSON. +/// Reads a NetworkX adjacency-format JSON graph, reorders nodes using the specified graph ordering +/// algorithm, and writes the reordered graph back as JSON. /// /// # Arguments /// @@ -102,8 +99,8 @@ pub fn sort_json_file_by_ordering( /// /// # Returns /// -/// The permutation that was applied: `order[new_index]` is the `NodeIndex` -/// the node occupied before reordering. +/// The permutation that was applied: `order[new_index]` is the `NodeIndex` the node occupied before +/// reordering. fn run_ordering_method( petx: &mut PetxGraph, method: GraphOrderingMethod, @@ -118,8 +115,7 @@ fn run_ordering_method( /// /// # Arguments /// -/// * `order` - The permutation that was applied: `order[new_index]` is the -/// old `NodeIndex`. +/// * `order` - The permutation that was applied: `order[new_index]` is the old `NodeIndex`. /// /// # Returns /// @@ -148,8 +144,7 @@ fn write_nx_graph(mut writer: W, nx_graph: &NxGraphAdjFormat) -> io::R writer.write_all(rendered.as_bytes()) } -/// Convert an [`NxPetgraphError`] into a [`std::io::Error`] with -/// [`ErrorKind::InvalidData`]. +/// Convert an [`NxPetgraphError`] into a [`std::io::Error`] with [`ErrorKind::InvalidData`]. /// /// # Arguments /// @@ -162,8 +157,8 @@ fn nx_err(e: NxPetgraphError) -> Error { Error::new(ErrorKind::InvalidData, e) } -/// Convert an [`NxGraphAdjFormat`] into a directed [`PetxGraph`], apply an -/// in-place reordering operation, and convert back to JSON adjacency form. +/// Convert an [`NxGraphAdjFormat`] into a directed [`PetxGraph`], apply an in-place reordering +/// operation, and convert back to JSON adjacency form. fn reorder_directed( nx_graph: NxGraphAdjFormat, op: F, @@ -177,8 +172,8 @@ where Ok((result, order)) } -/// Convert an [`NxGraphAdjFormat`] into an undirected [`PetxGraph`], apply an -/// in-place reordering operation, and convert back to JSON adjacency form. +/// Convert an [`NxGraphAdjFormat`] into an undirected [`PetxGraph`], apply an in-place reordering +/// operation, and convert back to JSON adjacency form. fn reorder_undirected( nx_graph: NxGraphAdjFormat, op: F, diff --git a/ben/src/json/graph/nx_formats.rs b/ben/src/json/graph/nx_formats.rs index 5b002e9..3efbcf3 100644 --- a/ben/src/json/graph/nx_formats.rs +++ b/ben/src/json/graph/nx_formats.rs @@ -4,10 +4,9 @@ use std::collections::BTreeMap; /// A NetworkX graph in adjacency-format JSON. /// -/// This is the Rust representation of the JSON produced by -/// `networkx.adjacency_data()`. All fields use `#[serde(default)]` so that -/// inputs which omit optional keys (e.g. `"directed"`) still deserialize -/// successfully. +/// This is the Rust representation of the JSON produced by `networkx.adjacency_data()`. All fields +/// use `#[serde(default)]` so that inputs which omit optional keys (e.g. `"directed"`) still +/// deserialize successfully. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub(crate) struct NxGraphAdjFormat { /// Whether the graph is directed. Defaults to `false`. @@ -23,8 +22,8 @@ pub(crate) struct NxGraphAdjFormat { /// The list of nodes, each carrying an `id` and arbitrary attributes. #[serde(default)] pub nodes: Vec, - /// Adjacency lists parallel to `nodes`. `adjacency[i]` lists the - /// neighbors (and edge attributes) of `nodes[i]`. + /// Adjacency lists parallel to `nodes`. `adjacency[i]` lists the neighbors (and edge + /// attributes) of `nodes[i]`. #[serde(default)] pub adjacency: Vec>, } @@ -48,8 +47,7 @@ pub(crate) struct NxAdjEntry { #[serde(rename = "id")] pub id: Value, - /// The edge key, present only in multigraphs. Omitted from JSON when - /// `None`. + /// The edge key, present only in multigraphs. Omitted from JSON when `None`. #[serde(default, skip_serializing_if = "Option::is_none")] pub key: Option, diff --git a/ben/src/json/graph/petxgraph/mod.rs b/ben/src/json/graph/petxgraph/mod.rs index c78a99c..b3291b1 100644 --- a/ben/src/json/graph/petxgraph/mod.rs +++ b/ben/src/json/graph/petxgraph/mod.rs @@ -14,22 +14,22 @@ use std::collections::BTreeMap; /// A single node in a [`PetxGraph`]. /// -/// All NetworkX node attributes are stored in `attrs`, including the original -/// node id under the reserved key `"__networkx_id__"`. +/// All NetworkX node attributes are stored in `attrs`, including the original node id under the +/// reserved key `"__networkx_id__"`. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub(crate) struct PetxNode { - /// Node attributes. Always contains `"__networkx_id__"` holding the - /// original (or current) NetworkX node id as a [`Value`]. + /// Node attributes. Always contains `"__networkx_id__"` holding the original (or current) + /// NetworkX node id as a [`Value`]. pub attrs: BTreeMap, } /// A petgraph-backed graph that mirrors a NetworkX adjacency-format graph. /// -/// The type parameter `Ty` is either [`Directed`] or [`Undirected`] and -/// determines the edge semantics of the underlying [`Graph`]. +/// The type parameter `Ty` is either [`Directed`] or [`Undirected`] and determines the edge +/// semantics of the underlying [`Graph`]. /// -/// Graph-level attributes (the `"graph"` array in the NetworkX JSON) are -/// stored alongside the petgraph [`Graph`] so they survive roundtrips. +/// Graph-level attributes (the `"graph"` array in the NetworkX JSON) are stored alongside the +/// petgraph [`Graph`] so they survive roundtrips. #[derive(Debug, Clone)] pub(crate) struct PetxGraph where @@ -37,8 +37,8 @@ where { /// Graph-level key/value attributes from the NetworkX JSON `"graph"` field. pub graph_attrs: Vec<(String, Value)>, - /// The underlying petgraph graph. Nodes carry [`PetxNode`] weights and - /// edges carry [`NxAdjEntry`] weights. + /// The underlying petgraph graph. Nodes carry [`PetxNode`] weights and edges carry + /// [`NxAdjEntry`] weights. pub graph: Graph, } diff --git a/ben/src/json/graph/petxgraph/nx_convert.rs b/ben/src/json/graph/petxgraph/nx_convert.rs index b75e440..905cb09 100644 --- a/ben/src/json/graph/petxgraph/nx_convert.rs +++ b/ben/src/json/graph/petxgraph/nx_convert.rs @@ -8,8 +8,8 @@ use std::collections::{HashMap, HashSet}; /// Convert an [`NxNode`] into a [`PetxNode`]. /// -/// The node's `id` field is moved into the attribute map under the reserved -/// key `"__networkx_id__"` so it can be recovered later. +/// The node's `id` field is moved into the attribute map under the reserved key `"__networkx_id__"` +/// so it can be recovered later. /// /// # Arguments /// @@ -17,8 +17,7 @@ use std::collections::{HashMap, HashSet}; /// /// # Returns /// -/// A [`PetxNode`] whose `attrs` map contains all original attributes plus -/// `"__networkx_id__"`. +/// A [`PetxNode`] whose `attrs` map contains all original attributes plus `"__networkx_id__"`. pub(in crate::json::graph) fn nx_node_to_petx_node(nx_node: NxNode) -> PetxNode { let mut attrs = nx_node.attrs; attrs.insert("__networkx_id__".to_string(), nx_node.id); @@ -27,8 +26,8 @@ pub(in crate::json::graph) fn nx_node_to_petx_node(nx_node: NxNode) -> PetxNode /// Convert a [`PetxNode`] back into an [`NxNode`]. /// -/// The `"__networkx_id__"` entry is removed from the attribute map and -/// placed back into the `id` field. +/// The `"__networkx_id__"` entry is removed from the attribute map and placed back into the `id` +/// field. /// /// # Arguments /// @@ -40,8 +39,7 @@ pub(in crate::json::graph) fn nx_node_to_petx_node(nx_node: NxNode) -> PetxNode /// /// # Errors /// -/// Returns [`NxPetgraphError::Other`] if the node has no -/// `"__networkx_id__"` attribute. +/// Returns [`NxPetgraphError::Other`] if the node has no `"__networkx_id__"` attribute. pub(in crate::json::graph) fn petx_node_to_nx_node( petx_node: &PetxNode, ) -> Result { @@ -55,15 +53,13 @@ pub(in crate::json::graph) fn petx_node_to_nx_node( /// Build a [`PetxGraph`] from a parsed [`NxGraphAdjFormat`]. /// -/// Nodes are added in order and edges are extracted from the adjacency lists. -/// For undirected graphs, duplicate `(u,v)` / `(v,u)` entries are -/// deduplicated so each edge is stored only once. +/// Nodes are added in order and edges are extracted from the adjacency lists. For undirected +/// graphs, duplicate `(u,v)` / `(v,u)` entries are deduplicated so each edge is stored only once. /// /// # Arguments /// /// * `nx_graph` - The parsed NetworkX graph. Consumed by this function. -/// * `is_directed` - Whether the target graph should be directed. Must match -/// `nx_graph.directed`. +/// * `is_directed` - Whether the target graph should be directed. Must match `nx_graph.directed`. /// /// # Returns /// @@ -141,8 +137,8 @@ where .expect("__networkx_id__ always set by nx_node_to_petx_node"); // serde_json::Value is always serializable. - let source_key = serde_json::to_string(source_id) - .expect("serde_json::Value always serializes"); + let source_key = + serde_json::to_string(source_id).expect("serde_json::Value always serializes"); for edge in neighbors { let target_id = &edge.id; @@ -154,8 +150,8 @@ where graph.add_edge(source_idx, *target_idx, edge); } else { // serde_json::Value is always serializable. - let target_key = serde_json::to_string(target_id) - .expect("serde_json::Value always serializes"); + let target_key = + serde_json::to_string(target_id).expect("serde_json::Value always serializes"); let edge_key_str = edge .key @@ -180,8 +176,8 @@ where /// Check whether a graph contains parallel (multi) edges. /// -/// Two edges are considered parallel if they connect the same pair of -/// endpoints. For undirected graphs, `(u,v)` and `(v,u)` are the same pair. +/// Two edges are considered parallel if they connect the same pair of endpoints. For undirected +/// graphs, `(u,v)` and `(v,u)` are the same pair. /// /// # Arguments /// @@ -218,10 +214,9 @@ where /// Convert a [`PetxGraph`] back into an [`NxGraphAdjFormat`]. /// -/// Nodes are emitted in petgraph index order. For undirected graphs, each -/// edge appears in both endpoints' adjacency lists (except self-loops, -/// which appear only once). The `multigraph` flag is set automatically -/// based on whether parallel edges exist. +/// Nodes are emitted in petgraph index order. For undirected graphs, each edge appears in both +/// endpoints' adjacency lists (except self-loops, which appear only once). The `multigraph` flag is +/// set automatically based on whether parallel edges exist. /// /// # Arguments /// @@ -234,8 +229,7 @@ where /// /// # Errors /// -/// Returns [`NxPetgraphError::Other`] if any node is missing its -/// `"__networkx_id__"` attribute. +/// Returns [`NxPetgraphError::Other`] if any node is missing its `"__networkx_id__"` attribute. fn construct_networkx_from_petgraph( petx_graph: &PetxGraph, is_directed: bool, diff --git a/ben/src/json/graph/petxgraph/permutation.rs b/ben/src/json/graph/petxgraph/permutation.rs index 323d06e..d901d0b 100644 --- a/ben/src/json/graph/petxgraph/permutation.rs +++ b/ben/src/json/graph/petxgraph/permutation.rs @@ -4,22 +4,20 @@ use petgraph::graph::{Graph, NodeIndex}; use petgraph::visit::{EdgeRef, NodeIndexable}; use serde_json::Value; -/// Apply a node permutation to a `PetxGraph`, returning a new graph with nodes -/// reordered. +/// Apply a node permutation to a `PetxGraph`, returning a new graph with nodes reordered. /// /// Arguments: /// /// - `petx_graph`: The input graph to permute. -/// - `order`: A permutation where `order[new_index]` is the `NodeIndex` of the -/// node that should occupy position `new_index` in the output graph. Must be -/// a valid permutation of the graph's node indices. +/// - `order`: A permutation where `order[new_index]` is the `NodeIndex` of the node that should +/// occupy position `new_index` in the output graph. Must be a valid permutation of the graph's +/// node indices. /// /// Returns: /// -/// - A new `PetxGraph` with nodes in the specified order and edges remapped to -/// the new indices. Edge attributes (including `key` and `attrs`) are -/// preserved; the `NxAdjEntry::id` field is left as-is since -/// `construct_networkx_from_petgraph` overwrites it on export. +/// - A new `PetxGraph` with nodes in the specified order and edges remapped to the new indices. +/// Edge attributes (including `key` and `attrs`) are preserved; the `NxAdjEntry::id` field is +/// left as-is since `construct_networkx_from_petgraph` overwrites it on export. pub(in crate::json::graph) fn apply_permutation( petx_graph: &PetxGraph, order: &[NodeIndex], diff --git a/ben/src/json/graph/petxgraph/sort.rs b/ben/src/json/graph/petxgraph/sort.rs index d8c1c8c..262271c 100644 --- a/ben/src/json/graph/petxgraph/sort.rs +++ b/ben/src/json/graph/petxgraph/sort.rs @@ -6,8 +6,8 @@ use std::cmp::Ordering; /// Sort a `PetxGraph` by a node attribute and apply the permutation in place. /// -/// Nodes are ordered by the value of `key` in their attribute map, using -/// numeric comparison when possible and falling back to string comparison. +/// Nodes are ordered by the value of `key` in their attribute map, using numeric comparison when +/// possible and falling back to string comparison. /// /// Returns the permutation that was applied. pub(in crate::json::graph) fn sort_by_key( @@ -32,19 +32,17 @@ where /// Look up the sort attribute for a node. /// -/// The special key `"id"` is mapped to the internal `"__networkx_id__"` -/// attribute so callers can sort by the NetworkX node id. +/// The special key `"id"` is mapped to the internal `"__networkx_id__"` attribute so callers can +/// sort by the NetworkX node id. /// /// # Arguments /// /// * `node` - The node whose attribute is being looked up. -/// * `key` - The attribute name. `"id"` is treated as an alias for -/// `"__networkx_id__"`. +/// * `key` - The attribute name. `"id"` is treated as an alias for `"__networkx_id__"`. /// /// # Returns /// -/// A reference to the attribute [`Value`], or `None` if the attribute is -/// absent. +/// A reference to the attribute [`Value`], or `None` if the attribute is absent. fn get_sort_attr<'a>(node: &'a PetxNode, key: &str) -> Option<&'a Value> { if key == "id" { node.attrs.get("__networkx_id__") @@ -55,9 +53,9 @@ fn get_sort_attr<'a>(node: &'a PetxNode, key: &str) -> Option<&'a Value> { /// Compare two optional attribute values for sorting. /// -/// Values are compared numerically when both can be interpreted as `f64` -/// (covers integers, floats, and numeric strings). Otherwise they are -/// compared as strings. `None` is treated as the string `"null"`. +/// Values are compared numerically when both can be interpreted as `f64` (covers integers, floats, +/// and numeric strings). Otherwise they are compared as strings. `None` is treated as the string +/// `"null"`. /// /// # Arguments /// diff --git a/ben/src/json/graph/rcm.rs b/ben/src/json/graph/rcm.rs index 39235ab..8628823 100644 --- a/ben/src/json/graph/rcm.rs +++ b/ben/src/json/graph/rcm.rs @@ -6,8 +6,8 @@ use std::collections::{HashSet, VecDeque}; /// Compute a Reverse Cuthill-McKee ordering and apply it to the graph in place. /// -/// Each connected component is ordered independently via RCM, and components -/// are sorted by their minimum node index. The graph is then permuted in place. +/// Each connected component is ordered independently via RCM, and components are sorted by their +/// minimum node index. The graph is then permuted in place. /// /// Arguments: /// @@ -15,8 +15,8 @@ use std::collections::{HashSet, VecDeque}; /// /// Returns: /// -/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` -/// the node occupied before reordering. +/// - The permutation that was applied: `order[new_index]` is the `NodeIndex` the node occupied +/// before reordering. pub(super) fn apply_reverse_cuthill_mckee(petx_graph: &mut PetxGraph) -> Vec where Ty: petgraph::EdgeType, @@ -46,14 +46,13 @@ where /// Reverse Cuthill-McKee ordering for a single connected component. /// -/// Starts BFS from the minimum-degree node (ties broken by label), then -/// reverses the result to produce the RCM permutation. +/// Starts BFS from the minimum-degree node (ties broken by label), then reverses the result to +/// produce the RCM permutation. /// /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by -/// `NodeIndex::index()`. +/// * `labels` - Per-node labels for tie-breaking, indexed by `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to order. /// /// # Returns @@ -100,22 +99,19 @@ where /// Compute the degree of each component node restricted to the component. /// -/// For each node in `component`, counts how many of its neighbors are also -/// in the component. The result is indexed by `NodeIndex::index()`, so -/// entries for nodes outside the component are zero. +/// For each node in `component`, counts how many of its neighbors are also in the component. The +/// result is indexed by `NodeIndex::index()`, so entries for nodes outside the component are zero. /// /// # Arguments /// /// * `graph` - The full graph. -/// * `component_set` - A `HashSet` of the nodes in the component, used for -/// fast membership checks. +/// * `component_set` - A `HashSet` of the nodes in the component, used for fast membership checks. /// * `component` - The slice of `NodeIndex` values in the component. /// /// # Returns /// -/// A vector of length `graph.node_bound()` where -/// `result[node.index()]` is the number of neighbors of `node` that are in -/// the component, or `0` for nodes not in the component. +/// A vector of length `graph.node_bound()` where `result[node.index()]` is the number of neighbors +/// of `node` that are in the component, or `0` for nodes not in the component. pub(super) fn local_degree_in_component( graph: &Graph, component_set: &HashSet, diff --git a/ben/src/json/graph/tests/test_algos.rs b/ben/src/json/graph/tests/test_algos.rs index d88e267..5f933c9 100644 --- a/ben/src/json/graph/tests/test_algos.rs +++ b/ben/src/json/graph/tests/test_algos.rs @@ -324,7 +324,6 @@ fn test_sort_json_file_by_multi_level_cluster() { assert_eq!(output_json["nodes"].as_array().unwrap().len(), 4); } - #[test] fn test_sort_by_ordering_directed_rcm() { let input = r#"{ @@ -450,8 +449,8 @@ fn test_sort_json_file_by_key_mixed_numeric_and_string() { #[test] fn test_sort_json_file_by_key_missing_attribute_uses_null() { - // When a node lacks the sort key, compare_attr_values receives None - // which maps to the string "null" for comparison purposes. + // When a node lacks the sort key, compare_attr_values receives None which maps to the string + // "null" for comparison purposes. let input = r#"{ "nodes": [ {"id": 0, "rank": 5}, @@ -478,8 +477,8 @@ fn test_sort_json_file_by_key_missing_attribute_uses_null() { #[test] fn test_mlc_with_isolated_node() { - // A graph containing an isolated node (no edges) triggers the - // single-node-component early return in mlc_component. + // A graph containing an isolated node (no edges) triggers the single-node-component early + // return in mlc_component. let input = r#"{ "nodes": [ {"id": 0}, @@ -514,9 +513,9 @@ fn test_mlc_with_isolated_node() { #[test] fn test_sort_json_file_by_key_fips_string_ids() { - // Node IDs are FIPS codes stored as JSON strings ("360191010003" etc.). - // The mapping must use original positions (0-indexed) as keys, not the - // raw FIPS values, so that downstream BEN relabeling can index correctly. + // Node IDs are FIPS codes stored as JSON strings ("360191010003" etc.). The mapping must use + // original positions (0-indexed) as keys, not the raw FIPS values, so that downstream BEN + // relabeling can index correctly. let input = r#"{ "nodes": [ {"id": "360191010003", "rank": 30}, @@ -542,7 +541,7 @@ fn test_sort_json_file_by_key_fips_string_ids() { assert_eq!(mapping[&0], 2); // pos 0 (rank=30) → new pos 2 assert_eq!(mapping[&1], 0); // pos 1 (rank=10) → new pos 0 assert_eq!(mapping[&2], 1); // pos 2 (rank=20) → new pos 1 - // All new positions 0..N-1 are valid BEN array indices. + // All new positions 0..N-1 are valid BEN array indices. let mut new_positions: Vec = mapping.values().copied().collect(); new_positions.sort(); assert_eq!(new_positions, vec![0, 1, 2]); @@ -550,9 +549,9 @@ fn test_sort_json_file_by_key_fips_string_ids() { #[test] fn test_sort_json_file_by_key_float_sort_values() { - // Sort key values are JSON floats; they should sort numerically, not as strings - // (e.g. 1.5 < 10.0, not "1.5" < "10.0" would also hold, but 2.5 < 10.0 would break - // lexicographically as "10.0" < "2.5"). + // Sort key values are JSON floats; they should sort numerically, not as strings (e.g. 1.5 < + // 10.0, not "1.5" < "10.0" would also hold, but 2.5 < 10.0 would break lexicographically as + // "10.0" < "2.5"). let input = r#"{ "nodes": [ {"id": 0, "score": 10.0}, diff --git a/ben/src/json/graph/tests/test_io.rs b/ben/src/json/graph/tests/test_io.rs index 711f959..12d327f 100644 --- a/ben/src/json/graph/tests/test_io.rs +++ b/ben/src/json/graph/tests/test_io.rs @@ -36,8 +36,8 @@ fn edge_set_directed(graph: &PetxDiInnerGraph) -> Vec<(usize, usize)> { edges } -/// Normalize an NxGraphAdjFormat by sorting each adjacency list by target id, -/// so structural equality can be checked after roundtrip. +/// Normalize an NxGraphAdjFormat by sorting each adjacency list by target id, so structural +/// equality can be checked after roundtrip. fn normalize(format: &mut NxGraphAdjFormat) { for adj_list in &mut format.adjacency { adj_list.sort_by(|a, b| { @@ -629,8 +629,8 @@ fn type_aliases_work() { #[test] fn undirected_dedup_produces_correct_edge_count() { - // NetworkX adjacency format lists each undirected edge twice: once from - // each endpoint. The converter should deduplicate to a single petgraph edge. + // NetworkX adjacency format lists each undirected edge twice: once from each endpoint. The + // converter should deduplicate to a single petgraph edge. let nx = parse_nx(P4_JSON); // P4 adjacency has 6 total entries (1+2+2+1) but only 3 unique edges let total_adj_entries: usize = nx.adjacency.iter().map(|a| a.len()).sum(); @@ -642,8 +642,7 @@ fn undirected_dedup_produces_correct_edge_count() { #[test] fn construct_nx_from_petx_restores_both_directions() { - // When converting back, each undirected edge should appear in both - // endpoints' adjacency lists. + // When converting back, each undirected edge should appear in both endpoints' adjacency lists. let nx_original = parse_nx(P4_JSON); let petx: PetxUnGraph = nx_original.try_into().unwrap(); let nx_roundtrip = NxGraphAdjFormat::try_from(&petx).unwrap(); @@ -702,13 +701,12 @@ fn graph_with_parallel_edges_sets_multigraph_true() { // // These tests verify that the full pipeline // JSON string → NxGraphAdjFormat → PetxGraph → NxGraphAdjFormat → JSON string -// produces output whose serde_json::Value representation matches the input. -// This catches serialization artifacts (e.g. `"key": null` for absent fields) -// that struct-level roundtrip tests miss. +// produces output whose serde_json::Value representation matches the input. This catches +// serialization artifacts (e.g. `"key": null` for absent fields) that struct-level roundtrip tests +// miss. -/// Normalize a `serde_json::Value` representing an NxGraphAdjFormat by sorting -/// each adjacency list by the stringified `"id"` field, so edge-order -/// differences don't cause spurious failures. +/// Normalize a `serde_json::Value` representing an NxGraphAdjFormat by sorting each adjacency list +/// by the stringified `"id"` field, so edge-order differences don't cause spurious failures. fn normalize_json_value(v: &mut serde_json::Value) { if let Some(adj) = v.get_mut("adjacency").and_then(|a| a.as_array_mut()) { for list in adj.iter_mut() { diff --git a/ben/src/lib.rs b/ben/src/lib.rs index afa1056..2cca17d 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -1,24 +1,21 @@ //! Tools for working with binary ensembles of districting plans. //! -//! This crate provides several command line tools and library functions for -//! converting ensembles of districting plans contained in a JSONL file with -//! lines of the form +//! This crate provides several command line tools and library functions for converting ensembles of +//! districting plans contained in a JSONL file with lines of the form //! //! ```text //! {"assignment": , "sample": } //! ``` //! -//! into binary ensembles (BEN) and extremely compressed binary ensembles -//! (XBEN). It also provides several tools for working with these files -//! including several tools for relabeling the ensembles to improve -//! compression ratios. +//! into binary ensembles (BEN) and extremely compressed binary ensembles (XBEN). It also provides +//! several tools for working with these files including several tools for relabeling the ensembles +//! to improve compression ratios. //! //! The main CLI tools provided by this crate are: //! -//! - `ben`: A tool for converting JSONL files into BEN files. -//! and for converting between BEN and XBEN files. +//! - `ben`: A tool for converting JSONL files into BEN files. and for converting between BEN and +//! XBEN files. //! - `reben`: A tool for relabeling BEN files to improve compression ratios. -//! #[cfg(not(target_pointer_width = "64"))] compile_error!("binary-ensemble requires a 64-bit target"); @@ -56,13 +53,12 @@ pub enum BenVariant { TwoDelta, } -/// The subset of [`BenVariant`] values that pass through the BEN32 intermediate -/// wire format (see `docs/glossary.md`). +/// The subset of [`BenVariant`] values that pass through the BEN32 intermediate wire format +/// (see `docs/glossary.md`). /// -/// `TwoDelta` streams use a separate XBEN columnar layout and are intentionally -/// excluded; functions parameterised by `XBenVariant` cannot be called for -/// TwoDelta at compile time. Convert with `From for BenVariant` -/// or `TryFrom for XBenVariant`. +/// `TwoDelta` streams use a separate XBEN columnar layout and are intentionally excluded; functions +/// parameterised by `XBenVariant` cannot be called for TwoDelta at compile time. Convert with +/// `From for BenVariant` or `TryFrom for XBenVariant`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum XBenVariant { Standard, @@ -78,8 +74,8 @@ impl From for BenVariant { } } -/// Returned by `TryFrom for XBenVariant` when the input is -/// `TwoDelta`, which has no BEN32 representation. +/// Returned by `TryFrom for XBenVariant` when the input is `TwoDelta`, which has no +/// BEN32 representation. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct TwoDeltaNotXBenError; diff --git a/ben/src/logging.rs b/ben/src/logging.rs index 37e6e57..43542a6 100644 --- a/ben/src/logging.rs +++ b/ben/src/logging.rs @@ -5,14 +5,12 @@ static INIT_LOGGER: Once = Once::new(); /// Initialize the global `tracing` subscriber used by the BEN CLIs. /// -/// The subscriber reads `RUST_LOG` when present and otherwise defaults to -/// logging being disabled. Initialization is guarded so it is safe to call -/// multiple times. +/// The subscriber reads `RUST_LOG` when present and otherwise defaults to logging being disabled. +/// Initialization is guarded so it is safe to call multiple times. /// /// # Returns /// -/// This function does not return a value. Repeated calls after the first are -/// no-ops. +/// This function does not return a value. Repeated calls after the first are no-ops. pub fn init_logging() { INIT_LOGGER.call_once(|| { let filter = EnvFilter::try_from_default_env() diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index 051723e..b178fbc 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -108,8 +108,8 @@ pub fn extract_assignment_xben( for frame in frame_iterator { let (decode_frame, count) = frame.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + count as usize > sample_number { - // The frame iterator guarantees complete zero-sentinel ben32 - // frames in the XBEN arm, so decode_ben32_line always succeeds. + // The frame iterator guarantees complete zero-sentinel ben32 frames in the XBEN arm, so + // decode_ben32_line always succeeds. let bytes = match &decode_frame { crate::io::reader::DecodeFrame::XBen(b, _) => b, crate::io::reader::DecodeFrame::Ben(_) => { diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 873a404..5ecb669 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -144,7 +144,7 @@ fn test_extract_assignment_xben_roundtrip_and_errors() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -200,8 +200,14 @@ fn extract_assignment_ben_path_returns_assignment() { let path = unique_path("extract-path.ben"); std::fs::write(&path, &ben_bytes).unwrap(); - assert_eq!(extract_assignment_ben_path(&path, 1).unwrap(), vec![1, 2, 3]); - assert_eq!(extract_assignment_ben_path(&path, 2).unwrap(), vec![3, 2, 1]); + assert_eq!( + extract_assignment_ben_path(&path, 1).unwrap(), + vec![1, 2, 3] + ); + assert_eq!( + extract_assignment_ben_path(&path, 2).unwrap(), + vec![3, 2, 1] + ); let _ = std::fs::remove_file(&path); } diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index e819ec6..d4c2585 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -1,7 +1,7 @@ //! Relabeling operations for BEN files. //! -//! All seven logical relabel/convert operations route through the single -//! [`relabel_ben_file`] driver, parameterised by [`RelabelOptions`]. +//! All seven logical relabel/convert operations route through the single [`relabel_ben_file`] +//! driver, parameterised by [`RelabelOptions`]. mod errors; mod permutation; @@ -39,20 +39,18 @@ pub enum RelabelTransform { #[derive(Clone, Copy, Debug, Eq, PartialEq)] #[non_exhaustive] pub enum RunPolicy { - /// Each input frame produces a separate output frame; counts are preserved - /// where the target variant can encode them, and expanded to one-sample - /// frames otherwise. + /// Each input frame produces a separate output frame; counts are preserved where the target + /// variant can encode them, and expanded to one-sample frames otherwise. PreserveFrameBoundaries, - /// Adjacent identical output assignments are merged into a single counted - /// frame where the target variant can encode counts. + /// Adjacent identical output assignments are merged into a single counted frame where the + /// target variant can encode counts. CollapseAdjacentEqualAssignments, } /// Options for [`relabel_ben_file`]. /// -/// Constructed via [`RelabelOptions::first_seen`], -/// [`RelabelOptions::node_permutation`], or [`RelabelOptions::convert_to`], -/// then refined with the `with_*` builder methods. +/// Constructed via [`RelabelOptions::first_seen`], [`RelabelOptions::node_permutation`], or +/// [`RelabelOptions::convert_to`], then refined with the `with_*` builder methods. #[non_exhaustive] pub struct RelabelOptions { transform: RelabelTransform, @@ -62,8 +60,7 @@ pub struct RelabelOptions { } impl RelabelOptions { - /// First-seen district relabeling, preserving the input variant and frame - /// boundaries. + /// First-seen district relabeling, preserving the input variant and frame boundaries. pub fn first_seen() -> Self { Self { transform: RelabelTransform::FirstSeen, @@ -73,8 +70,8 @@ impl RelabelOptions { } } - /// Node permutation through `new_idx -> old_idx`, preserving the input - /// variant and frame boundaries. + /// Node permutation through `new_idx -> old_idx`, preserving the input variant and frame + /// boundaries. pub fn node_permutation(map: HashMap) -> Self { Self { transform: RelabelTransform::NodePermutation(map), @@ -84,8 +81,8 @@ impl RelabelOptions { } } - /// Convert to `target` without relabeling, collapsing adjacent equal - /// assignments to preserve today's conversion compression behavior. + /// Convert to `target` without relabeling, collapsing adjacent equal assignments to preserve + /// today's conversion compression behavior. pub fn convert_to(target: BenVariant) -> Self { Self { transform: RelabelTransform::Identity, @@ -135,10 +132,9 @@ impl RelabelOptions { /// Process a BEN file according to the supplied options. /// -/// All seven logical relabel/convert operations route through this driver. -/// Internally chooses between an RLE-fast-path byte walker (first-seen -/// relabeling, no variant change, frame-preserving, Standard/MkvChain input) -/// and the high-level decoder driver (everything else). +/// All seven logical relabel/convert operations route through this driver. Internally chooses +/// between an RLE-fast-path byte walker (first-seen relabeling, no variant change, +/// frame-preserving, Standard/MkvChain input) and the high-level decoder driver (everything else). pub fn relabel_ben_file( reader: R, writer: W, @@ -202,8 +198,8 @@ pub fn convert_ben_file( /// True when the driver may take the byte-walking RLE fast path. /// -/// The predicate is one boolean computed once. See `risks` in the plan for -/// why it is its own pure function and gets a dedicated unit-test matrix. +/// The predicate is one boolean computed once. See `risks` in the plan for why it is its own pure +/// function and gets a dedicated unit-test matrix. fn can_use_first_seen_fast_path( transform: &RelabelTransform, target_variant: Option, @@ -216,15 +212,13 @@ fn can_use_first_seen_fast_path( && matches!(input, BenVariant::Standard | BenVariant::MkvChain) } -/// Decode a BEN stream, apply a per-assignment transform, and re-encode into -/// the target variant. +/// Decode a BEN stream, apply a per-assignment transform, and re-encode into the target variant. /// -/// With [`RunPolicy::PreserveFrameBoundaries`], the implementation never -/// merges across input frame boundaries: MkvChain/TwoDelta targets receive -/// counted output frames, Standard targets receive `count` one-sample frames -/// because Standard cannot encode repetition counts. With -/// [`RunPolicy::CollapseAdjacentEqualAssignments`], the existing -/// [`BenStreamWriter`] merging path is used. +/// With [`RunPolicy::PreserveFrameBoundaries`], the implementation never merges across input frame +/// boundaries: MkvChain/TwoDelta targets receive counted output frames, Standard targets receive +/// `count` one-sample frames because Standard cannot encode repetition counts. With +/// [`RunPolicy::CollapseAdjacentEqualAssignments`], the existing [`BenStreamWriter`] merging path +/// is used. fn relabel_via_decoder( reader: R, writer: W, @@ -295,10 +289,9 @@ where /// Byte-walking RLE fast path for first-seen relabeling on Standard/MkvChain. /// -/// Walks 6-byte frame headers, decodes the RLE in place, applies first-seen -/// relabeling on the `(val, len)` pairs, and re-encodes. Skips assignment -/// vector materialization entirely. The output banner has been emitted by the -/// caller before this is invoked. +/// Walks 6-byte frame headers, decodes the RLE in place, applies first-seen relabeling on the +/// `(val, len)` pairs, and re-encodes. Skips assignment vector materialization entirely. The output +/// banner has been emitted by the caller before this is invoked. fn relabel_first_seen_via_byte_walk( mut reader: R, mut writer: W, @@ -337,8 +330,7 @@ fn relabel_first_seen_via_byte_walk( 1 }; - let frame = - BenEncodeFrame::from_rle(ben_line, input_variant, Some(count_occurrences)); + let frame = BenEncodeFrame::from_rle(ben_line, input_variant, Some(count_occurrences)); writer.write_all(frame.as_slice())?; sample_number += count_occurrences as usize; diff --git a/ben/src/ops/relabel/permutation.rs b/ben/src/ops/relabel/permutation.rs index bef676e..dde6bf8 100644 --- a/ben/src/ops/relabel/permutation.rs +++ b/ben/src/ops/relabel/permutation.rs @@ -177,9 +177,8 @@ mod tests { assert_eq!(runs, vec![(1, 3), (2, 1), (1, 2), (3, 1)]); } - /// Cross-check: assignment-level and RLE-level first-seen relabeling must - /// agree for any input. This pins the equivalence as a property, not a - /// coincidence (decision #6 / risk mitigation). + /// Cross-check: assignment-level and RLE-level first-seen relabeling must agree for any input. + /// This pins the equivalence as a property, not a coincidence (decision #6 / risk mitigation). #[test] fn first_seen_relabel_assignment_equals_rle_path() { let inputs: Vec> = vec![ diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 782f27a..164d344 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -42,9 +42,9 @@ where map } -/// Wrap a banner-stripped frame payload back into a full BEN file by prepending -/// the banner. Tests that previously fed banner-less buffers feed a full BEN -/// file under the new API and call [`relabel_ben_file`] directly. +/// Wrap a banner-stripped frame payload back into a full BEN file by prepending the banner. Tests +/// that previously fed banner-less buffers feed a full BEN file under the new API and call +/// [`relabel_ben_file`] directly. fn with_banner(variant: BenVariant, payload: &[u8]) -> Vec { let mut out = crate::format::banners::banner_for_variant(variant).to_vec(); out.extend_from_slice(payload); @@ -69,7 +69,10 @@ fn test_relabel_ben_line_simple() { ) .unwrap(); - assert_eq!(&buf[..BANNER_LEN], crate::format::banners::STANDARD_BEN_BANNER); + assert_eq!( + &buf[..BANNER_LEN], + crate::format::banners::STANDARD_BEN_BANNER + ); assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } @@ -272,7 +275,10 @@ fn test_relabel_ben_line_with_map() { ) .unwrap(); - assert_eq!(&buf[..BANNER_LEN], crate::format::banners::STANDARD_BEN_BANNER); + assert_eq!( + &buf[..BANNER_LEN], + crate::format::banners::STANDARD_BEN_BANNER + ); assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } @@ -572,9 +578,8 @@ fn test_relabel_file_with_map_rejects_invalid_header() { #[test] fn test_relabel_lines_propagate_non_eof_reader_error() { - // Reader returns a valid Standard banner via Cursor, then the BoomReader - // produces a non-EOF I/O error on the body. The byte-walk fast path - // returns this I/O error unchanged. + // Reader returns a valid Standard banner via Cursor, then the BoomReader produces a non-EOF I/O + // error on the body. The byte-walk fast path returns this I/O error unchanged. struct BoomReader { returned_first: bool, } @@ -590,11 +595,10 @@ fn test_relabel_lines_propagate_non_eof_reader_error() { } } - let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain( - BoomReader { + let chained = + io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain(BoomReader { returned_first: false, - }, - ); + }); let err = relabel_ben_file(chained, Vec::new(), RelabelOptions::first_seen()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); } @@ -616,11 +620,10 @@ fn test_relabel_lines_with_map_propagate_non_eof_reader_error() { } } - let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain( - BoomReader { + let chained = + io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()).chain(BoomReader { returned_first: false, - }, - ); + }); let err = relabel_ben_file( chained, Vec::new(), @@ -968,8 +971,8 @@ fn test_relabel_ben_file_as_variant_rejects_invalid_banner() { #[test] fn test_relabel_ben_length_mismatch() { - // BEN stream with assignment length 3 ([1,2,3]); permutation of length 5 - // — triggers LengthMismatch. + // BEN stream with assignment length 3 ([1,2,3]); permutation of length 5 — triggers + // LengthMismatch. let jsonl = r#"{"assignment":[1,2,3],"sample":1} "#; let mut ben = Vec::new(); @@ -998,8 +1001,7 @@ fn test_relabel_ben_lines_non_eof_read_error_propagates() { let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()) .chain(ErrorAfterOneByte); let mut output = Vec::new(); - let err = - relabel_ben_file(chained, &mut output, RelabelOptions::first_seen()).unwrap_err(); + let err = relabel_ben_file(chained, &mut output, RelabelOptions::first_seen()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } @@ -1009,12 +1011,8 @@ fn test_relabel_ben_file_with_map_non_eof_read_error_propagates() { let chained = io::Cursor::new(crate::format::banners::STANDARD_BEN_BANNER.to_vec()) .chain(ErrorAfterOneByte); let mut output = Vec::new(); - let err = relabel_ben_file( - chained, - &mut output, - RelabelOptions::node_permutation(map), - ) - .unwrap_err(); + let err = + relabel_ben_file(chained, &mut output, RelabelOptions::node_permutation(map)).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } @@ -1023,19 +1021,20 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { // Build a valid 2-sample TwoDelta BEN file, then corrupt the delta frame. let mut ben: Vec = Vec::new(); { - let mut writer = crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta) - .unwrap(); + let mut writer = + crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } let banner_len = 17usize; - let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let n_bytes = + u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; let anchor_end = banner_len + 6 + n_bytes + 2; ben[anchor_end + 4] = 0; let mut output = Vec::new(); - let err = relabel_ben_file(ben.as_slice(), &mut output, RelabelOptions::first_seen()) - .unwrap_err(); + let err = + relabel_ben_file(ben.as_slice(), &mut output, RelabelOptions::first_seen()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); } @@ -1043,13 +1042,14 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { fn test_relabel_ben_file_with_map_twodelta_malformed_frame_error_propagates() { let mut ben: Vec = Vec::new(); { - let mut writer = crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta) - .unwrap(); + let mut writer = + crate::io::writer::BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } let banner_len = 17usize; - let n_bytes = u32::from_be_bytes(ben[banner_len+2..banner_len+6].try_into().unwrap()) as usize; + let n_bytes = + u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; let anchor_end = banner_len + 6 + n_bytes + 2; ben[anchor_end + 4] = 0; @@ -1107,9 +1107,9 @@ fn fast_path_predicate_matrix() { assert_eq!(true_cases, 2, "expected exactly two true matrix entries"); } -/// Forced-slow vs. fast-path equivalence for first-seen relabeling on -/// Standard input. Forcing the slow path uses `with_target_variant(input)` -/// per decision #5 (`is_none()` semantics in the predicate). +/// Forced-slow vs. fast-path equivalence for first-seen relabeling on Standard input. Forcing the +/// slow path uses `with_target_variant(input)` per decision #5 (`is_none()` semantics in the +/// predicate). #[test] fn fast_path_matches_slow_path_standard() { let file = concat!( @@ -1180,8 +1180,8 @@ fn fast_path_matches_slow_path_mkvchain() { ) .unwrap(); - // Decoded equivalence is the load-bearing assertion. Byte-identity is also - // expected here (per plan verification step 4) — tighten if it holds. + // Decoded equivalence is the load-bearing assertion. Byte-identity is also expected here + // (per plan verification step 4) — tighten if it holds. let mut fast_jsonl = Vec::new(); decode_ben_to_jsonl(fast_out.as_slice(), &mut fast_jsonl).unwrap(); let mut slow_jsonl = Vec::new(); @@ -1191,10 +1191,9 @@ fn fast_path_matches_slow_path_mkvchain() { #[test] fn collapse_policy_disables_fast_path() { - // With CollapseAdjacentEqualAssignments + first-seen on Standard input, - // the predicate must be false (fast path disabled). We verify behaviorally - // by running both: the merging path should produce the same decoded - // content but takes a different code path internally. + // With CollapseAdjacentEqualAssignments + first-seen on Standard input, the predicate must be + // false (fast path disabled). We verify behaviorally by running both: the merging path should + // produce the same decoded content but takes a different code path internally. let file = concat!( "{\"assignment\":[3,1,2],\"sample\":1}\n", "{\"assignment\":[3,1,2],\"sample\":2}\n", @@ -1211,8 +1210,7 @@ fn collapse_policy_disables_fast_path() { relabel_ben_file( encoded.as_slice(), &mut out, - RelabelOptions::first_seen() - .with_run_policy(RunPolicy::CollapseAdjacentEqualAssignments), + RelabelOptions::first_seen().with_run_policy(RunPolicy::CollapseAdjacentEqualAssignments), ) .unwrap(); @@ -1222,22 +1220,19 @@ fn collapse_policy_disables_fast_path() { assert!(s.contains("\"assignment\":[1,2,3]")); } -/// Decision #9: with `PreserveFrameBoundaries`, two adjacent input frames -/// with the same assignment but distinct counts must remain distinct counted -/// frames at MkvChain target — not merged into one frame with summed count. -/// With `CollapseAdjacentEqualAssignments`, they are merged. +/// Decision #9: with `PreserveFrameBoundaries`, two adjacent input frames with the same assignment +/// but distinct counts must remain distinct counted frames at MkvChain target — not merged into one +/// frame with summed count. With `CollapseAdjacentEqualAssignments`, they are merged. #[test] fn run_policy_pins_frame_preservation_and_collapse() { - // Build an MkvChain BEN file with two adjacent equal-assignment frames of - // counts 5 and 7 (12 total samples). + // Build an MkvChain BEN file with two adjacent equal-assignment frames of counts 5 and 7 + // (12 total samples). let mut input = Vec::new(); { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = - BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = - BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } @@ -1258,8 +1253,7 @@ fn run_policy_pins_frame_preservation_and_collapse() { let mut frames = 0; while i < ben.len() { // header: max_val_bits(1), max_len_bits(1), n_bytes(4), payload(n_bytes), count(2) - let n_bytes = - u32::from_be_bytes(ben[i + 2..i + 6].try_into().unwrap()) as usize; + let n_bytes = u32::from_be_bytes(ben[i + 2..i + 6].try_into().unwrap()) as usize; i += 6 + n_bytes + 2; frames += 1; } @@ -1304,8 +1298,8 @@ fn run_policy_pins_frame_preservation_and_collapse() { ); } -/// Cross-policy invariant for Standard targets: byte-identical output -/// regardless of run policy, because Standard cannot encode counts. +/// Cross-policy invariant for Standard targets: byte-identical output regardless of run policy, +/// because Standard cannot encode counts. #[test] fn standard_target_cross_policy_byte_identity() { // Build the same (5, 7) MkvChain fixture. @@ -1313,10 +1307,8 @@ fn standard_target_cross_policy_byte_identity() { { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = - BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = - BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } diff --git a/ben/src/progress/mod.rs b/ben/src/progress/mod.rs index ca55d15..7a33705 100644 --- a/ben/src/progress/mod.rs +++ b/ben/src/progress/mod.rs @@ -1,18 +1,17 @@ //! In-place progress spinners for streaming encode/decode/relabel loops. //! -//! Streaming operations have no upfront totals (BEN/JSONL inputs are read -//! frame-by-frame), so a percentage bar is not possible — this module -//! provides a running-counter spinner instead. The spinner writes directly -//! to stderr via [`indicatif`], bypassing `tracing` (whose fmt subscriber +//! Streaming operations have no upfront totals (BEN/JSONL inputs are read frame-by-frame), so a +//! percentage bar is not possible — this module provides a running-counter spinner instead. The +//! spinner writes directly to stderr via [`indicatif`], bypassing `tracing` (whose fmt subscriber //! appends `\n` and would defeat carriage-return redraws). //! //! Visibility is gated by two checks performed at construction time: //! 1. `cli::common::is_quiet()` — the `--quiet` CLI flag. -//! 2. `std::io::stderr().is_terminal()` — auto-disable when stderr is -//! redirected, so logs and pipelines stay clean. +//! 2. `std::io::stderr().is_terminal()` — auto-disable when stderr is redirected, so logs and +//! pipelines stay clean. //! -//! Both checks happen once in [`Spinner::new`]; the resulting [`Spinner`] -//! is either a live indicatif bar or a no-op stub. +//! Both checks happen once in [`Spinner::new`]; the resulting [`Spinner`] is either a live +//! indicatif bar or a no-op stub. use std::io::IsTerminal; use std::time::Duration; @@ -21,9 +20,8 @@ use indicatif::{ProgressBar, ProgressStyle}; /// A scope-bound progress spinner backed by [`indicatif::ProgressBar`]. /// -/// The spinner animates on a steady tick and exposes a single counter via -/// [`Spinner::set_count`]. On drop, the spinner clears its line so that -/// subsequent stderr writes start fresh. +/// The spinner animates on a steady tick and exposes a single counter via [`Spinner::set_count`]. +/// On drop, the spinner clears its line so that subsequent stderr writes start fresh. pub struct Spinner { bar: Option, } @@ -31,13 +29,11 @@ pub struct Spinner { impl Spinner { /// Build a spinner for a streaming operation. /// - /// Returns a no-op spinner when `--quiet` is set or when stderr is not - /// a TTY. + /// Returns a no-op spinner when `--quiet` is set or when stderr is not a TTY. /// /// # Arguments /// - /// * `prefix` - The label shown before the running counter, e.g. - /// `"Encoding line"`. + /// * `prefix` - The label shown before the running counter, e.g. `"Encoding line"`. /// /// # Returns /// diff --git a/ben/src/test_utils.rs b/ben/src/test_utils.rs index be16f4e..f757aa1 100644 --- a/ben/src/test_utils.rs +++ b/ben/src/test_utils.rs @@ -1,9 +1,8 @@ //! Test helpers shared across unit and integration tests. //! -//! This module is always-compiled (not `#[cfg(test)]`) so integration tests -//! in `ben/tests/` — which are separate crates — can reuse the same helpers -//! as unit tests inside `ben/src/.../tests.rs`. It is `#[doc(hidden)]` and -//! is not part of the stable public API. +//! This module is always-compiled (not `#[cfg(test)]`) so integration tests in `ben/tests/` — which +//! are separate crates — can reuse the same helpers as unit tests inside `ben/src/.../tests.rs`. It +//! is `#[doc(hidden)]` and is not part of the stable public API. use std::io::{Cursor, Write}; use std::path::PathBuf; @@ -16,9 +15,9 @@ use crate::io::bundle::format::AssignmentFormat; use crate::io::bundle::BendlWriter; use crate::BenVariant; -/// Return a unique temp path of the form `binary-ensemble-{name}-{nonce}` in -/// the system temp directory. The nonce is the current monotonic-ish time in -/// nanoseconds, sufficient to avoid collisions between parallel test runs. +/// Return a unique temp path of the form `binary-ensemble-{name}-{nonce}` in the system temp +/// directory. The nonce is the current monotonic-ish time in nanoseconds, sufficient to avoid +/// collisions between parallel test runs. pub fn unique_path(name: &str) -> PathBuf { let nonce = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -27,8 +26,7 @@ pub fn unique_path(name: &str) -> PathBuf { std::env::temp_dir().join(format!("binary-ensemble-{name}-{nonce}")) } -/// Build a JSONL byte buffer from a sequence of assignment vectors, -/// numbering samples from 1. +/// Build a JSONL byte buffer from a sequence of assignment vectors, numbering samples from 1. pub fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { let mut buf = Vec::new(); for (i, a) in assignments.iter().enumerate() { @@ -37,8 +35,7 @@ pub fn jsonl_from_assignments(assignments: &[Vec]) -> Vec { buf } -/// Expand an RLE sequence `(value, length)` into a flat assignment vector, -/// truncating at `cap`. +/// Expand an RLE sequence `(value, length)` into a flat assignment vector, truncating at `cap`. pub fn expand_rle(rle: &[(u16, u16)], cap: usize) -> Vec { let mut v = Vec::with_capacity(cap); for &(val, len) in rle { @@ -51,17 +48,16 @@ pub fn expand_rle(rle: &[(u16, u16)], cap: usize) -> Vec { v } -/// Encode the given JSONL bytes as a BEN byte vector, including the 17-byte -/// banner. Panics on encoder error; intended only for fixture construction. +/// Encode the given JSONL bytes as a BEN byte vector, including the 17-byte banner. Panics on +/// encoder error; intended only for fixture construction. pub fn sample_ben_bytes(jsonl: &[u8], variant: BenVariant) -> Vec { let mut out = Vec::new(); encode_jsonl_to_ben(jsonl, &mut out, variant).unwrap(); out } -/// Build a minimal finalized `.bendl` byte vector containing the given -/// pre-encoded assignment stream bytes. Panics on writer error; intended -/// only for fixture construction. +/// Build a minimal finalized `.bendl` byte vector containing the given pre-encoded assignment +/// stream bytes. Panics on writer error; intended only for fixture construction. pub fn sample_bendl_bytes(stream: &[u8], format: AssignmentFormat) -> Vec { let mut buf: Vec = Vec::new(); { diff --git a/ben/tests/common/mod.rs b/ben/tests/common/mod.rs index 0a87f43..b86adbf 100644 --- a/ben/tests/common/mod.rs +++ b/ben/tests/common/mod.rs @@ -1,8 +1,7 @@ //! Helpers shared across `ben/tests/*.rs` integration tests. //! -//! Each integration test crate declares `mod common;` to opt in. The -//! module re-exports the in-crate test utilities from -//! `binary_ensemble::test_utils` and adds integration-only helpers +//! Each integration test crate declares `mod common;` to opt in. The module re-exports the in-crate +//! test utilities from `binary_ensemble::test_utils` and adds integration-only helpers //! (subprocess paths, etc.). #![allow(dead_code, unused_imports)] @@ -13,9 +12,8 @@ pub use binary_ensemble::test_utils::{ /// Path to a compiled binary for shelling out from integration tests. /// -/// Returns the same `env!("CARGO_BIN_EXE_*")` value the existing test -/// helpers use; centralised here so future CLI tests can pick up the -/// canonical lookup table. +/// Returns the same `env!("CARGO_BIN_EXE_*")` value the existing test helpers use; centralised here +/// so future CLI tests can pick up the canonical lookup table. pub fn binary_path(name: &str) -> &'static str { match name { "ben" => env!("CARGO_BIN_EXE_ben"), diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index bcf2eb7..4f34c69 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -1,8 +1,8 @@ //! Rigorous tests for `BenStreamReader` with the MkvChain and TwoDelta BEN variants. //! -//! Standard-variant tests already exist in `test_coverage.rs`. This file adds -//! equivalent depth for the two more complex variants. The helpers intentionally -//! mirror those in `test_coverage.rs` so that the two suites are easy to compare. +//! Standard-variant tests already exist in `test_coverage.rs`. This file adds equivalent depth for +//! the two more complex variants. The helpers intentionally mirror those in `test_coverage.rs` so +//! that the two suites are easy to compare. use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::codec::encode::encode_jsonl_to_ben; @@ -16,8 +16,7 @@ use std::io::{self, Cursor}; mod common; use common::jsonl_from_assignments; -// ────────────────────────────────────────────────────────────────────────────── -// Shared helpers +// ────────────────────────────────────────────────────────────────────────────── Shared helpers // ────────────────────────────────────────────────────────────────────────────── fn encode_ben(assignments: &[Vec], variant: BenVariant) -> Vec { @@ -42,8 +41,7 @@ fn expand_assignments(ben: &[u8]) -> Vec> { out } -// ────────────────────────────────────────────────────────────────────────────── -// MkvChain variant +// ────────────────────────────────────────────────────────────────────────────── MkvChain variant // ────────────────────────────────────────────────────────────────────────────── mod mkvchain { @@ -67,7 +65,9 @@ mod mkvchain { #[test] fn empty_payload_yields_nothing() { let ben = MKVCHAIN_BEN_BANNER.to_vec(); - let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert!(frames.is_empty()); } @@ -79,7 +79,9 @@ mod mkvchain { let assignment = vec![3u16, 3, 1, 2, 2, 1]; let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); assert_eq!(decoded, assignment); @@ -91,7 +93,9 @@ mod mkvchain { let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![2u16, 1, 3]]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); for expected in &assignments { let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1, "distinct assignment should have count=1"); @@ -107,7 +111,9 @@ mod mkvchain { let assignments = vec![assignment.clone(); 5]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 5, "expected compressed count=5, got {count}"); assert_eq!(decoded, assignment); @@ -130,7 +136,9 @@ mod mkvchain { ]; let ben = encode_ben(&assignments, BenVariant::MkvChain); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (d1, c1) = decoder.next().unwrap().unwrap(); assert_eq!(c1, 3); @@ -437,9 +445,9 @@ mod mkvchain { // ─── subsampling ────────────────────────────────────────────────────────── // - // SubsampleFrameDecoder operates at the frame level: it returns one - // (assignment, count) tuple per frame that contains any selected indices, - // where count is the number of selected indices in that frame. + // SubsampleFrameDecoder operates at the frame level: it returns one (assignment, count) tuple + // per frame that contains any selected indices, where count is the number of selected indices + // in that frame. #[test] fn subsample_by_indices_locates_correct_sample_in_run() { @@ -484,7 +492,8 @@ mod mkvchain { #[test] fn subsample_by_range_spans_repeated_frames() { - // A×3, B×3; range [2, 5] → A contributes samples 2,3 (count=2) and B contributes 4,5 (count=2). + // A×3, B×3; range [2, 5] → A contributes samples 2,3 (count=2) and B contributes 4,5 + // (count=2). let a = vec![10u16; 3]; let b = vec![20u16; 3]; let assignments: Vec<_> = (0..3) @@ -599,8 +608,7 @@ mod mkvchain { } } -// ────────────────────────────────────────────────────────────────────────────── -// TwoDelta variant +// ────────────────────────────────────────────────────────────────────────────── TwoDelta variant // ────────────────────────────────────────────────────────────────────────────── mod twodelta { @@ -667,15 +675,17 @@ mod twodelta { #[test] fn delta_values_are_applied_correctly() { - // Explicit value check: the decoder must correctly update the previous - // assignment when it applies the delta. + // Explicit value check: the decoder must correctly update the previous assignment when it + // applies the delta. // anchor: [1, 2, 1, 2, 1] // next: [2, 1, 2, 1, 2] (every element swaps 1↔2) let anchor = vec![1u16, 2, 1, 2, 1]; let next = vec![2u16, 1, 2, 1, 2]; let ben = encode_twodelta(&[anchor, next.clone()]); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let _ = decoder.next().unwrap().unwrap(); // skip anchor let (decoded_next, _) = decoder.next().unwrap().unwrap(); assert_eq!(decoded_next, next); @@ -711,7 +721,9 @@ mod twodelta { let anchor = vec![1u16, 1, 2, 2]; let ben = encode_twodelta(&vec![anchor.clone(); 3]); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 3, "anchor count should be 3"); assert_eq!(decoded, anchor); @@ -728,7 +740,9 @@ mod twodelta { .collect(); let ben = encode_twodelta(&assignments); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (d_anchor, c_anchor) = decoder.next().unwrap().unwrap(); assert_eq!(c_anchor, 1, "anchor count"); @@ -743,7 +757,7 @@ mod twodelta { #[test] fn anchor_and_delta_repetitions_round_trip() { - // a×2, b×3 → anchor(2), delta(3). Expanding must give 5 correct assignments. + // a×2, b×3 → anchor(2), delta(3). Expanding must give 5 correct assignments. let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; let assignments: Vec<_> = (0..2) @@ -1125,9 +1139,8 @@ mod twodelta { #[test] fn subsample_by_indices_across_repeated_frames() { - // a×3, b×3 → 6 samples from 2 frames. - // Indices 1 and 3 fall in the anchor (a) frame → (a, 2). - // Index 4 is the first b → (b, 1). + // a×3, b×3 → 6 samples from 2 frames. Indices 1 and 3 fall in the anchor (a) frame → + // (a, 2). Index 4 is the first b → (b, 1). let a = vec![1u16, 1, 2, 2]; let b = vec![2u16, 2, 1, 1]; let assignments: Vec<_> = (0..3) diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index dd3b635..f67c690 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -1,9 +1,8 @@ //! Rigorous coverage tests for the binary-ensemble `ben` library. //! -//! These tests target code paths and edge-cases that are not covered by the -//! existing integration / property-based suites. They are deliberately strict: -//! if the implementation behaves in an unexpected way the test should fail -//! rather than silently accept wrong output. +//! These tests target code paths and edge-cases that are not covered by the existing integration / +//! property-based suites. They are deliberately strict: if the implementation behaves in an +//! unexpected way the test should fail rather than silently accept wrong output. use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben}; use binary_ensemble::codec::encode::{ @@ -32,8 +31,7 @@ use std::io::{self, BufReader, Cursor}; mod common; use common::jsonl_from_assignments; -// ────────────────────────────────────────────────────────────────────────────── -// Helpers +// ────────────────────────────────────────────────────────────────────────────── Helpers // ────────────────────────────────────────────────────────────────────────────── /// Encode assignments as a Standard BEN byte vector (including the 17-byte banner). @@ -62,14 +60,14 @@ fn encode_xben(assignments: &[Vec], variant: BenVariant) -> Vec { Some(1), Some(1), None, - None, + None, ) .unwrap(); xben } -/// Build a ring-graph JSON string with `n` nodes (0-based ids). -/// Each node i is connected to (i-1) mod n and (i+1) mod n. +/// Build a ring-graph JSON string with `n` nodes (0-based ids). Each node i is connected to (i-1) +/// mod n and (i+1) mod n. fn make_ring_graph_json(n: usize) -> String { let nodes: Vec = (0..n).map(|i| json!({"id": i})).collect(); let adjacency: Vec = (0..n) @@ -82,8 +80,7 @@ fn make_ring_graph_json(n: usize) -> String { serde_json::to_string(&json!({"nodes": nodes, "adjacency": adjacency})).unwrap() } -// ────────────────────────────────────────────────────────────────────────────── -// format::banners +// ────────────────────────────────────────────────────────────────────────────── format::banners // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -175,8 +172,7 @@ fn has_known_banner_prefix_rejects_garbage() { assert!(!has_known_banner_prefix(b"\x00")); } -// ────────────────────────────────────────────────────────────────────────────── -// util::rle +// ────────────────────────────────────────────────────────────────────────────── util::rle // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -253,9 +249,8 @@ fn rle_roundtrip_with_max_values() { assert_eq!(recovered, original); } -// ────────────────────────────────────────────────────────────────────────────── -// io::reader – DecoderInitError -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── io::reader – +// DecoderInitError ────────────────────────────────────────────────────────────────────────────── #[test] fn decoder_init_error_display_io_variant() { @@ -323,9 +318,8 @@ fn decoder_init_error_converts_to_io_error_from_invalid_format() { assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); } -// ────────────────────────────────────────────────────────────────────────────── -// io::reader – BenStreamReader -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── io::reader – +// BenStreamReader ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_rejects_empty_input() { @@ -377,7 +371,9 @@ fn ben_decoder_standard_multiple_assignments_round_trip() { let assignments = vec![vec![1u16, 2, 3], vec![3u16, 2, 1], vec![1u16, 1, 1]]; let ben = encode_standard_ben(&assignments); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); for expected in &assignments { let (decoded, count) = decoder.next().unwrap().unwrap(); assert_eq!(count, 1); @@ -402,7 +398,9 @@ fn ben_decoder_mkv_preserves_repetition_counts() { let mut ben = Vec::new(); encode_jsonl_to_ben(jsonl.as_bytes(), &mut ben, BenVariant::MkvChain).unwrap(); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let (a1, c1) = decoder.next().unwrap().unwrap(); assert_eq!(a1, vec![1u16, 2, 3]); @@ -465,7 +463,9 @@ fn ben_decoder_for_each_assignment_early_stop() { let assignments = vec![vec![1u16, 2], vec![3u16, 4], vec![5u16, 6]]; let ben = encode_standard_ben(&assignments); - let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let mut seen = Vec::new(); decoder .for_each_assignment(|a, _count| { @@ -479,9 +479,8 @@ fn ben_decoder_for_each_assignment_early_stop() { assert_eq!(seen[1], vec![3u16, 4]); } -// ────────────────────────────────────────────────────────────────────────────── -// io::reader – BenStreamReader -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── io::reader – +// BenStreamReader ────────────────────────────────────────────────────────────────────────────── fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { let jsonl = jsonl_from_assignments(assignments); @@ -493,7 +492,7 @@ fn make_xben(assignments: &[Vec], variant: BenVariant) -> Vec { Some(1), Some(1), None, - None, + None, ) .unwrap(); xben @@ -525,9 +524,8 @@ fn xben_decoder_reads_variant_from_banner_twodelta() { assert_eq!(decoder.variant(), BenVariant::TwoDelta); } -// ────────────────────────────────────────────────────────────────────────────── -// io::writer – BenEncoder -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── io::writer – +// BenEncoder ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_encoder_writes_correct_banner_standard() { @@ -663,8 +661,8 @@ fn ben_encoder_mkv_identical_assignments_deduplicated() { enc.finish().unwrap(); } - // The BEN payload should be much smaller than 3 independent frames. - // More importantly, decoding must give back 3 lines. + // The BEN payload should be much smaller than 3 independent frames. More importantly, decoding + // must give back 3 lines. let mut decoded = Vec::new(); decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); @@ -690,8 +688,8 @@ fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { assert_eq!(lines.len(), 2, "decoded:\n{s}"); } -// ────────────────────────────────────────────────────────────────────────────── -// codec::encode – encode_ben_vec_from_rle and encode_ben_vec_from_assign +// ────────────────────────────────────────────────────────────────────────────── codec::encode – +// encode_ben_vec_from_rle and encode_ben_vec_from_assign // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -725,9 +723,8 @@ fn encode_ben_vec_from_assign_all_same() { assert!(!frame.as_slice().is_empty()); } -// ────────────────────────────────────────────────────────────────────────────── -// codec::encode – encode_ben_to_xben -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── codec::encode – +// encode_ben_to_xben ────────────────────────────────────────────────────────────────────────────── #[test] fn encode_ben_to_xben_and_back_standard() { @@ -735,7 +732,15 @@ fn encode_ben_to_xben_and_back_standard() { let ben = encode_standard_ben(&assignments); let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, None, None, None, None).unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + None, + None, + None, + None, + ) + .unwrap(); let mut ben2 = Vec::new(); decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben2).unwrap(); @@ -752,8 +757,8 @@ fn encode_ben_to_xben_and_back_standard() { assert_eq!(String::from_utf8(decoded).unwrap(), expected); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – convert_ben_file and convert_ben_file_limit +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// convert_ben_file and convert_ben_file_limit // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -841,8 +846,8 @@ fn convert_ben_file_limit_zero_produces_banner_only() { assert!(decoded.is_empty()); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – relabel_ben_lines_limit +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// relabel_ben_lines_limit // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -872,8 +877,8 @@ fn relabel_ben_lines_limit_truncates_standard() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – relabel_ben_file_as_variant +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// relabel_ben_file_as_variant // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -895,9 +900,8 @@ fn relabel_ben_file_as_variant_standard_to_standard() { decode_ben_to_jsonl(out.as_slice(), &mut decoded).unwrap(); let s = String::from_utf8(decoded).unwrap(); - // Each frame is canonicalized independently (first-seen within the frame → 1, etc.). - // Frame 1: [5,5,1] → first 5→1, then 1→2 → [1,1,2] - // Frame 2: [1,5,5] → first 1→1, then 5→2 → [1,2,2] + // Each frame is canonicalized independently (first-seen within the frame → 1, etc.). Frame 1: + // [5,5,1] → first 5→1, then 1→2 → [1,1,2] Frame 2: [1,5,5] → first 1→1, then 5→2 → [1,2,2] assert!( s.contains("\"assignment\":[1,1,2]"), "frame1 mismatch, got: {s}" @@ -983,8 +987,8 @@ fn relabel_ben_file_as_variant_limit_zero_gives_empty() { assert!(decoded.is_empty(), "expected empty output for limit=0"); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – relabel_ben_file_with_map_as_variant +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// relabel_ben_file_with_map_as_variant // ────────────────────────────────────────────────────────────────────────────── /// Build a map that reverses a 3-element assignment: new[0]←old[2], etc. @@ -1001,8 +1005,7 @@ fn relabel_ben_file_with_map_as_variant_standard_to_standard() { relabel_ben_file( ben.as_slice(), &mut out, - RelabelOptions::node_permutation(reverse_map_3()) - .with_target_variant(BenVariant::Standard), + RelabelOptions::node_permutation(reverse_map_3()).with_target_variant(BenVariant::Standard), ) .unwrap(); @@ -1028,8 +1031,7 @@ fn relabel_ben_file_with_map_as_variant_standard_to_mkvchain() { relabel_ben_file( ben.as_slice(), &mut out, - RelabelOptions::node_permutation(reverse_map_3()) - .with_target_variant(BenVariant::MkvChain), + RelabelOptions::node_permutation(reverse_map_3()).with_target_variant(BenVariant::MkvChain), ) .unwrap(); @@ -1045,8 +1047,7 @@ fn relabel_ben_file_with_map_as_variant_rejects_invalid_header() { let err = relabel_ben_file( b"NOT A VALID BEN!!".as_slice(), Vec::new(), - RelabelOptions::node_permutation(reverse_map_3()) - .with_target_variant(BenVariant::Standard), + RelabelOptions::node_permutation(reverse_map_3()).with_target_variant(BenVariant::Standard), ) .unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); @@ -1098,8 +1099,8 @@ fn relabel_ben_file_with_map_as_variant_limit_zero_gives_empty() { assert!(decoded.is_empty()); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – dense_permutation edge cases (tested indirectly) +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// dense_permutation edge cases (tested indirectly) // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1120,8 +1121,8 @@ fn relabel_file_with_map_detects_gap_in_permutation() { assert_eq!(err.kind(), io::ErrorKind::InvalidInput); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – convert_ben_file with MkvChain truncation +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// convert_ben_file with MkvChain truncation // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1157,8 +1158,8 @@ fn convert_ben_file_limit_with_mkvchain_repetitions() { assert_eq!(decoded.iter().filter(|&&b| b == b'\n').count(), 3); } -// ────────────────────────────────────────────────────────────────────────────── -// ops::relabel – relabel_ben_file TwoDelta (canonicalization path) +// ────────────────────────────────────────────────────────────────────────────── ops::relabel – +// relabel_ben_file TwoDelta (canonicalization path) // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1176,12 +1177,7 @@ fn relabel_ben_file_twodelta_canonicalizes_labels() { encode_jsonl_to_ben(file.as_bytes(), &mut ben, BenVariant::TwoDelta).unwrap(); let mut relabeled = Vec::new(); - relabel_ben_file( - ben.as_slice(), - &mut relabeled, - RelabelOptions::first_seen(), - ) - .unwrap(); + relabel_ben_file(ben.as_slice(), &mut relabeled, RelabelOptions::first_seen()).unwrap(); let mut decoded = Vec::new(); decode_ben_to_jsonl(relabeled.as_slice(), &mut decoded).unwrap(); @@ -1191,9 +1187,8 @@ fn relabel_ben_file_twodelta_canonicalizes_labels() { assert!(s.contains("\"assignment\":[1,1,2,2,3,3]"), "got: {s}"); } -// ────────────────────────────────────────────────────────────────────────────── -// Encoding – empty assignment vectors -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── Encoding – empty +// assignment vectors ────────────────────────────────────────────────────────────────────────────── #[test] fn encode_and_decode_empty_assignment_standard() { @@ -1208,9 +1203,8 @@ fn encode_and_decode_empty_assignment_standard() { assert!(s.contains("\"assignment\":[]"), "got: {s}"); } -// ────────────────────────────────────────────────────────────────────────────── -// Encoding – large u16 values -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── Encoding – large +// u16 values ────────────────────────────────────────────────────────────────────────────── #[test] fn encode_and_decode_max_u16_values_standard() { @@ -1223,8 +1217,8 @@ fn encode_and_decode_max_u16_values_standard() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// Encoding – single-sample files +// ────────────────────────────────────────────────────────────────────────────── Encoding – +// single-sample files // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1249,8 +1243,7 @@ fn single_sample_mkvchain_round_trip() { assert!(s.contains("\"assignment\":[1,2,3]"), "got: {s}"); } -// ────────────────────────────────────────────────────────────────────────────── -// Decode error paths +// ────────────────────────────────────────────────────────────────────────────── Decode error paths // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1274,8 +1267,8 @@ fn decode_ben_to_jsonl_rejects_truncated_frame_header() { assert_ne!(err.kind(), io::ErrorKind::Other); // not just "ok" } -// ────────────────────────────────────────────────────────────────────────────── -// XBEN round-trip with various compression levels +// ────────────────────────────────────────────────────────────────────────────── XBEN round-trip +// with various compression levels // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1291,7 +1284,7 @@ fn xben_round_trip_with_level_0_compression() { Some(1), Some(0), // compression level 0 None, - None, + None, ) .unwrap(); @@ -1324,7 +1317,7 @@ fn xben_mkvchain_round_trip_preserves_all_samples() { Some(1), Some(1), None, - None, + None, ) .unwrap(); @@ -1340,8 +1333,8 @@ fn xben_mkvchain_round_trip_preserves_all_samples() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// Relabel – file_as_variant with MkvChain source +// ────────────────────────────────────────────────────────────────────────────── Relabel – +// file_as_variant with MkvChain source // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1376,8 +1369,8 @@ fn relabel_ben_file_as_variant_mkvchain_to_standard() { assert_eq!(s.lines().count(), 3); } -// ────────────────────────────────────────────────────────────────────────────── -// Relabel – with_map_as_variant permutation correctness +// ────────────────────────────────────────────────────────────────────────────── Relabel – +// with_map_as_variant permutation correctness // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1403,15 +1396,16 @@ fn relabel_ben_file_with_map_as_variant_permutes_correctly() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// BenStreamReader – iterator interface -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── BenStreamReader – +// iterator interface ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_decoder_iterator_collects_all_frames() { let assignments = vec![vec![1u16, 2, 3], vec![4u16, 5, 6], vec![7u16, 8, 9]]; let ben = encode_standard_ben(&assignments); - let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert_eq!(frames.len(), 3); for (i, (a, count)) in frames.iter().enumerate() { @@ -1423,13 +1417,15 @@ fn ben_decoder_iterator_collects_all_frames() { #[test] fn ben_decoder_iterator_on_empty_payload_yields_nothing() { let ben = STANDARD_BEN_BANNER.to_vec(); // banner only, no frames - let decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let decoder = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let frames: Vec<_> = decoder.collect::>>().unwrap(); assert!(frames.is_empty()); } -// ────────────────────────────────────────────────────────────────────────────── -// Relabeling – idempotence of canonicalization +// ────────────────────────────────────────────────────────────────────────────── Relabeling – +// idempotence of canonicalization // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1465,8 +1461,8 @@ fn relabel_ben_file_standard_is_idempotent() { assert_eq!(decoded1, decoded2, "relabeling is not idempotent"); } -// ────────────────────────────────────────────────────────────────────────────── -// Edge case: assignment with a single unique label +// ────────────────────────────────────────────────────────────────────────────── Edge case: +// assignment with a single unique label // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1486,12 +1482,7 @@ fn single_unique_label_relabeled_to_one() { let ben = encode_standard_ben(&[assignment]); let mut relabeled = Vec::new(); - relabel_ben_file( - ben.as_slice(), - &mut relabeled, - RelabelOptions::first_seen(), - ) - .unwrap(); + relabel_ben_file(ben.as_slice(), &mut relabeled, RelabelOptions::first_seen()).unwrap(); let decoded_str = decode_ben_to_string(&relabeled); // All 99s should become 1s. @@ -1501,8 +1492,8 @@ fn single_unique_label_relabeled_to_one() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// Edge case: frame with maximum run-length value +// ────────────────────────────────────────────────────────────────────────────── Edge case: frame +// with maximum run-length value // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1522,9 +1513,8 @@ fn encode_decode_max_run_length_standard() { ); } -// ────────────────────────────────────────────────────────────────────────────── -// BenVariant debug / clone / copy -// ────────────────────────────────────────────────────────────────────────────── +// ────────────────────────────────────────────────────────────────────────────── BenVariant debug / +// clone / copy ────────────────────────────────────────────────────────────────────────────── #[test] fn ben_variant_clone_and_copy() { @@ -1541,8 +1531,8 @@ fn ben_variant_debug() { assert_eq!(s, "TwoDelta"); } -// ────────────────────────────────────────────────────────────────────────────── -// Cursor::new round trips for Cursor-based readers +// ────────────────────────────────────────────────────────────────────────────── Cursor::new round +// trips for Cursor-based readers // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1664,7 +1654,10 @@ fn twodelta_frame_from_parts_round_trip() { ); assert_eq!(original.as_slice(), reconstructed.as_slice()); assert_eq!(original.pair().unwrap(), reconstructed.pair().unwrap()); - assert_eq!(original.max_len_bit_count(), reconstructed.max_len_bit_count()); + assert_eq!( + original.max_len_bit_count(), + reconstructed.max_len_bit_count() + ); assert_eq!(original.n_bytes(), reconstructed.n_bytes()); assert_eq!(original.count(), reconstructed.count()); } @@ -1680,8 +1673,8 @@ fn twodelta_frame_asref_and_deref() { assert_eq!(as_ref, frame.as_slice()); } -// ────────────────────────────────────────────────────────────────────────────── -// EncodeBenFrame (BenFrame from codec::encode) accessors +// ────────────────────────────────────────────────────────────────────────────── EncodeBenFrame +// (BenFrame from codec::encode) accessors // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1709,7 +1702,8 @@ fn encode_ben_frame_max_len_bits() { #[test] fn encode_ben_frame_n_bytes_consistent() { - // Frame layout: 1 byte (max_val_bits) + 1 byte (max_len_bits) + 4 bytes (n_bytes header) + n_bytes payload + // Frame layout: 1 byte (max_val_bits) + 1 byte (max_len_bits) + 4 bytes (n_bytes header) + + // n_bytes payload let runs = vec![(1u16, 5u16), (2u16, 3u16)]; let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); assert_eq!(frame.n_bytes() as usize + 6, frame.as_slice().len()); @@ -1759,8 +1753,8 @@ fn encode_ben_frame_from_assignment() { assert_eq!(runs, &[(1u16, 2u16), (2u16, 2u16), (3u16, 1u16)]); } -// ────────────────────────────────────────────────────────────────────────────── -// Graph ordering with >8 nodes (triggers multilevel clustering recursion) +// ────────────────────────────────────────────────────────────────────────────── Graph ordering +// with >8 nodes (triggers multilevel clustering recursion) // ────────────────────────────────────────────────────────────────────────────── #[test] @@ -1898,8 +1892,8 @@ fn graph_invalid_link_id_errors() { #[test] fn sort_by_ordering_large_graph_multilevel_verifies_permutation() { - // 30-node ring — large enough that greedy_cluster_partition produces multiple clusters - // and the coarse graph recursion fires + // 30-node ring — large enough that greedy_cluster_partition produces multiple clusters and the + // coarse graph recursion fires let graph_json = make_ring_graph_json(30); let mut output = Vec::new(); let mapping = sort_json_file_by_ordering( @@ -1916,8 +1910,8 @@ fn sort_by_ordering_large_graph_multilevel_verifies_permutation() { assert_eq!(new_ids, (0..30).collect::>()); } -// ────────────────────────────────────────────────────────────────────────────── -// BenStreamReader / BenStreamFrameReader +// ────────────────────────────────────────────────────────────────────────────── BenStreamReader / +// BenStreamFrameReader // ────────────────────────────────────────────────────────────────────────────── #[test] diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index e7430b6..235d1db 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -67,9 +67,8 @@ where // ---------- proptest strategies ---------- -/// Strategy for a single assignment vector: -/// Generate as RLE runs (value in [1, max_val], length in [1, max_run]), -/// expand to a bounded length. +/// Strategy for a single assignment vector: Generate as RLE runs (value in [1, max_val], length in +/// [1, max_run]), expand to a bounded length. fn strat_assignment(max_val: u16, max_run: u16, max_len: usize) -> impl Strategy> { // up to ~50 runs per vector to keep things small/fast let runs = 1..=50usize; @@ -211,8 +210,7 @@ proptest! { prop_assert_eq!(out, jsonl); } - // JSONL -> XBEN(Standard) -> BEN -> JSONL - // Also vary threads & compression level. + // JSONL -> XBEN(Standard) -> BEN -> JSONL Also vary threads & compression level. #[test] fn fuzz_roundtrip_xben_standard(seq in strat_assignment_seq(), params in strat_threads_levels()) { let (threads, level) = params; @@ -590,12 +588,19 @@ fn invalid_ben_header_yields_error() { #[test] fn xben_decoder_rejects_bad_banner() { - // Valid XZ container but wrong banner should raise InvalidData - // Build a minimal XBEN stream with a wrong banner inside. + // Valid XZ container but wrong banner should raise InvalidData Build a minimal XBEN stream with + // a wrong banner inside. let mut inner = Vec::new(); inner.extend_from_slice(b"BAD BAD BAD BAD!!"); // 17 bytes let mut xz = Vec::new(); - xz_compress(BufReader::new(inner.as_slice()), &mut xz, Some(1), Some(0), None).unwrap(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); let err = BenStreamReader::from_xben(xz.as_slice()) .err() @@ -619,7 +624,7 @@ fn subsample_every_respects_offset() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -679,7 +684,7 @@ fn xbenencoder_drop_flushes_tail_group() { Some(1), Some(0), None, - None, + None, ) .unwrap(); out @@ -702,7 +707,7 @@ fn ben_new_invalid_header_detects_xz() { &mut xz, Some(1), Some(0), - None, + None, ) .unwrap(); @@ -730,7 +735,7 @@ fn xben_new_invalid_banner() { &mut wrong, Some(1), Some(0), - None, + None, ) .unwrap(); let err = BenStreamReader::from_xben(wrong.as_slice()) @@ -756,7 +761,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -844,7 +849,7 @@ fn subsample_by_indices_sorts_and_dedups() { Some(1), Some(0), None, - None, + None, ) .unwrap(); let xb = BenStreamReader::from_xben(xz.as_slice()).unwrap(); @@ -890,7 +895,7 @@ fn ben_encode_xben_respects_existing_ben_header() { Some(1), Some(0), None, - None, + None, ) .expect("ben->xben failed"); @@ -915,7 +920,7 @@ fn xz_mt_params_are_capped_and_safe() { Some(10_000), Some(42), None, - None, + None, ) .unwrap(); let mut ben = Vec::new(); @@ -945,7 +950,9 @@ fn ben_encoder_write_assignment_path_roundtrips() { #[test] fn ben_decoder_new_reports_short_header_as_io_error() { - let err = BenStreamReader::from_ben([1u8, 2, 3].as_slice()).err().unwrap(); + let err = BenStreamReader::from_ben([1u8, 2, 3].as_slice()) + .err() + .unwrap(); match err { DecoderInitError::Io(e) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), other => panic!("unexpected error: {other:?}"), @@ -987,7 +994,7 @@ fn xben_frame_decoder_new_and_truncated_iteration_paths() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -1163,7 +1170,7 @@ fn decoder_init_error_display_source_and_conversion_paths() { &mut buf, Some(1), Some(0), - None, + None, ) .unwrap(); buf @@ -1212,7 +1219,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { Some(1), Some(0), None, - None, + None, ) .unwrap(); assert_eq!( @@ -1235,7 +1242,7 @@ fn ben_decoder_and_xben_decoder_count_samples() { Some(1), Some(0), None, - None, + None, ) .unwrap(); assert_eq!( @@ -1272,7 +1279,7 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { Some(1), Some(0), None, - None, + None, ) .unwrap(); let xben_path = unique_temp_path("sample.xben"); @@ -1284,8 +1291,14 @@ fn build_frame_iter_and_count_samples_from_file_cover_public_file_api() { let xben_iter = build_frame_iter(&xben_path, BenWireFormat::XBen).unwrap(); assert_eq!(collect_frames(xben_iter).unwrap().len(), 2); - assert_eq!(count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), 3); - assert_eq!(count_samples_from_file(&xben_path, BenWireFormat::XBen).unwrap(), 3); + assert_eq!( + count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), + 3 + ); + assert_eq!( + count_samples_from_file(&xben_path, BenWireFormat::XBen).unwrap(), + 3 + ); fs::remove_file(ben_path).unwrap(); fs::remove_file(xben_path).unwrap(); @@ -1368,7 +1381,9 @@ fn twodelta_roundtrips_and_counts_repeated_frames() { decode_ben_to_jsonl(ben.as_slice(), &mut jsonl).unwrap(); assert_eq!(jsonl, jsonl_from_assignments(&assignments)); - let frames = BenStreamReader::from_ben(ben.as_slice()).unwrap().into_frames(); + let frames = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .into_frames(); assert_eq!(collect_frames(frames).unwrap().len(), 3); } @@ -1464,6 +1479,9 @@ fn twodelta_supports_frame_iteration_counting_and_sample_extraction() { let ben_path = unique_temp_path("twodelta_sample.ben"); fs::write(&ben_path, &ben).unwrap(); - assert_eq!(count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), 4); + assert_eq!( + count_samples_from_file(&ben_path, BenWireFormat::Ben).unwrap(), + 4 + ); fs::remove_file(ben_path).unwrap(); } diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 933bfdd..86b522c 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -266,7 +266,7 @@ fn test_xben_pipeline() { Some(1), Some(1), None, - None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -337,7 +337,7 @@ fn test_xmkvben_pipeline() { Some(1), Some(1), None, - None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); @@ -433,7 +433,7 @@ fn test_xtwodeltaben_pipeline() { Some(1), Some(1), None, - None, + None, ) .unwrap(); decode_xben_to_ben(&input_writer[..], &mut output_writer).unwrap(); diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 559a3d9..f1bbaeb 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -4,7 +4,6 @@ use binary_ensemble::codec::decode::{ }; use binary_ensemble::codec::encode::{encode_jsonl_to_xben, xz_compress}; use binary_ensemble::codec::BenEncodeFrame; -use binary_ensemble::BenVariant; use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; @@ -20,6 +19,7 @@ use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; +use binary_ensemble::BenVariant; use std::cell::RefCell; use std::collections::HashMap; use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; @@ -234,7 +234,9 @@ fn mkvchain_writer_splits_repetition_count_longer_than_u16_max() { writer.finish().unwrap(); } - let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut reader = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); let first = reader.next().unwrap().unwrap(); let second = reader.next().unwrap().unwrap(); assert!(reader.next().is_none()); @@ -286,7 +288,7 @@ fn xben_mkvchain_splits_repetition_count_longer_than_u16_max() { Some(1), Some(0), None, - None, + None, ) .unwrap(); @@ -317,13 +319,14 @@ fn malformed_ben_bit_widths_return_invalid_data() { #[test] fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { - let anchor = - BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)); + let anchor = BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)); let mut ben = TWODELTA_BEN_BANNER.to_vec(); ben.extend_from_slice(anchor.as_slice()); ben.extend_from_slice(&[0, 1, 0, 2, 0, 0, 0, 0, 0, 1]); - let mut reader = BenStreamReader::from_ben(ben.as_slice()).unwrap().silent(true); + let mut reader = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true); assert_eq!(reader.next().unwrap().unwrap(), (vec![1, 2], 1)); let err = reader.next().unwrap().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); @@ -344,7 +347,7 @@ fn direct_xben_helpers_propagate_corrupt_xz_errors() { Some(1), Some(0), None, - None, + None, ) .unwrap(); xben.truncate(xben.len() - 1); @@ -464,7 +467,7 @@ fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { &mut xben, Some(1), Some(0), - None, + None, ) .unwrap(); @@ -495,7 +498,7 @@ fn zero_count_frames_are_rejected() { &mut xben, Some(1), Some(0), - None, + None, ) .unwrap(); let err = BenStreamReader::from_xben(xben.as_slice()) @@ -510,7 +513,8 @@ fn zero_count_frames_are_rejected() { fn seeded_malformed_ben_bytes_do_not_panic() { let mut valid_standard = Vec::new(); { - let mut writer = BenStreamWriter::for_ben(&mut valid_standard, BenVariant::Standard).unwrap(); + let mut writer = + BenStreamWriter::for_ben(&mut valid_standard, BenVariant::Standard).unwrap(); writer.write_assignment(vec![1, 1, 2, 3]).unwrap(); writer.write_assignment(vec![3, 3, 2, 1]).unwrap(); writer.finish().unwrap(); @@ -527,7 +531,8 @@ fn seeded_malformed_ben_bytes_do_not_panic() { let mut valid_twodelta = Vec::new(); { - let mut writer = BenStreamWriter::for_ben(&mut valid_twodelta, BenVariant::TwoDelta).unwrap(); + let mut writer = + BenStreamWriter::for_ben(&mut valid_twodelta, BenVariant::TwoDelta).unwrap(); writer.write_assignment(vec![1, 1, 2, 2]).unwrap(); writer.write_assignment(vec![1, 2, 1, 2]).unwrap(); writer.write_assignment(vec![2, 2, 1, 1]).unwrap(); @@ -572,7 +577,7 @@ fn seeded_malformed_xben_bytes_do_not_panic() { Some(1), Some(0), Some(32), - None, + None, ) .unwrap(); seeds.push(xben); @@ -598,7 +603,7 @@ fn seeded_malformed_xben_bytes_do_not_panic() { &mut unknown_tag_xben, Some(1), Some(0), - None, + None, ) .unwrap(); assert_xben_bytes_do_not_panic(unknown_tag_xben); diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..8f67881 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,3 @@ +wrap_comments = true +comment_width = 100 +max_width = 100 From b76de0408a06f082832f9d9910bc4e8fde20abf8 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 09:01:01 -0600 Subject: [PATCH 100/221] checksum for the assignment streams --- ben/src/io/bundle/format.rs | 51 +++- ben/src/io/bundle/mod.rs | 2 +- ben/src/io/bundle/reader.rs | 410 +++++++++++++++++++++++++++--- ben/src/io/bundle/tests/format.rs | 5 +- ben/src/io/bundle/tests/reader.rs | 242 ++++++++++++++++-- ben/src/io/bundle/tests/writer.rs | 287 ++++++++++++++++++++- ben/src/io/bundle/writer.rs | 20 +- ben/tests/test_stress_edges.rs | 3 +- 8 files changed, 938 insertions(+), 82 deletions(-) diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 00c817c..2f2fac1 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -31,6 +31,15 @@ pub const FINALIZED_NO: u8 = 0; /// `finalized` flag value for finalized bundles. pub const FINALIZED_YES: u8 = 1; +/// Header flag bit 0: the `stream_checksum` field contains a valid CRC32C over the on-disk +/// assignment stream bytes (`stream_offset..stream_offset + stream_len`). For XBEN streams the CRC +/// covers the compressed bytes, not the decompressed content. Bits 1..31 are reserved; writers set +/// them to zero. +/// +/// Library writers always set this flag and write a valid checksum. The clear-flag state exists +/// only for adversarial reader fixtures and partial-recovery flows. +pub const HEADER_FLAG_STREAM_CHECKSUM: u32 = 1 << 0; + // --------------------------------------------------------------------------- // Assignment format identifiers // --------------------------------------------------------------------------- @@ -180,12 +189,19 @@ pub struct BendlHeader { pub finalized: u8, /// Container format of the embedded assignment stream. pub assignment_format: u8, - /// Padding after `assignment_format`; writers set to zero, readers ignore. - pub reserved_0: u16, - /// Bundle-level feature flags. - pub flags: u64, - /// Absolute byte offset of the directory table, or `0` if no directory has been written yet. In - /// a finalized bundle the directory lives at the end of the file. + /// Alignment padding after `assignment_format` that keeps the following 8-byte fields at + /// offset ≥ 24 8-byte aligned. Writers set this to zero; readers ignore non-zero bytes. + /// This is not a forward-compat slot — new fields must live elsewhere. + pub alignment_padding: u16, + /// Bundle-level feature flags (32-bit). See `HEADER_FLAG_*` constants. Bits without a defined + /// constant are reserved; readers must ignore them and writers must set them to zero. + pub flags: u32, + /// CRC32C of the on-disk assignment stream bytes. Valid only when + /// `HEADER_FLAG_STREAM_CHECKSUM` is set in `flags`. Writers set this to zero while the + /// bundle is unfinalized and patch it on finalization. + pub stream_checksum: u32, + /// Absolute byte offset of the directory table, or `0` if no directory has been written yet. + /// In a finalized bundle the directory lives at the end of the file. pub directory_offset: u64, /// Byte length of the directory table, or `0` if absent. pub directory_len: u64, @@ -206,8 +222,9 @@ impl BendlHeader { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: assignment_format.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset, @@ -216,6 +233,12 @@ impl BendlHeader { } } + /// Whether the `HEADER_FLAG_STREAM_CHECKSUM` bit is set, indicating the `stream_checksum` field + /// contains a valid CRC32C over the assignment stream bytes. + pub fn has_stream_checksum(&self) -> bool { + self.flags & HEADER_FLAG_STREAM_CHECKSUM != 0 + } + /// Whether the bundle has been finalized. pub fn is_finalized(&self) -> bool { self.finalized == FINALIZED_YES @@ -234,8 +257,9 @@ impl BendlHeader { out[10..12].copy_from_slice(&self.minor_version.to_le_bytes()); out[12] = self.finalized; out[13] = self.assignment_format; - out[14..16].copy_from_slice(&self.reserved_0.to_le_bytes()); - out[16..24].copy_from_slice(&self.flags.to_le_bytes()); + out[14..16].copy_from_slice(&self.alignment_padding.to_le_bytes()); + out[16..20].copy_from_slice(&self.flags.to_le_bytes()); + out[20..24].copy_from_slice(&self.stream_checksum.to_le_bytes()); out[24..32].copy_from_slice(&self.directory_offset.to_le_bytes()); out[32..40].copy_from_slice(&self.directory_len.to_le_bytes()); out[40..48].copy_from_slice(&self.stream_offset.to_le_bytes()); @@ -267,8 +291,9 @@ impl BendlHeader { minor_version, finalized: bytes[12], assignment_format: bytes[13], - reserved_0: u16::from_le_bytes(bytes[14..16].try_into().unwrap()), - flags: u64::from_le_bytes(bytes[16..24].try_into().unwrap()), + alignment_padding: u16::from_le_bytes(bytes[14..16].try_into().unwrap()), + flags: u32::from_le_bytes(bytes[16..20].try_into().unwrap()), + stream_checksum: u32::from_le_bytes(bytes[20..24].try_into().unwrap()), directory_offset: u64::from_le_bytes(bytes[24..32].try_into().unwrap()), directory_len: u64::from_le_bytes(bytes[32..40].try_into().unwrap()), stream_offset: u64::from_le_bytes(bytes[40..48].try_into().unwrap()), @@ -495,6 +520,10 @@ pub enum BendlFormatError { #[error("malformed directory: {0}")] MalformedDirectory(String), + /// The header's `assignment_format` byte did not map to any known assignment format. + #[error("unknown assignment_format byte in bundle header: {0}")] + UnknownAssignmentFormat(u8), + /// A directory entry's `ASSET_FLAG_CHECKSUM` bit and `checksum_len` disagree. The wire format /// requires `flag set iff checksum_len == 4` and `flag clear iff checksum_len == 0`. #[error( diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index bb4e642..95a5d53 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -22,5 +22,5 @@ pub mod writer; mod tests; pub use error::{BendlReadError, ChecksumError, ChecksumTarget}; -pub use reader::{BendlReader, BundleAssignmentReaderError, BundleValidationError}; +pub use reader::{BendlReader, BendlVerifiedStreamReader, BundleValidationError}; pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 3d377cc..9506376 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -17,8 +17,13 @@ //! - [`BendlReader::verify_asset_checksum`] and [`BendlReader::verify_all_asset_checksums`] are //! explicit raw-bytes verifiers (no decoding) that do not return decoded payload bytes. -use std::io::{self, Read, Seek, SeekFrom, Take}; +use std::io::{self, Read, Seek, SeekFrom, Take, Write}; +use std::sync::{ + atomic::{AtomicU32, Ordering}, + Arc, +}; +use serde_json::json; use xz2::read::XzDecoder; use super::error::{BendlReadError, ChecksumError, ChecksumTarget}; @@ -26,7 +31,8 @@ use super::format::{ read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_XZ, }; -use crate::io::reader::{BenStreamReader, BenWireFormat}; +use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat, SubsampleFrameDecoder}; +use crate::BenVariant; impl From for BenWireFormat { fn from(format: AssignmentFormat) -> Self { @@ -136,36 +142,155 @@ impl BendlReader { } } - /// Return a `Take` reader positioned at the start of the assignment stream and limited to its - /// declared length. The caller is expected to wrap the returned reader in a [`BenStreamReader`] - /// (via [`BendlReader::open_assignment_reader`] or directly) as appropriate for - /// [`BendlReader::assignment_format`]. - pub fn assignment_stream_reader(&mut self) -> io::Result> { + /// Return a verified reader for the assignment stream that checks the stored CRC32C at raw EOF. + /// + /// Returns `Err(ChecksumError::BundleIncomplete)` for unfinalized bundles (the stored + /// `stream_checksum` is not authoritative until the bundle is finalized). + /// Returns `Err(ChecksumError::Unavailable)` when `HEADER_FLAG_STREAM_CHECKSUM` is clear + /// (foreign or hand-built bytes; the library writer always sets this flag). + /// + /// On success, CRC mismatch surfaces from `Read::read` as + /// `io::Error::new(io::ErrorKind::InvalidData, ChecksumError::Mismatch)` on the call that + /// would otherwise return `Ok(0)` at raw EOF. For a raw copy that decodes nothing, driving the + /// returned reader to EOF is sufficient. For decoded access use + /// [`BendlReader::open_assignment_reader`]. + pub fn assignment_stream_reader( + &mut self, + ) -> Result, BendlReadError> { + if !self.header.is_finalized() { + return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream, + })); + } + if !self.header.has_stream_checksum() { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream, + })); + } + let expected = self.header.stream_checksum; + let (offset, len) = self.assignment_stream_range()?; + self.inner.seek(SeekFrom::Start(offset))?; + let raw = (&mut self.inner).take(len); + Ok(Box::new(RawVerifyingReader { + inner: raw, + hasher: 0, + expected, + target: ChecksumTarget::Stream, + state: VerifyState::Reading, + })) + } + + /// Return a raw bounded reader for the assignment stream **without** CRC verification. + /// + /// Works on both finalized and unfinalized bundles. Useful for recovery/debug flows and for + /// callers that need the raw bytes without the overhead of a CRC check. + pub fn assignment_stream_reader_unverified(&mut self) -> io::Result> { let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; Ok((&mut self.inner).take(len)) } - /// Construct the appropriate assignment decoder for the bundle's declared `assignment_format` - /// and return it as a [`BenStreamReader`] over the bundle's bounded stream region. + /// Construct a verified decoded assignment reader that checks the stream CRC32C after the + /// codec reaches EOF. The returned [`BendlVerifiedStreamReader`] forwards the full + /// [`BenStreamReader`] API surface and folds the CRC check into consuming methods. /// - /// Returns an error if the header's `assignment_format` field is unrecognized or the embedded - /// banner is malformed. + /// Returns `Err(BundleIncomplete)` for unfinalized bundles and `Err(Unavailable)` when the + /// stream checksum flag is clear. pub fn open_assignment_reader( &mut self, - ) -> Result>, BundleAssignmentReaderError> { - let format = self.assignment_format().ok_or( - BundleAssignmentReaderError::UnknownAssignmentFormat(self.header.assignment_format), - )?; - let stream = self.assignment_stream_reader()?; - match format { - AssignmentFormat::Ben => { - BenStreamReader::from_ben(stream).map_err(BundleAssignmentReaderError::Decoder) - } - AssignmentFormat::Xben => { - BenStreamReader::from_xben(stream).map_err(BundleAssignmentReaderError::Decoder) + ) -> Result, BendlReadError> { + // Finalization check must come first: if the bundle is unfinalized, stream_checksum is not + // authoritative and reporting Unavailable would be misleading. + if !self.header.is_finalized() { + return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream, + })); + } + if !self.header.has_stream_checksum() { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream, + })); + } + let expected = self.header.stream_checksum; + + let format = self.assignment_format().ok_or_else(|| { + BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat( + self.header.assignment_format, + )) + })?; + let (offset, len) = self.assignment_stream_range()?; + self.inner.seek(SeekFrom::Start(offset))?; + let raw = (&mut self.inner).take(len); + + let arc_hasher = Arc::new(AtomicU32::new(0)); + let shared_raw = ArcHasher { + inner: raw, + state: Arc::clone(&arc_hasher), + }; + + let inner = match format { + AssignmentFormat::Ben => BenStreamReader::from_ben(shared_raw)?, + AssignmentFormat::Xben => BenStreamReader::from_xben(shared_raw)?, + }; + + Ok(BendlVerifiedStreamReader { + inner, + expected, + arc_hasher, + state: StreamVerifyState::Running, + }) + } + + /// Verify the stored stream CRC32C by scanning the raw on-disk bytes of the assignment stream. + /// + /// This is the explicit full-scan verifier for callers that want to check integrity without + /// decoding the stream. For random-access extraction (which intentionally skips untouched + /// frames), call this separately to confirm the whole stream is intact. + /// + /// Returns `Err(BundleIncomplete)` for unfinalized bundles and `Err(Unavailable)` when the + /// stream checksum flag is clear. + pub fn verify_stream_checksum(&mut self) -> Result<(), BendlReadError> { + if !self.header.is_finalized() { + return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream, + })); + } + if !self.header.has_stream_checksum() { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream, + })); + } + let expected = self.header.stream_checksum; + let (offset, len) = self.assignment_stream_range()?; + self.inner.seek(SeekFrom::Start(offset))?; + + let mut remaining = len; + let mut buf = [0u8; 64 * 1024]; + let mut hasher: u32 = 0; + while remaining > 0 { + let want = remaining.min(buf.len() as u64) as usize; + let n = self.inner.read(&mut buf[..want])?; + if n == 0 { + return Err(BendlReadError::Io(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "stream ended {} byte(s) before declared length", + remaining + ), + ))); } + hasher = crc32c::crc32c_append(hasher, &buf[..n]); + remaining -= n as u64; + } + + if hasher != expected { + return Err(BendlReadError::Checksum(ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + computed: hasher, + expected, + })); } + Ok(()) } /// Read the fully-decoded bytes of an asset by directory entry, verifying its CRC32C before @@ -480,6 +605,233 @@ impl Read for DecodedVerifyingReader<'_, R> { } } +/// CRC accumulator that shares its running hash via an `Arc`. Used as the source reader +/// for [`BendlVerifiedStreamReader`]: the `Arc` lets the outer wrapper read the final hash after a +/// consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. +/// +/// Unlike `CrcTeeReader`, this type never substitutes a checksum error for raw EOF — it is always +/// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. The type is +/// exposed because it leaks through the return signatures of the wrapper's intentionally-partial +/// APIs (`into_frames`, `into_subsample_by_*`); callers should treat it as an opaque reader. +pub struct ArcHasher { + inner: R, + state: Arc, +} + +impl Read for ArcHasher { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let n = self.inner.read(buf)?; + if n > 0 { + let prev = self.state.load(Ordering::Relaxed); + self.state + .store(crc32c::crc32c_append(prev, &buf[..n]), Ordering::Relaxed); + } + Ok(n) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum StreamVerifyState { + Running, + /// CRC mismatch was returned once as `Some(Err(...))`. Subsequent iterator calls return `None`. + MismatchReported, + /// CRC matched after natural EOF. Subsequent iterator calls return `None`. + Verified, +} + +/// Verified decoded assignment reader returned by [`BendlReader::open_assignment_reader`]. +/// +/// Wraps a [`BenStreamReader`] over a CRC-accumulating source and checks the stored stream CRC32C +/// after the codec reaches natural EOF. CRC mismatch surfaces from [`Iterator::next`] as +/// `Some(Err(io::ErrorKind::InvalidData))` — returned once after the last decoded record, then +/// `None`. Consuming methods (`count_samples`, `write_all_jsonl`, `for_each_assignment` when driven +/// to natural EOF) also fold the CRC check into their return value. +/// +/// **Intentionally partial APIs** (`into_frames`, `into_subsample_by_*`) are forwarded for +/// ergonomics but do not automatically verify — the underlying reader is stopped short of raw EOF +/// so the CRC tee is never finalized. Callers that need integrity for partial reads must call +/// [`BendlReader::verify_stream_checksum`] separately. +pub struct BendlVerifiedStreamReader<'a, R: Read + Seek> { + inner: BenStreamReader>>, + expected: u32, + arc_hasher: Arc, + state: StreamVerifyState, +} + +impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.inner.variant() + } + + /// Return the wire format (BEN vs XBEN) of this stream. + pub fn wire_format(&self) -> BenWireFormat { + self.inner.wire_format() + } + + /// Suppress progress output from the decoder. + pub fn silent(mut self, silent: bool) -> Self { + self.inner = self.inner.silent(silent); + self + } + + /// Count the number of samples in the stream and verify the stream CRC32C. + /// + /// Drives the decoder to raw EOF as a side effect, finalizing the CRC accumulator. If the + /// count succeeds but the CRC does not match, the CRC mismatch is returned instead of the + /// count. + pub fn count_samples(self) -> io::Result { + let arc = Arc::clone(&self.arc_hasher); + let expected = self.expected; + let count = self.inner.count_samples()?; + let computed = arc.load(Ordering::Relaxed); + if computed != expected { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + computed, + expected, + }, + )); + } + Ok(count) + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// When the callback drives the reader to natural EOF, the stream CRC is verified and a + /// mismatch is returned as an error. When the callback stops early (`f` returns `Ok(false)`), + /// the CRC is not checked — only a full traversal can verify the whole stream. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + loop { + match self.next() { + Some(Ok((ref assignment, count))) => { + if !f(assignment, count)? { + return Ok(()); + } + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + } + } + } + + /// Decode the remaining stream, write it as JSONL, and verify the stream CRC32C. + /// + /// Each decoded sample is written as a JSON object containing an `assignment` vector and a + /// 1-based `sample` index. After all records are written, the stream CRC is checked; a + /// mismatch is returned instead of `Ok(())`. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + loop { + match self.next() { + Some(Ok((assignment, count))) => { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + } + } + } + + /// Consume the wrapper and iterate over raw BEN/ben32 frames instead of materialized + /// assignments. + /// + /// Frame iteration is intentionally partial: callers typically stop short of EOF, so the CRC + /// tee is never finalized and the stream is **not verified** by this path. Callers needing + /// integrity for partial reads should call [`BendlReader::verify_stream_checksum`] separately. + pub fn into_frames(self) -> BenStreamFrameReader>> { + self.inner.into_frames() + } +} + +impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { + /// Convert into a subsampling iterator over explicit 1-based indices. + /// + /// Subsampling is intentionally partial: the underlying reader is stopped short of raw EOF, so + /// the CRC tee is never finalized and the stream is **not verified** by this path. Use + /// [`BendlReader::verify_stream_checksum`] for an explicit full-stream integrity check. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder>>> + where + T: IntoIterator, + { + self.inner.into_subsample_by_indices(indices) + } + + /// Convert into a subsampling iterator over the inclusive 1-based range `[start, end]`. + /// + /// Subsampling is intentionally partial and is **not verified** by this path; see + /// [`Self::into_subsample_by_indices`]. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder>>> { + self.inner.into_subsample_by_range(start, end) + } + + /// Convert into a subsampling iterator that selects every `step` samples from the 1-based + /// `offset`. + /// + /// Subsampling is intentionally partial and is **not verified** by this path; see + /// [`Self::into_subsample_by_indices`]. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder>>> { + self.inner.into_subsample_every(step, offset) + } +} + +impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { + type Item = io::Result<(Vec, u16)>; + + fn next(&mut self) -> Option { + match self.state { + StreamVerifyState::MismatchReported | StreamVerifyState::Verified => return None, + StreamVerifyState::Running => {} + } + match self.inner.next() { + Some(item) => Some(item), + None => { + // Inner reached natural EOF — finalize the CRC check. + let computed = self.arc_hasher.load(Ordering::Relaxed); + if computed == self.expected { + self.state = StreamVerifyState::Verified; + None + } else { + self.state = StreamVerifyState::MismatchReported; + Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + computed, + expected: self.expected, + }, + ))) + } + } + } + } +} + /// Map a `read_to_end`-time `io::Error` (or any `Read`-derived `io::Error`) into the right /// [`BendlReadError`] variant. /// @@ -536,20 +888,6 @@ pub(crate) fn validate_directory_entries( Ok(()) } -/// Errors raised by [`BendlReader::open_assignment_reader`]. -#[derive(Debug, thiserror::Error)] -pub enum BundleAssignmentReaderError { - /// The header's `assignment_format` byte did not map to a known format. - #[error("unknown assignment_format in bundle header: {0}")] - UnknownAssignmentFormat(u8), - /// The embedded BEN/XBEN decoder rejected the stream banner. - #[error(transparent)] - Decoder(#[from] crate::io::reader::DecoderInitError), - /// An underlying I/O error occurred while seeking to the stream. - #[error(transparent)] - Io(#[from] io::Error), -} - /// Errors raised when a directory violates the canonical-name or uniqueness rules. #[derive(Debug, thiserror::Error)] pub enum BundleValidationError { diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index 731a017..cf0ff7d 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -70,8 +70,9 @@ fn header_round_trip_finalized() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: ASSIGNMENT_FORMAT_BEN, - reserved_0: 0, - flags: 0x0000_0000_0000_000F, + alignment_padding: 0, + flags: 0x0000_000F, + stream_checksum: 1_234_567, directory_offset: 1_000_000, directory_len: 256, stream_offset: 64, diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 084c811..b726b7e 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -6,11 +6,9 @@ use crate::io::bundle::format::{ encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, - BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, -}; -use crate::io::bundle::reader::{ - validate_directory_entries, BendlReader, BundleAssignmentReaderError, BundleValidationError, + BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; +use crate::io::bundle::reader::{validate_directory_entries, BendlReader, BundleValidationError}; /// Stamp a valid CRC32C and `ASSET_FLAG_CHECKSUM` onto a hand-built directory entry whose on-disk /// payload bytes are `payload`. Use this in test fixtures so the entry round-trips through the @@ -85,15 +83,17 @@ fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { bundle.extend_from_slice(&directory_bytes); let directory_len = directory_bytes.len() as u64; - // Now patch the header. + // Now patch the header. Set HEADER_FLAG_STREAM_CHECKSUM so the finalized bundle supports the + // verified assignment_stream_reader() path in tests. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: crc32c::crc32c(&fake_stream), directory_offset, directory_len, stream_offset, @@ -168,8 +168,9 @@ fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset: HEADER_SIZE as u64, @@ -190,7 +191,7 @@ fn incomplete_bundle_reports_no_directory_and_stream_runs_to_eof() { let mut buf = Vec::new(); reader - .assignment_stream_reader() + .assignment_stream_reader_unverified() .unwrap() .read_to_end(&mut buf) .unwrap(); @@ -288,14 +289,17 @@ fn build_basic_finalized_bundle() -> Vec { bytes.extend_from_slice(&directory); let directory_len = directory.len() as u64; + // Set HEADER_FLAG_STREAM_CHECKSUM so open_assignment_reader() passes the checksum check. + // The stream is empty here so the CRC32C of zero bytes is 0x00000000. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, - flags: 0, + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, directory_offset, directory_len, stream_offset, @@ -431,8 +435,9 @@ fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset: HEADER_SIZE as u64, @@ -465,7 +470,7 @@ fn open_assignment_reader_rejects_unknown_assignment_format() { bytes[13] = 42; // corrupt assignment format byte let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); match reader.open_assignment_reader() { - Err(BundleAssignmentReaderError::UnknownAssignmentFormat(42)) => {} + Err(BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat(42))) => {} Err(other) => panic!("expected UnknownAssignmentFormat(42), got {other:?}"), Ok(_) => panic!("expected error, got Ok"), } @@ -480,8 +485,9 @@ fn incomplete_bundle_stream_range_runs_to_eof_without_directory() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset: HEADER_SIZE as u64, @@ -627,8 +633,9 @@ fn stress_thousand_custom_assets_round_trip() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len, stream_offset, @@ -682,8 +689,9 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset, @@ -715,8 +723,9 @@ fn reader_scales_to_very_wide_stream_offset_field() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory_bytes.len() as u64, stream_offset: HEADER_SIZE as u64, @@ -732,9 +741,9 @@ fn reader_scales_to_very_wide_stream_offset_field() { let mut buf = Vec::new(); // Take will try to read `stream_len` bytes but the Cursor will just return however many bytes // remain from stream_offset to EOF. The reader must not panic; it must simply return what it - // got. + // got. Use the unverified reader since this bundle has no stream checksum. reader - .assignment_stream_reader() + .assignment_stream_reader_unverified() .unwrap() .read_to_end(&mut buf) .unwrap(); @@ -759,8 +768,9 @@ fn incomplete_bundle_with_nonzero_directory_offset_uses_it_as_stream_end() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: dir_offset, directory_len: 0, stream_offset: stream_start, @@ -845,8 +855,9 @@ fn make_single_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, u64 minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset, @@ -893,8 +904,9 @@ fn make_single_xz_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset, @@ -1019,8 +1031,9 @@ fn verify_asset_checksum_returns_unavailable_when_flag_clear() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset, @@ -1143,8 +1156,9 @@ fn asset_bytes_returns_unavailable_when_flag_clear() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset: directory_offset, @@ -1240,8 +1254,9 @@ fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: directory.len() as u64, stream_offset, @@ -1286,3 +1301,180 @@ fn crc32c_polynomial_pin_against_known_vectors() { // Extra sentinels to broaden the trip-wire. assert_eq!(crc32c::crc32c(&[0x01, 0x02, 0x03, 0x04]), 0x2930_8CF4); } + +// ===================================================================== +// Stream CRC32C verification — API surface tests +// ===================================================================== +// +// These tests cover the error cases for unfinalized bundles and unflagged bundles (hand-built +// fixtures only). The round-trip correctness and corruption tests are in tests/writer.rs, which +// has the writer infrastructure needed to produce real BEN streams. + +/// Build the smallest possible finalized bundle with a known non-empty fake stream and no stream +/// checksum flag (simulates a foreign/pre-checksummed bundle). +fn make_unflagged_stream_bundle() -> Vec { + let fake_stream = b"hello stream".to_vec(); + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let stream_offset = bytes.len() as u64; + bytes.extend_from_slice(&fake_stream); + let directory_offset = bytes.len() as u64; + let directory = encode_directory(&[]).unwrap(); + bytes.extend_from_slice(&directory); + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: 0, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: fake_stream.len() as u64, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + bytes +} + +#[test] +fn assignment_stream_reader_returns_unavailable_when_flag_clear() { + let bytes = make_unflagged_stream_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = match reader.assignment_stream_reader() { + Err(e) => e, + Ok(_) => panic!("expected Err, got Ok"), + }; + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream + }) + ), + "expected Unavailable(Stream), got {err:?}" + ); + // The unverified path can still read the bytes. + let mut buf = Vec::new(); + reader + .assignment_stream_reader_unverified() + .unwrap() + .read_to_end(&mut buf) + .unwrap(); + assert_eq!(buf, b"hello stream"); +} + +#[test] +fn assignment_stream_reader_returns_bundle_incomplete_for_unfinalized() { + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: 0, + stream_checksum: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = vec![0u8; HEADER_SIZE]; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = match reader.assignment_stream_reader() { + Err(e) => e, + Ok(_) => panic!("expected Err, got Ok"), + }; + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream + }) + ), + "expected BundleIncomplete(Stream), got {err:?}" + ); +} + +#[test] +fn open_assignment_reader_returns_bundle_incomplete_for_unfinalized() { + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: 0, + stream_checksum: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = vec![0u8; HEADER_SIZE]; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + match reader.open_assignment_reader() { + Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream, + })) => {} + Ok(_) => panic!("expected Err, got Ok"), + Err(e) => panic!("expected BundleIncomplete(Stream), got Err({e:?})"), + } +} + +#[test] +fn verify_stream_checksum_returns_unavailable_when_flag_clear() { + let bytes = make_unflagged_stream_bundle(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = reader.verify_stream_checksum().unwrap_err(); + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream + }) + ), + "expected Unavailable(Stream), got {err:?}" + ); +} + +#[test] +fn verify_stream_checksum_returns_bundle_incomplete_for_unfinalized() { + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: 0, + stream_checksum: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut bytes = vec![0u8; HEADER_SIZE]; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = reader.verify_stream_checksum().unwrap_err(); + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream + }) + ), + "expected BundleIncomplete(Stream), got {err:?}" + ); +} diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index ea42711..72b8f1d 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -2,15 +2,18 @@ use std::io::{Cursor, Read, Seek, Write}; use xz2::write::XzEncoder; +use crate::io::bundle::error::{BendlReadError, ChecksumError, ChecksumTarget}; use crate::io::bundle::format::{ AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, - BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, FINALIZED_YES, HEADER_SIZE, + BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, FINALIZED_YES, + HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; use crate::io::bundle::reader::BendlReader; use crate::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter}; use crate::io::reader::BenWireFormat; use crate::io::writer::BenStreamWriter; +use crate::BenVariant; fn make_buffer() -> Cursor> { Cursor::new(Vec::new()) @@ -381,8 +384,9 @@ fn append_rejects_incomplete_bundle() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_NO, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset: HEADER_SIZE as u64, @@ -409,8 +413,9 @@ fn append_rejects_complete_bundle_with_zero_directory() { minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset: 0, directory_len: 0, stream_offset: HEADER_SIZE as u64, @@ -1337,3 +1342,279 @@ fn stream_session_partial_writes_account_returned_bytes() { let header = BendlHeader::read_from(&mut Cursor::new(&mut bundle_buf)).unwrap(); assert_eq!(header.stream_len, total_returned); } + +// ===================================================================== +// Stream CRC32C verification +// ===================================================================== +// +// Tests pin the writer→reader round-trip of stream_checksum and the behavioral contract of the +// verified stream reader APIs across both the raw-copy path (assignment_stream_reader) and the +// decoded path (open_assignment_reader / count_samples / write_all_jsonl). Each verified API +// surfaces ChecksumError::Mismatch when the stored stream_checksum is corrupted in-place. + +/// Build a finalized bundle containing a small plain-BEN stream with `count` samples. Returns +/// `(bundle_bytes, samples)`. +fn make_ben_stream_bundle(count: usize) -> (Vec, Vec>) { + let samples: Vec> = (0..count).map(|i| vec![i as u16, (i + 1) as u16]).collect(); + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + { + let mut ben = BenStreamWriter::for_ben(&mut session, BenVariant::Standard).unwrap(); + for s in &samples { + ben.write_assignment(s.clone()).unwrap(); + } + ben.finish().unwrap(); + } + let writer = session.finish_into_writer(count as i64); + let buf = writer.finish().unwrap().into_inner(); + (buf, samples) +} + +/// Corrupt the stored `stream_checksum` field in-place by flipping a byte at header offset 20. +fn corrupt_stream_checksum(bytes: &mut Vec) { + bytes[20] ^= 0xFF; +} + +/// Flip a byte in the stream payload to corrupt the stream contents without changing its length. +fn corrupt_stream_payload(bytes: &mut Vec, reader: &mut BendlReader>>) { + let (offset, len) = reader.assignment_stream_range().unwrap(); + assert!(len > 0, "stream must be non-empty to corrupt a payload byte"); + // Flip the last byte of the stream region. + bytes[(offset + len - 1) as usize] ^= 0x01; +} + +#[test] +fn writer_sets_header_flag_stream_checksum_on_finalization() { + let (buf, _) = make_ben_stream_bundle(3); + let header = BendlHeader::read_from(&mut Cursor::new(&buf)).unwrap(); + assert!( + header.flags & HEADER_FLAG_STREAM_CHECKSUM != 0, + "HEADER_FLAG_STREAM_CHECKSUM must be set after finalization" + ); + assert_ne!( + header.stream_checksum, 0, + "stream_checksum must be non-zero for a non-empty stream" + ); +} + +#[test] +fn writer_sets_stream_checksum_zero_for_empty_stream() { + // An empty stream has CRC32C(b"") = 0x00000000. + let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + let buf = writer.finish().unwrap().into_inner(); + let header = BendlHeader::read_from(&mut Cursor::new(&buf)).unwrap(); + assert!(header.flags & HEADER_FLAG_STREAM_CHECKSUM != 0); + assert_eq!(header.stream_checksum, 0); +} + +#[test] +fn assignment_stream_reader_verified_round_trips_stream_bytes() { + let (buf, _) = make_ben_stream_bundle(3); + // Capture the raw stream bytes first for comparison. + let mut r = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let (off, len) = r.assignment_stream_range().unwrap(); + let raw_stream: Vec = buf[off as usize..(off + len) as usize].to_vec(); + drop(r); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let mut got = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut got) + .unwrap(); + assert_eq!(got, raw_stream); +} + +#[test] +fn assignment_stream_reader_detects_corrupt_stored_checksum() { + let (mut buf, _) = make_ben_stream_bundle(3); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let mut sink = Vec::new(); + let err = reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut sink) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + let inner = err + .get_ref() + .and_then(|e| e.downcast_ref::()) + .expect("inner ChecksumError"); + assert!( + matches!(inner, ChecksumError::Mismatch { target: ChecksumTarget::Stream, .. }), + "expected Stream Mismatch, got {inner:?}" + ); +} + +#[test] +fn verify_stream_checksum_passes_on_intact_bundle() { + let (buf, _) = make_ben_stream_bundle(3); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + reader.verify_stream_checksum().unwrap(); +} + +#[test] +fn verify_stream_checksum_fails_on_corrupt_stored_checksum() { + let (mut buf, _) = make_ben_stream_bundle(3); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let err = reader.verify_stream_checksum().unwrap_err(); + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + }) + ), + "expected Mismatch(Stream), got {err:?}" + ); +} + +#[test] +fn verify_stream_checksum_fails_on_corrupt_stream_payload() { + let (mut buf, _) = make_ben_stream_bundle(3); + { + let mut r = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + corrupt_stream_payload(&mut buf, &mut r); + } + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let err = reader.verify_stream_checksum().unwrap_err(); + assert!( + matches!( + err, + BendlReadError::Checksum(ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + }) + ), + "expected Mismatch(Stream), got {err:?}" + ); +} + +#[test] +fn open_assignment_reader_iterator_detects_corrupt_stored_checksum() { + let (mut buf, samples) = make_ben_stream_bundle(3); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let mut decoder = reader.open_assignment_reader().unwrap(); + // Consume all real records; then the next call should report Mismatch. + let mut decoded_count = 0usize; + loop { + match decoder.next() { + Some(Ok(_)) => { + decoded_count += 1; + } + Some(Err(e)) => { + assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); + let inner = e + .get_ref() + .and_then(|x| x.downcast_ref::()) + .expect("inner ChecksumError"); + assert!( + matches!( + inner, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + } + ), + "expected Stream Mismatch, got {inner:?}" + ); + break; + } + None => panic!("expected ChecksumMismatch before None, got None"), + } + } + // Subsequent calls must return None (not repeat the error). + assert!(decoder.next().is_none(), "expected None after mismatch reported"); + assert_eq!(decoded_count, samples.len()); +} + +#[test] +fn count_samples_detects_corrupt_stored_checksum() { + let (mut buf, _) = make_ben_stream_bundle(4); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let decoder = reader.open_assignment_reader().unwrap(); + let err = decoder.count_samples().unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + let inner = err + .get_ref() + .and_then(|x| x.downcast_ref::()) + .expect("inner ChecksumError"); + assert!( + matches!( + inner, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + } + ), + "expected Stream Mismatch, got {inner:?}" + ); +} + +#[test] +fn write_all_jsonl_detects_corrupt_stored_checksum() { + let (mut buf, _) = make_ben_stream_bundle(3); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let mut decoder = reader.open_assignment_reader().unwrap(); + let err = decoder + .write_all_jsonl(std::io::sink()) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + let inner = err + .get_ref() + .and_then(|x| x.downcast_ref::()) + .expect("inner ChecksumError"); + assert!( + matches!( + inner, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + } + ), + "expected Stream Mismatch, got {inner:?}" + ); +} + +#[test] +fn for_each_assignment_detects_corrupt_stored_checksum_when_driven_to_eof() { + let (mut buf, _) = make_ben_stream_bundle(3); + corrupt_stream_checksum(&mut buf); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let mut decoder = reader.open_assignment_reader().unwrap(); + // Callback always returns Ok(true) so it drives to natural EOF. + let err = decoder + .for_each_assignment(|_, _| Ok(true)) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + let inner = err + .get_ref() + .and_then(|x| x.downcast_ref::()) + .expect("inner ChecksumError"); + assert!( + matches!( + inner, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + } + ), + "expected Stream Mismatch, got {inner:?}" + ); +} + +#[test] +fn open_assignment_reader_intact_bundle_round_trips_count_samples() { + let (buf, samples) = make_ben_stream_bundle(5); + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let decoder = reader.open_assignment_reader().unwrap(); + let n = decoder.count_samples().unwrap(); + assert_eq!(n, samples.len()); +} diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 07d8777..87fdd15 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -33,7 +33,7 @@ use super::format::{ default_compresses_by_type, encode_directory, read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, DEFAULT_XZ_PRESET, - FINALIZED_YES, HEADER_SIZE, + FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; /// Ability to truncate an underlying seekable target to a given length. @@ -315,6 +315,7 @@ impl BendlWriter { }), start_offset: stream_offset, bytes_written: 0, + hasher: 0, }) } @@ -329,6 +330,9 @@ impl BendlWriter { // No stream written; treat as empty stream located just after the asset region. let stream_offset = self.inner.seek(SeekFrom::Current(0))?; self.header.stream_offset = stream_offset; + // CRC32C of an empty byte sequence is 0x00000000. + self.header.stream_checksum = 0; + self.header.flags |= HEADER_FLAG_STREAM_CHECKSUM; (0, 0) } }; @@ -383,6 +387,7 @@ pub struct BendlStreamSession { parent: Option, start_offset: u64, bytes_written: u64, + hasher: u32, } impl BendlStreamSession { @@ -405,7 +410,13 @@ impl BendlStreamSession { /// method returns, the session's [`Drop`] impl observes `inner.is_none()` and skips the warn. pub fn finish_into_writer(mut self, sample_count: i64) -> BendlWriter { let inner = self.inner.take().expect("session has not been finished"); - let parent = self.parent.take().expect("session has not been finished"); + let mut parent = self.parent.take().expect("session has not been finished"); + + // Patch the stream checksum into the in-memory header so BendlWriter::finish can write it + // to disk in a single header patch pass. + parent.header.stream_checksum = self.hasher; + parent.header.flags |= HEADER_FLAG_STREAM_CHECKSUM; + BendlWriter { inner, header: parent.header, @@ -424,7 +435,10 @@ impl Write for BendlStreamSession { fn write(&mut self, buf: &[u8]) -> io::Result { let inner = self.inner.as_mut().expect("session has not been finished"); let n = inner.write(buf)?; - self.bytes_written += n as u64; + if n > 0 { + self.bytes_written += n as u64; + self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); + } Ok(n) } diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index f1bbaeb..a83ab86 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -54,8 +54,9 @@ fn minimal_bendl_with_entries( minor_version: BENDL_MINOR_VERSION, finalized: FINALIZED_YES, assignment_format: AssignmentFormat::Ben.to_u8(), - reserved_0: 0, + alignment_padding: 0, flags: 0, + stream_checksum: 0, directory_offset, directory_len: (directory.len() as i64 + directory_len_adjustment.min(0)) as u64, stream_offset: directory_offset, From f4d95e3ad03ccfe40e2853f4cd83d9e0d6c6fccf Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 09:15:37 -0600 Subject: [PATCH 101/221] Add in some edge-case tests --- ben/src/io/bundle/tests/reader.rs | 112 +++++++++++++++++++++++++++++- ben/src/io/bundle/tests/writer.rs | 40 +++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index b726b7e..aaced9c 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -1,4 +1,4 @@ -use std::io::{Cursor, Read, Write}; +use std::io::{Cursor, Read, Seek, Write}; use xz2::write::XzEncoder; @@ -1447,6 +1447,116 @@ fn verify_stream_checksum_returns_unavailable_when_flag_clear() { ); } +#[test] +fn asset_payload_reader_unverified_returns_compressed_bytes_for_xz_asset() { + // For an xz-flagged asset, `asset_payload_reader_unverified` is the raw on-disk byte + // accessor — it must NOT invoke the xz decoder. This is the distinction from + // `asset_reader_unverified`, which decompresses but skips CRC verification. + let raw = b"the quick brown fox jumps over the lazy dog".to_vec(); + let (bytes, name, compressed, _, _) = make_single_xz_asset_bundle("xz_blob", &raw); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + + let mut payload_reader = reader.asset_payload_reader_unverified(&entry).unwrap(); + let mut out = Vec::new(); + payload_reader.read_to_end(&mut out).unwrap(); + drop(payload_reader); + assert_eq!(out, compressed, "payload reader returns raw compressed bytes"); + assert_ne!(out, raw, "payload reader did NOT decompress"); + + // For an uncompressed asset, the payload reader and the decoded unverified reader produce the + // same bytes — there is no codec to bypass. + let (bytes2, name2, _, _) = make_single_asset_bundle("raw_blob", b"plain payload"); + let mut reader2 = BendlReader::open(Cursor::new(bytes2)).unwrap(); + let entry2 = reader2.find_asset_by_name(&name2).cloned().unwrap(); + let mut via_payload = Vec::new(); + reader2 + .asset_payload_reader_unverified(&entry2) + .unwrap() + .read_to_end(&mut via_payload) + .unwrap(); + let mut via_unverified = Vec::new(); + reader2 + .asset_reader_unverified(&entry2) + .unwrap() + .read_to_end(&mut via_unverified) + .unwrap(); + assert_eq!(via_payload, b"plain payload".to_vec()); + assert_eq!(via_payload, via_unverified); +} + +#[test] +fn asset_bytes_surfaces_io_error_through_failing_reader() { + // Variant-discipline test: when the underlying Read+Seek fails after open, asset_bytes for an + // uncompressed asset must surface `BendlReadError::Io`, not `Decode`. There is no codec to + // blame on an uncompressed payload, so any io::Error reaching the wrapper must be classified + // as Io. + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::Arc; + + let (bytes, name, _, _) = make_single_asset_bundle("blob", b"never read me"); + let fail_flag = Arc::new(AtomicBool::new(false)); + let reader_inner = FailWhenArmed { + inner: Cursor::new(bytes), + armed: Arc::clone(&fail_flag), + }; + let mut reader = BendlReader::open(reader_inner).unwrap(); + // Arm AFTER open completes so header/directory reads succeed; the next read (issued by + // asset_bytes for the payload) hits the forced failure. + fail_flag.store(true, Ordering::SeqCst); + let entry = reader.find_asset_by_name(&name).cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + assert!( + matches!(err, BendlReadError::Io(ref e) if e.kind() == std::io::ErrorKind::Other), + "expected BendlReadError::Io(Other), got {err:?}" + ); +} + +#[test] +fn open_assignment_reader_returns_decoder_init_on_bad_banner() { + // Variant-discipline test: corrupt the BEN banner so the BenStreamReader rejects it during + // construction. The error must surface as `BendlReadError::DecoderInit`, distinct from `Io`, + // `Format`, `Decode`, and `Checksum`. The banner is parsed before the CRC tee reaches EOF, so + // DecoderInit wins over any checksum mismatch the corrupted byte would otherwise produce. + let (mut bytes, _, _, _) = build_finalized_bundle(); + let header = BendlHeader::from_bytes(bytes[..HEADER_SIZE].try_into().unwrap()).unwrap(); + let banner_offset = header.stream_offset as usize; + // "STANDARD BEN FILE\x00" prefix is the banner; flip the first byte so it no longer matches. + bytes[banner_offset] ^= 0x01; + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + match reader.open_assignment_reader() { + Err(BendlReadError::DecoderInit(_)) => {} + Err(other) => panic!("expected DecoderInit, got {other:?}"), + Ok(_) => panic!("expected Err, got Ok"), + } +} + +/// `Read + Seek` test double that returns a forced `io::Error` on every `read` call while the +/// shared `armed` flag is true. Used to pin the `BendlReadError::Io` wrap site without depending +/// on filesystem-specific behavior (a deleted file's open fd stays alive on Linux/macOS). +struct FailWhenArmed { + inner: R, + armed: std::sync::Arc, +} + +impl Read for FailWhenArmed { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.armed.load(std::sync::atomic::Ordering::SeqCst) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "forced read failure", + )); + } + self.inner.read(buf) + } +} + +impl Seek for FailWhenArmed { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + self.inner.seek(pos) + } +} + #[test] fn verify_stream_checksum_returns_bundle_incomplete_for_unfinalized() { let header = BendlHeader { diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 72b8f1d..2171fe7 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -851,6 +851,46 @@ fn writer_asset_round_trips_with_auto_computed_crc32c() { ); } +#[test] +fn writer_xz_asset_stores_crc_over_compressed_bytes_not_raw() { + // The CRC contract for xz-flagged assets is "CRC32C over the on-disk bytes" — i.e. the + // compressed bytes, not the raw input. Pin this directly: re-compress the same input, compute + // the CRC over the compressed result, and assert it matches the stored value. Asserting that + // the stored CRC does NOT equal `crc32c(raw_input)` is what catches the "writer accidentally + // hashed pre-compression bytes" regression. + let payload = b"the quick brown fox jumps over the lazy dog".to_vec(); + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "xz_asset", + &payload, + AddAssetOptions::defaults().compress(), + ) + .unwrap(); + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); + let buf = writer.finish().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(buf)).unwrap(); + let entry = reader.find_asset_by_name("xz_asset").cloned().unwrap(); + assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0, "asset must be xz-flagged"); + assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); + + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder.write_all(&payload).unwrap(); + let compressed = encoder.finish().unwrap(); + assert_eq!( + entry.checksum_u32(), + Some(crc32c::crc32c(&compressed)), + "stored CRC must be over compressed on-disk bytes" + ); + assert_ne!( + entry.checksum_u32(), + Some(crc32c::crc32c(&payload)), + "stored CRC must NOT be over the raw pre-compression input" + ); +} + #[test] fn finished_writer_rejects_further_operations() { let writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); From c2aac6ea52d846a3f5491b2530f5c85ef99487cd Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 21:06:49 -0600 Subject: [PATCH 102/221] add strict payload length enforcement --- ben/src/io/bundle/reader.rs | 265 ++++++++++++++++++++++--- ben/src/io/bundle/tests/reader.rs | 313 +++++++++++++++++++++++++++--- 2 files changed, 517 insertions(+), 61 deletions(-) diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 9506376..efb6123 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -17,9 +17,10 @@ //! - [`BendlReader::verify_asset_checksum`] and [`BendlReader::verify_all_asset_checksums`] are //! explicit raw-bytes verifiers (no decoding) that do not return decoded payload bytes. -use std::io::{self, Read, Seek, SeekFrom, Take, Write}; +use std::fmt; +use std::io::{self, Read, Seek, SeekFrom, Write}; use std::sync::{ - atomic::{AtomicU32, Ordering}, + atomic::{AtomicBool, AtomicU32, Ordering}, Arc, }; @@ -170,7 +171,7 @@ impl BendlReader { let expected = self.header.stream_checksum; let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; - let raw = (&mut self.inner).take(len); + let raw = ExactLen::new(&mut self.inner, len, ShortRangeFlag::new()); Ok(Box::new(RawVerifyingReader { inner: raw, hasher: 0, @@ -184,10 +185,16 @@ impl BendlReader { /// /// Works on both finalized and unfinalized bundles. Useful for recovery/debug flows and for /// callers that need the raw bytes without the overhead of a CRC check. - pub fn assignment_stream_reader_unverified(&mut self) -> io::Result> { + pub fn assignment_stream_reader_unverified( + &mut self, + ) -> io::Result> { let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; - Ok((&mut self.inner).take(len)) + Ok(Box::new(ExactLen::new( + &mut self.inner, + len, + ShortRangeFlag::new(), + ))) } /// Construct a verified decoded assignment reader that checks the stream CRC32C after the @@ -220,7 +227,8 @@ impl BendlReader { })?; let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; - let raw = (&mut self.inner).take(len); + let short_flag = ShortRangeFlag::new(); + let raw = ExactLen::new(&mut self.inner, len, short_flag.clone()); let arc_hasher = Arc::new(AtomicU32::new(0)); let shared_raw = ArcHasher { @@ -228,15 +236,31 @@ impl BendlReader { state: Arc::clone(&arc_hasher), }; - let inner = match format { - AssignmentFormat::Ben => BenStreamReader::from_ben(shared_raw)?, - AssignmentFormat::Xben => BenStreamReader::from_xben(shared_raw)?, + let init = match format { + AssignmentFormat::Ben => BenStreamReader::from_ben(shared_raw), + AssignmentFormat::Xben => BenStreamReader::from_xben(shared_raw), + }; + let inner = match init { + Ok(inner) => inner, + Err(e) => { + // If the underlying ExactLen flagged a short range while the codec was reading its + // banner, surface a bundle-layer UnexpectedEof rather than a DecoderInit so callers + // see the structural truncation as the failure, not a banner parse error. + if short_flag.get() { + return Err(BendlReadError::Io(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + ))); + } + return Err(e.into()); + } }; Ok(BendlVerifiedStreamReader { inner, expected, arc_hasher, + short_flag, state: StreamVerifyState::Running, }) } @@ -273,10 +297,7 @@ impl BendlReader { if n == 0 { return Err(BendlReadError::Io(io::Error::new( io::ErrorKind::UnexpectedEof, - format!( - "stream ended {} byte(s) before declared length", - remaining - ), + ShortRangeMarker { remaining }, ))); } hasher = crc32c::crc32c_append(hasher, &buf[..n]); @@ -360,7 +381,8 @@ impl BendlReader { let target = ChecksumTarget::Asset(entry.name.clone()); self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - let raw = (&mut self.inner).take(entry.payload_len); + let short_flag = ShortRangeFlag::new(); + let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); if entry.asset_flags & ASSET_FLAG_XZ != 0 { // Compressed: CRC tee sits *inside* the XzDecoder so the tee accumulates over raw @@ -372,6 +394,7 @@ impl BendlReader { decoder, expected, target, + short_flag, state: VerifyState::Reading, })) } else { @@ -395,9 +418,16 @@ impl BendlReader { entry: &BendlDirectoryEntry, ) -> Result, BendlReadError> { self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - let raw = (&mut self.inner).take(entry.payload_len); + let short_flag = ShortRangeFlag::new(); + let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); if entry.asset_flags & ASSET_FLAG_XZ != 0 { - Ok(Box::new(XzDecoder::new(raw))) + // Wrap the decoder so that if xz reports a runtime error while the underlying + // ExactLen has flagged a short read, the surface is a short-range UnexpectedEof + // rather than a codec error. + Ok(Box::new(ShortRangeAwareReader { + inner: XzDecoder::new(raw), + short_flag, + })) } else { Ok(Box::new(raw)) } @@ -415,7 +445,11 @@ impl BendlReader { entry: &BendlDirectoryEntry, ) -> Result, BendlReadError> { self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - Ok(Box::new((&mut self.inner).take(entry.payload_len))) + Ok(Box::new(ExactLen::new( + &mut self.inner, + entry.payload_len, + ShortRangeFlag::new(), + ))) } /// Verify the stored CRC32C of a single asset without returning any decoded bytes. @@ -449,10 +483,7 @@ impl BendlReader { // callers can distinguish a truncated bundle from a CRC mismatch. return Err(BendlReadError::Io(io::Error::new( io::ErrorKind::UnexpectedEof, - format!( - "asset {:?} payload ended {} byte(s) before declared length", - entry.name, remaining - ), + ShortRangeMarker { remaining }, ))); } hasher = crc32c::crc32c_append(hasher, &buf[..n]); @@ -494,6 +525,124 @@ impl BendlReader { } } +// --------------------------------------------------------------------------- +// Strict-length plumbing +// --------------------------------------------------------------------------- + +/// Marker error attached to the `io::Error` returned when an [`ExactLen`] reader hits underlying +/// EOF before consuming its declared length. Used by convenience APIs to recognise a bundle-layer +/// short-range failure even when it has surfaced through a codec. +#[derive(Debug)] +pub(crate) struct ShortRangeMarker { + pub remaining: u64, +} + +impl fmt::Display for ShortRangeMarker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "bundle range ended {} byte(s) before declared length", + self.remaining + ) + } +} + +impl std::error::Error for ShortRangeMarker {} + +/// Shared flag set by an [`ExactLen`] reader when the underlying reader runs out of bytes before +/// the declared length is reached. Clones share state so a wrapper above a codec can detect the +/// short read even if the codec swallows the inner `UnexpectedEof` in favor of its own error. +#[derive(Clone, Default)] +pub struct ShortRangeFlag(Arc); + +impl ShortRangeFlag { + pub(crate) fn new() -> Self { + Self(Arc::new(AtomicBool::new(false))) + } + + pub(crate) fn set(&self) { + self.0.store(true, Ordering::Relaxed); + } + + pub(crate) fn get(&self) -> bool { + self.0.load(Ordering::Relaxed) + } +} + +/// Bounded reader that enforces an exact byte length. Behaves like [`std::io::Take`] for reads +/// within the declared length, but returns +/// `Err(io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker))` (and sets the shared +/// [`ShortRangeFlag`]) if the underlying reader signals EOF before the declared length is reached. +/// +/// `ExactLen` is the BENDL-layer guarantee that `payload_len` and `stream_len` are exact byte +/// counts of the on-disk range; a backing file shorter than declared is a corrupt bundle, not a +/// short successful read. +pub struct ExactLen { + inner: R, + remaining: u64, + flag: ShortRangeFlag, +} + +impl ExactLen { + pub(crate) fn new(inner: R, declared: u64, flag: ShortRangeFlag) -> Self { + Self { + inner, + remaining: declared, + flag, + } + } +} + +impl Read for ExactLen { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.remaining == 0 || buf.is_empty() { + return Ok(0); + } + let max = (buf.len() as u64).min(self.remaining) as usize; + let n = self.inner.read(&mut buf[..max])?; + if n == 0 { + // Underlying reader hit EOF before our declared length. Set the shared flag so a + // wrapper above a codec can recognise this as a bundle-range failure, and surface as + // UnexpectedEof carrying the marker. + let remaining = self.remaining; + self.flag.set(); + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining }, + )); + } + self.remaining -= n as u64; + Ok(n) + } +} + +/// Wraps a reader sitting above an [`ExactLen`]-bounded source. If the underlying reader returns +/// an error and the shared `ShortRangeFlag` is set, the error is replaced with an `UnexpectedEof` +/// carrying a [`ShortRangeMarker`] so callers see a bundle-layer short-range failure rather than a +/// codec-specific error message. +struct ShortRangeAwareReader { + inner: R, + short_flag: ShortRangeFlag, +} + +impl Read for ShortRangeAwareReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.inner.read(buf) { + Ok(n) => Ok(n), + Err(e) => { + if self.short_flag.get() { + Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )) + } else { + Err(e) + } + } + } + } +} + // --------------------------------------------------------------------------- // Verifying reader plumbing // --------------------------------------------------------------------------- @@ -514,7 +663,7 @@ enum VerifyState { /// as they fly past, and on raw EOF either confirms the checksum or returns /// [`ChecksumError::Mismatch`] in place of the usual `Ok(0)`. struct RawVerifyingReader<'a, R: Read + Seek> { - inner: Take<&'a mut R>, + inner: ExactLen<&'a mut R>, hasher: u32, expected: u32, target: ChecksumTarget, @@ -574,9 +723,10 @@ impl Read for CrcTeeReader { /// Verifying wrapper around an `XzDecoder>`. Lets the codec observe normal raw EOF /// before finalizing the CRC check at the decoded layer. struct DecodedVerifyingReader<'a, R: Read + Seek> { - decoder: XzDecoder>>, + decoder: XzDecoder>>, expected: u32, target: ChecksumTarget, + short_flag: ShortRangeFlag, state: VerifyState, } @@ -586,7 +736,21 @@ impl Read for DecodedVerifyingReader<'_, R> { VerifyState::EofChecked | VerifyState::Failed => return Ok(0), VerifyState::Reading => {} } - let n = self.decoder.read(buf)?; + let n = match self.decoder.read(buf) { + Ok(n) => n, + Err(e) => { + // If the underlying ExactLen flagged a short range, surface it as a bundle-layer + // UnexpectedEof rather than a codec error — the bytes were missing, not malformed. + self.state = VerifyState::Failed; + if self.short_flag.get() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )); + } + return Err(e); + } + }; if n == 0 { let computed = self.decoder.get_ref().hasher; if computed == self.expected { @@ -633,8 +797,13 @@ impl Read for ArcHasher { #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum StreamVerifyState { Running, - /// CRC mismatch was returned once as `Some(Err(...))`. Subsequent iterator calls return `None`. + /// A CRC mismatch was returned once as `Some(Err(...))`. Subsequent iterator calls return + /// `None`. MismatchReported, + /// A non-CRC terminal error (codec failure, bundle-layer short range, etc.) was returned once + /// as `Some(Err(...))`. Subsequent iterator calls return `None`. Kept distinct from + /// `MismatchReported` so the state machine self-documents which class of failure tripped it. + Errored, /// CRC matched after natural EOF. Subsequent iterator calls return `None`. Verified, } @@ -652,9 +821,10 @@ enum StreamVerifyState { /// so the CRC tee is never finalized. Callers that need integrity for partial reads must call /// [`BendlReader::verify_stream_checksum`] separately. pub struct BendlVerifiedStreamReader<'a, R: Read + Seek> { - inner: BenStreamReader>>, + inner: BenStreamReader>>, expected: u32, arc_hasher: Arc, + short_flag: ShortRangeFlag, state: StreamVerifyState, } @@ -683,7 +853,19 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { pub fn count_samples(self) -> io::Result { let arc = Arc::clone(&self.arc_hasher); let expected = self.expected; - let count = self.inner.count_samples()?; + let short_flag = self.short_flag.clone(); + let count = match self.inner.count_samples() { + Ok(count) => count, + Err(e) => { + if short_flag.get() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )); + } + return Err(e); + } + }; let computed = arc.load(Ordering::Relaxed); if computed != expected { return Err(io::Error::new( @@ -753,7 +935,7 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { /// Frame iteration is intentionally partial: callers typically stop short of EOF, so the CRC /// tee is never finalized and the stream is **not verified** by this path. Callers needing /// integrity for partial reads should call [`BendlReader::verify_stream_checksum`] separately. - pub fn into_frames(self) -> BenStreamFrameReader>> { + pub fn into_frames(self) -> BenStreamFrameReader>> { self.inner.into_frames() } } @@ -767,7 +949,7 @@ impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { pub fn into_subsample_by_indices( self, indices: T, - ) -> SubsampleFrameDecoder>>> + ) -> SubsampleFrameDecoder>>> where T: IntoIterator, { @@ -782,7 +964,7 @@ impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { self, start: usize, end: usize, - ) -> SubsampleFrameDecoder>>> { + ) -> SubsampleFrameDecoder>>> { self.inner.into_subsample_by_range(start, end) } @@ -795,7 +977,7 @@ impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { self, step: usize, offset: usize, - ) -> SubsampleFrameDecoder>>> { + ) -> SubsampleFrameDecoder>>> { self.inner.into_subsample_every(step, offset) } } @@ -805,10 +987,24 @@ impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { fn next(&mut self) -> Option { match self.state { - StreamVerifyState::MismatchReported | StreamVerifyState::Verified => return None, + StreamVerifyState::MismatchReported + | StreamVerifyState::Errored + | StreamVerifyState::Verified => return None, StreamVerifyState::Running => {} } match self.inner.next() { + Some(Err(e)) => { + // Non-CRC terminal error: codec failure, bundle-layer short range, or anything else + // the inner reader returned. CRC mismatch lives in the `None` branch below. + self.state = StreamVerifyState::Errored; + if self.short_flag.get() { + return Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + ))); + } + Some(Err(e)) + } Some(item) => Some(item), None => { // Inner reached natural EOF — finalize the CRC check. @@ -840,6 +1036,11 @@ impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { /// context. Codec-runtime errors from xz/BEN go to [`BendlReadError::Decode`] when the entry is /// xz-flagged; raw payload errors stay `Io`. fn classify_read_error(err: io::Error, entry: &BendlDirectoryEntry) -> BendlReadError { + // Bundle-layer short-range failures map to Io regardless of asset flags. A backing file + // shorter than payload_len is a structural bundle problem, not a codec error. + if err.get_ref().is_some_and(|e| e.is::()) { + return BendlReadError::Io(err); + } if err.get_ref().is_some_and(|e| e.is::()) { match err .into_inner() diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index aaced9c..f1c9603 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -1,4 +1,4 @@ -use std::io::{Cursor, Read, Seek, Write}; +use std::io::{self, Cursor, Read, Seek, Write}; use xz2::write::XzEncoder; @@ -402,27 +402,24 @@ fn interleaved_reads_do_not_corrupt_each_other() { } #[test] -fn asset_bytes_errors_when_declared_length_runs_past_eof() { - // Hand-construct a bundle where the metadata directory entry claims a payload_len that extends - // well past EOF. +fn asset_bytes_errors_with_unexpected_eof_when_payload_len_runs_past_eof() { + // Strict-EOF contract: a directory entry whose payload_len claims more bytes than the backing + // file provides must surface as BendlReadError::Io wrapping io::ErrorKind::UnexpectedEof. + // Returning a short successful read on a corrupt bundle is exactly the silent-corruption + // failure mode this contract exists to prevent. let mut bytes = build_basic_finalized_bundle(); - // Parse the directory offset to find where the entry lives. let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Skip the u32 entry count (4 bytes) and then the 16-byte fixed entry header up to - // `payload_len` (bytes 16..24 of the entry). let entry_start = directory_offset + 4; let payload_len_offset = entry_start + 16; bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); - // The reader opens fine — the directory parses. But reading the asset bytes must surface an - // error eventually (short read vs declared length). xz would also trip on this, but this is the - // raw-asset path. Either returns an error or a slice shorter than u64::MAX. - reader - .asset_bytes(&entry) - .map(|b| assert!(b.len() < u64::MAX as usize)) - .ok(); + let err = reader.asset_bytes(&entry).unwrap_err(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } } #[test] @@ -707,14 +704,13 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { } #[test] -fn reader_scales_to_very_wide_stream_offset_field() { - // Confirm the `Take` bound clamps a stream reader even when the header's stream_len is much - // larger than the actual remaining bytes: the reader must return the shorter slice rather than - // loop forever or panic. This is a "short read" tolerance check. +fn assignment_stream_reader_unverified_errors_when_stream_len_runs_past_eof() { + // Strict-EOF contract for the assignment stream: when stream_len claims more bytes than the + // backing file actually provides, the unverified stream reader must surface + // io::ErrorKind::UnexpectedEof rather than silently return a short slice. let fake_stream = b"STANDARD BEN FILE\x00\x01tiny".to_vec(); let actual_len = fake_stream.len() as u64; let directory_offset = HEADER_SIZE as u64 + actual_len; - // Build a bundle that lies about stream_len: claims ten times what's actually present. let entries: Vec = Vec::new(); let directory_bytes = encode_directory(&entries).unwrap(); let header = BendlHeader { @@ -729,7 +725,7 @@ fn reader_scales_to_very_wide_stream_offset_field() { directory_offset, directory_len: directory_bytes.len() as u64, stream_offset: HEADER_SIZE as u64, - stream_len: actual_len * 10, // lie + stream_len: actual_len * 10, // claim ten times the actual length sample_count: 0, }; let mut bytes = Vec::new(); @@ -739,18 +735,12 @@ fn reader_scales_to_very_wide_stream_offset_field() { let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); let mut buf = Vec::new(); - // Take will try to read `stream_len` bytes but the Cursor will just return however many bytes - // remain from stream_offset to EOF. The reader must not panic; it must simply return what it - // got. Use the unverified reader since this bundle has no stream checksum. - reader + let err = reader .assignment_stream_reader_unverified() .unwrap() .read_to_end(&mut buf) - .unwrap(); - // Take includes the directory bytes in the window since they come after stream_offset and the - // claim exceeds file size — so we assert only that we got *at least* the real stream bytes as a - // prefix, which is the basic "no truncation of what exists" check. - assert!(buf.starts_with(&fake_stream)); + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); } #[test] @@ -1588,3 +1578,268 @@ fn verify_stream_checksum_returns_bundle_incomplete_for_unfinalized() { "expected BundleIncomplete(Stream), got {err:?}" ); } + +// --------------------------------------------------------------------------- +// Strict payload_len / stream_len EOF enforcement +// --------------------------------------------------------------------------- + +/// Returns a bundle whose `metadata.json` entry's `payload_len` has been corrupted to point past +/// EOF, while the rest of the file remains structurally valid. +fn build_bundle_with_overlong_metadata_payload_len() -> Vec { + let mut bytes = build_basic_finalized_bundle(); + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let entry_start = directory_offset + 4; + let payload_len_offset = entry_start + 16; + bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); + bytes +} + +/// Returns a finalized BEN bundle whose declared `stream_len` is much longer than the bytes +/// actually present in the file. Lays the file out as `[header | directory | stream | EOF]` so the +/// stream is the last region; ExactLen-driven readers hit underlying EOF and surface +/// `UnexpectedEof` rather than overshoot into unrelated trailing bytes. +fn build_bundle_with_overlong_stream_len() -> Vec { + let fake_stream = b"STANDARD BEN FILE\x00\x01".to_vec(); + let actual_stream_len = fake_stream.len() as u64; + let directory = encode_directory(&[]).unwrap(); + let directory_offset = HEADER_SIZE as u64; + let stream_offset = directory_offset + directory.len() as u64; + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: crc32c::crc32c(&fake_stream), + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: actual_stream_len * 10, // claim ten times the actual length + sample_count: 0, + }; + + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&directory); + bytes.extend_from_slice(&fake_stream); + bytes +} + +#[test] +fn asset_bytes_unverified_errors_with_unexpected_eof_when_payload_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_metadata_payload_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + let err = reader.asset_bytes_unverified(&entry).unwrap_err(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } +} + +#[test] +fn asset_reader_returns_unexpected_eof_when_payload_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_metadata_payload_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + let mut r = reader.asset_reader(&entry).unwrap(); + let mut buf = Vec::new(); + let err = r.read_to_end(&mut buf).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn asset_reader_unverified_returns_unexpected_eof_when_payload_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_metadata_payload_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + let mut r = reader.asset_reader_unverified(&entry).unwrap(); + let mut buf = Vec::new(); + let err = r.read_to_end(&mut buf).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn asset_payload_reader_unverified_returns_unexpected_eof_when_payload_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_metadata_payload_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("metadata.json").cloned().unwrap(); + let mut r = reader.asset_payload_reader_unverified(&entry).unwrap(); + let mut buf = Vec::new(); + let err = r.read_to_end(&mut buf).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn asset_bytes_returns_unexpected_eof_for_xz_asset_with_overlong_payload_len() { + // For an xz-flagged asset whose payload_len claims more bytes than the backing file holds, the + // surface must be BendlReadError::Io(UnexpectedEof) — not BendlReadError::Decode — because the + // failure is a bundle-layer short range, not a codec failure. Layout the bundle as + // `[header | directory | compressed_payload | EOF]` so the compressed payload is the last + // region; otherwise xz would over-read into unrelated trailing bytes and report a corrupt-xz + // error instead of the short-range surface we want to assert. + let raw_payload = br#"{"hello":"world"}"#.to_vec(); + let mut compressed = Vec::new(); + let mut encoder = XzEncoder::new(&mut compressed, 6); + encoder.write_all(&raw_payload).unwrap(); + encoder.finish().unwrap(); + + let directory_offset = HEADER_SIZE as u64; + let placeholder_payload_offset = 0u64; + let placeholder_entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset: placeholder_payload_offset, + payload_len: u64::MAX, + checksum: None, + }, + &compressed, + )]; + let placeholder_directory = encode_directory(&placeholder_entries).unwrap(); + let payload_offset = directory_offset + placeholder_directory.len() as u64; + + let entries = vec![with_crc( + BendlDirectoryEntry { + asset_type: ASSET_TYPE_GRAPH, + asset_flags: ASSET_FLAG_JSON | ASSET_FLAG_XZ, + name: "graph.json".to_string(), + payload_offset, + payload_len: u64::MAX, // claim far more than is present + checksum: None, + }, + &compressed, + )]; + let directory = encode_directory(&entries).unwrap(); + assert_eq!(directory.len(), placeholder_directory.len()); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&directory); + bytes.extend_from_slice(&compressed); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = reader.find_asset_by_name("graph.json").cloned().unwrap(); + let err = reader.asset_bytes(&entry).unwrap_err(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } +} + +#[test] +fn assignment_stream_reader_returns_unexpected_eof_when_stream_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_stream_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let mut r = reader.assignment_stream_reader().unwrap(); + let mut buf = Vec::new(); + let err = r.read_to_end(&mut buf).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn open_assignment_reader_returns_unexpected_eof_when_stream_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_stream_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let mut decoded = reader.open_assignment_reader().unwrap(); + // Drive iteration; the underlying BEN decoder runs out of backing bytes before the declared + // stream_len, and the BENDL-owned wrapper surfaces that as UnexpectedEof rather than a codec + // error. + let final_item = loop { + match decoded.next() { + Some(Ok(_)) => continue, + Some(Err(e)) => break Some(e), + None => break None, + } + }; + let err = final_item.expect("expected an error before natural EOF"); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn verify_stream_checksum_returns_unexpected_eof_when_stream_len_runs_past_eof() { + let bytes = build_bundle_with_overlong_stream_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let err = reader.verify_stream_checksum().unwrap_err(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } +} + +#[test] +fn write_all_jsonl_returns_unexpected_eof_when_stream_len_runs_past_eof() { + // write_all_jsonl delegates to BendlVerifiedStreamReader::next, so the iterator's short-range + // translation must propagate through this consuming method too. Pin it explicitly so a future + // refactor that bypasses `next` (e.g. driving the inner reader directly) cannot quietly drop + // the UnexpectedEof surface in favor of a codec error. + let bytes = build_bundle_with_overlong_stream_len(); + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let mut decoded = reader.open_assignment_reader().unwrap(); + let err = decoded.write_all_jsonl(std::io::sink()).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn open_assignment_reader_returns_unexpected_eof_when_banner_falls_in_short_range() { + // Construction-time variant of the strict-EOF contract: if stream_len is so short that + // BenStreamReader can't even read its 17-byte banner, the surface must be a bundle-layer + // UnexpectedEof — not BendlReadError::DecoderInit. The banner read happens inside + // `from_ben`/`from_xben`, before any iterator step, so this catches the + // "codec-reclassification at construction" gap. + // + // Build a bundle whose declared stream_len claims 100 bytes but only provides 4 — fewer than + // the 17-byte banner needs. + let stream_bytes = b"STAN".to_vec(); + let directory = encode_directory(&[]).unwrap(); + let directory_offset = HEADER_SIZE as u64; + let stream_offset = directory_offset + directory.len() as u64; + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset, + stream_len: 100, // claim far more than is present + sample_count: 0, + }; + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header.to_bytes()); + bytes.extend_from_slice(&directory); + bytes.extend_from_slice(&stream_bytes); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + match reader.open_assignment_reader() { + Err(BendlReadError::Io(io_err)) => { + assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof); + } + Err(other) => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + Ok(_) => panic!("expected Err, got Ok"), + } +} From 18ad3ec81448ae02892a31fe063df6cff92ae7a6 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 21:20:15 -0600 Subject: [PATCH 103/221] add some fixtures and stability tests --- ben/tests/fixtures/v1.0.0/flags_set.bendl | Bin 0 -> 445 bytes ben/tests/fixtures/v1.0.0/mkvchain.ben | Bin 0 -> 54 bytes ben/tests/fixtures/v1.0.0/mkvchain.xben | Bin 0 -> 104 bytes ben/tests/fixtures/v1.0.0/source.jsonl | 5 + ben/tests/fixtures/v1.0.0/source_graph.json | 1 + .../fixtures/v1.0.0/source_metadata.json | 1 + ben/tests/fixtures/v1.0.0/standard.ben | Bin 0 -> 53 bytes ben/tests/fixtures/v1.0.0/standard.xben | Bin 0 -> 104 bytes ben/tests/fixtures/v1.0.0/twodelta.ben | Bin 0 -> 62 bytes ben/tests/fixtures/v1.0.0/twodelta.xben | Bin 0 -> 112 bytes ben/tests/fixtures/v1.0.0/unknown_flags.bendl | Bin 0 -> 507 bytes ben/tests/test_format_stability.rs | 407 ++++++++++++++++++ 12 files changed, 414 insertions(+) create mode 100644 ben/tests/fixtures/v1.0.0/flags_set.bendl create mode 100644 ben/tests/fixtures/v1.0.0/mkvchain.ben create mode 100644 ben/tests/fixtures/v1.0.0/mkvchain.xben create mode 100644 ben/tests/fixtures/v1.0.0/source.jsonl create mode 100644 ben/tests/fixtures/v1.0.0/source_graph.json create mode 100644 ben/tests/fixtures/v1.0.0/source_metadata.json create mode 100644 ben/tests/fixtures/v1.0.0/standard.ben create mode 100644 ben/tests/fixtures/v1.0.0/standard.xben create mode 100644 ben/tests/fixtures/v1.0.0/twodelta.ben create mode 100644 ben/tests/fixtures/v1.0.0/twodelta.xben create mode 100644 ben/tests/fixtures/v1.0.0/unknown_flags.bendl create mode 100644 ben/tests/test_format_stability.rs diff --git a/ben/tests/fixtures/v1.0.0/flags_set.bendl b/ben/tests/fixtures/v1.0.0/flags_set.bendl new file mode 100644 index 0000000000000000000000000000000000000000..9cb5cab2cc5afe3b56d77be024d387be305637cf GIT binary patch literal 445 zcmZ>A^>guIU|?iq01`|<45TG@H033M7(fsWp%{Ka=?o~%3Z?&=S4BiIFt9wkw#AK! zL6K1mC|;ufbTK1?PPI~AeoAVwl9h>$Qff*%h!q`epko-T6K$wt1SE}gjDe)FjzMf} zEy%cn57s+G%C}5mV06`$3krWBzwL7^*cmL5QPoOiiA9--c_m6#O2s9Kc`1oSDM~s@ zNu_xyIjQkwsYS(^`FToKhB`_>E0c>d3rfIzr7}Z313d!;pq8Y}oXnC+g|y6ylG377 zrCO-V8U731bYwYT913we#9I$9_=!JY2w~8UWzd|;q8hz%ZmHb;dBXp67s>Uz*st0s zv480|N5)Qd*TeatKYp9Z^xt5+J<$QEm(e|R(#icTY(?{`AORQ|1q@UMCI)r}E+7q3 z0s^p5WC60%ixLYm^s(-C1VIpr0i-fFwInemu_O_!xU8-E F4geLia?=0+ literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/mkvchain.ben b/ben/tests/fixtures/v1.0.0/mkvchain.ben new file mode 100644 index 0000000000000000000000000000000000000000..009a87f9c16970202312ccbc8ee0b0ffe04eef9a GIT binary patch literal 54 zcmebE4s-T!^z>73a`jVi^Yn3LVq#!mV9a7*WMTv|m`X1ISzw_`1||q&D+40{0}cm$ literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/mkvchain.xben b/ben/tests/fixtures/v1.0.0/mkvchain.xben new file mode 100644 index 0000000000000000000000000000000000000000..1da93e5a1a220525973717b6719c1095fcb50863 GIT binary patch literal 104 zcmexsUKJ6=z`*cd=%ypf0pmbLMlm1{0?K!Nzy5$BfI&N!L2c4;!RDKx@l92hCk$de zdAUy3ay@;~`J^LSAe&`+-xt2k&n-NQX0ppNw*mDsx(9aiUTS75npedLHYzd-01ZbW A{Qv*} literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/source.jsonl b/ben/tests/fixtures/v1.0.0/source.jsonl new file mode 100644 index 0000000..c479628 --- /dev/null +++ b/ben/tests/fixtures/v1.0.0/source.jsonl @@ -0,0 +1,5 @@ +{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,2,1,2],"sample":2} +{"assignment":[1,1,1,2],"sample":3} +{"assignment":[1,1,1,2],"sample":4} +{"assignment":[2,2,2,1],"sample":5} diff --git a/ben/tests/fixtures/v1.0.0/source_graph.json b/ben/tests/fixtures/v1.0.0/source_graph.json new file mode 100644 index 0000000..727f85b --- /dev/null +++ b/ben/tests/fixtures/v1.0.0/source_graph.json @@ -0,0 +1 @@ +{"nodes":4,"edges":[[0,1],[1,2],[2,3],[3,0]]} \ No newline at end of file diff --git a/ben/tests/fixtures/v1.0.0/source_metadata.json b/ben/tests/fixtures/v1.0.0/source_metadata.json new file mode 100644 index 0000000..a8dc128 --- /dev/null +++ b/ben/tests/fixtures/v1.0.0/source_metadata.json @@ -0,0 +1 @@ +{"variant":"standard","bundle_version":1,"description":"v1.0.0 stability fixture"} \ No newline at end of file diff --git a/ben/tests/fixtures/v1.0.0/standard.ben b/ben/tests/fixtures/v1.0.0/standard.ben new file mode 100644 index 0000000000000000000000000000000000000000..a49d8960207ef6941582955dba354024bf8c93c3 GIT binary patch literal 53 rcmWFzarAR>402I$a`jVi^Yn3LVq#!mV9a7-1X4_;7a-h96lyB~14;+> literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/standard.xben b/ben/tests/fixtures/v1.0.0/standard.xben new file mode 100644 index 0000000000000000000000000000000000000000..ab3d1a2ebf67482573ceec1544587d3538afbd19 GIT binary patch literal 104 zcmexsUKJ6=z`*cd=%ypf0pn0bMlm1{0v=xQ6Mw)E!k`_?pgEOAHG1RRQn~x{g#YO- zlIwS|U$s$U|I%-cjGgMPhx0>!{5F&6zrl8Uq61JbqkHJ2llxoPisn@@f{lud0s!mI BBKiOT literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/twodelta.ben b/ben/tests/fixtures/v1.0.0/twodelta.ben new file mode 100644 index 0000000000000000000000000000000000000000..9f50b1fc2cbd65143ae6c1c6e8b33f647c0e3684 GIT binary patch literal 62 zcmWFu_jhsi32{_#a`jVi^Yn3LVq#!mV9a7*1VSc85c>m!4Hmz^zyyShP&Oj~G_wb< literal 0 HcmV?d00001 diff --git a/ben/tests/fixtures/v1.0.0/twodelta.xben b/ben/tests/fixtures/v1.0.0/twodelta.xben new file mode 100644 index 0000000000000000000000000000000000000000..5c1fc7d914c83c5b0cee76129c7077c729c97052 GIT binary patch literal 112 zcmexsUKJ6=z`*cd=%ypf0qbBzMlm1{0=NV}wLD-5Vla+n&=Nhy&@zE#-ASY9N3(v# zRJzA^>guIU|?iq01`|L49q}Uaz|5MIYrCFi$U-PPnCy z8zuHH{pQHnsqT6>KlI0MGnxJyY_}&m0QEAuhfX@VzlE)6UKJz&Bcn=+5`i}7r7I+t z7MJAbDkK&crl^rFOq485%4{Jh8WlCCf^urlxhMOT27 z0YMOiVgRYkO)W`GNi0bOD=urRzQY2HGftpH5;O)ykrby^09~V(l$qDh?fo7A03m_Z literal 0 HcmV?d00001 diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs new file mode 100644 index 0000000..ea4730d --- /dev/null +++ b/ben/tests/test_format_stability.rs @@ -0,0 +1,407 @@ +//! Forward-compatibility stability tests. +//! +//! Each `(BenVariant × wire format)` combination, plus two BENDL bundles, is committed as a +//! byte-identical fixture under `tests/fixtures/v1.0.0/`. These tests decode each fixture and +//! confirm the produced JSONL matches the canonical input that minted it. +//! +//! Committed fixtures are a permanent compatibility contract: they MUST continue to decode cleanly +//! in every future v1.x release of the library. The `generate_format_stability_fixtures` regen test +//! at the bottom of this file is marked `#[ignore]` precisely so it is never run by accident — if +//! the wire format ever needs to change, add a new `tests/fixtures/v/` directory and a parallel +//! generator, but never overwrite an older one. See `docs/format-stability.md`. +//! +//! # Adding a new fixture for a new wire-format feature +//! +//! Within v1.0.0, the only way to add a fixture is alongside a minor-version feature that already +//! ships in v1.x. To add one: +//! +//! 1. Mint the new fixture into `tests/fixtures/v1.0.0/`. +//! 2. Add a stability test against the new file. +//! 3. Update `docs/format-stability.md` if the new fixture pins behavior not already covered. + +use std::io::{BufReader, Cursor, Read}; +use std::path::{Path, PathBuf}; + +use binary_ensemble::codec::decode::{decode_ben_to_jsonl, decode_xben_to_jsonl}; +use binary_ensemble::codec::encode::{encode_jsonl_to_ben, encode_jsonl_to_xben}; +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, + ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, HEADER_FLAG_STREAM_CHECKSUM, +}; +use binary_ensemble::io::bundle::reader::BendlReader; +use binary_ensemble::io::bundle::writer::{AddAssetOptions, BendlWriter}; +use binary_ensemble::BenVariant; + +/// Canonical JSONL used to mint every codec fixture. Chosen to exercise both Standard and +/// run-length-encoded variants: each line has multiple distinct partitions, and consecutive lines +/// repeat exactly so MkvChain/TwoDelta hit their run-length code paths. +const CANONICAL_JSONL: &str = "\ +{\"assignment\":[1,1,2,2],\"sample\":1} +{\"assignment\":[1,2,1,2],\"sample\":2} +{\"assignment\":[1,1,1,2],\"sample\":3} +{\"assignment\":[1,1,1,2],\"sample\":4} +{\"assignment\":[2,2,2,1],\"sample\":5} +"; + +/// Graph JSON committed as the `graph.json` asset inside the BENDL fixtures. Tiny but +/// representative of a real adjacency-style graph. +const CANONICAL_GRAPH_JSON: &str = "{\"nodes\":4,\"edges\":[[0,1],[1,2],[2,3],[3,0]]}"; + +/// Metadata JSON committed as the `metadata.json` asset inside the BENDL fixtures. +const CANONICAL_METADATA_JSON: &str = + "{\"variant\":\"standard\",\"bundle_version\":1,\"description\":\"v1.0.0 stability fixture\"}"; + +fn fixtures_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("v1.0.0") +} + +fn read_fixture(name: &str) -> Vec { + let path = fixtures_dir().join(name); + std::fs::read(&path).unwrap_or_else(|e| { + panic!( + "missing fixture {path:?}: {e}. Did you run `cargo test -- --ignored \ + generate_format_stability_fixtures`?" + ) + }) +} + +/// Decode a committed BEN fixture and assert the round-trip matches `CANONICAL_JSONL`. +fn assert_ben_fixture_round_trips(name: &str) { + let bytes = read_fixture(name); + let mut out = Vec::new(); + decode_ben_to_jsonl(&bytes[..], &mut out).expect("ben decode"); + assert_eq!( + String::from_utf8(out).expect("decoder output is utf-8"), + CANONICAL_JSONL, + "fixture {name} did not round-trip" + ); +} + +/// Decode a committed XBEN fixture and assert the round-trip matches `CANONICAL_JSONL`. +fn assert_xben_fixture_round_trips(name: &str) { + let bytes = read_fixture(name); + let mut out = Vec::new(); + decode_xben_to_jsonl(BufReader::new(&bytes[..]), &mut out).expect("xben decode"); + assert_eq!( + String::from_utf8(out).expect("decoder output is utf-8"), + CANONICAL_JSONL, + "fixture {name} did not round-trip" + ); +} + +#[test] +fn standard_ben_v1_0_0_round_trips() { + assert_ben_fixture_round_trips("standard.ben"); +} + +#[test] +fn mkvchain_ben_v1_0_0_round_trips() { + assert_ben_fixture_round_trips("mkvchain.ben"); +} + +#[test] +fn twodelta_ben_v1_0_0_round_trips() { + assert_ben_fixture_round_trips("twodelta.ben"); +} + +#[test] +fn standard_xben_v1_0_0_round_trips() { + assert_xben_fixture_round_trips("standard.xben"); +} + +#[test] +fn mkvchain_xben_v1_0_0_round_trips() { + assert_xben_fixture_round_trips("mkvchain.xben"); +} + +#[test] +fn twodelta_xben_v1_0_0_round_trips() { + assert_xben_fixture_round_trips("twodelta.xben"); +} + +#[test] +fn flags_set_bendl_v1_0_0_decodes_all_assets_and_stream() { + // Bundle minted with every currently-defined flag bit set: header has + // HEADER_FLAG_STREAM_CHECKSUM; an xz+json+checksum graph asset is present; a json+checksum + // metadata asset is present. The reader must verify both assets and the stream cleanly, and + // the decoded stream must round-trip back to the canonical JSONL. + let bytes = read_fixture("flags_set.bendl"); + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open bundle"); + + assert!(reader.is_finalized()); + assert_eq!(reader.assignment_format(), Some(AssignmentFormat::Xben)); + assert!(reader.header().has_stream_checksum()); + + let graph_entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .expect("graph asset present") + .clone(); + assert_eq!( + graph_entry.asset_flags, + ASSET_FLAG_JSON | ASSET_FLAG_XZ | ASSET_FLAG_CHECKSUM, + "graph asset should have every defined bit set" + ); + let graph_bytes = reader.asset_bytes(&graph_entry).expect("graph asset bytes"); + assert_eq!(graph_bytes, CANONICAL_GRAPH_JSON.as_bytes()); + + let meta_entry = reader + .find_asset_by_type(ASSET_TYPE_METADATA) + .expect("metadata asset present") + .clone(); + assert_eq!( + meta_entry.asset_flags, + ASSET_FLAG_JSON | ASSET_FLAG_CHECKSUM, + "metadata asset should be json+checksum" + ); + let meta_bytes = reader + .asset_bytes(&meta_entry) + .expect("metadata asset bytes"); + assert_eq!(meta_bytes, CANONICAL_METADATA_JSON.as_bytes()); + + reader + .verify_all_asset_checksums() + .expect("all asset checksums valid"); + reader + .verify_stream_checksum() + .expect("stream checksum valid"); + + let mut stream_bytes = Vec::new(); + reader + .assignment_stream_reader_unverified() + .expect("stream reader") + .read_to_end(&mut stream_bytes) + .expect("read stream"); + let mut decoded = Vec::new(); + decode_xben_to_jsonl(BufReader::new(&stream_bytes[..]), &mut decoded).expect("xben decode"); + assert_eq!( + String::from_utf8(decoded).expect("decoder output is utf-8"), + CANONICAL_JSONL, + "stream did not round-trip" + ); +} + +#[test] +fn unknown_flags_bendl_v1_0_0_opens_and_decodes_cleanly() { + // Bundle minted by taking flags_set.bendl and setting reserved bits on both the header flags + // and on the custom asset's asset_flags. Forward-compatible readers must ignore those bits: + // the bundle still opens, assets still verify, the stream still decodes. + let bytes = read_fixture("unknown_flags.bendl"); + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open bundle"); + + // Confirm at least one reserved header bit is set so this fixture really exercises the + // forward-compat surface. Otherwise this test would silently degrade if someone regenerated + // the file without preserving the unknown-bits property. + let known_header_bits = HEADER_FLAG_STREAM_CHECKSUM; + assert_ne!( + reader.header().flags & !known_header_bits, + 0, + "expected at least one reserved header bit set" + ); + + // The custom asset has a reserved bit set in its asset_flags. + let custom_entry = reader + .assets() + .iter() + .find(|e| e.asset_type == ASSET_TYPE_CUSTOM) + .expect("custom asset present") + .clone(); + let known_asset_bits = ASSET_FLAG_JSON | ASSET_FLAG_XZ | ASSET_FLAG_CHECKSUM; + assert_ne!( + custom_entry.asset_flags & !known_asset_bits, + 0, + "expected at least one reserved asset bit set" + ); + + // Despite the unknown bits, all known operations succeed. + reader + .verify_all_asset_checksums() + .expect("checksums still verify with unknown bits set"); + reader + .verify_stream_checksum() + .expect("stream checksum still verifies"); + + let mut stream_bytes = Vec::new(); + reader + .assignment_stream_reader_unverified() + .expect("stream reader") + .read_to_end(&mut stream_bytes) + .expect("read stream"); + let mut decoded = Vec::new(); + decode_xben_to_jsonl(BufReader::new(&stream_bytes[..]), &mut decoded).expect("xben decode"); + assert_eq!( + String::from_utf8(decoded).expect("decoder output is utf-8"), + CANONICAL_JSONL, + "stream did not round-trip" + ); +} + +// --------------------------------------------------------------------------- +// Fixture generation +// --------------------------------------------------------------------------- +// +// IMPORTANT: this is intentionally `#[ignore]`. Once v1.0.0 fixtures are committed, they MUST NOT +// be regenerated in place — see `docs/format-stability.md`. If a future format change requires +// new fixtures, add a `tests/fixtures/v/` directory and a parallel generator; never overwrite +// an older directory. + +fn write_fixture(name: &str, bytes: &[u8]) { + let dir = fixtures_dir(); + std::fs::create_dir_all(&dir).expect("create fixtures dir"); + let path = dir.join(name); + std::fs::write(&path, bytes).unwrap_or_else(|e| panic!("write {path:?}: {e}")); +} + +fn mint_ben(variant: BenVariant) -> Vec { + let mut out = Vec::new(); + encode_jsonl_to_ben(Cursor::new(CANONICAL_JSONL.as_bytes()), &mut out, variant) + .expect("encode ben"); + out +} + +fn mint_xben(variant: BenVariant) -> Vec { + let mut out = Vec::new(); + // Force single-threaded encoding with a fixed compression level so the bytes are deterministic. + // Defaults vary across machines (n_threads = available parallelism), which would make + // re-generation non-reproducible across hosts. + encode_jsonl_to_xben( + Cursor::new(CANONICAL_JSONL.as_bytes()), + &mut out, + variant, + Some(1), + Some(6), + None, + None, + ) + .expect("encode xben"); + out +} + +fn mint_flags_set_bendl() -> Vec { + let mut backing = Cursor::new(Vec::::new()); + let mut writer = BendlWriter::new(&mut backing, AssignmentFormat::Xben).expect("new writer"); + + // Graph: json + xz + checksum (writer always adds the checksum bit). + writer + .add_known_asset( + binary_ensemble::io::bundle::format::KnownAssetKind::Graph, + CANONICAL_GRAPH_JSON.as_bytes(), + AddAssetOptions::defaults().json().compress(), + ) + .expect("add graph"); + + // Metadata: json + checksum only (no xz). + writer + .add_known_asset( + binary_ensemble::io::bundle::format::KnownAssetKind::Metadata, + CANONICAL_METADATA_JSON.as_bytes(), + AddAssetOptions::defaults().json().raw(), + ) + .expect("add metadata"); + + // Stream phase: write XBEN content driven from the canonical JSONL. + let session = writer.into_stream_session().expect("into stream session"); + let mut session = session; + encode_jsonl_to_xben( + Cursor::new(CANONICAL_JSONL.as_bytes()), + &mut session, + BenVariant::Standard, + Some(1), + Some(6), + None, + None, + ) + .expect("encode xben into session"); + // sample_count == 5 (lines in CANONICAL_JSONL). + let writer = session.finish_into_writer(5); + let _ = writer.finish().expect("finish bundle"); + + backing.into_inner() +} + +/// Returns a copy of `bytes` with reserved bits set on both the header flags and the custom +/// asset's asset_flags. Used to mint the `unknown_flags.bendl` fixture from a known-good bundle. +fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { + // 1. Set bit 1 of the header flags (offset 16..20). Bit 0 is HEADER_FLAG_STREAM_CHECKSUM; bit + // 1 is currently reserved. + let mut header_flags = u32::from_le_bytes(bytes[16..20].try_into().unwrap()); + header_flags |= 1 << 1; + bytes[16..20].copy_from_slice(&header_flags.to_le_bytes()); + + // 2. Add a custom asset entry's asset_flags reserved bit. Since the writer-minted bundle does + // not include a custom asset, append one to the directory before flipping. Rather than + // surgery, do the simpler thing: reopen the bundle, append a custom asset via the + // appender API, then flip a reserved bit on its directory entry. + let mut appender = binary_ensemble::io::bundle::writer::BendlAppender::open(Cursor::new(bytes)) + .expect("open appender"); + appender + .add_custom_asset( + "extra.bin", + b"trailing custom asset", + AddAssetOptions::defaults(), + ) + .expect("add custom asset"); + let cursor = appender.commit().expect("commit appender"); + let mut bytes = cursor.into_inner(); + + // 3. Locate the custom asset's directory entry and flip bit 7 of its asset_flags. + // Directory entry layout per `BendlDirectoryEntry::to_bytes`: + // [u16 asset_type][u16 asset_flags][u16 name_len][u16 reserved][u64 payload_offset] + // [u64 payload_len][u32 checksum_len][name bytes][checksum bytes] + // asset_flags is at byte offset 2 within each entry. We scan the directory and patch the + // entry whose asset_type is ASSET_TYPE_CUSTOM. + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let entry_count_offset = directory_offset; + let entry_count = u32::from_le_bytes( + bytes[entry_count_offset..entry_count_offset + 4] + .try_into() + .unwrap(), + ) as usize; + + let mut cursor = directory_offset + 4; + for _ in 0..entry_count { + let asset_type = u16::from_le_bytes(bytes[cursor..cursor + 2].try_into().unwrap()); + let name_len = u16::from_le_bytes(bytes[cursor + 4..cursor + 6].try_into().unwrap()) as usize; + let checksum_len = u32::from_le_bytes(bytes[cursor + 24..cursor + 28].try_into().unwrap()) + as usize; + if asset_type == ASSET_TYPE_CUSTOM { + let flags_offset = cursor + 2; + let mut asset_flags = u16::from_le_bytes( + bytes[flags_offset..flags_offset + 2].try_into().unwrap(), + ); + asset_flags |= 1 << 7; // currently reserved + bytes[flags_offset..flags_offset + 2].copy_from_slice(&asset_flags.to_le_bytes()); + return bytes; + } + cursor += 28 + name_len + checksum_len; + } + panic!("custom asset not found in directory"); +} + +#[test] +#[ignore = "regenerates committed v1.0.0 fixtures; never run as part of normal CI"] +fn generate_format_stability_fixtures() { + write_fixture("standard.ben", &mint_ben(BenVariant::Standard)); + write_fixture("mkvchain.ben", &mint_ben(BenVariant::MkvChain)); + write_fixture("twodelta.ben", &mint_ben(BenVariant::TwoDelta)); + write_fixture("standard.xben", &mint_xben(BenVariant::Standard)); + write_fixture("mkvchain.xben", &mint_xben(BenVariant::MkvChain)); + write_fixture("twodelta.xben", &mint_xben(BenVariant::TwoDelta)); + + let flags_set = mint_flags_set_bendl(); + write_fixture("flags_set.bendl", &flags_set); + + let unknown_flags = flip_unknown_flag_bits(flags_set); + write_fixture("unknown_flags.bendl", &unknown_flags); + + // Also commit the canonical sources alongside so a human can read what the fixtures represent + // without invoking the codec. + write_fixture("source.jsonl", CANONICAL_JSONL.as_bytes()); + write_fixture("source_graph.json", CANONICAL_GRAPH_JSON.as_bytes()); + write_fixture("source_metadata.json", CANONICAL_METADATA_JSON.as_bytes()); + + // Print a checklist so the engineer regenerating fixtures sees what landed. + eprintln!("Wrote v1.0.0 fixtures to {:?}", fixtures_dir()); +} From c04cee7a02fa1580f37959f4aa3f8e73e7fbc4e8 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 21:41:55 -0600 Subject: [PATCH 104/221] enforce bit width consistency and add size guard --- ben/src/codec/decode/ben.rs | 136 ++++++++++++++++++++++++++++++--- ben/tests/test_stress_edges.rs | 14 ++++ 2 files changed, 140 insertions(+), 10 deletions(-) diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index b6116c3..652b561 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -1,5 +1,11 @@ use std::io::{self, Read}; +/// Upper bound on `n_bytes` accepted by [`decode_ben_line`]. A frame larger than this is rejected +/// without allocating, so malformed or adversarial input cannot OOM the process during fuzzing or +/// stream decoding. The cap is well above any legitimate BEN frame: at 64 MiB of packed RLE data +/// it would hold tens of millions of run pairs. +const MAX_FRAME_PAYLOAD_BYTES: u32 = 1 << 26; + /// Decode a single BEN frame payload into run-length encoded assignments. /// /// This function expects only the packed payload bytes for one BEN frame, not the leading per-frame @@ -15,6 +21,18 @@ use std::io::{self, Read}; /// # Returns /// /// Returns the decoded run-length encoded assignment vector as `(value, count)` pairs. +/// +/// # Errors +/// +/// Returns [`io::ErrorKind::InvalidData`] for the following corrupt-frame conditions, which the +/// library writers never produce: +/// +/// - `max_val_bits` or `max_len_bits` outside `1..=16`. +/// - `n_bytes` larger than [`MAX_FRAME_PAYLOAD_BYTES`]. +/// - A decoded pair with a zero-length run before the trailing padding region. +/// - `n_bytes` not equal to `ceil(real_pairs * (mvb + mlb) / 8)` after decoding (the encoder uses +/// `div_ceil` to compute `n_bytes`, so any other value indicates a malformed or maliciously +/// crafted frame). pub fn decode_ben_line( mut reader: R, max_val_bits: u8, @@ -30,12 +48,26 @@ pub fn decode_ben_line( )); } + if n_bytes > MAX_FRAME_PAYLOAD_BYTES { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "BEN frame payload of {n_bytes} bytes exceeds {MAX_FRAME_PAYLOAD_BYTES}; \ + refusing to allocate" + ), + )); + } + let mut assign_bits: Vec = vec![0; n_bytes as usize]; reader.read_exact(&mut assign_bits)?; - let n_assignments: usize = - (n_bytes as f64 / ((max_val_bits + max_len_bits) as f64 / 8.0)) as usize; - let mut output_rle: Vec<(u16, u16)> = Vec::with_capacity(n_assignments); + // Bit-width invariants the encoder maintains. Width values themselves are bounded by 16 each + // (checked above), so the sum fits in u32 trivially and the per-pair extraction below stays + // within the 32-bit shift register. + let bit_width = u64::from(max_val_bits) + u64::from(max_len_bits); + let total_bits = u64::from(n_bytes) * 8; + let n_assignments_upper_bound = (total_bits / bit_width) as usize; + let mut output_rle: Vec<(u16, u16)> = Vec::with_capacity(n_assignments_upper_bound); let mut buffer: u32 = 0; let mut n_bits_in_buff: u16 = 0; @@ -45,6 +77,13 @@ pub fn decode_ben_line( let mut len = 0; let mut len_set = false; + // Tracks zero-length pairs seen since the last real (len > 0) pair. The encoder never emits + // zero-length runs, so any zero-length pair in the decoded stream is either trailing padding + // (for narrow bit widths, where padding bits may form a complete pair) or a corrupt-frame + // signal. We accumulate them until either (a) the frame ends — accepted as padding — or + // (b) a real pair follows — rejected as interior corruption. + let mut pending_zero_pairs: usize = 0; + for &byte in &assign_bits { buffer |= (byte as u32).to_be() >> n_bits_in_buff; n_bits_in_buff += 8; @@ -65,7 +104,12 @@ pub fn decode_ben_line( } if val_set && len_set { - if len > 0 { + if len == 0 { + pending_zero_pairs += 1; + } else { + if pending_zero_pairs > 0 { + return Err(interior_zero_length_run_error()); + } output_rle.push((val, len)); } val_set = false; @@ -86,27 +130,99 @@ pub fn decode_ben_line( buffer <<= max_len_bits; n_bits_in_buff -= max_len_bits as u16; - if len > 0 { + if len == 0 { + pending_zero_pairs += 1; + } else { + if pending_zero_pairs > 0 { + return Err(interior_zero_length_run_error()); + } output_rle.push((val, len)); } val_set = false; } } + // n_bytes consistency: the encoder writes `n_bytes = ceil(real_pairs * bit_width / 8)`. Any + // other relationship between n_bytes and the number of real pairs we recovered is a + // corrupt-frame signal (n_bytes overstated → extra "phantom" capacity the encoder wouldn't + // allocate; n_bytes understated → real pairs would have been truncated). + let real_pairs = output_rle.len() as u64; + let expected_bytes = (real_pairs * bit_width).div_ceil(8); + if u64::from(n_bytes) != expected_bytes { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "inconsistent BEN frame size: n_bytes={n_bytes} but {real_pairs} pair(s) at \ + {bit_width} bit(s)/pair require {expected_bytes} byte(s)" + ), + )); + } + Ok(output_rle) } +fn interior_zero_length_run_error() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + "BEN frame contains an interior zero-length run; the encoder never emits zero-length runs", + ) +} + #[cfg(test)] mod tests { use super::*; use std::io::Cursor; #[test] - fn decode_ben_line_skips_zero_length_run() { - // max_val_bits=1, max_len_bits=1, 1 byte payload = 0x80. Bit layout: [val=1][len=0] → run - // with len=0 is not pushed. - let result = decode_ben_line(Cursor::new(&[0x80u8]), 1, 1, 1).unwrap(); - assert!(result.is_empty()); + fn decode_ben_line_rejects_zero_length_run_when_trailing_real_pair_present() { + // Hand-built frame: mvb=4, mlb=4 (bit_width=8 = one full byte per pair). First byte + // 0x10 = (val=1, len=0) — zero-length, should not exist. Second byte 0x23 = (val=2, + // len=3). The trailing real pair makes the leading zero-length pair "interior", which is + // rejected. + let err = + decode_ben_line(Cursor::new(&[0x10u8, 0x23u8]), 4, 4, 2).expect_err("must reject"); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("interior zero-length")); + } + + #[test] + fn decode_ben_line_rejects_inconsistent_n_bytes() { + // Plan's headline case: mvb=8, mlb=8 → 16 bits/pair = 2 bytes/pair. n_bytes=3 should + // decode 1 pair but leaves a full byte of "padding" — the encoder uses div_ceil(2*16/8)=2, + // never 3. The post-decode consistency check rejects this. + let err = decode_ben_line(Cursor::new(&[0x01u8, 0x03u8, 0xff]), 8, 8, 3) + .expect_err("must reject"); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("inconsistent")); + } + + #[test] + fn decode_ben_line_rejects_oversized_n_bytes_without_allocating() { + // n_bytes way above the sanity cap must error before allocating. We don't supply any + // bytes here because the cap check fires first; read_exact would otherwise try to fill + // ~4GiB. + let err = decode_ben_line(Cursor::new(&[][..]), 8, 8, u32::MAX).expect_err("must reject"); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("exceeds")); + } + + #[test] + fn decode_ben_line_accepts_narrow_bit_width_with_trailing_zero_padding() { + // mvb=1, mlb=1, n_bytes=1, single real pair (1, 1) at the high bits. The remaining 6 bits + // are zero, which the decoder reads as three trailing (0, 0) "phantom" pairs. These are + // padding artifacts of the byte-aligned wire format and must be accepted. + let result = decode_ben_line(Cursor::new(&[0b11_00_00_00u8]), 1, 1, 1).unwrap(); + assert_eq!(result, vec![(1u16, 1u16)]); + } + + #[test] + #[allow(clippy::unusual_byte_groupings)] + fn decode_ben_line_accepts_non_byte_aligned_frame() { + // mvb=2, mlb=3 (bit_width=5), n_bytes=2 (16 bits = 3 real pairs + 1 padding bit). Encoder + // produces this layout for RLE [(1,4),(2,1),(3,3)]; the consistency check must accept it. + let result = + decode_ben_line(Cursor::new(&[0b01100_100u8, 0b01_11011_0u8]), 2, 3, 2).unwrap(); + assert_eq!(result, vec![(1u16, 4u16), (2, 1), (3, 3)]); } #[test] diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index a83ab86..68fc7b2 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -558,6 +558,20 @@ fn seeded_malformed_ben_bytes_do_not_panic() { assert_ben_bytes_do_not_panic(inflated_frame_len); } } + + // Explicit no-panic seed: misaligned frame size. mvb=8, mlb=8 means a pair is 2 bytes; this + // frame claims n_bytes=3 with 2 real-pair bytes plus 1 phantom byte. The decoder must reject + // (InvalidData) without panicking. + let mut misaligned_standard = STANDARD_BEN_BANNER.to_vec(); + misaligned_standard.extend_from_slice(&[8u8, 8, 0, 0, 0, 3, 0x01, 0x03, 0xff]); + assert_ben_bytes_do_not_panic(misaligned_standard); + + // Explicit no-panic seed: interior zero-length run. mvb=4, mlb=4 → 1 pair per byte. Byte 1 = + // (val=1, len=0) (zero-length pair), byte 2 = (val=2, len=3) (real pair). The decoder must + // reject (InvalidData) for the interior zero-length run without panicking. + let mut interior_zero_standard = STANDARD_BEN_BANNER.to_vec(); + interior_zero_standard.extend_from_slice(&[4u8, 4, 0, 0, 0, 2, 0x10, 0x23]); + assert_ben_bytes_do_not_panic(interior_zero_standard); } #[test] From 7acb8057c97ba6ad449d378a5d36939053c44411 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 21:46:50 -0600 Subject: [PATCH 105/221] add adversarial tests --- ben/tests/test_stress_edges.rs | 258 +++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 68fc7b2..040fbf1 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -670,3 +670,261 @@ fn bendl_append_truncated_new_directory_is_rejected_on_reopen() { let err = expect_bendl_open_err(appended); assert!(err.to_string().contains("IO error")); } + +// --------------------------------------------------------------------------- +// BENDL adversarial-bytes fuzz +// --------------------------------------------------------------------------- + +/// Mint a valid BENDL bundle that exercises every public surface the no-panic harness will drive: +/// a finalized header with `HEADER_FLAG_STREAM_CHECKSUM`, an xz-compressed graph asset, a raw JSON +/// metadata asset, a raw custom asset, and an XBEN assignment stream. This is the seed used by +/// `seeded_malformed_bendl_bytes_do_not_panic`. +fn valid_bendl_seed() -> Vec { + let mut writer = + BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Xben).unwrap(); + writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + br#"{"nodes":4,"edges":[[0,1],[1,2],[2,3]]}"#, + AddAssetOptions::defaults().json().compress(), + ) + .unwrap(); + writer + .add_asset( + binary_ensemble::io::bundle::format::ASSET_TYPE_METADATA, + "metadata.json", + br#"{"variant":"standard","bundle_version":1}"#, + AddAssetOptions::defaults().json().raw(), + ) + .unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"trailing custom asset", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + + let mut session = writer.into_stream_session().unwrap(); + encode_jsonl_to_xben( + Cursor::new(b"{\"assignment\":[1,1,2,2],\"sample\":1}\n{\"assignment\":[1,2,1,2],\"sample\":2}\n".as_slice()), + &mut session, + BenVariant::Standard, + Some(1), + Some(1), + None, + None, + ) + .unwrap(); + let writer = session.finish_into_writer(2); + writer.finish().unwrap().into_inner() +} + +/// Open the bundle and drive every public read accessor. Any panic from any reader path fails the +/// test loudly. Errors are expected (the input is adversarial) and are silently discarded; only +/// panics matter here. +fn assert_bendl_bytes_do_not_panic(bytes: Vec) { + let outcome = std::panic::catch_unwind(|| { + let mut reader = match BendlReader::open(Cursor::new(bytes)) { + Ok(r) => r, + Err(_) => return, + }; + + // Header / sample-count getters never read further bytes; they should always be safe. + let _ = reader.is_finalized(); + let _ = reader.sample_count(); + let _ = reader.assignment_format(); + + // Stream range computation; may seek to EOF on unfinalized bundles but never panics. + let _ = reader.assignment_stream_range(); + + // Drive each asset accessor with a bounded read so a wildly inflated payload_len cannot + // OOM the test process. We cap at 1 MiB per asset; legitimate fixtures here are well + // under that. + let entries: Vec<_> = reader.assets().to_vec(); + for entry in &entries { + if let Ok(mut r) = reader.asset_reader(entry) { + let mut buf = [0u8; 1024]; + for _ in 0..1024 { + match r.read(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => {} + } + } + } + if let Ok(mut r) = reader.asset_reader_unverified(entry) { + let mut buf = [0u8; 1024]; + for _ in 0..1024 { + match r.read(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => {} + } + } + } + if let Ok(mut r) = reader.asset_payload_reader_unverified(entry) { + let mut buf = [0u8; 1024]; + for _ in 0..1024 { + match r.read(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => {} + } + } + } + let _ = reader.verify_asset_checksum(entry); + } + + // verify_all_asset_checksums short-circuits on first mismatch; bounded by directory size. + let _ = reader.verify_all_asset_checksums(); + + // Stream accessors. The verified raw path may surface ChecksumError; that's fine — we + // only care about absence of panics. + if let Ok(mut r) = reader.assignment_stream_reader() { + let mut buf = [0u8; 1024]; + for _ in 0..1024 { + match r.read(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => {} + } + } + } + if let Ok(mut r) = reader.assignment_stream_reader_unverified() { + let mut buf = [0u8; 1024]; + for _ in 0..1024 { + match r.read(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => {} + } + } + } + if let Ok(decoded) = reader.open_assignment_reader() { + // Take a bounded prefix of the iterator to avoid spinning on adversarial frame + // counts. The frame-payload cap in decode_ben_line already bounds per-frame work. + for record in decoded.silent(true).take(64) { + let _ = record; + } + } + let _ = reader.verify_stream_checksum(); + }); + assert!(outcome.is_ok(), "BENDL reader panicked on adversarial input"); +} + +/// Sanity cap used when fuzzing length fields. Inflating to `u32::MAX` would turn this into an OOM +/// stress test; legitimate fixtures here are kilobytes at most. The cap is large enough that the +/// inflated frames still exercise "value far past end of input" paths, but small enough that the +/// resulting allocations are negligible. +const ADVERSARIAL_LEN_CAP: u32 = 1 << 16; // 64 KiB + +#[test] +fn seeded_malformed_bendl_bytes_do_not_panic() { + let seed = valid_bendl_seed(); + + // Truncation at every length, including zero and full size. + for len in 0..=seed.len() { + assert_bendl_bytes_do_not_panic(seed[..len].to_vec()); + } + + // Single-byte XOR mutations everywhere. This covers every byte of the header, directory, and + // payload regions — the same coverage pattern the BEN/XBEN fuzz tests use. + for idx in 0..seed.len() { + let mut mutated = seed.clone(); + mutated[idx] ^= 0xA5; + assert_bendl_bytes_do_not_panic(mutated); + } + + // Length-field inflation seeds. Header field offsets per the v1.0.0 spec: + // bytes 16..20 : flags (u32) + // bytes 20..24 : stream_checksum (u32) + // bytes 24..32 : directory_offset (u64) + // bytes 32..40 : directory_len (u64) + // bytes 40..48 : stream_offset (u64) + // bytes 48..56 : stream_len (u64) + let make_inflated = |range: std::ops::Range, value: u64| -> Vec { + let len = range.end - range.start; + let mut bytes = seed.clone(); + bytes[range].copy_from_slice(&value.to_le_bytes()[..len]); + bytes + }; + + // directory_offset past EOF. + assert_bendl_bytes_do_not_panic(make_inflated(24..32, u64::MAX)); + // directory_len past EOF (capped to avoid OOM if the implementation pre-allocates). + assert_bendl_bytes_do_not_panic(make_inflated(32..40, ADVERSARIAL_LEN_CAP as u64)); + // stream_offset past EOF. + assert_bendl_bytes_do_not_panic(make_inflated(40..48, u64::MAX)); + // stream_len past EOF (capped). + assert_bendl_bytes_do_not_panic(make_inflated(48..56, ADVERSARIAL_LEN_CAP as u64)); + // stream_offset + stream_len overflowing u64. + let mut overflow_bundle = seed.clone(); + overflow_bundle[40..48].copy_from_slice(&(u64::MAX - 1).to_le_bytes()); + overflow_bundle[48..56].copy_from_slice(&u64::MAX.to_le_bytes()); + assert_bendl_bytes_do_not_panic(overflow_bundle); + // Reserved header flag bits set. + assert_bendl_bytes_do_not_panic(make_inflated(16..20, u32::MAX as u64)); + // Non-zero alignment_padding at bytes 14..16; we don't have a make_inflated for u16 so do it + // inline. + let mut padded = seed.clone(); + padded[14..16].copy_from_slice(&u16::MAX.to_le_bytes()); + assert_bendl_bytes_do_not_panic(padded); + + // Directory-entry length-field inflation. The directory starts at directory_offset and begins + // with a u32 entry_count followed by the entries themselves. Each entry header is 28 bytes: + // u16 asset_type | u16 asset_flags | u16 name_len | u16 reserved + // u64 payload_offset | u64 payload_len | u32 checksum_len + let directory_offset = u64::from_le_bytes(seed[24..32].try_into().unwrap()) as usize; + let entry_count = u32::from_le_bytes( + seed[directory_offset..directory_offset + 4] + .try_into() + .unwrap(), + ); + assert!(entry_count > 0, "valid_bendl_seed must contain entries"); + + // entry_count inflation (capped to keep test runtime bounded — the reader must not try to + // pre-allocate a Vec with u32::MAX capacity, but we don't want to find out the hard way here). + let mut inflated_entry_count = seed.clone(); + inflated_entry_count[directory_offset..directory_offset + 4] + .copy_from_slice(&ADVERSARIAL_LEN_CAP.to_le_bytes()); + assert_bendl_bytes_do_not_panic(inflated_entry_count); + + // Walk each entry and inflate its per-entry length fields one at a time. + let mut entry_cursor = directory_offset + 4; + for _ in 0..entry_count { + let name_len = + u16::from_le_bytes(seed[entry_cursor + 4..entry_cursor + 6].try_into().unwrap()) + as usize; + let checksum_len = u32::from_le_bytes( + seed[entry_cursor + 24..entry_cursor + 28] + .try_into() + .unwrap(), + ) as usize; + let entry_size = 28 + name_len + checksum_len; + + // name_len inflation (capped). + let mut inflated = seed.clone(); + inflated[entry_cursor + 4..entry_cursor + 6] + .copy_from_slice(&(ADVERSARIAL_LEN_CAP as u16).to_le_bytes()); + assert_bendl_bytes_do_not_panic(inflated); + + // checksum_len inflation (capped). + let mut inflated = seed.clone(); + inflated[entry_cursor + 24..entry_cursor + 28] + .copy_from_slice(&ADVERSARIAL_LEN_CAP.to_le_bytes()); + assert_bendl_bytes_do_not_panic(inflated); + + // payload_len inflation to u64::MAX. ExactLen at read time, plus the per-frame decode + // cap, prevent any actual allocation. + let mut inflated = seed.clone(); + inflated[entry_cursor + 16..entry_cursor + 24] + .copy_from_slice(&u64::MAX.to_le_bytes()); + assert_bendl_bytes_do_not_panic(inflated); + + // payload_offset past EOF. + let mut inflated = seed.clone(); + inflated[entry_cursor + 8..entry_cursor + 16] + .copy_from_slice(&u64::MAX.to_le_bytes()); + assert_bendl_bytes_do_not_panic(inflated); + + entry_cursor += entry_size; + } +} From f4a93f9691d46dcb0a9fdfff0c967078ba032f5e Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:15:52 -0600 Subject: [PATCH 106/221] add more edge-case stress tests --- ben/tests/test_stress_edges.rs | 272 +++++++++++++++++++++++++++++++-- 1 file changed, 260 insertions(+), 12 deletions(-) diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 040fbf1..4be9bcd 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -8,14 +8,14 @@ use binary_ensemble::format::banners::{ MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::bundle::format::{ - encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, ASSET_TYPE_CUSTOM, - ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_YES, - HEADER_SIZE, + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, + ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, + FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; use binary_ensemble::io::bundle::writer::{ AddAssetOptions, BendlAppender, BendlTruncate, BendlWriter, }; -use binary_ensemble::io::bundle::BendlReader; +use binary_ensemble::io::bundle::{BendlReadError, BendlReader, ChecksumError, ChecksumTarget}; use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; @@ -680,8 +680,7 @@ fn bendl_append_truncated_new_directory_is_rejected_on_reopen() { /// metadata asset, a raw custom asset, and an XBEN assignment stream. This is the seed used by /// `seeded_malformed_bendl_bytes_do_not_panic`. fn valid_bendl_seed() -> Vec { - let mut writer = - BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Xben).unwrap(); + let mut writer = BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Xben).unwrap(); writer .add_asset( ASSET_TYPE_GRAPH, @@ -709,7 +708,10 @@ fn valid_bendl_seed() -> Vec { let mut session = writer.into_stream_session().unwrap(); encode_jsonl_to_xben( - Cursor::new(b"{\"assignment\":[1,1,2,2],\"sample\":1}\n{\"assignment\":[1,2,1,2],\"sample\":2}\n".as_slice()), + Cursor::new( + b"{\"assignment\":[1,1,2,2],\"sample\":1}\n{\"assignment\":[1,2,1,2],\"sample\":2}\n" + .as_slice(), + ), &mut session, BenVariant::Standard, Some(1), @@ -807,7 +809,10 @@ fn assert_bendl_bytes_do_not_panic(bytes: Vec) { } let _ = reader.verify_stream_checksum(); }); - assert!(outcome.is_ok(), "BENDL reader panicked on adversarial input"); + assert!( + outcome.is_ok(), + "BENDL reader panicked on adversarial input" + ); } /// Sanity cap used when fuzzing length fields. Inflating to `u32::MAX` would turn this into an OOM @@ -915,16 +920,259 @@ fn seeded_malformed_bendl_bytes_do_not_panic() { // payload_len inflation to u64::MAX. ExactLen at read time, plus the per-frame decode // cap, prevent any actual allocation. let mut inflated = seed.clone(); - inflated[entry_cursor + 16..entry_cursor + 24] - .copy_from_slice(&u64::MAX.to_le_bytes()); + inflated[entry_cursor + 16..entry_cursor + 24].copy_from_slice(&u64::MAX.to_le_bytes()); assert_bendl_bytes_do_not_panic(inflated); // payload_offset past EOF. let mut inflated = seed.clone(); - inflated[entry_cursor + 8..entry_cursor + 16] - .copy_from_slice(&u64::MAX.to_le_bytes()); + inflated[entry_cursor + 8..entry_cursor + 16].copy_from_slice(&u64::MAX.to_le_bytes()); assert_bendl_bytes_do_not_panic(inflated); entry_cursor += entry_size; } } + +// --------------------------------------------------------------------------- +// Open-rejected variant-pinning. Each fixture must fail BendlReader::open +// with a specific BendlFormatError variant, not just an unspecified Err. +// --------------------------------------------------------------------------- + +#[test] +fn bendl_open_rejects_directory_offset_past_eof() { + // directory_offset claims a position well past the actual file. Cursor seek succeeds (its + // position is u64) but the subsequent read returns Ok(0); read_directory's read_exact for the + // entry count fails with UnexpectedEof, which becomes BendlFormatError::Io. + let mut bytes = valid_bendl_seed(); + let past_eof = (bytes.len() as u64) + 4096; + bytes[24..32].copy_from_slice(&past_eof.to_le_bytes()); + let err = expect_bendl_open_err(bytes); + assert!( + matches!(err, BendlFormatError::Io(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof), + "expected BendlFormatError::Io(UnexpectedEof), got {err:?}" + ); +} + +#[test] +fn bendl_open_rejects_directory_offset_plus_directory_len_overflow() { + // directory_offset + directory_len overflows u64. The reader has no chance to read anything at + // u64::MAX - 4; the failure surface is the same UnexpectedEof from the bounded read attempt. + let mut bytes = valid_bendl_seed(); + bytes[24..32].copy_from_slice(&(u64::MAX - 4).to_le_bytes()); + bytes[32..40].copy_from_slice(&100u64.to_le_bytes()); + let err = expect_bendl_open_err(bytes); + assert!( + matches!(err, BendlFormatError::Io(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof), + "expected BendlFormatError::Io(UnexpectedEof), got {err:?}" + ); +} + +#[test] +fn bendl_open_rejects_name_len_longer_than_remaining_directory_bytes() { + // Build a one-entry directory by hand whose name_len field claims more bytes than the + // directory range provides. The bounded Take in BendlReader::open prevents the read from + // escaping into the asset region; read_exact for the name buffer then fails inside the bound. + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "ab".to_string(), + payload_offset: HEADER_SIZE as u64, + payload_len: 0, + checksum: None, + }]; + let mut bytes = minimal_bendl_with_entries(entries, 0); + + // Directory layout in the bundle starts at HEADER_SIZE: [u32 count][entry_header (28 bytes) + // including u16 name_len at offset +4][name bytes][checksum bytes]. + // Patch name_len from 2 to a huge value that exceeds the directory's declared length. + let name_len_offset = HEADER_SIZE + 4 + 4; + bytes[name_len_offset..name_len_offset + 2].copy_from_slice(&u16::MAX.to_le_bytes()); + + let err = expect_bendl_open_err(bytes); + assert!( + matches!(err, BendlFormatError::Io(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof), + "expected BendlFormatError::Io(UnexpectedEof), got {err:?}" + ); +} + +// --------------------------------------------------------------------------- +// Openable behavioral pins. Each fixture must let BendlReader::open succeed and then +// surface the documented behavior through the accessors. +// --------------------------------------------------------------------------- + +#[test] +fn bendl_unknown_header_flag_bits_are_ignored() { + // Forward-compat contract: bits 1..31 of `flags` are reserved. Setting them on a finalized + // bundle must not change anything observable — open succeeds, directory entries are intact, + // verify_stream_checksum passes, asset access works. + let mut bytes = valid_bendl_seed(); + let flags_offset = 16; + let original_flags = + u32::from_le_bytes(bytes[flags_offset..flags_offset + 4].try_into().unwrap()); + assert!( + original_flags & HEADER_FLAG_STREAM_CHECKSUM != 0, + "seed must have STREAM_CHECKSUM set; otherwise this test is testing the wrong contract" + ); + let polluted_flags = original_flags | (1u32 << 5) | (1u32 << 31); + bytes[flags_offset..flags_offset + 4].copy_from_slice(&polluted_flags.to_le_bytes()); + + let mut reader = + BendlReader::open(Cursor::new(bytes)).expect("unknown flag bits must not block open"); + assert!(reader.is_finalized()); + assert_eq!( + reader.assets().len(), + 3, + "all three seed assets must be present" + ); + + // Stream CRC must still pass — the verifier doesn't inspect reserved bits. + reader + .verify_stream_checksum() + .expect("stream CRC must still verify with unknown flag bits set"); + + // Asset access must work for both compressed and uncompressed entries. + for entry in reader.assets().to_vec() { + reader.asset_bytes(&entry).unwrap_or_else(|e| { + panic!( + "asset {} read failed with unknown flags set: {e:?}", + entry.name + ) + }); + } +} + +#[test] +fn bendl_clear_stream_checksum_flag_with_nonzero_bytes_returns_unavailable_not_mismatch() { + // Plan-mandated contract: when HEADER_FLAG_STREAM_CHECKSUM is clear, verified stream APIs + // must return Unavailable regardless of what's in bytes 20..24. Pin this by clearing the flag + // but leaving non-zero garbage in the stream_checksum slot — a buggy reader that interpreted + // bytes 20..24 unconditionally would return Mismatch (since the garbage would not match the + // actual CRC). + let mut bytes = valid_bendl_seed(); + let flags_offset = 16; + let cleared_flags = + u32::from_le_bytes(bytes[flags_offset..flags_offset + 4].try_into().unwrap()) + & !HEADER_FLAG_STREAM_CHECKSUM; + bytes[flags_offset..flags_offset + 4].copy_from_slice(&cleared_flags.to_le_bytes()); + bytes[20..24].copy_from_slice(&0xDEADBEEFu32.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + + let expect_unavailable = |result: Result<_, BendlReadError>| match result { + Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream, + })) => {} + Err(other) => panic!("expected Unavailable(Stream), got {other:?}"), + Ok(_) => panic!("expected Err, got Ok"), + }; + + expect_unavailable(reader.assignment_stream_reader().map(|_| ())); + expect_unavailable(reader.open_assignment_reader().map(|_| ())); + expect_unavailable(reader.verify_stream_checksum()); + + // Asset access is an independent checksum domain and must still verify normally. + for entry in reader.assets().to_vec() { + reader + .asset_bytes(&entry) + .unwrap_or_else(|e| panic!("asset {} read failed: {e:?}", entry.name)); + } +} + +#[test] +fn bendl_nonzero_alignment_padding_is_ignored() { + // alignment_padding occupies bytes 14..16. Writers zero it; readers must ignore non-zero bytes + // there. Forward-compat insurance: a future writer that accidentally stamps something into the + // padding region must not break readers. + let mut bytes = valid_bendl_seed(); + bytes[14..16].copy_from_slice(&u16::MAX.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)) + .expect("non-zero alignment_padding must not block open"); + reader + .verify_stream_checksum() + .expect("stream CRC must still verify with non-zero alignment_padding"); + for entry in reader.assets().to_vec() { + reader + .asset_bytes(&entry) + .unwrap_or_else(|e| panic!("asset {} read failed: {e:?}", entry.name)); + } +} + +#[test] +fn bendl_stream_offset_plus_stream_len_overflow_surfaces_short_range() { + // stream_offset + stream_len overflows u64. BendlReader::open does not validate stream range + // (intentional — keeps metadata inspection cheap), so open succeeds. Each accessor must + // surface the strict-EOF contract: the verified raw stream reader returns UnexpectedEof from + // read; verify_stream_checksum returns BendlReadError::Io(UnexpectedEof); + // open_assignment_reader either fails at construction or surfaces UnexpectedEof during + // iteration; assignment_stream_reader_unverified surfaces UnexpectedEof on read. + let mut bytes = valid_bendl_seed(); + bytes[40..48].copy_from_slice(&(u64::MAX - 5).to_le_bytes()); + bytes[48..56].copy_from_slice(&u64::MAX.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + + match reader.verify_stream_checksum() { + Err(BendlReadError::Io(ref e)) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } + + let mut buf = [0u8; 64]; + let mut raw = reader + .assignment_stream_reader() + .expect("constructor must succeed"); + let err = raw.read(&mut buf).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + drop(raw); + + let mut raw_unverified = reader + .assignment_stream_reader_unverified() + .expect("constructor must succeed"); + let err = raw_unverified.read(&mut buf).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + drop(raw_unverified); + + // open_assignment_reader either fails at construction (banner read into short range) or fails + // during iteration. Both surfaces are acceptable per the 0.1c contract; both must be Io-not- + // Decode. + match reader.open_assignment_reader() { + Err(BendlReadError::Io(ref e)) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), + Err(other) => panic!("expected Io(UnexpectedEof) at construction, got {other:?}"), + Ok(mut decoded) => { + let mut saw_unexpected_eof = false; + for record in (&mut decoded).take(64) { + if let Err(e) = record { + assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof); + saw_unexpected_eof = true; + break; + } + } + assert!( + saw_unexpected_eof, + "decoded iterator must surface UnexpectedEof" + ); + } + } +} + +#[test] +fn bendl_stream_offset_past_eof_surfaces_short_range() { + // stream_offset alone points past EOF. Same surface contract as the overflow case — open + // succeeds; every stream accessor reports UnexpectedEof on read. + let mut bytes = valid_bendl_seed(); + let past_eof = (bytes.len() as u64) + 4096; + bytes[40..48].copy_from_slice(&past_eof.to_le_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + + match reader.verify_stream_checksum() { + Err(BendlReadError::Io(ref e)) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), + other => panic!("expected BendlReadError::Io(UnexpectedEof), got {other:?}"), + } + + let mut buf = [0u8; 64]; + let mut raw_unverified = reader + .assignment_stream_reader_unverified() + .expect("constructor must succeed"); + let err = raw_unverified.read(&mut buf).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} From 3cfee6604b09bedbac4646b4e9906d9346170da3 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:24:36 -0600 Subject: [PATCH 107/221] check twodelta boundary test --- ben/src/io/writer/tests.rs | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 19112ca..6f48947 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -143,6 +143,63 @@ fn writer_twodelta_chunk_size_1() { assert_eq!(results, assignments); } +#[test] +fn writer_twodelta_chunk_boundary_off_by_one_grid() { + // Off-by-one bugs in the chunked TwoDelta path hide exactly at the boundaries between full + // chunks and partial trailing chunks: a flush that runs one short of the chunk boundary, a + // flush that exactly fills it, and a flush that spills one past. Sweep both the first chunk + // (samples = chunk - 1, chunk, chunk + 1) and the second chunk (samples = 2*chunk - 1, + // 2*chunk, 2*chunk + 1) for every plausible chunk size, including the default 10_000. + // + // Each test generates assignments that strictly alternate between an anchor pattern and a + // delta pattern so the writer is forced through both the anchor-frame and delta-frame paths; + // a writer that miscounts chunk boundaries would either drop the final partial chunk, write + // a stale anchor for the next chunk, or scramble the delta chain. + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![2u16, 2, 1, 1]; + + for &chunk_size in &[2usize, 7, 64, 10_000] { + for &n_samples in &[ + chunk_size.saturating_sub(1), + chunk_size, + chunk_size + 1, + 2 * chunk_size - 1, + 2 * chunk_size, + 2 * chunk_size + 1, + ] { + if n_samples == 0 { + continue; + } + let assignments: Vec> = (0..n_samples) + .map(|i| if i % 2 == 0 { anchor.clone() } else { delta.clone() }) + .collect(); + + let mut xben = Vec::new(); + { + let mut writer = + build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(chunk_size)); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); + let decoded: Vec> = reader + .flat_map(|r| { + let (a, count) = r.unwrap(); + std::iter::repeat(a).take(count as usize) + }) + .collect(); + assert_eq!( + decoded, assignments, + "TwoDelta chunk-boundary round-trip failed for chunk_size={chunk_size}, \ + n_samples={n_samples}" + ); + } + } +} + #[test] fn writer_twodelta_chunk_size_larger_than_stream() { let a = vec![1u16, 1, 2, 2]; From f8986c8d4508118b32f72e3f25c08fff5fd26d7e Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:29:32 -0600 Subject: [PATCH 108/221] more boundary tests --- ben/src/codec/encode/tests.rs | 135 ++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 98ce749..06d5b19 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1336,3 +1336,138 @@ fn ben32_encode_run_exceeding_u16_max_splits_correctly() { assert_eq!(second & 0xFFFF, 2u32); // remaining 2 elements assert_eq!(sentinel, 0u32); // always-present zero sentinel } + +// --------------------------------------------------------------------------- +// Bit-packing boundary widths +// --------------------------------------------------------------------------- + +/// Round-trip `assignment` through `BenStreamWriter::for_ben` + `BenStreamReader::from_ben`, +/// asserting the decoded result matches the input. Used by the bit-packing boundary-width sweep +/// below to confirm both encoder and decoder agree at every interesting width. +fn assert_ben_round_trip(assignment: Vec, variant: BenVariant) { + use crate::io::reader::BenStreamReader; + use crate::io::writer::BenStreamWriter; + use std::io::Cursor; + + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, variant).unwrap(); + writer.write_assignment(assignment.clone()).unwrap(); + if matches!(variant, BenVariant::TwoDelta) { + // TwoDelta needs at least one delta frame after the anchor for the + // delta-frame bit-packing path to be exercised at all; otherwise the round-trip only + // tests the anchor frame (which is MkvChain-shaped). + writer.write_assignment(assignment.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(Cursor::new(&ben)).unwrap(); + let decoded: Vec> = reader + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + let expected = if matches!(variant, BenVariant::TwoDelta) { + vec![assignment.clone(), assignment] + } else { + vec![assignment] + }; + assert_eq!( + decoded, expected, + "BEN round-trip failed for variant {variant:?}" + ); +} + +/// For each `width` in {1, 7, 8, 9, 16} — single-bit, just-under and just-over byte-aligned, +/// exactly byte-aligned, and the upper bound — exercise both the `mvb` and `mlb` sides of the +/// bit-packing code by constructing assignments whose encoded frame is forced to that width. The +/// encoder picks `max_val_bit_count = (16 - max_val.leading_zeros()).max(1)`, so a `max_val` of +/// `2.pow(width - 1)` (or 1 for width=1) lands on the requested width exactly. +/// +/// The upper bound of `17` is not tested here because the encoder cannot produce it — `u16` +/// values cap bit widths at 16. Rejection of a hand-built frame with mvb=17 is already pinned by +/// `malformed_ben_bit_widths_return_invalid_data` in `tests/test_stress_edges.rs`. +/// +/// `width=1` is a degenerate case: the only valid `max_val` is 1, so the assignment must contain +/// only the value 1, which collapses into a single run. mvb and mlb cannot be isolated +/// independently at this width, so it appears once via the `[1]` single-element fixture (mvb=1 +/// and mlb=1 together). +#[test] +fn bit_packing_boundary_widths_round_trip() { + // width=1: max_val=1 means all values are 1; the only way to keep mlb=1 too is a single- + // element assignment. Both sides at width=1 are pinned by this one fixture. + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_ben_round_trip(vec![1u16], variant); + } + + for &width in &[7u8, 8, 9, 16] { + let peak = 1u16 << (width - 1); + + // mvb sweep: alternating `1` and `peak` produces 4 runs of length 1, giving max_val=peak + // (so mvb=width) and max_len=1 (so mlb=1). TwoDelta uses only mlb (its label pair is not + // bit-packed against a max value), so it's skipped on this side. + let mvb_assignment = vec![1u16, peak, 1, peak]; + for variant in [BenVariant::Standard, BenVariant::MkvChain] { + assert_ben_round_trip(mvb_assignment.clone(), variant); + } + + // mlb sweep: a single run of length `peak` of value 1. max_val=1 (mvb=1), max_len=peak + // (mlb=width). All three variants exercise the run-length bit packing. + let mlb_assignment = vec![1u16; peak as usize]; + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_ben_round_trip(mlb_assignment.clone(), variant); + } + } +} + +/// Independently verify that the encoder actually picks the bit width we expect for each fixture +/// in `bit_packing_boundary_widths_round_trip`. If a future encoder change makes a different +/// width choice for the same input, the round-trip test above can still pass — this guards +/// against that drift. +#[test] +fn bit_packing_boundary_widths_pin_encoder_choice() { + fn standard_widths(assignment: Vec) -> (u8, u8) { + let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None); + if let BenEncodeFrame::Standard { + max_val_bit_count, + max_len_bit_count, + .. + } = frame + { + (max_val_bit_count, max_len_bit_count) + } else { + panic!("expected Standard frame"); + } + } + + // width=1: [1] → mvb=1, mlb=1. + assert_eq!(standard_widths(vec![1u16]), (1, 1), "width=1 drift"); + + for &width in &[7u8, 8, 9, 16] { + let peak = 1u16 << (width - 1); + + // mvb side: [1, peak, 1, peak] → max_val=peak (mvb=width), max_len=1 (mlb=1). + assert_eq!( + standard_widths(vec![1u16, peak, 1, peak]), + (width, 1), + "mvb drift at width={width}" + ); + + // mlb side: [1; peak] → max_val=1 (mvb=1), max_len=peak (mlb=width). + assert_eq!( + standard_widths(vec![1u16; peak as usize]), + (1, width), + "mlb drift at width={width}" + ); + } +} From 24f86722b49a06fe8ccdd02698ac3cfca625cf65 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:33:38 -0600 Subject: [PATCH 109/221] check label value 0 makes the round trip --- ben/src/codec/encode/tests.rs | 113 ++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 06d5b19..cca8815 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1337,6 +1337,119 @@ fn ben32_encode_run_exceeding_u16_max_splits_correctly() { assert_eq!(sentinel, 0u32); // always-present zero sentinel } +// --------------------------------------------------------------------------- +// Label-value 0 round-trips for MkvChain and TwoDelta +// --------------------------------------------------------------------------- + +/// MkvChain round-trip with label `0` in the assignment. The existing +/// `encode_jsonl_to_ben_single_zero` test covers Standard; MkvChain is structurally similar but +/// the count-byte plumbing and run-length code paths are distinct enough to warrant their own +/// pin. +#[test] +fn mkvchain_round_trip_with_label_zero() { + use crate::io::reader::BenStreamReader; + use crate::io::writer::BenStreamWriter; + use std::io::Cursor; + + let assignments = vec![vec![0u16, 0, 1], vec![0u16, 1, 1], vec![0u16, 0, 0]]; + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::MkvChain).unwrap(); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let decoded: Vec> = BenStreamReader::from_ben(Cursor::new(ben)) + .unwrap() + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!(decoded, assignments); +} + +/// TwoDelta round-trips with delta-frame pairs that contain `0`. The pair `(first, second)` is +/// computed by `twodelta_repeat_runs` from the first distinct values it sees; these fixtures +/// force both orderings — `(0, 1)` and `(1, 0)` — to confirm the bit-packing and unpacking +/// handle a zero-valued label on either side of the pair. Each fixture pairs an anchor with at +/// least one delta so the delta-frame path is exercised, not just the anchor. +/// +/// The degenerate `(0, 0)` case is not testable through this writer path: `twodelta_repeat_runs` +/// guarantees `second != first` via a `first + 1` fallback when only one distinct value is +/// present, so an assignment of `[0, 0, 0, 0]` yields the pair `(0, 1)` rather than `(0, 0)`. +/// The all-identical case is covered by `writer_twodelta_all_identical_values` in +/// `io/writer/tests.rs` (using `vec![3u16; 8]`); the only zero-specific aspect missing there is +/// `vec![0u16; 8]`, which would exercise the same code path with the value field cleared. +#[test] +fn twodelta_round_trip_with_label_zero_pairs() { + use crate::io::reader::BenStreamReader; + use crate::io::writer::BenStreamWriter; + use std::io::Cursor; + + let fixtures = vec![ + ("pair (0, 1)", vec![vec![0u16, 0, 1, 1], vec![0u16, 1, 0, 1]]), + ("pair (1, 0)", vec![vec![1u16, 1, 0, 0], vec![1u16, 0, 1, 0]]), + ]; + + for (label, assignments) in fixtures { + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let decoded: Vec> = BenStreamReader::from_ben(Cursor::new(ben)) + .unwrap() + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!( + decoded, assignments, + "TwoDelta round-trip failed for fixture: {label}" + ); + } +} + +/// All-zero assignment in TwoDelta: exercises the repeat/anchor-only path on the value `0`. The +/// existing `writer_twodelta_all_identical_values` test in `io/writer/tests.rs` already covers +/// this shape with value `3`; this companion confirms value `0` survives the same path, +/// guarding against any future code path that uses `0` as a sentinel. +#[test] +fn twodelta_round_trip_all_zero_assignment() { + use crate::io::reader::BenStreamReader; + use crate::io::writer::BenStreamWriter; + use std::io::Cursor; + + let assign = vec![0u16; 4]; + let assignments: Vec> = (0..5).map(|_| assign.clone()).collect(); + + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + for a in &assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let decoded: Vec> = BenStreamReader::from_ben(Cursor::new(ben)) + .unwrap() + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!(decoded, assignments); +} + // --------------------------------------------------------------------------- // Bit-packing boundary widths // --------------------------------------------------------------------------- From f146f52ecdeff2a1619f1cef8c4f05ab9989b410 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:39:15 -0600 Subject: [PATCH 110/221] add property tests for bendl --- ben/tests/test_bendl_append_proptest.rs | 294 ++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 ben/tests/test_bendl_append_proptest.rs diff --git a/ben/tests/test_bendl_append_proptest.rs b/ben/tests/test_bendl_append_proptest.rs new file mode 100644 index 0000000..8ee2945 --- /dev/null +++ b/ben/tests/test_bendl_append_proptest.rs @@ -0,0 +1,294 @@ +//! Property-based stress test for [`BendlAppender`]. +//! +//! Append is the most subtle BENDL invariant: existing payload offsets must not move; the +//! directory rewrite must be atomic; aborted appends and drops must leave the file unchanged. +//! Existing fixed-seed tests in `ben/src/io/bundle/tests/writer.rs` cover the happy path but +//! cannot explore the full grammar of append/abort/drop sequences. +//! +//! This proptest generates sequences of `AddAsset` / `Commit` / `Abort` / `DropWithoutCommit` +//! operations and verifies: +//! +//! 1. After every `Commit`, every previously-committed asset is still readable and its decoded +//! bytes match what was originally added. +//! 2. After every `Commit`, every existing directory entry's `(payload_offset, payload_len)` is +//! unchanged, and the raw bytes at those offsets are byte-for-byte identical to before the +//! commit. This is the strong append-only invariant. +//! 3. After every `Abort` or drop-without-commit, the file is byte-identical to before the +//! appender was opened. + +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, BendlDirectoryEntry, ASSET_TYPE_CUSTOM, +}; +use binary_ensemble::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriter}; +use binary_ensemble::io::bundle::BendlReader; +use proptest::prelude::*; +use std::io::{Cursor, Read, Seek, SeekFrom}; + +#[derive(Debug, Clone)] +enum Op { + /// Open an appender (if none is open) and enqueue a pending asset. + AddAsset { + payload: Vec, + compress: bool, + }, + /// Commit the currently-open appender, if any. + Commit, + /// Abort the currently-open appender via the explicit `.abort()` API, if any. + Abort, + /// Drop the currently-open appender without committing, if any. Distinguished from `Abort` + /// because they exercise different code paths internally even though both leave the file + /// unchanged. + DropWithoutCommit, +} + +fn op_strategy() -> impl Strategy { + prop_oneof![ + 4 => (any::(), 0usize..=64usize).prop_map(|(compress, n)| Op::AddAsset { + payload: (0..n).map(|i| (i as u8).wrapping_mul(31)).collect(), + compress, + }), + 3 => Just(Op::Commit), + 1 => Just(Op::Abort), + 1 => Just(Op::DropWithoutCommit), + ] +} + +/// Build the seed bundle used by every proptest case: a finalized bundle with one initial custom +/// asset and a short stream so there's something to preserve across appends. +fn build_seed_bundle() -> Vec { + let mut writer = BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "seed.bin", + b"seed payload bytes", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + use std::io::Write; + session.write_all(b"STANDARD BEN FILE\x00\x01\x02").unwrap(); + let writer = session.finish_into_writer(1); + writer.finish().unwrap().into_inner() +} + +/// Read the raw bytes at a given offset/length range from a buffer. Used to compare an existing +/// directory entry's payload bytes before and after a commit. +fn raw_bytes_at(buf: &[u8], offset: u64, len: u64) -> Vec { + let start = offset as usize; + let end = start + len as usize; + buf[start..end].to_vec() +} + +/// Snapshot the (offset, len, raw payload bytes) for every directory entry in `bytes`. The +/// invariant is that these tuples must be unchanged after an append-only commit. +fn snapshot_existing_entries(bytes: &[u8]) -> Vec<(String, u64, u64, Vec)> { + let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + reader + .assets() + .iter() + .map(|e| { + let payload = raw_bytes_at(bytes, e.payload_offset, e.payload_len); + (e.name.clone(), e.payload_offset, e.payload_len, payload) + }) + .collect() +} + +/// Lookup an entry by name in a freshly-read directory. +fn find_entry<'a>(entries: &'a [BendlDirectoryEntry], name: &str) -> &'a BendlDirectoryEntry { + entries + .iter() + .find(|e| e.name == name) + .unwrap_or_else(|| panic!("entry {name:?} not found in directory")) +} + +/// Run a single sequence of ops against the seed bundle. Returns the final state for an outer +/// `prop_assert!` to inspect, but most assertions fire inline as the ops execute. +fn run_sequence(ops: &[Op]) { + let seed = build_seed_bundle(); + let baseline_reader = BendlReader::open(Cursor::new(&seed)).unwrap(); + let baseline_samples = baseline_reader.sample_count(); + drop(baseline_reader); + + // The "model" of every asset that has been committed to disk, in commit order. Each entry is + // (name, raw_payload_bytes_as_added, compress_flag). The decoded asset bytes returned by the + // reader must equal `raw_payload_bytes_as_added` regardless of compression. + let mut committed: Vec<(String, Vec, bool)> = vec![( + "seed.bin".to_string(), + b"seed payload bytes".to_vec(), + false, + )]; + + let mut current_bytes = seed.clone(); + + // Per-appender state: when an appender is open we hold its Vec of pending payloads alongside + // the snapshot we'll diff against if it commits. We don't keep the appender itself in this + // structure because moving it through closures with a snapshot is awkward; instead we + // construct/consume the appender inline at the next Op that uses it. We do, however, track + // the names allocated so the appender doesn't get hit with DuplicateName on the second + // AddAsset in the same round. + struct PendingRound { + pending: Vec<(String, Vec, bool)>, + next_name_index: usize, + } + let mut round: Option = None; + let mut name_counter: usize = 0; + + for op in ops { + match op { + Op::AddAsset { payload, compress } => { + let r = round.get_or_insert(PendingRound { + pending: Vec::new(), + next_name_index: 0, + }); + // Name allocation: use a global counter to guarantee uniqueness across rounds, + // and embed the round-local index so a successful Commit lands a stable name. + let name = format!("asset-{}-{}.bin", name_counter, r.next_name_index); + r.next_name_index += 1; + name_counter += 1; + r.pending.push((name, payload.clone(), *compress)); + } + Op::Commit => { + let Some(r) = round.take() else { continue }; + let snapshot = snapshot_existing_entries(¤t_bytes); + + let mut appender = BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); + for (name, payload, compress) in &r.pending { + let mut opts = AddAssetOptions::defaults(); + opts = if *compress { opts.compress() } else { opts.raw() }; + appender + .add_asset(ASSET_TYPE_CUSTOM, name, payload, opts) + .expect("add_asset on pending entry should succeed"); + } + let new_bytes = appender.commit().unwrap().into_inner(); + + // File must have grown (or stayed equal if pending was empty — but an empty + // round only happens when the user inserts nothing before Commit, which isn't a + // generated op here since AddAsset is the only way to enter Pending state). + assert!( + new_bytes.len() >= current_bytes.len(), + "file shrank after commit" + ); + + // Strong invariant: every previously-committed directory entry kept its offset, + // length, and raw payload bytes. + let new_reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); + let new_entries: Vec = new_reader.assets().to_vec(); + drop(new_reader); + for (name, old_offset, old_len, old_payload) in &snapshot { + let entry = find_entry(&new_entries, name); + assert_eq!( + (entry.payload_offset, entry.payload_len), + (*old_offset, *old_len), + "directory entry {name} (offset, len) drifted after commit" + ); + let new_raw = raw_bytes_at(&new_bytes, entry.payload_offset, entry.payload_len); + assert_eq!( + new_raw, *old_payload, + "directory entry {name} raw payload bytes drifted after commit" + ); + } + + // Append model: every previously-committed asset + every freshly-committed + // pending one is readable and decodes to the right bytes. + for (name, payload, _compress) in &r.pending { + committed.push((name.clone(), payload.clone(), false)); + } + let mut reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); + assert_eq!( + reader.assets().len(), + committed.len(), + "directory size mismatch after commit" + ); + assert_eq!( + reader.sample_count(), + baseline_samples, + "sample_count drifted across append" + ); + for (name, want, _) in &committed { + let entry = reader.find_asset_by_name(name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "decoded payload mismatch for {name}"); + } + + current_bytes = new_bytes; + } + Op::Abort => { + let Some(_r) = round.take() else { continue }; + let pre_bytes = current_bytes.clone(); + let appender = BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); + // .abort() consumes the appender and returns the underlying cursor. We never + // wrote anything to it (we never entered the pending state at the writer + // level), so the bytes must equal pre_bytes. + let cursor = appender.abort(); + let post_bytes = cursor.into_inner(); + assert_eq!( + post_bytes, pre_bytes, + "Abort modified the file (it must be a no-op)" + ); + } + Op::DropWithoutCommit => { + let Some(_r) = round.take() else { continue }; + let pre_bytes = current_bytes.clone(); + { + let mut appender = + BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); + // Re-enqueue the pending ops on this appender, then let it drop without + // committing. The file underlying `appender` is a clone of `current_bytes`, + // so dropping it can't affect `current_bytes` either way — but the + // assertion below pins that intent for clarity. + for (i, (_, payload, compress)) in _r.pending.iter().enumerate() { + let mut opts = AddAssetOptions::defaults(); + opts = if *compress { opts.compress() } else { opts.raw() }; + let name = format!("dropped-{name_counter}-{i}.bin"); + let _ = appender.add_asset(ASSET_TYPE_CUSTOM, &name, payload, opts); + } + // appender drops here without commit(). + } + assert_eq!( + current_bytes, pre_bytes, + "DropWithoutCommit modified the master file (it must be a no-op)" + ); + } + } + } + + // Final consistency check: open the file one last time, validate the directory, and confirm + // every committed asset is still readable. Any pending round at end-of-sequence is implicitly + // dropped (no commit), which must not affect `current_bytes`. + let mut reader = BendlReader::open(Cursor::new(¤t_bytes)).unwrap(); + reader.validate_directory().unwrap(); + assert_eq!( + reader.assets().len(), + committed.len(), + "final directory size mismatch" + ); + for (name, want, _) in &committed { + let entry = reader.find_asset_by_name(name).cloned().unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!(&got, want, "final decoded payload mismatch for {name}"); + } + + // Also drive a raw seek to EOF to confirm the file is structurally sound. + let mut tail = Vec::new(); + let mut cursor = Cursor::new(¤t_bytes); + cursor.seek(SeekFrom::Start(0)).unwrap(); + cursor.read_to_end(&mut tail).unwrap(); + assert_eq!(tail.len(), current_bytes.len()); +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 64, + ..ProptestConfig::default() + })] + + /// Drive a random sequence of append-grammar operations against the seed bundle. All + /// invariants are asserted inline by `run_sequence`. + #[test] + fn bendl_appender_preserves_existing_entries_and_no_op_aborts( + ops in proptest::collection::vec(op_strategy(), 1..30), + ) { + run_sequence(&ops); + } +} From 5d8406314aabea0d5be1595780ca679a4f58c620 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:45:25 -0600 Subject: [PATCH 111/221] add property-based equivalence tests --- ..._equivalence_proptest.proptest-regressions | 7 + ben/tests/test_ops_equivalence_proptest.rs | 225 ++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 ben/tests/test_ops_equivalence_proptest.proptest-regressions create mode 100644 ben/tests/test_ops_equivalence_proptest.rs diff --git a/ben/tests/test_ops_equivalence_proptest.proptest-regressions b/ben/tests/test_ops_equivalence_proptest.proptest-regressions new file mode 100644 index 0000000..9820fc4 --- /dev/null +++ b/ben/tests/test_ops_equivalence_proptest.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 35f2adb6ac746a76ea630c8185295f9a409227d55b830ffb4f88e9637fbe17c9 # shrinks to seq = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 2]] diff --git a/ben/tests/test_ops_equivalence_proptest.rs b/ben/tests/test_ops_equivalence_proptest.rs new file mode 100644 index 0000000..c3c2dfa --- /dev/null +++ b/ben/tests/test_ops_equivalence_proptest.rs @@ -0,0 +1,225 @@ +//! Property-based equivalence tests for the high-level ops (`relabel`, `extract`, `convert`). +//! +//! Existing proptests in `tests/test_impls_pipeline.rs` cover encoder/decoder round-trips +//! (`translate` direction). The complementary properties here pin the algebraic identities of +//! the post-decode operations: +//! +//! - **relabel composition:** for any node permutation `P` of length `L`, +//! `relabel(P^-1, relabel(P, x)) == x`. +//! - **extract correctness:** for any sample index `i` in `1..=N`, +//! `extract(i, encode(x)) == x[i-1]`. +//! - **convert variant round-trip:** for any variant pair `(A, B)`, +//! `convert(A, convert(B, x)) == x` (compared at the decoded-assignment level, since BEN +//! variants differ in frame structure and counts but not assignment data). + +use binary_ensemble::codec::decode::decode_ben_to_jsonl; +use binary_ensemble::codec::encode::encode_jsonl_to_ben; +use binary_ensemble::ops::extract::extract_assignment_ben; +use binary_ensemble::ops::relabel::{convert_ben_file, relabel_ben_file, RelabelOptions}; +use binary_ensemble::BenVariant; +use proptest::prelude::*; +use serde_json::json; +use std::collections::HashMap; +use std::io::{BufReader, Cursor, Write}; + +/// Build canonical JSONL from a sequence of equal-length assignments. +fn jsonl_from(seq: &[Vec]) -> Vec { + let mut buf = Vec::new(); + for (i, a) in seq.iter().enumerate() { + writeln!( + &mut buf, + "{}", + json!({"assignment": a, "sample": i + 1}) + ) + .unwrap(); + } + buf +} + +/// Strategy: a sequence of `n` assignments, each of length `L`, with values in `1..=max_val`. +fn strat_fixed_length_seq( + max_val: u16, + len: usize, + max_samples: usize, +) -> impl Strategy>> { + prop::collection::vec( + prop::collection::vec(1u16..=max_val, len), + 1..=max_samples, + ) +} + +/// Invert a permutation `P` (new_idx → old_idx). The inverse maps `old_idx → new_idx`. Given +/// the contract that callers pass a contiguous bijection over `0..len`, the inverse is also a +/// bijection over the same range and is what we apply to undo `P`. +fn invert_permutation(p: &HashMap) -> HashMap { + p.iter().map(|(&new, &old)| (old, new)).collect() +} + +/// Build a `Vec` permutation of `0..len` by shuffling deterministically from a `u64` seed. +/// Used inside proptest bodies to derive a shuffle from a generated seed input rather than +/// strategy-composing one (which would require ValueTree plumbing). +fn shuffled_indices(len: usize, seed: u64) -> Vec { + use rand::seq::SliceRandom; + use rand::SeedableRng; + let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); + let mut indices: Vec = (0..len).collect(); + indices.shuffle(&mut rng); + indices +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 48, + ..ProptestConfig::default() + })] + + /// `relabel(P^-1, relabel(P, x)) == x` for any node permutation `P` of fixed length. The + /// relabel API operates at the BEN-stream level: we encode `x` as BEN, run `relabel(P)`, + /// then `relabel(P^-1)`, decode the resulting BEN and compare the assignment sequence + /// against the original. + #[test] + fn relabel_composition_is_identity( + len in 1usize..=12, + n_samples in 1usize..=8usize, + ) { + let seq = (0..n_samples) + .map(|i| (0..len).map(|j| ((i * 31 + j * 7) % 5 + 1) as u16).collect::>()) + .collect::>(); + + // Generate a permutation deterministically from `len` so this case is reproducible. + let mut p: HashMap = HashMap::new(); + for i in 0..len { + // Rotate by 1: simple non-identity permutation that exercises every position. + p.insert(i, (i + 1) % len); + } + let p_inv = invert_permutation(&p); + + let jsonl = jsonl_from(&seq); + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::Standard) + .unwrap(); + + let mut after_p = Vec::new(); + relabel_ben_file( + ben.as_slice(), + &mut after_p, + RelabelOptions::node_permutation(p), + ) + .unwrap(); + + let mut after_p_inv = Vec::new(); + relabel_ben_file( + after_p.as_slice(), + &mut after_p_inv, + RelabelOptions::node_permutation(p_inv), + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(after_p_inv.as_slice(), &mut decoded).unwrap(); + prop_assert_eq!(decoded, jsonl); + } + + /// `relabel` with a random shuffle composed with its inverse is identity. The shuffle is + /// seeded from a generated `u64` so proptest can shrink the seed when a counterexample is + /// found. + #[test] + fn relabel_random_shuffle_composition_is_identity( + seq in strat_fixed_length_seq(5, 8, 6), + seed in any::(), + ) { + let len = seq[0].len(); + let shuffled = shuffled_indices(len, seed); + let p: HashMap = + (0..len).map(|new_idx| (new_idx, shuffled[new_idx])).collect(); + let p_inv = invert_permutation(&p); + + let jsonl = jsonl_from(&seq); + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::Standard) + .unwrap(); + + let mut after_p = Vec::new(); + relabel_ben_file( + ben.as_slice(), + &mut after_p, + RelabelOptions::node_permutation(p), + ) + .unwrap(); + + let mut after_p_inv = Vec::new(); + relabel_ben_file( + after_p.as_slice(), + &mut after_p_inv, + RelabelOptions::node_permutation(p_inv), + ) + .unwrap(); + + let mut decoded = Vec::new(); + decode_ben_to_jsonl(after_p_inv.as_slice(), &mut decoded).unwrap(); + prop_assert_eq!(decoded, jsonl); + } + + /// `extract(i, encode(x)) == x[i-1]` for every 1-based sample index `i` in `1..=N`. + /// Sweeps the entire sequence (not just a random index) because extract correctness for one + /// index is almost free to verify for all of them once the BEN file is built. + #[test] + fn extract_returns_the_correct_sample( + seq in strat_fixed_length_seq(8, 5, 6), + ) { + let jsonl = jsonl_from(&seq); + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut ben, BenVariant::Standard) + .unwrap(); + + for (i, expected) in seq.iter().enumerate() { + let extracted = extract_assignment_ben(Cursor::new(&ben), i + 1).unwrap(); + prop_assert_eq!(&extracted, expected, + "extract(sample_number={}) returned the wrong assignment", i + 1); + } + } + + /// `convert(A, convert(B, x)) == x`, asserted at the decoded-assignment level. BEN + /// variants encode assignment runs differently (Standard packs label/count pairs; MkvChain + /// adds an outer repetition count for adjacent equal assignments) and the round trip must + /// preserve every materialized assignment regardless of which intermediate representation + /// is used. Pairs `(Standard, MkvChain)` and the same in reverse pin both directions. + /// + /// TwoDelta is intentionally excluded from this sweep: it imposes a structural constraint + /// that each delta assignment must only contain values from a 2-value pair shared with the + /// previous assignment, so `convert(arbitrary_BEN, TwoDelta)` is not well-defined for the + /// general inputs this strategy generates. TwoDelta round-trips are exercised by the + /// dedicated `fuzz_roundtrip_*_twodelta` proptests in `test_impls_pipeline.rs`, which use + /// the `strat_twodelta_seq` strategy that respects those constraints. + #[test] + fn convert_variant_round_trip_preserves_assignments( + seq in strat_fixed_length_seq(8, 5, 6), + ) { + let jsonl = jsonl_from(&seq); + + for (source, intermediate) in &[ + (BenVariant::Standard, BenVariant::MkvChain), + (BenVariant::MkvChain, BenVariant::Standard), + ] { + let mut start_ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(jsonl.as_slice()), &mut start_ben, *source) + .unwrap(); + + let mut mid_ben = Vec::new(); + convert_ben_file(start_ben.as_slice(), &mut mid_ben, *intermediate).unwrap(); + let mut end_ben = Vec::new(); + convert_ben_file(mid_ben.as_slice(), &mut end_ben, *source).unwrap(); + + // Decode both endpoints to JSONL and compare. Direct byte comparison would over-pin + // frame boundaries and run-length grouping (e.g. `convert(MkvChain, ...)` may merge + // adjacent equal assignments into a single repeat-count frame). + let mut start_jsonl = Vec::new(); + decode_ben_to_jsonl(start_ben.as_slice(), &mut start_jsonl).unwrap(); + let mut end_jsonl = Vec::new(); + decode_ben_to_jsonl(end_ben.as_slice(), &mut end_jsonl).unwrap(); + prop_assert_eq!(end_jsonl, start_jsonl, + "convert {:?} -> {:?} -> {:?} did not preserve assignments", + source, intermediate, source); + } + } +} From da500346a1b95280f4d6e94c506d13e43fe73c8d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:50:19 -0600 Subject: [PATCH 112/221] add some forward-compatibility tests --- ben/src/io/bundle/tests/format.rs | 35 ++++++++++++ ben/src/io/bundle/tests/reader.rs | 64 +++++++++++++++++++++ ben/src/io/bundle/tests/writer.rs | 93 +++++++++++++++++++++++++++++-- 3 files changed, 188 insertions(+), 4 deletions(-) diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index cf0ff7d..ea1cafd 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -104,6 +104,41 @@ fn header_rejects_unsupported_major_version() { )); } +#[test] +fn header_accepts_higher_minor_version() { + // Minor-version bumps are additive and backwards-compatible: a v1.0 reader must accept a + // v1.999 header (same major version, higher minor) and round-trip the field cleanly. + let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); + bytes[10..12].copy_from_slice(&999u16.to_le_bytes()); + let decoded = BendlHeader::from_bytes(&bytes).expect("higher minor version should read"); + assert_eq!(decoded.minor_version, 999); +} + +#[test] +fn header_accepts_nonzero_alignment_padding() { + // alignment_padding (bytes 14..16) is reserved for header byte-alignment, not as a + // forward-compat slot. Writers must zero it, but readers must tolerate non-zero values for + // foreign or adversarial bundles. The field itself round-trips through the struct. + let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); + bytes[14..16].copy_from_slice(&u16::MAX.to_le_bytes()); + let decoded = BendlHeader::from_bytes(&bytes).expect("non-zero padding should read"); + assert_eq!(decoded.alignment_padding, u16::MAX); +} + +#[test] +fn header_accepts_reserved_flag_bits() { + // Bits 1..31 of the header `flags` field are reserved in v1.0.0; readers must ignore them + // (i.e., open the header cleanly and surface the full u32 value to callers, who in turn must + // only act on bit 0). + let mut bytes = BendlHeader::provisional(AssignmentFormat::Ben, 64).to_bytes(); + let reserved_bits: u32 = 0xFFFF_FFFE; + bytes[16..20].copy_from_slice(&reserved_bits.to_le_bytes()); + let decoded = BendlHeader::from_bytes(&bytes).expect("reserved flag bits should read"); + assert_eq!(decoded.flags, reserved_bits); + // has_stream_checksum() only inspects bit 0; reserved bits must not flip that. + assert!(!decoded.has_stream_checksum()); +} + #[test] fn directory_entry_round_trip_no_checksum() { let entry = BendlDirectoryEntry { diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index f1c9603..c6ec4d6 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -1843,3 +1843,67 @@ fn open_assignment_reader_returns_unexpected_eof_when_banner_falls_in_short_rang Ok(_) => panic!("expected Err, got Ok"), } } + +// --------------------------------------------------------------------------- +// Forward-compat: unknown asset-flag bits +// --------------------------------------------------------------------------- + +#[test] +fn asset_with_unknown_flag_bit_opens_and_verifies_checksum() { + // Hand-build a directory entry with the known ASSET_FLAG_CHECKSUM bit set AND a reserved + // bit (bit 7) also set, plus a valid CRC32C over the payload bytes. Reader must: + // 1. Open the bundle cleanly (no rejection on unknown flags). + // 2. Verify the asset's CRC successfully (the unknown bit must not interfere with the + // verifier's flag handling). + // 3. Return the decoded payload bytes from asset_bytes. + const RESERVED_BIT_7: u16 = 1 << 7; + let payload = b"asset bytes with reserved bit".to_vec(); + + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + + let directory_offset = bytes.len() as u64; + let crc = crc32c::crc32c(&payload).to_le_bytes().to_vec(); + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_CHECKSUM | RESERVED_BIT_7, + name: "custom.bin".to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: Some(crc), + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open succeeds"); + let entry = reader.find_asset_by_name("custom.bin").cloned().unwrap(); + assert_ne!( + entry.asset_flags & RESERVED_BIT_7, + 0, + "reserved bit must be preserved through the read path" + ); + reader + .verify_asset_checksum(&entry) + .expect("CRC verifies despite unknown flag bit"); + let got = reader.asset_bytes(&entry).expect("asset_bytes succeeds"); + assert_eq!(got, payload); +} diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 2171fe7..3673954 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -4,10 +4,10 @@ use xz2::write::XzEncoder; use crate::io::bundle::error::{BendlReadError, ChecksumError, ChecksumTarget}; use crate::io::bundle::format::{ - AssignmentFormat, BendlFormatError, BendlHeader, ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, - ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, BENDL_MAGIC, BENDL_MAJOR_VERSION, - BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, FINALIZED_YES, - HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, + ASSET_FLAG_CHECKSUM, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, DEFAULT_XZ_PRESET, FINALIZED_NO, + FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; use crate::io::bundle::reader::BendlReader; use crate::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter}; @@ -1658,3 +1658,88 @@ fn open_assignment_reader_intact_bundle_round_trips_count_samples() { let n = decoder.count_samples().unwrap(); assert_eq!(n, samples.len()); } + +// --------------------------------------------------------------------------- +// Forward-compat: appender preserves unknown asset-flag bits on existing entries +// --------------------------------------------------------------------------- + +/// Build a finalized BENDL bundle with a single custom asset whose `asset_flags` carries a +/// reserved (unknown-in-v1.0.0) bit alongside the known `ASSET_FLAG_CHECKSUM` bit. Used to +/// confirm that `BendlAppender::commit` clones the existing entry verbatim, preserving the +/// reserved bit so future readers that grow the spec are not silently downgraded by today's +/// appender. +fn bundle_with_reserved_asset_flag_bit() -> (Vec, u16) { + const RESERVED_BIT_7: u16 = 1 << 7; + let payload = b"forward-compat asset".to_vec(); + let mut bytes = Vec::new(); + bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(&payload); + + let directory_offset = bytes.len() as u64; + let crc = crc32c::crc32c(&payload).to_le_bytes().to_vec(); + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: ASSET_FLAG_CHECKSUM | RESERVED_BIT_7, + name: "forward.bin".to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: Some(crc), + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + (bytes, RESERVED_BIT_7) +} + +#[test] +fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { + // Open a bundle whose pre-existing entry has a reserved bit set; commit a new asset; reopen + // and assert the reserved bit is still set on the original entry. The new entry must not + // pick up any reserved bits. + let (initial_bytes, reserved_bit) = bundle_with_reserved_asset_flag_bit(); + let known_v1_bits: u16 = ASSET_FLAG_CHECKSUM | ASSET_FLAG_XZ | crate::io::bundle::format::ASSET_FLAG_JSON; + + let mut appender = BendlAppender::open(Cursor::new(initial_bytes)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "new.bin", + b"new asset bytes", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let final_bytes = appender.commit().unwrap().into_inner(); + + let reader = BendlReader::open(Cursor::new(final_bytes)).unwrap(); + + let original = reader.find_asset_by_name("forward.bin").unwrap(); + assert_ne!( + original.asset_flags & reserved_bit, + 0, + "appender must not clear reserved bits on existing entries" + ); + + let new_entry = reader.find_asset_by_name("new.bin").unwrap(); + assert_eq!( + new_entry.asset_flags & !known_v1_bits, + 0, + "appender must not set any unknown bits on newly written entries" + ); +} From 879743418ae428a20c66e80feb3429d2edab4ace Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:52:39 -0600 Subject: [PATCH 113/221] add tests for multi-step decode --- ben/tests/test_impls_pipeline.rs | 62 ++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 235d1db..b41f8f4 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -319,6 +319,68 @@ proptest! { prop_assert_eq!(direct, via); } + // Two-path equivalence for Standard: direct XBEN->JSONL must equal XBEN->BEN->JSONL. + // Pins the same invariant as `fuzz_decode_xben_direct_equals_via_ben` for the Standard + // variant, whose XBEN frame layout is structurally different from MkvChain (no outer + // repetition count) and so deserves its own coverage. + #[test] + fn fuzz_decode_xben_direct_equals_via_ben_standard(seq in strat_assignment_seq(), params in strat_threads_levels()) { + let (threads, level) = params; + let jsonl = jsonl_from_assignments(&seq); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::Standard, + Some(threads), + Some(level), + None, + None, + ).unwrap(); + + let mut direct = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut direct).unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + let mut via = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut via).unwrap(); + + prop_assert_eq!(direct, via); + } + + // Two-path equivalence for TwoDelta: the most distinct of the three variants because XBEN + // TwoDelta uses columnar chunk frames that bear no resemblance to BEN TwoDelta's per-sample + // delta frames. The XBEN->JSONL direct decoder and the XBEN->BEN->JSONL pipeline must still + // agree byte-for-byte. + #[test] + fn fuzz_decode_xben_direct_equals_via_ben_twodelta(seq in strat_twodelta_seq(), params in strat_threads_levels()) { + let (threads, level) = params; + let jsonl = jsonl_from_assignments(&seq); + + let mut xben = Vec::new(); + encode_jsonl_to_xben( + BufReader::new(jsonl.as_slice()), + &mut xben, + BenVariant::TwoDelta, + Some(threads), + Some(level), + None, + None, + ).unwrap(); + + let mut direct = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut direct).unwrap(); + + let mut ben = Vec::new(); + decode_xben_to_ben(BufReader::new(xben.as_slice()), &mut ben).unwrap(); + let mut via = Vec::new(); + decode_ben_to_jsonl(ben.as_slice(), &mut via).unwrap(); + + prop_assert_eq!(direct, via); + } + // Iterator surface: BenStreamReader -> records matches direct JSONL #[test] fn fuzz_xbendecoder_iterator_matches_jsonl(seq in strat_assignment_seq(), params in strat_threads_levels()) { From aa00f837282050904fc3abb2dc18a6606f707b88 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:56:37 -0600 Subject: [PATCH 114/221] test parallel reads --- ben/src/io/bundle/tests/writer.rs | 86 +++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 3673954..40ee9d6 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -1708,6 +1708,92 @@ fn bundle_with_reserved_asset_flag_bit() -> (Vec, u16) { (bytes, RESERVED_BIT_7) } +// --------------------------------------------------------------------------- +// Concurrent reader access +// --------------------------------------------------------------------------- + +#[test] +fn two_parallel_readers_against_the_same_bundle_agree() { + // Two `BendlReader`s opened from independent `Cursor`s over an `Arc>` shared + // buffer must produce identical results across the full accessor surface. The bundle + // bytes are immutable for the duration of the test — this pins that the reader holds no + // shared mutable state internally (e.g., no static caches, no thread-local position + // tracking) that would let one thread's reads scramble the other's. + // + // Reader-during-append is intentionally not covered here: today's append path truncates + // the old trailing directory before writing the new one, while the header still points at + // the old directory offset until the final patch. A concurrent reader during that window + // would observe a torn state. Whether the contract should weaken to "errors cleanly + // during torn states" or strengthen to "snapshot-style readers" is a design decision + // (see the coverage plan tier 0.12 for the design question), and the right pin here is + // not a test against the current behavior. + use std::sync::Arc; + use std::thread; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + br#"{"nodes":4,"edges":[[0,1],[1,2],[2,3]]}"#, + AddAssetOptions::defaults().json().compress(), + ) + .unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"a bit of custom payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00\x01\x02").unwrap(); + let writer = session.finish_into_writer(1); + let bytes = writer.finish().unwrap().into_inner(); + let shared = Arc::new(bytes); + + // Pre-compute the expected (asset_name, decoded_bytes) pairs on the main thread so each + // worker has a stable oracle to compare against without re-deriving it from the same + // reader API under test. + let oracle: Vec<(String, Vec)> = { + let mut reader = BendlReader::open(Cursor::new(shared.as_slice())).unwrap(); + let entries: Vec<_> = reader.assets().to_vec(); + entries + .iter() + .map(|e| (e.name.clone(), reader.asset_bytes(e).unwrap())) + .collect() + }; + + let mut handles = Vec::new(); + for _ in 0..4 { + let shared = Arc::clone(&shared); + let oracle = oracle.clone(); + handles.push(thread::spawn(move || { + for _ in 0..16 { + let mut reader = BendlReader::open(Cursor::new(shared.as_slice())).unwrap(); + assert!(reader.is_finalized()); + assert!(reader.header().has_stream_checksum()); + reader + .verify_all_asset_checksums() + .expect("asset checksums must verify under concurrent readers"); + reader + .verify_stream_checksum() + .expect("stream checksum must verify under concurrent readers"); + let entries: Vec<_> = reader.assets().to_vec(); + for (entry, (expected_name, expected_bytes)) in entries.iter().zip(oracle.iter()) { + assert_eq!(&entry.name, expected_name); + let got = reader.asset_bytes(entry).unwrap(); + assert_eq!(&got, expected_bytes); + } + } + })); + } + for h in handles { + h.join().expect("worker thread panicked"); + } +} + #[test] fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { // Open a bundle whose pre-existing entry has a reserved bit set; commit a new asset; reopen From 4187b8c66ab75e54ce792c6a9e12e98ff5ad779f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 21 May 2026 22:58:56 -0600 Subject: [PATCH 115/221] test zero and one sample edge cases --- ben/src/io/writer/tests.rs | 108 +++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 6f48947..55bc11b 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -143,6 +143,114 @@ fn writer_twodelta_chunk_size_1() { assert_eq!(results, assignments); } +// ── Zero/one-sample edges ───────────────────────────────────────────── + +/// Round-trip an `assignments` list (possibly empty) through a BEN writer and reader for the +/// given variant, asserting the decoded sequence equals the input. Used by the zero/one-sample +/// matrix tests below. +fn assert_ben_round_trip(assignments: &[Vec], variant: BenVariant) { + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, variant).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let reader = BenStreamReader::from_ben(Cursor::new(&ben)).unwrap(); + let decoded: Vec> = reader + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!( + decoded, assignments, + "BEN round-trip failed for {variant:?}" + ); +} + +/// XBEN counterpart of [`assert_ben_round_trip`]. +fn assert_xben_round_trip(assignments: &[Vec], variant: BenVariant) { + let mut xben = Vec::new(); + { + let mut writer = build_xben_writer(&mut xben, variant, None); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let reader = BenStreamReader::from_xben(Cursor::new(&xben)).unwrap(); + let decoded: Vec> = reader + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!( + decoded, assignments, + "XBEN round-trip failed for {variant:?}" + ); +} + +/// Zero-sample (banner-only) BEN streams round-trip for every variant. Constructed by opening +/// the writer and immediately finishing it without any `write_assignment` calls. Catches stream +/// readers that assume at least one frame follows the banner. +#[test] +fn writer_ben_zero_sample_round_trip_per_variant() { + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_ben_round_trip(&[], variant); + } +} + +/// Zero-sample XBEN streams. XBEN adds an outer xz frame around the BEN content, so this also +/// covers any reader path that expects at least one BEN frame inside the compressed payload. +#[test] +fn writer_xben_zero_sample_round_trip_per_variant() { + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_xben_round_trip(&[], variant); + } +} + +/// One-sample BEN streams. Each fixture contains a single first frame; for TwoDelta this is the +/// MkvChain-shaped anchor frame (no delta frames follow, since there's no second sample). +#[test] +fn writer_ben_one_sample_round_trip_per_variant() { + let assignment = vec![1u16, 1, 2, 2]; + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_ben_round_trip(&[assignment.clone()], variant); + } +} + +/// One-sample XBEN streams. Mirrors the BEN matrix above but through the xz-compressed wire +/// format. For TwoDelta this exercises the XBEN columnar-chunk path when only an anchor exists +/// and no chunk has accumulated. +#[test] +fn writer_xben_one_sample_round_trip_per_variant() { + let assignment = vec![1u16, 1, 2, 2]; + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + assert_xben_round_trip(&[assignment.clone()], variant); + } +} + #[test] fn writer_twodelta_chunk_boundary_off_by_one_grid() { // Off-by-one bugs in the chunked TwoDelta path hide exactly at the boundaries between full From d6aef7051798f12c1b577814d6aef89eb1411317 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 22 May 2026 07:26:10 -0600 Subject: [PATCH 116/221] add cli tests --- ben/tests/test_cli.rs | 388 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 388 insertions(+) diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 28e3bfe..864b850 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1744,3 +1744,391 @@ fn bendl_cli_create_inspect_extract_append_roundtrip() { ); assert_failure(&append_duplicate); } + +// --------------------------------------------------------------------------- +// `ben encode --graph` and `ben x-encode --graph` +// --------------------------------------------------------------------------- + +#[test] +fn ben_encode_graph_requires_input_file_not_stdin() { + // `--graph` is structurally incompatible with stdin input because the output container has + // to seek to patch the header. The CLI must reject the bad combination explicitly. + let temp = TempDir::new("ben-encode-graph-stdin"); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + + let out = run( + "ben", + &[ + "--mode", + "encode", + "--graph", + graph_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&out); + let msg = String::from_utf8_lossy(&out.stderr); + assert!( + msg.contains("--graph") && msg.contains("input file"), + "expected '--graph requires an input file' error, got stderr: {msg}" + ); +} + +#[test] +fn ben_encode_graph_rejects_combination_with_print() { + // `--print` writes to stdout; bendl output requires a seekable file. Combination is invalid + // and must be rejected explicitly. + let temp = TempDir::new("ben-encode-graph-print"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + + let out = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--print", + ], + temp.path(), + ); + assert_failure(&out); + let msg = String::from_utf8_lossy(&out.stderr); + assert!( + msg.contains("--graph") && msg.contains("--print"), + "expected '--graph is incompatible with --print' error, got stderr: {msg}" + ); +} + +#[test] +fn ben_encode_graph_happy_path_produces_bendl() { + // Happy path for `ben --mode encode --graph`: produces a finalized .bendl whose decoded + // stream round-trips the input JSONL and whose graph asset matches the source. + let temp = TempDir::new("ben-encode-graph-happy"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + let out_path = temp.path().join("out.bendl"); + + let encode = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + ); + assert_success(&encode); + assert!(out_path.exists()); + + // Recover the embedded BEN stream and confirm it decodes back to the canonical JSONL. + let stream_path = temp.path().join("recovered.ben"); + let extract_stream = run( + "bendl", + &[ + "extract", + out_path.to_str().unwrap(), + "--stream", + "--output", + stream_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_stream); + + let decoded_path = temp.path().join("decoded.jsonl"); + let decode = run( + "ben", + &[ + "--mode", + "decode", + stream_path.to_str().unwrap(), + "--output-file", + decoded_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&decode); + assert_eq!(fs::read_to_string(&decoded_path).unwrap(), sample_jsonl()); + + // The graph asset should be embedded byte-equal. + let recovered_graph = temp.path().join("recovered-graph.json"); + let extract_graph = run( + "bendl", + &[ + "extract", + out_path.to_str().unwrap(), + "--asset", + "graph.json", + "--output", + recovered_graph.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_graph); + assert_eq!(fs::read_to_string(&recovered_graph).unwrap(), sample_graph()); +} + +#[test] +fn ben_xencode_graph_requires_input_file_not_stdin() { + let temp = TempDir::new("ben-xencode-graph-stdin"); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + + let out = run( + "ben", + &[ + "--mode", + "x-encode", + "--graph", + graph_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&out); + let msg = String::from_utf8_lossy(&out.stderr); + assert!( + msg.contains("--graph") && msg.contains("input file"), + "expected '--graph requires an input file' error, got stderr: {msg}" + ); +} + +#[test] +fn ben_xencode_graph_rejects_combination_with_print() { + let temp = TempDir::new("ben-xencode-graph-print"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + + let out = run( + "ben", + &[ + "--mode", + "x-encode", + jsonl_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--print", + ], + temp.path(), + ); + assert_failure(&out); + let msg = String::from_utf8_lossy(&out.stderr); + assert!( + msg.contains("--graph") && msg.contains("--print"), + "expected '--graph is incompatible with --print' error, got stderr: {msg}" + ); +} + +#[test] +fn ben_xencode_graph_with_ben_input_round_trips() { + // The `--graph` xencode handler dispatches on input extension: a `.ben` input takes the + // `encode_ben_to_xben` path (cli/ben/bundle.rs line 127), a `.jsonl` input takes the + // `encode_jsonl_to_xben` path. The happy-path test below only covers the `.jsonl` arm; + // this companion exercises the `.ben` arm. + let temp = TempDir::new("ben-xencode-graph-ben-input"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + + // Encode JSONL to a BEN file first; this is what we'll feed into --mode x-encode. + let ben_path = temp.path().join("samples.ben"); + let encode_ben = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + ben_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + ); + assert_success(&encode_ben); + + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + let out_path = temp.path().join("out.bendl"); + + let xencode = run( + "ben", + &[ + "--mode", + "x-encode", + ben_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&xencode); + assert!(out_path.exists()); + + // Round-trip: extract the XBEN stream, decode it back to JSONL, compare to the original. + let recovered_xben = temp.path().join("recovered.xben"); + let extract = run( + "bendl", + &[ + "extract", + out_path.to_str().unwrap(), + "--stream", + "--output", + recovered_xben.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract); + + let decoded_path = temp.path().join("decoded.jsonl"); + let decode = run( + "ben", + &[ + "--mode", + "x-decode", + recovered_xben.to_str().unwrap(), + "--output-file", + decoded_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&decode); + assert_eq!(fs::read_to_string(&decoded_path).unwrap(), sample_jsonl()); +} + +#[test] +fn ben_encode_graph_rejects_missing_graph_file() { + // A graph path that does not exist must surface a clean error, not a panic. + let temp = TempDir::new("ben-encode-graph-missing-file"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let nonexistent_graph = temp.path().join("does-not-exist.json"); + let out_path = temp.path().join("out.bendl"); + + let out = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + "--graph", + nonexistent_graph.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_failure(&out); +} + +#[test] +fn ben_encode_graph_refuses_to_overwrite_existing_file_without_flag() { + // Without --overwrite, an existing output path must be preserved. + let temp = TempDir::new("ben-encode-graph-overwrite-guard"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + let out_path = temp.path().join("out.bendl"); + fs::write(&out_path, b"prior contents").unwrap(); + + let out = run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_failure(&out); + // The prior file must remain untouched. + assert_eq!(fs::read(&out_path).unwrap(), b"prior contents"); +} + +#[test] +fn ben_xencode_graph_happy_path_produces_bendl() { + let temp = TempDir::new("ben-xencode-graph-happy"); + let jsonl_path = temp.path().join("samples.jsonl"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + let graph_path = temp.path().join("graph.json"); + fs::write(&graph_path, sample_graph()).unwrap(); + let out_path = temp.path().join("out.bendl"); + + let encode = run( + "ben", + &[ + "--mode", + "x-encode", + jsonl_path.to_str().unwrap(), + "--output-file", + out_path.to_str().unwrap(), + "--graph", + graph_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + ); + assert_success(&encode); + assert!(out_path.exists()); + + // Recover the embedded XBEN stream and decode it to confirm round-trip. + let stream_path = temp.path().join("recovered.xben"); + let extract_stream = run( + "bendl", + &[ + "extract", + out_path.to_str().unwrap(), + "--stream", + "--output", + stream_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&extract_stream); + + let decoded_path = temp.path().join("decoded.jsonl"); + let decode = run( + "ben", + &[ + "--mode", + "x-decode", + stream_path.to_str().unwrap(), + "--output-file", + decoded_path.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + ); + assert_success(&decode); + assert_eq!(fs::read_to_string(&decoded_path).unwrap(), sample_jsonl()); +} From 107d880dcd9c1fdd9ce1482ea9db44e70a0cc3be Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 22 May 2026 07:28:30 -0600 Subject: [PATCH 117/221] better testing of the bendl write path --- ben/src/cli/ben/bundle.rs | 165 ++++++++++++++++++++++++++++++ ben/src/io/bundle/error.rs | 54 ++++++++++ ben/src/io/bundle/tests/writer.rs | 146 ++++++++++++++++++++++++-- ben/src/io/writer/options.rs | 79 ++++++++++++++ ben/src/io/writer/tests.rs | 122 ++++++++++++++++++++++ 5 files changed, 555 insertions(+), 11 deletions(-) diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 47eb6da..dfbb7d5 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -144,3 +144,168 @@ pub(super) fn run_xencode_bundle_with_graph( append_graph_asset(out_path, graph_path) } + +#[cfg(test)] +mod tests { + //! In-process tests for the `--graph` bundle dispatchers. CLI subprocess tests + //! (`tests/test_cli.rs`) confirm argv parsing and exit codes; coverage instrumentation does + //! not follow subprocess boundaries, so unit tests here are required to actually exercise + //! these functions' branches. + use super::*; + use crate::test_utils::unique_path; + use std::io::Write; + + fn canonical_jsonl() -> &'static [u8] { + b"{\"assignment\":[1,1,2],\"sample\":1}\n{\"assignment\":[2,2,3],\"sample\":2}\n" + } + + fn canonical_graph() -> &'static [u8] { + b"{\"nodes\":3,\"edges\":[[0,1],[1,2]]}" + } + + /// Allocate a fresh per-test directory under the system temp dir. Returned path is created + /// on disk so callers can write files into it immediately. + fn fresh_temp_dir(label: &str) -> std::path::PathBuf { + let p = unique_path(label); + std::fs::create_dir_all(&p).unwrap(); + p + } + + #[test] + fn run_encode_bundle_with_graph_produces_readable_bundle() { + let temp = fresh_temp_dir("encode-bundle-graph"); + let input = temp.join("input.jsonl"); + std::fs::write(&input, canonical_jsonl()).unwrap(); + let graph = temp.join("graph.json"); + std::fs::write(&graph, canonical_graph()).unwrap(); + let out = temp.join("out.bendl"); + let out_str = out.to_string_lossy().into_owned(); + + run_encode_bundle_with_graph(&input, &out_str, BenVariant::Standard, &graph).unwrap(); + + let reader = crate::io::bundle::BendlReader::open( + std::fs::File::open(&out).expect("open bundle"), + ) + .expect("open bundle"); + assert!(reader.is_finalized()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + } + + #[test] + fn run_xencode_bundle_with_graph_from_jsonl_input_succeeds() { + // jsonl_and_xben dispatch arm: from_ben=false, encode_jsonl_to_xben path. + let temp = fresh_temp_dir("xencode-bundle-graph-jsonl"); + let input = temp.join("input.jsonl"); + std::fs::write(&input, canonical_jsonl()).unwrap(); + let graph = temp.join("graph.json"); + std::fs::write(&graph, canonical_graph()).unwrap(); + let out = temp.join("out.bendl"); + let out_str = out.to_string_lossy().into_owned(); + + run_xencode_bundle_with_graph( + &input, + &out_str, + BenVariant::Standard, + /* from_ben */ false, + Some(1), + Some(1), + None, + None, + &graph, + ) + .unwrap(); + + let reader = crate::io::bundle::BendlReader::open( + std::fs::File::open(&out).expect("open bundle"), + ) + .expect("open bundle"); + assert!(reader.is_finalized()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + } + + #[test] + fn run_xencode_bundle_with_graph_from_ben_input_succeeds() { + // from_ben=true dispatch arm — exercises the encode_ben_to_xben branch in the function + // body, the gap the CLI subprocess tests can't touch under llvm-cov. + let temp = fresh_temp_dir("xencode-bundle-graph-ben"); + + let jsonl = temp.join("input.jsonl"); + std::fs::write(&jsonl, canonical_jsonl()).unwrap(); + let ben_path = temp.join("input.ben"); + { + use std::io::BufReader; + let reader = BufReader::new(std::fs::File::open(&jsonl).unwrap()); + let mut writer = std::fs::File::create(&ben_path).unwrap(); + encode_jsonl_to_ben(reader, &mut writer, BenVariant::Standard).unwrap(); + writer.flush().unwrap(); + } + + let graph = temp.join("graph.json"); + std::fs::write(&graph, canonical_graph()).unwrap(); + let out = temp.join("out.bendl"); + let out_str = out.to_string_lossy().into_owned(); + + run_xencode_bundle_with_graph( + &ben_path, + &out_str, + BenVariant::Standard, + /* from_ben */ true, + Some(1), + Some(1), + None, + None, + &graph, + ) + .unwrap(); + + let reader = crate::io::bundle::BendlReader::open( + std::fs::File::open(&out).expect("open bundle"), + ) + .expect("open bundle"); + assert!(reader.is_finalized()); + assert!(reader.find_asset_by_name("graph.json").is_some()); + } + + #[test] + fn run_encode_bundle_with_graph_rejects_missing_graph_file() { + let temp = fresh_temp_dir("encode-bundle-missing-graph"); + let input = temp.join("input.jsonl"); + std::fs::write(&input, canonical_jsonl()).unwrap(); + let nonexistent_graph = temp.join("does-not-exist.json"); + let out = temp.join("out.bendl"); + let out_str = out.to_string_lossy().into_owned(); + + let err = run_encode_bundle_with_graph( + &input, + &out_str, + BenVariant::Standard, + &nonexistent_graph, + ) + .unwrap_err(); + assert!( + err.to_string().contains("graph") || err.kind() == io::ErrorKind::NotFound, + "expected missing-graph error, got {err}" + ); + } + + #[test] + fn append_graph_asset_rejects_missing_graph_path() { + let temp = fresh_temp_dir("append-graph-missing"); + let input = temp.join("input.jsonl"); + std::fs::write(&input, canonical_jsonl()).unwrap(); + let graph = temp.join("graph.json"); + std::fs::write(&graph, canonical_graph()).unwrap(); + let out = temp.join("out.bendl"); + let out_str = out.to_string_lossy().into_owned(); + run_encode_bundle_with_graph(&input, &out_str, BenVariant::Standard, &graph).unwrap(); + + // append_graph_asset is the function under test (separate from the dispatchers, which + // already validated graph existence at the top). + let missing = temp.join("missing.json"); + let err = append_graph_asset(&out_str, &missing).unwrap_err(); + assert!( + err.to_string().contains("graph") || err.kind() == io::ErrorKind::NotFound, + "expected missing-graph error, got {err}" + ); + } +} diff --git a/ben/src/io/bundle/error.rs b/ben/src/io/bundle/error.rs index 5d13569..becdec0 100644 --- a/ben/src/io/bundle/error.rs +++ b/ben/src/io/bundle/error.rs @@ -140,3 +140,57 @@ impl From for BendlReadError { BendlReadError::DecoderInit(e) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn checksum_target_display_includes_asset_name() { + let t = ChecksumTarget::Asset("graph.json".to_string()); + let s = format!("{t}"); + // Quote-wrapped via Debug; pin the meaningful substring rather than exact bytes. + assert!(s.contains("graph.json"), "got: {s}"); + assert!(s.contains("asset"), "got: {s}"); + } + + #[test] + fn checksum_target_display_for_stream_is_human_readable() { + assert_eq!(format!("{}", ChecksumTarget::Stream), "assignment stream"); + } + + #[test] + fn from_io_error_wraps_as_io_variant() { + let inner = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let err: BendlReadError = inner.into(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::BrokenPipe), + other => panic!("expected Io, got {other:?}"), + } + } + + #[test] + fn from_bendl_format_error_unwraps_io_arm() { + // The From impl is contractual: a format-layer Io error must surface as + // BendlReadError::Io, NOT as BendlReadError::Format(BendlFormatError::Io(_)). + let inner = io::Error::new(io::ErrorKind::UnexpectedEof, "truncated"); + let fmt_err = BendlFormatError::Io(inner); + let err: BendlReadError = fmt_err.into(); + match err { + BendlReadError::Io(io_err) => assert_eq!(io_err.kind(), io::ErrorKind::UnexpectedEof), + other => panic!("expected Io, got {other:?}"), + } + } + + #[test] + fn from_bendl_format_error_passes_through_non_io_variants() { + let fmt_err = BendlFormatError::MalformedDirectory("broken".to_string()); + let err: BendlReadError = fmt_err.into(); + match err { + BendlReadError::Format(BendlFormatError::MalformedDirectory(msg)) => { + assert_eq!(msg, "broken") + } + other => panic!("expected Format(MalformedDirectory), got {other:?}"), + } + } +} diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 40ee9d6..57debd5 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -873,7 +873,11 @@ fn writer_xz_asset_stores_crc_over_compressed_bytes_not_raw() { let reader = BendlReader::open(Cursor::new(buf)).unwrap(); let entry = reader.find_asset_by_name("xz_asset").cloned().unwrap(); - assert_ne!(entry.asset_flags & ASSET_FLAG_XZ, 0, "asset must be xz-flagged"); + assert_ne!( + entry.asset_flags & ASSET_FLAG_XZ, + 0, + "asset must be xz-flagged" + ); assert_ne!(entry.asset_flags & ASSET_FLAG_CHECKSUM, 0); let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); @@ -1418,7 +1422,10 @@ fn corrupt_stream_checksum(bytes: &mut Vec) { /// Flip a byte in the stream payload to corrupt the stream contents without changing its length. fn corrupt_stream_payload(bytes: &mut Vec, reader: &mut BendlReader>>) { let (offset, len) = reader.assignment_stream_range().unwrap(); - assert!(len > 0, "stream must be non-empty to corrupt a payload byte"); + assert!( + len > 0, + "stream must be non-empty to corrupt a payload byte" + ); // Flip the last byte of the stream region. bytes[(offset + len - 1) as usize] ^= 0x01; } @@ -1483,7 +1490,13 @@ fn assignment_stream_reader_detects_corrupt_stored_checksum() { .and_then(|e| e.downcast_ref::()) .expect("inner ChecksumError"); assert!( - matches!(inner, ChecksumError::Mismatch { target: ChecksumTarget::Stream, .. }), + matches!( + inner, + ChecksumError::Mismatch { + target: ChecksumTarget::Stream, + .. + } + ), "expected Stream Mismatch, got {inner:?}" ); } @@ -1569,7 +1582,10 @@ fn open_assignment_reader_iterator_detects_corrupt_stored_checksum() { } } // Subsequent calls must return None (not repeat the error). - assert!(decoder.next().is_none(), "expected None after mismatch reported"); + assert!( + decoder.next().is_none(), + "expected None after mismatch reported" + ); assert_eq!(decoded_count, samples.len()); } @@ -1603,9 +1619,7 @@ fn write_all_jsonl_detects_corrupt_stored_checksum() { corrupt_stream_checksum(&mut buf); let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); let mut decoder = reader.open_assignment_reader().unwrap(); - let err = decoder - .write_all_jsonl(std::io::sink()) - .unwrap_err(); + let err = decoder.write_all_jsonl(std::io::sink()).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); let inner = err .get_ref() @@ -1630,9 +1644,7 @@ fn for_each_assignment_detects_corrupt_stored_checksum_when_driven_to_eof() { let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); let mut decoder = reader.open_assignment_reader().unwrap(); // Callback always returns Ok(true) so it drives to natural EOF. - let err = decoder - .for_each_assignment(|_, _| Ok(true)) - .unwrap_err(); + let err = decoder.for_each_assignment(|_, _| Ok(true)).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); let inner = err .get_ref() @@ -1800,7 +1812,8 @@ fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { // and assert the reserved bit is still set on the original entry. The new entry must not // pick up any reserved bits. let (initial_bytes, reserved_bit) = bundle_with_reserved_asset_flag_bit(); - let known_v1_bits: u16 = ASSET_FLAG_CHECKSUM | ASSET_FLAG_XZ | crate::io::bundle::format::ASSET_FLAG_JSON; + let known_v1_bits: u16 = + ASSET_FLAG_CHECKSUM | ASSET_FLAG_XZ | crate::io::bundle::format::ASSET_FLAG_JSON; let mut appender = BendlAppender::open(Cursor::new(initial_bytes)).unwrap(); appender @@ -1829,3 +1842,114 @@ fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { "appender must not set any unknown bits on newly written entries" ); } + +// --------------------------------------------------------------------------- +// rollback paths and accessors +// --------------------------------------------------------------------------- + +#[test] +fn stream_session_start_offset_returns_recorded_value() { + // `BendlStreamSession::start_offset` records the file position at session-construction time so + // a caller can later size the stream region. The getter is a one-line method but is the only + // way to read this value, so pin it explicitly. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "blob.bin", + b"abcdef", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let session = writer.into_stream_session().unwrap(); + // Header is 64 bytes; one 6-byte asset payload follows → start_offset = 70. + assert_eq!(session.start_offset(), HEADER_SIZE as u64 + 6); +} + +#[test] +fn writer_duplicate_name_after_singleton_insert_rolls_back_singleton_state() { + // Trigger the rare DuplicateName-after-canonical-singleton-insert branch in BendlWriter:: + // add_asset (the `singleton_types.remove(&asset_type)` rollback path). Reach it by adding a + // custom asset that happens to take the canonical name of a known singleton type, then + // attempting to add the actual singleton: the canonical-name check passes, singleton_types + // accepts the new type, then names.insert fails because the custom asset already claimed + // that name. The rollback keeps the writer state consistent for a future retry. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "graph.json", + b"squatting on the canonical name", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let err = writer + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + b"the real graph", + AddAssetOptions::defaults().json().compress(), + ) + .unwrap_err(); + assert!( + matches!(err, BendlWriteError::DuplicateName(ref n) if n == "graph.json"), + "expected DuplicateName, got {err:?}" + ); + + // The rollback contract: a second attempt at adding ASSET_TYPE_GRAPH must NOT see a stale + // entry in singleton_types from the previous attempt. The writer is also expected to + // remain usable for non-conflicting additions. + writer + .add_asset( + ASSET_TYPE_METADATA, + "metadata.json", + br#"{"v":1}"#, + AddAssetOptions::defaults().json().raw(), + ) + .unwrap(); +} + +#[test] +fn appender_duplicate_name_after_singleton_insert_rolls_back_pending_state() { + // Same rollback contract for BendlAppender (rather than BendlWriter): a successful canonical- + // name singleton insert into pending_singleton_types must be undone if the name collides + // with an existing entry. Reach it by appending a custom asset that takes a canonical name, + // committing, then opening the appender and attempting the singleton add. + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "graph.json", + b"squatter", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let session = writer.into_stream_session().unwrap(); + let writer = session.finish_into_writer(0); + let bundle = writer.finish().unwrap().into_inner(); + + let mut appender = BendlAppender::open(Cursor::new(bundle)).unwrap(); + let err = appender + .add_asset( + ASSET_TYPE_GRAPH, + "graph.json", + b"the real graph", + AddAssetOptions::defaults().json().compress(), + ) + .unwrap_err(); + assert!( + matches!(err, BendlWriteError::DuplicateName(ref n) if n == "graph.json"), + "expected DuplicateName, got {err:?}" + ); + + // After the rejection, the appender must still be usable for non-conflicting additions + // (the rollback removed the stale pending_singleton_types entry). + appender + .add_asset( + ASSET_TYPE_METADATA, + "metadata.json", + br#"{"v":1}"#, + AddAssetOptions::defaults().json().raw(), + ) + .unwrap(); +} diff --git a/ben/src/io/writer/options.rs b/ben/src/io/writer/options.rs index 83ccfbd..34d8741 100644 --- a/ben/src/io/writer/options.rs +++ b/ben/src/io/writer/options.rs @@ -59,3 +59,82 @@ impl Default for XzEncodeOptions { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_equals_default() { + let a = XzEncodeOptions::new(); + let b = XzEncodeOptions::default(); + assert_eq!(a.n_threads, b.n_threads); + assert_eq!(a.compression_level, b.compression_level); + assert_eq!(a.block_size, b.block_size); + assert_eq!(a.twodelta_chunk_size, b.twodelta_chunk_size); + } + + #[test] + fn defaults_are_none_and_default_chunk_size() { + let o = XzEncodeOptions::default(); + assert_eq!(o.n_threads, None); + assert_eq!(o.compression_level, None); + assert_eq!(o.block_size, None); + assert_eq!(o.twodelta_chunk_size, DEFAULT_TWODELTA_CHUNK_SIZE); + } + + #[test] + fn with_n_threads_clamps_zero_to_one() { + // The clamp is part of the contract — the underlying xz mt encoder requires ≥1. + assert_eq!(XzEncodeOptions::new().with_n_threads(0).n_threads, Some(1)); + assert_eq!(XzEncodeOptions::new().with_n_threads(8).n_threads, Some(8)); + } + + #[test] + fn with_compression_level_clamps_to_nine() { + assert_eq!( + XzEncodeOptions::new().with_compression_level(99).compression_level, + Some(9) + ); + // Level 0 (store-mode) is a legitimate setting and must be preserved as-is. + assert_eq!( + XzEncodeOptions::new().with_compression_level(0).compression_level, + Some(0) + ); + assert_eq!( + XzEncodeOptions::new().with_compression_level(6).compression_level, + Some(6) + ); + } + + #[test] + fn with_block_size_round_trips_any_value() { + let o = XzEncodeOptions::new().with_block_size(64 * 1024 * 1024); + assert_eq!(o.block_size, Some(64 * 1024 * 1024)); + } + + #[test] + fn with_twodelta_chunk_size_clamps_zero_to_one() { + assert_eq!( + XzEncodeOptions::new().with_twodelta_chunk_size(0).twodelta_chunk_size, + 1 + ); + assert_eq!( + XzEncodeOptions::new().with_twodelta_chunk_size(7).twodelta_chunk_size, + 7 + ); + } + + #[test] + fn chained_builder_composes_all_fields() { + let o = XzEncodeOptions::new() + .with_n_threads(4) + .with_compression_level(3) + .with_block_size(1024) + .with_twodelta_chunk_size(128); + assert_eq!(o.n_threads, Some(4)); + assert_eq!(o.compression_level, Some(3)); + assert_eq!(o.block_size, Some(1024)); + assert_eq!(o.twodelta_chunk_size, 128); + } +} diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 55bc11b..fe9cb62 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -973,3 +973,125 @@ fn ben_writer_failed_state_after_underlying_writer_error() { let err = w.finish().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); } + +// ── stream_writer/mod.rs coverage ───────────────────────────────────── + +use crate::io::reader::BenWireFormat; +use crate::io::writer::XzEncodeOptions; + +#[test] +fn for_xben_top_level_constructor_round_trips_per_variant() { + // The internal codec plumbing builds XBEN writers through `for_xben_with_encoder` with a + // pre-built XzEncoder; the public `for_xben` constructor (which takes XzEncodeOptions and + // builds the encoder internally) is the path external callers use. Exercise it directly so + // the encoder-construction branch isn't only covered indirectly. + let assignment = vec![1u16, 1, 2, 2]; + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + let mut buf = Vec::new(); + { + let opts = XzEncodeOptions::new() + .with_n_threads(1) + .with_compression_level(1); + let mut writer = BenStreamWriter::for_xben(&mut buf, variant, opts).unwrap(); + writer.write_assignment(assignment.clone()).unwrap(); + writer.finish().unwrap(); + } + let decoded: Vec> = BenStreamReader::from_xben(Cursor::new(&buf)) + .unwrap() + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat(a).take(c as usize) + }) + .collect(); + assert_eq!(decoded, vec![assignment.clone()], "variant={variant:?}"); + } +} + +#[test] +fn writer_variant_and_wire_format_accessors_reflect_construction() { + // The variant() and wire_format() accessors are zero-cost getters but easy to regress — + // a future refactor that adds a third inner variant must keep these in sync. Pin both. + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + let mut buf = Vec::new(); + let ben_writer = BenStreamWriter::for_ben(&mut buf, variant).unwrap(); + assert_eq!(ben_writer.variant(), variant); + assert_eq!(ben_writer.wire_format(), BenWireFormat::Ben); + drop(ben_writer); // BEN writer drop is a no-op-flush. + + let mut buf = Vec::new(); + let xben_writer = build_xben_writer(&mut buf, variant, None); + assert_eq!(xben_writer.variant(), variant); + assert_eq!(xben_writer.wire_format(), BenWireFormat::XBen); + } +} + +#[test] +fn finish_into_inner_returns_underlying_buffer_for_ben_open_state() { + // `finish_into_inner` from the Open state must flush pending state and hand back the inner + // writer. Pins the BEN-Open branch (lines 303-307). + let buf = Vec::new(); + let mut writer = BenStreamWriter::for_ben(buf, BenVariant::Standard).unwrap(); + writer.write_assignment(vec![1u16, 2]).unwrap(); + let inner = writer.finish_into_inner().unwrap(); + // Inner must contain at least the banner; concrete bytes are pinned by other tests. + assert!(inner.starts_with(b"STANDARD BEN FILE")); +} + +#[test] +fn finish_into_inner_returns_underlying_buffer_for_xben_open_state() { + // Pins the XBEN-Open branch (lines 309-315): finish the xz encoder and return the inner + // buffer. + let buf = Vec::new(); + let encoder = XzEncoder::new(buf, 1); + let mut writer = + BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::Standard, None).unwrap(); + writer.write_assignment(vec![1u16, 2]).unwrap(); + let inner = writer.finish_into_inner().unwrap(); + // The inner buffer should be a complete xz stream (decompresses to the BEN stream). We + // don't pin exact bytes; just confirm the writer handed back a non-empty buffer. + assert!(!inner.is_empty()); +} + +#[test] +fn finish_into_inner_from_complete_state_returns_buffer_without_double_flush() { + // After `finish()` succeeds the writer is Complete; `finish_into_inner` must accept this + // state and return the inner writer without trying to flush again. + let buf = Vec::new(); + let mut writer = BenStreamWriter::for_ben(buf, BenVariant::Standard).unwrap(); + writer.write_assignment(vec![1u16, 2]).unwrap(); + writer.finish().unwrap(); + let inner = writer.finish_into_inner().unwrap(); + assert!(inner.starts_with(b"STANDARD BEN FILE")); +} + +#[test] +fn write_json_value_with_malformed_assignment_field_does_not_poison() { + // The JSON-parse step in write_json_value is preflight: a malformed input must error but + // leave the writer in Open so subsequent valid writes still work. This pins the contract + // that JSON validation happens before any stateful encode work. + use serde_json::json; + let mut buf = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut buf, BenVariant::Standard).unwrap(); + // Missing the "assignment" field -> rejected by parse_json_assignment, NOT a stateful + // write -> writer stays Open. + let bad = json!({"sample": 1}); + let err = writer.write_json_value(bad).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + // Writer must still accept a valid sample after the preflight rejection. + writer + .write_json_value(json!({"assignment": [1, 2], "sample": 1})) + .unwrap(); + writer.finish().unwrap(); + } + assert!(buf.starts_with(b"STANDARD BEN FILE")); +} From d0c5b714060ba5f707a577ec6ea91939dd1eb728 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 22 May 2026 08:25:21 -0600 Subject: [PATCH 118/221] more cli path tests for bendl --- ben/src/cli/bendl/tests.rs | 105 ++++++++++++++++++++++++++++ ben/src/cli/reben/tests.rs | 140 +++++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index f9884fb..b0f9308 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -472,3 +472,108 @@ fn run_append_errors_on_missing_custom_asset_file() { assert!(err.contains("failed to read")); let _ = std::fs::remove_file(&bendl); } + +// --------------------------------------------------------------------------- +// extract --stream + inspect display branches +// --------------------------------------------------------------------------- + +#[test] +fn run_extract_stream_writes_raw_assignment_bytes() { + // The existing run_extract_asset_by_name test covers --asset; this companion exercises + // --stream (lines 27-31 of extract.rs). + let known_stream = b"STANDARD BEN FILE\x00\x01known stream bytes"; + let bendl = unique_path("extract_stream.bendl"); + let buf = sample_bendl_bytes(known_stream, AssignmentFormat::Ben); + std::fs::write(&bendl, &buf).unwrap(); + + let out = unique_path("extract_stream_out.bin"); + let args = ExtractArgs::try_parse_from([ + "extract", + "--stream", + "--output", + out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + run_extract(args).unwrap(); + assert_eq!(std::fs::read(&out).unwrap(), known_stream); + + let _ = std::fs::remove_file(&bendl); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_extract_asset_with_unknown_name_errors_cleanly() { + // Pin the no-asset-named-X branch of extract.rs — find_asset_by_name returns None and the + // caller surfaces a clear "no asset named ..." error. + let bendl = write_temp_bendl("extract_unknown_asset.bendl", AssignmentFormat::Ben); + let out = unique_path("extract_unknown_out.bin"); + let args = ExtractArgs::try_parse_from([ + "extract", + "--asset", + "does-not-exist.txt", + "--output", + out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + let err = run_extract(args).unwrap_err(); + assert!( + err.contains("no asset") && err.contains("does-not-exist"), + "expected no-asset error mentioning the name, got: {err}" + ); + let _ = std::fs::remove_file(&bendl); + let _ = std::fs::remove_file(&out); +} + +#[test] +fn run_inspect_displays_asset_with_no_flags_as_dash() { + // Pin inspect.rs line 60 — the `"-".to_string()` fallback for an asset whose asset_flags + // bitmap has no known bits set. Reaching it requires hand-building a directory entry with + // asset_flags=0 (the library writer always sets ASSET_FLAG_CHECKSUM). + use crate::io::bundle::format::{ + encode_directory, BendlDirectoryEntry, BendlHeader, ASSET_TYPE_CUSTOM, BENDL_MAGIC, + BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, + HEADER_SIZE, + }; + + let payload = b"raw bytes"; + let mut bytes = vec![0u8; HEADER_SIZE]; + let payload_offset = bytes.len() as u64; + bytes.extend_from_slice(payload); + + let directory_offset = bytes.len() as u64; + let entries = vec![BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "flagless.bin".to_string(), + payload_offset, + payload_len: payload.len() as u64, + checksum: None, + }]; + let directory = encode_directory(&entries).unwrap(); + bytes.extend_from_slice(&directory); + + let header = BendlHeader { + magic: BENDL_MAGIC, + major_version: BENDL_MAJOR_VERSION, + minor_version: BENDL_MINOR_VERSION, + finalized: FINALIZED_YES, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: HEADER_FLAG_STREAM_CHECKSUM, + stream_checksum: 0, + directory_offset, + directory_len: directory.len() as u64, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: 0, + }; + bytes[..HEADER_SIZE].copy_from_slice(&header.to_bytes()); + + let bendl = unique_path("inspect_flagless.bendl"); + std::fs::write(&bendl, &bytes).unwrap(); + let args = InspectArgs::try_parse_from(["inspect", bendl.to_str().unwrap()]).unwrap(); + run_inspect(args).unwrap(); + let _ = std::fs::remove_file(&bendl); +} diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 1953f8d..f7a009a 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -569,3 +569,143 @@ fn run_ben_mode_with_output_variant_derives_name() { fs::remove_file(&input).unwrap(); result.unwrap(); } + +// --------------------------------------------------------------------------- +// --key / --ordering happy paths and rejection guards +// --------------------------------------------------------------------------- + +/// Minimal 3-node adjacency-style graph JSON, matching the shape `sort_json_file_by_*` accepts. +const SHAPE_JSON: &[u8] = br#"{"nodes":[{"id":0,"GEOID20":"B"},{"id":1,"GEOID20":"A"},{"id":2,"GEOID20":"C"}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#; + +#[test] +fn run_json_mode_with_key_happy_path() { + // Exercise the `if let Some(key)` arm of run_json_mode (sort_json_file_by_key path). The + // existing `run_json_mode_with_ordering_derives_output_name` test only covers the ordering + // arm; this companion pins the key arm. + let shape = unique_path("json_mode_key_shape.json"); + fs::write(&shape, SHAPE_JSON).unwrap(); + let args = Args::try_parse_from([ + "reben", + shape.to_str().unwrap(), + "--mode", + "json", + "--key", + "GEOID20", + ]) + .unwrap(); + let result = run_json_mode(args); + let stem = shape.to_str().unwrap().trim_end_matches(".json").to_owned(); + let derived_map = stem.clone() + "_sorted_by_GEOID20_map.json"; + let derived_sorted = stem + "_sorted_by_GEOID20.json"; + let _ = fs::remove_file(&derived_map); + let _ = fs::remove_file(&derived_sorted); + let _ = fs::remove_file(&shape); + result.unwrap(); +} + +#[test] +fn run_ben_mode_with_key_and_shape_happy_path() { + // Exercise the --key + --shape-file branch of run_ben_mode (lines 76-123 of ben_mode.rs): + // sort by key, generate a map file, then permute the BEN stream by that map. The existing + // tests cover the no-map/no-key path and the --map-file path; this is the gap. + let input = write_temp_ben("ben_mode_key_input.jsonl.ben"); + let shape = unique_path("ben_mode_key_shape.json"); + fs::write(&shape, SHAPE_JSON).unwrap(); + let out = unique_path("ben_mode_key_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--key", + "GEOID20", + "--shape-file", + shape.to_str().unwrap(), + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + + let shape_stem = shape.to_str().unwrap().trim_end_matches(".json").to_owned(); + let _ = fs::remove_file(shape_stem.clone() + "_sorted_by_GEOID20_map.json"); + let _ = fs::remove_file(shape_stem + "_sorted_by_GEOID20.json"); + let _ = fs::remove_file(&shape); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); + result.unwrap(); +} + +#[test] +fn run_ben_mode_with_ordering_and_shape_happy_path() { + // The complement of the --key test: --ordering instead. Exercises + // `sort_json_file_by_ordering` + `to_graph_ordering`. + let input = write_temp_ben("ben_mode_ord_input.jsonl.ben"); + let shape = unique_path("ben_mode_ord_shape.json"); + fs::write(&shape, SHAPE_JSON).unwrap(); + let out = unique_path("ben_mode_ord_output.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--ordering", + "reverse-cuthill-mckee", + "--shape-file", + shape.to_str().unwrap(), + "--output-file", + out.to_str().unwrap(), + ]) + .unwrap(); + let result = run_ben_mode(args); + + let shape_stem = shape.to_str().unwrap().trim_end_matches(".json").to_owned(); + let _ = fs::remove_file(shape_stem.clone() + "_sorted_by_reverse-cuthill-mckee_map.json"); + let _ = fs::remove_file(shape_stem + "_sorted_by_reverse-cuthill-mckee.json"); + let _ = fs::remove_file(&shape); + let _ = fs::remove_file(&input); + let _ = fs::remove_file(&out); + result.unwrap(); +} + +#[test] +fn run_ben_mode_rejects_map_file_combined_with_key() { + // The map-file + key/ordering conflict guard (ben_mode.rs lines 67-74). The check fires + // after the input file is opened, so we provide a valid BEN input to reach the guard. + let input = write_temp_ben("map_plus_key_input.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--map-file", + "m.json", + "--key", + "k", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + let _ = fs::remove_file(&input); + assert!( + err.contains("map file") || err.contains("sorting option"), + "expected map+sort conflict error, got: {err}" + ); +} + +#[test] +fn run_ben_mode_rejects_key_without_shape_file() { + // The shape-file presence guard (ben_mode.rs line 78-80). + let input = write_temp_ben("key_no_shape_input.jsonl.ben"); + let args = Args::try_parse_from([ + "reben", + input.to_str().unwrap(), + "--mode", + "ben", + "--key", + "GEOID20", + ]) + .unwrap(); + let err = run_ben_mode(args).unwrap_err(); + let _ = fs::remove_file(&input); + assert!(err.contains("shape file"), "got: {err}"); +} From 7b5841021df74b6c92d4d7af1c59ccb5c98b608d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 22 May 2026 08:43:18 -0600 Subject: [PATCH 119/221] remove redundant tests --- ben/src/cli/ben/bundle.rs | 164 ------------------ ben/tests/test_coverage.rs | 330 +------------------------------------ 2 files changed, 9 insertions(+), 485 deletions(-) diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index dfbb7d5..05e5682 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -145,167 +145,3 @@ pub(super) fn run_xencode_bundle_with_graph( append_graph_asset(out_path, graph_path) } -#[cfg(test)] -mod tests { - //! In-process tests for the `--graph` bundle dispatchers. CLI subprocess tests - //! (`tests/test_cli.rs`) confirm argv parsing and exit codes; coverage instrumentation does - //! not follow subprocess boundaries, so unit tests here are required to actually exercise - //! these functions' branches. - use super::*; - use crate::test_utils::unique_path; - use std::io::Write; - - fn canonical_jsonl() -> &'static [u8] { - b"{\"assignment\":[1,1,2],\"sample\":1}\n{\"assignment\":[2,2,3],\"sample\":2}\n" - } - - fn canonical_graph() -> &'static [u8] { - b"{\"nodes\":3,\"edges\":[[0,1],[1,2]]}" - } - - /// Allocate a fresh per-test directory under the system temp dir. Returned path is created - /// on disk so callers can write files into it immediately. - fn fresh_temp_dir(label: &str) -> std::path::PathBuf { - let p = unique_path(label); - std::fs::create_dir_all(&p).unwrap(); - p - } - - #[test] - fn run_encode_bundle_with_graph_produces_readable_bundle() { - let temp = fresh_temp_dir("encode-bundle-graph"); - let input = temp.join("input.jsonl"); - std::fs::write(&input, canonical_jsonl()).unwrap(); - let graph = temp.join("graph.json"); - std::fs::write(&graph, canonical_graph()).unwrap(); - let out = temp.join("out.bendl"); - let out_str = out.to_string_lossy().into_owned(); - - run_encode_bundle_with_graph(&input, &out_str, BenVariant::Standard, &graph).unwrap(); - - let reader = crate::io::bundle::BendlReader::open( - std::fs::File::open(&out).expect("open bundle"), - ) - .expect("open bundle"); - assert!(reader.is_finalized()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - } - - #[test] - fn run_xencode_bundle_with_graph_from_jsonl_input_succeeds() { - // jsonl_and_xben dispatch arm: from_ben=false, encode_jsonl_to_xben path. - let temp = fresh_temp_dir("xencode-bundle-graph-jsonl"); - let input = temp.join("input.jsonl"); - std::fs::write(&input, canonical_jsonl()).unwrap(); - let graph = temp.join("graph.json"); - std::fs::write(&graph, canonical_graph()).unwrap(); - let out = temp.join("out.bendl"); - let out_str = out.to_string_lossy().into_owned(); - - run_xencode_bundle_with_graph( - &input, - &out_str, - BenVariant::Standard, - /* from_ben */ false, - Some(1), - Some(1), - None, - None, - &graph, - ) - .unwrap(); - - let reader = crate::io::bundle::BendlReader::open( - std::fs::File::open(&out).expect("open bundle"), - ) - .expect("open bundle"); - assert!(reader.is_finalized()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - } - - #[test] - fn run_xencode_bundle_with_graph_from_ben_input_succeeds() { - // from_ben=true dispatch arm — exercises the encode_ben_to_xben branch in the function - // body, the gap the CLI subprocess tests can't touch under llvm-cov. - let temp = fresh_temp_dir("xencode-bundle-graph-ben"); - - let jsonl = temp.join("input.jsonl"); - std::fs::write(&jsonl, canonical_jsonl()).unwrap(); - let ben_path = temp.join("input.ben"); - { - use std::io::BufReader; - let reader = BufReader::new(std::fs::File::open(&jsonl).unwrap()); - let mut writer = std::fs::File::create(&ben_path).unwrap(); - encode_jsonl_to_ben(reader, &mut writer, BenVariant::Standard).unwrap(); - writer.flush().unwrap(); - } - - let graph = temp.join("graph.json"); - std::fs::write(&graph, canonical_graph()).unwrap(); - let out = temp.join("out.bendl"); - let out_str = out.to_string_lossy().into_owned(); - - run_xencode_bundle_with_graph( - &ben_path, - &out_str, - BenVariant::Standard, - /* from_ben */ true, - Some(1), - Some(1), - None, - None, - &graph, - ) - .unwrap(); - - let reader = crate::io::bundle::BendlReader::open( - std::fs::File::open(&out).expect("open bundle"), - ) - .expect("open bundle"); - assert!(reader.is_finalized()); - assert!(reader.find_asset_by_name("graph.json").is_some()); - } - - #[test] - fn run_encode_bundle_with_graph_rejects_missing_graph_file() { - let temp = fresh_temp_dir("encode-bundle-missing-graph"); - let input = temp.join("input.jsonl"); - std::fs::write(&input, canonical_jsonl()).unwrap(); - let nonexistent_graph = temp.join("does-not-exist.json"); - let out = temp.join("out.bendl"); - let out_str = out.to_string_lossy().into_owned(); - - let err = run_encode_bundle_with_graph( - &input, - &out_str, - BenVariant::Standard, - &nonexistent_graph, - ) - .unwrap_err(); - assert!( - err.to_string().contains("graph") || err.kind() == io::ErrorKind::NotFound, - "expected missing-graph error, got {err}" - ); - } - - #[test] - fn append_graph_asset_rejects_missing_graph_path() { - let temp = fresh_temp_dir("append-graph-missing"); - let input = temp.join("input.jsonl"); - std::fs::write(&input, canonical_jsonl()).unwrap(); - let graph = temp.join("graph.json"); - std::fs::write(&graph, canonical_graph()).unwrap(); - let out = temp.join("out.bendl"); - let out_str = out.to_string_lossy().into_owned(); - run_encode_bundle_with_graph(&input, &out_str, BenVariant::Standard, &graph).unwrap(); - - // append_graph_asset is the function under test (separate from the dispatchers, which - // already validated graph existence at the top). - let missing = temp.join("missing.json"); - let err = append_graph_asset(&out_str, &missing).unwrap_err(); - assert!( - err.to_string().contains("graph") || err.kind() == io::ErrorKind::NotFound, - "expected missing-graph error, got {err}" - ); - } -} diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index f67c690..4bf0515 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -10,7 +10,7 @@ use binary_ensemble::codec::encode::{ }; use binary_ensemble::codec::BenEncodeFrame; use binary_ensemble::format::banners::{ - banner_for_variant, has_known_banner_prefix, variant_from_banner, BANNER_LEN, + banner_for_variant, has_known_banner_prefix, variant_from_banner, MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::reader::{ @@ -83,21 +83,6 @@ fn make_ring_graph_json(n: usize) -> String { // ────────────────────────────────────────────────────────────────────────────── format::banners // ────────────────────────────────────────────────────────────────────────────── -#[test] -fn banner_constants_have_correct_length() { - assert_eq!(STANDARD_BEN_BANNER.len(), BANNER_LEN); - assert_eq!(MKVCHAIN_BEN_BANNER.len(), BANNER_LEN); - assert_eq!(TWODELTA_BEN_BANNER.len(), BANNER_LEN); - assert_eq!(BANNER_LEN, 17); -} - -#[test] -fn banner_constants_have_correct_content() { - assert_eq!(STANDARD_BEN_BANNER, b"STANDARD BEN FILE"); - assert_eq!(MKVCHAIN_BEN_BANNER, b"MKVCHAIN BEN FILE"); - assert_eq!(TWODELTA_BEN_BANNER, b"TWODELTA BEN FILE"); -} - #[test] fn banner_for_variant_returns_correct_banners() { assert_eq!( @@ -175,64 +160,6 @@ fn has_known_banner_prefix_rejects_garbage() { // ────────────────────────────────────────────────────────────────────────────── util::rle // ────────────────────────────────────────────────────────────────────────────── -#[test] -fn assign_to_rle_empty_vec() { - let v: Vec = vec![]; - assert_eq!(assign_to_rle(&v), vec![]); -} - -#[test] -fn rle_to_vec_empty_vec() { - let rle: Vec<(u16, u16)> = vec![]; - assert_eq!(rle_to_vec(rle), Vec::::new()); -} - -#[test] -fn assign_to_rle_single_element() { - assert_eq!(assign_to_rle(&[42u16]), vec![(42, 1)]); -} - -#[test] -fn assign_to_rle_all_same() { - let v = vec![7u16; 100]; - assert_eq!(assign_to_rle(&v), vec![(7, 100)]); -} - -#[test] -fn assign_to_rle_all_different() { - let v = vec![1u16, 2, 3, 4, 5]; - let expected = vec![(1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]; - assert_eq!(assign_to_rle(&v), expected); -} - -#[test] -fn assign_to_rle_alternating() { - let v = vec![1u16, 2, 1, 2, 1, 2]; - let expected = vec![(1, 1), (2, 1), (1, 1), (2, 1), (1, 1), (2, 1)]; - assert_eq!(assign_to_rle(&v), expected); -} - -#[test] -fn assign_to_rle_with_zero_values() { - let v = vec![0u16, 0, 1, 0, 0]; - let expected = vec![(0, 2), (1, 1), (0, 2)]; - assert_eq!(assign_to_rle(&v), expected); -} - -#[test] -fn assign_to_rle_max_u16_value() { - let v = vec![65535u16; 3]; - assert_eq!(assign_to_rle(&v), vec![(65535, 3)]); -} - -#[test] -fn rle_to_vec_single_long_run() { - let rle = vec![(99u16, 1000u16)]; - let result = rle_to_vec(rle); - assert_eq!(result.len(), 1000); - assert!(result.iter().all(|&v| v == 99)); -} - #[test] fn rle_roundtrip_preserves_data() { let original = vec![3u16, 3, 3, 1, 1, 4, 4, 4, 4, 2]; @@ -1595,52 +1522,9 @@ fn encode_twodelta_frame_single_value_swap() { } // ────────────────────────────────────────────────────────────────────────────── -// TwoDeltaEncodeFrame accessors +// TwoDeltaEncodeFrame round-trip // ────────────────────────────────────────────────────────────────────────────── -#[test] -fn twodelta_frame_pair_accessor() { - let pair = (3u16, 7u16); - let run_lengths = vec![2u16, 3, 1]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.pair().unwrap(), pair); -} - -#[test] -fn twodelta_frame_max_len_bits_accessor() { - // max run length = 4 = 0b100 → 3 bits - let pair = (1u16, 2u16); - let run_lengths = vec![4u16, 4]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.max_len_bit_count(), 3); -} - -#[test] -fn twodelta_frame_n_bytes_and_payload_consistent() { - let pair = (5u16, 10u16); - let run_lengths = vec![1u16, 2, 3]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.n_bytes() as usize, frame.payload().len()); -} - -#[test] -fn twodelta_frame_to_bytes_and_as_slice_same() { - let pair = (1u16, 2u16); - let run_lengths = vec![3u16, 2]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - assert_eq!(frame.to_bytes(), frame.as_slice()); -} - -#[test] -fn twodelta_frame_into_bytes_consumes() { - let pair = (1u16, 2u16); - let run_lengths = vec![3u16, 2]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - let expected = frame.to_bytes(); - let actual = frame.into_bytes(); - assert_eq!(actual, expected); -} - #[test] fn twodelta_frame_from_parts_round_trip() { let pair = (10u16, 20u16); @@ -1662,87 +1546,9 @@ fn twodelta_frame_from_parts_round_trip() { assert_eq!(original.count(), reconstructed.count()); } -#[test] -fn twodelta_frame_asref_and_deref() { - let pair = (1u16, 2u16); - let run_lengths = vec![3u16]; - let frame = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - let as_ref: &[u8] = frame.as_ref(); - let deref: &[u8] = &*frame; - assert_eq!(as_ref, deref); - assert_eq!(as_ref, frame.as_slice()); -} - -// ────────────────────────────────────────────────────────────────────────────── EncodeBenFrame -// (BenFrame from codec::encode) accessors // ────────────────────────────────────────────────────────────────────────────── - -#[test] -fn encode_ben_frame_from_rle_runs_accessor() { - let runs = vec![(3u16, 2u16), (5u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); - assert_eq!(frame.runs().unwrap().as_slice(), runs.as_slice()); -} - -#[test] -fn encode_ben_frame_max_val_bits() { - // max value = 5 = 0b101 → 3 bits - let runs = vec![(1u16, 3u16), (5u16, 2u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - assert_eq!(frame.max_val_bit_count(), Some(3)); -} - -#[test] -fn encode_ben_frame_max_len_bits() { - // max run length = 7 = 0b111 → 3 bits - let runs = vec![(1u16, 7u16), (2u16, 1u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - assert_eq!(frame.max_len_bit_count(), 3); -} - -#[test] -fn encode_ben_frame_n_bytes_consistent() { - // Frame layout: 1 byte (max_val_bits) + 1 byte (max_len_bits) + 4 bytes (n_bytes header) + - // n_bytes payload - let runs = vec![(1u16, 5u16), (2u16, 3u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - assert_eq!(frame.n_bytes() as usize + 6, frame.as_slice().len()); -} - -#[test] -fn encode_ben_frame_to_bytes_and_as_slice_same() { - let runs = vec![(1u16, 2u16), (3u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - assert_eq!(frame.to_bytes(), frame.as_slice()); -} - -#[test] -fn encode_ben_frame_into_bytes_consumes() { - let runs = vec![(1u16, 2u16), (3u16, 4u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - let expected = frame.to_bytes(); - let actual = frame.into_bytes(); - assert_eq!(actual, expected); -} - -#[test] -fn encode_ben_frame_eq_with_vec_u8() { - let runs = vec![(1u16, 2u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - let bytes = frame.to_bytes(); - assert!(frame == bytes); - assert!(bytes == frame); -} - -#[test] -fn encode_ben_frame_asref_and_deref() { - let runs = vec![(1u16, 1u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); - let as_ref: &[u8] = frame.as_ref(); - let deref: &[u8] = &*frame; - assert_eq!(as_ref, deref); - assert_eq!(as_ref, frame.as_slice()); -} +// EncodeBenFrame: from_assignment matches the RLE entrypoint +// ────────────────────────────────────────────────────────────────────────────── #[test] fn encode_ben_frame_from_assignment() { @@ -1995,126 +1801,8 @@ fn ben_frame_decoder_twodelta_yields_standard_frames() { assert_eq!(frames.len(), 2); } -// ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — BenStreamReader subsample methods -// ────────────────────────────────────────────────────────────────────────────── - -#[test] -fn ben_decoder_subsample_by_indices() { - let assignments: Vec> = (0u16..10).map(|i| vec![i; 4]).collect(); - let ben = encode_standard_ben(&assignments); - let decoder = BenStreamReader::from_ben(Cursor::new(ben)) - .unwrap() - .silent(true); - // 1-based indices: 2, 5, 8 - let selected: Vec> = decoder - .into_subsample_by_indices(vec![2usize, 5, 8]) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 3); - assert_eq!(selected[0], assignments[1]); // 1-based 2 = 0-based 1 - assert_eq!(selected[1], assignments[4]); // 1-based 5 = 0-based 4 - assert_eq!(selected[2], assignments[7]); // 1-based 8 = 0-based 7 -} - -#[test] -fn ben_decoder_subsample_by_range() { - let assignments: Vec> = (0u16..10).map(|i| vec![i; 3]).collect(); - let ben = encode_standard_ben(&assignments); - let decoder = BenStreamReader::from_ben(Cursor::new(ben)) - .unwrap() - .silent(true); - // Inclusive 1-based range [3, 6] - let selected: Vec> = decoder - .into_subsample_by_range(3, 6) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 4); - assert_eq!(selected[0], assignments[2]); // 3rd sample - assert_eq!(selected[3], assignments[5]); // 6th sample -} - -#[test] -fn ben_decoder_subsample_every_nth() { - let assignments: Vec> = (0u16..10).map(|i| vec![i; 2]).collect(); - let ben = encode_standard_ben(&assignments); - let decoder = BenStreamReader::from_ben(Cursor::new(ben)) - .unwrap() - .silent(true); - // Every 3rd sample starting at 1-based offset 1: samples 1, 4, 7, 10 - let selected: Vec> = decoder - .into_subsample_every(3, 1) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 4); - assert_eq!(selected[0], assignments[0]); - assert_eq!(selected[1], assignments[3]); - assert_eq!(selected[2], assignments[6]); - assert_eq!(selected[3], assignments[9]); -} - -#[test] -fn ben_decoder_subsample_by_indices_dedup() { - let assignments: Vec> = (0u16..5).map(|i| vec![i; 2]).collect(); - let ben = encode_standard_ben(&assignments); - let decoder = BenStreamReader::from_ben(Cursor::new(ben)) - .unwrap() - .silent(true); - // Duplicate index 2 → after dedup only samples 2 and 3 are selected - let selected: Vec> = decoder - .into_subsample_by_indices(vec![2usize, 2, 3]) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 2); - assert_eq!(selected[0], assignments[1]); - assert_eq!(selected[1], assignments[2]); -} - -// ────────────────────────────────────────────────────────────────────────────── -// SubsampleFrameDecoder — BenStreamReader subsample methods -// ────────────────────────────────────────────────────────────────────────────── - -#[test] -fn xben_decoder_subsample_by_indices() { - let assignments: Vec> = (1u16..=5).map(|i| vec![i; 4]).collect(); - let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); - let selected: Vec> = decoder - .into_subsample_by_indices(vec![1usize, 3, 5]) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 3); - assert_eq!(selected[0], assignments[0]); - assert_eq!(selected[1], assignments[2]); - assert_eq!(selected[2], assignments[4]); -} - -#[test] -fn xben_decoder_subsample_by_range() { - let assignments: Vec> = (0u16..6).map(|i| vec![i; 3]).collect(); - let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); - let selected: Vec> = decoder - .into_subsample_by_range(2, 4) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 3); - assert_eq!(selected[0], assignments[1]); - assert_eq!(selected[2], assignments[3]); -} - -#[test] -fn xben_decoder_subsample_every() { - let assignments: Vec> = (0u16..6).map(|i| vec![i; 2]).collect(); - let xben = encode_xben(&assignments, BenVariant::Standard); - let decoder = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); - // Every 2nd sample starting from offset 1: samples 1, 3, 5 - let selected: Vec> = decoder - .into_subsample_every(2, 1) - .map(|r| r.unwrap().0) - .collect(); - assert_eq!(selected.len(), 3); - assert_eq!(selected[0], assignments[0]); - assert_eq!(selected[1], assignments[2]); - assert_eq!(selected[2], assignments[4]); -} +// Subsample-method single-case tests were deleted in the suite-audit deletion pass: +// `fuzz_subsample_by_indices`, `fuzz_subsample_every`, `fuzz_subsample_range`, and +// `fuzz_subsample_by_indices_twodelta` in `tests/test_impls_pipeline.rs` exercise these methods +// over random sequences with random indices/ranges/strides for both BEN and XBEN, subsuming the +// per-method single-case checks formerly here. From abac92d069c89becb55b64ca352f0f2e14f41be7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 25 May 2026 11:23:18 -0600 Subject: [PATCH 120/221] formatting --- ben/src/cli/bendl/tests.rs | 4 +- ben/src/cli/reben/tests.rs | 4 +- ben/src/codec/encode/tests.rs | 8 +-- ben/src/io/bundle/format.rs | 28 +++++----- ben/src/io/bundle/reader.rs | 8 +-- ben/src/io/bundle/tests/reader.rs | 32 +++++++---- ben/src/io/bundle/tests/writer.rs | 28 +++++----- ben/src/io/bundle/writer.rs | 4 +- ben/src/json/graph/tests/test_io.rs | 84 ++++++++++++++--------------- ben/tests/test_cli.rs | 4 +- ben/tests/test_format_stability.rs | 4 +- ben/tests/test_impls_pipeline.rs | 16 ++++-- ben/tests/test_stress_edges.rs | 12 ++--- 13 files changed, 128 insertions(+), 108 deletions(-) diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index b0f9308..f799722 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -473,9 +473,9 @@ fn run_append_errors_on_missing_custom_asset_file() { let _ = std::fs::remove_file(&bendl); } -// --------------------------------------------------------------------------- +// ===================================================================== // extract --stream + inspect display branches -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn run_extract_stream_writes_raw_assignment_bytes() { diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index f7a009a..026e7fb 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -570,9 +570,9 @@ fn run_ben_mode_with_output_variant_derives_name() { result.unwrap(); } -// --------------------------------------------------------------------------- +// ===================================================================== // --key / --ordering happy paths and rejection guards -// --------------------------------------------------------------------------- +// ===================================================================== /// Minimal 3-node adjacency-style graph JSON, matching the shape `sort_json_file_by_*` accepts. const SHAPE_JSON: &[u8] = br#"{"nodes":[{"id":0,"GEOID20":"B"},{"id":1,"GEOID20":"A"},{"id":2,"GEOID20":"C"}],"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1}]]}"#; diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index cca8815..9caa5a3 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1337,9 +1337,9 @@ fn ben32_encode_run_exceeding_u16_max_splits_correctly() { assert_eq!(sentinel, 0u32); // always-present zero sentinel } -// --------------------------------------------------------------------------- +// ===================================================================== // Label-value 0 round-trips for MkvChain and TwoDelta -// --------------------------------------------------------------------------- +// ===================================================================== /// MkvChain round-trip with label `0` in the assignment. The existing /// `encode_jsonl_to_ben_single_zero` test covers Standard; MkvChain is structurally similar but @@ -1450,9 +1450,9 @@ fn twodelta_round_trip_all_zero_assignment() { assert_eq!(decoded, assignments); } -// --------------------------------------------------------------------------- +// ===================================================================== // Bit-packing boundary widths -// --------------------------------------------------------------------------- +// ===================================================================== /// Round-trip `assignment` through `BenStreamWriter::for_ben` + `BenStreamReader::from_ben`, /// asserting the decoded result matches the input. Used by the bit-packing boundary-width sweep diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 2f2fac1..3e2c3d6 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -11,9 +11,9 @@ use std::io::{self, Read, Write}; use thiserror::Error; -// --------------------------------------------------------------------------- +// ===================================================================== // Magic, version, and header layout -// --------------------------------------------------------------------------- +// ===================================================================== /// Magic bytes at offset 0 of every `.bendl` file. pub const BENDL_MAGIC: [u8; 8] = *b"BENDL\0\0\x01"; @@ -40,9 +40,9 @@ pub const FINALIZED_YES: u8 = 1; /// only for adversarial reader fixtures and partial-recovery flows. pub const HEADER_FLAG_STREAM_CHECKSUM: u32 = 1 << 0; -// --------------------------------------------------------------------------- +// ===================================================================== // Assignment format identifiers -// --------------------------------------------------------------------------- +// ===================================================================== /// Assignment format identifier: embedded BEN stream. pub const ASSIGNMENT_FORMAT_BEN: u8 = 1; @@ -80,9 +80,9 @@ impl AssignmentFormat { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Asset types, flags, standardized names -// --------------------------------------------------------------------------- +// ===================================================================== /// Asset type id for `metadata.json`. pub const ASSET_TYPE_METADATA: u16 = 1; @@ -172,9 +172,9 @@ pub const ASSET_CHECKSUM_LEN: u32 = 4; /// reasonable ratio/speed balance for JSON payloads. pub const DEFAULT_XZ_PRESET: u32 = 6; -// --------------------------------------------------------------------------- +// ===================================================================== // Header -// --------------------------------------------------------------------------- +// ===================================================================== /// In-memory representation of the fixed 64-byte `.bendl` header. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -315,9 +315,9 @@ impl BendlHeader { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Directory entry -// --------------------------------------------------------------------------- +// ===================================================================== /// Fixed-size header at the start of every directory entry, before the variable-length `name` and /// optional `checksum` bytes. @@ -442,9 +442,9 @@ impl BendlDirectoryEntry { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Directory table -// --------------------------------------------------------------------------- +// ===================================================================== /// Read the full directory table from a `Read` source. /// @@ -478,9 +478,9 @@ pub fn encode_directory(entries: &[BendlDirectoryEntry]) -> Result, Bend Ok(out) } -// --------------------------------------------------------------------------- +// ===================================================================== // Errors -// --------------------------------------------------------------------------- +// ===================================================================== /// Errors produced by the `.bendl` format layer. #[derive(Debug, Error)] diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index efb6123..a562d2f 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -525,9 +525,9 @@ impl BendlReader { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Strict-length plumbing -// --------------------------------------------------------------------------- +// ===================================================================== /// Marker error attached to the `io::Error` returned when an [`ExactLen`] reader hits underlying /// EOF before consuming its declared length. Used by convenience APIs to recognise a bundle-layer @@ -643,9 +643,9 @@ impl Read for ShortRangeAwareReader { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Verifying reader plumbing -// --------------------------------------------------------------------------- +// ===================================================================== #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum VerifyState { diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index c6ec4d6..05bf86f 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -253,9 +253,9 @@ fn validate_directory_catches_wrong_canonical_name() { )); } -// ----------------------------------------------------------------------- +// ===================================================================== // Robustness tests -// ----------------------------------------------------------------------- +// ===================================================================== /// Build a small finalized bundle with a known graph asset, metadata asset, empty stream, and no /// validation pitfalls. Useful as a base that tests can mutate byte-by-byte. @@ -923,7 +923,9 @@ fn stored_checksum_offset(directory_offset: u64, name: &str) -> usize { entry_start + 28 + name.len() } -// ----- Explicit verify_asset_checksum ------------------------------- +// ===================================================================== +// Explicit verify_asset_checksum +// ===================================================================== #[test] fn verify_asset_checksum_uncompressed_passes_on_intact_bundle() { @@ -1046,7 +1048,9 @@ fn verify_asset_checksum_returns_unavailable_when_flag_clear() { assert_eq!(got, payload); } -// ----- Verify-on-touch via asset_bytes ------------------------------ +// ===================================================================== +// Verify-on-touch via asset_bytes +// ===================================================================== #[test] fn asset_bytes_uncompressed_corrupt_payload_returns_checksum_mismatch() { @@ -1166,7 +1170,9 @@ fn asset_bytes_returns_unavailable_when_flag_clear() { )); } -// ----- asset_reader EOF semantics ---------------------------------- +// ===================================================================== +// asset_reader EOF semantics +// ===================================================================== #[test] fn asset_reader_uncompressed_surfaces_mismatch_on_final_read() { @@ -1198,7 +1204,9 @@ fn asset_reader_uncompressed_surfaces_mismatch_on_final_read() { assert_eq!(total_ok, b"abcdef".len()); } -// ----- Bulk verifier ------------------------------------------------- +// ===================================================================== +// Bulk verifier +// ===================================================================== #[test] fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { @@ -1267,7 +1275,9 @@ fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { assert!(matches!(&target, ChecksumTarget::Asset(n) if n == "first")); } -// ----- Polynomial pin ------------------------------------------------ +// ===================================================================== +// Polynomial pin +// ===================================================================== #[test] fn crc32c_polynomial_pin_against_known_vectors() { @@ -1579,9 +1589,9 @@ fn verify_stream_checksum_returns_bundle_incomplete_for_unfinalized() { ); } -// --------------------------------------------------------------------------- +// ===================================================================== // Strict payload_len / stream_len EOF enforcement -// --------------------------------------------------------------------------- +// ===================================================================== /// Returns a bundle whose `metadata.json` entry's `payload_len` has been corrupted to point past /// EOF, while the rest of the file remains structurally valid. @@ -1844,9 +1854,9 @@ fn open_assignment_reader_returns_unexpected_eof_when_banner_falls_in_short_rang } } -// --------------------------------------------------------------------------- +// ===================================================================== // Forward-compat: unknown asset-flag bits -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn asset_with_unknown_flag_bit_opens_and_verifies_checksum() { diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 57debd5..f19fc33 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -209,9 +209,9 @@ fn finalized_directory_lives_at_eof() { ); } -// ----------------------------------------------------------------------- +// ===================================================================== // Append-path tests -// ----------------------------------------------------------------------- +// ===================================================================== /// Build a finalized bundle with a single `metadata.json` asset and a short fake stream, then /// return both the bytes and the byte range (offset, len) occupied by the stream region. @@ -498,7 +498,9 @@ fn append_rejects_conflicting_pending_additions() { assert_eq!(buf, bundle_before); } -// -------- Phase 4: assignment-stream integration tests -------- +// ===================================================================== +// Phase 4: assignment-stream integration tests +// ===================================================================== #[test] fn bundle_ben_stream_round_trips_through_assignment_reader() { @@ -649,9 +651,9 @@ fn open_assignment_reader_reports_ben_wire_format() { assert_eq!(decoder.wire_format(), BenWireFormat::Ben); } -// ----------------------------------------------------------------------- +// ===================================================================== // Robustness tests -// ----------------------------------------------------------------------- +// ===================================================================== #[test] fn fully_empty_bundle_finalizes_and_round_trips() { @@ -982,9 +984,9 @@ fn append_rejects_duplicate_name_across_existing_and_pending() { assert!(reader.find_asset_by_name("blob").is_some()); } -// ----------------------------------------------------------------------- +// ===================================================================== // Randomized / stress tests -// ----------------------------------------------------------------------- +// ===================================================================== /// Build a bundle from a random set of custom assets (plus an optional metadata asset) and fully /// round-trip it through the reader. Repeated with a seeded ChaCha PRNG so the sequence is @@ -1671,9 +1673,9 @@ fn open_assignment_reader_intact_bundle_round_trips_count_samples() { assert_eq!(n, samples.len()); } -// --------------------------------------------------------------------------- +// ===================================================================== // Forward-compat: appender preserves unknown asset-flag bits on existing entries -// --------------------------------------------------------------------------- +// ===================================================================== /// Build a finalized BENDL bundle with a single custom asset whose `asset_flags` carries a /// reserved (unknown-in-v1.0.0) bit alongside the known `ASSET_FLAG_CHECKSUM` bit. Used to @@ -1720,9 +1722,9 @@ fn bundle_with_reserved_asset_flag_bit() -> (Vec, u16) { (bytes, RESERVED_BIT_7) } -// --------------------------------------------------------------------------- +// ===================================================================== // Concurrent reader access -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn two_parallel_readers_against_the_same_bundle_agree() { @@ -1843,9 +1845,9 @@ fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { ); } -// --------------------------------------------------------------------------- +// ===================================================================== // rollback paths and accessors -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn stream_session_start_offset_returns_recorded_value() { diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 87fdd15..d54730b 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -509,9 +509,9 @@ pub enum BendlWriteError { Io(#[from] io::Error), } -// --------------------------------------------------------------------------- +// ===================================================================== // Append path -// --------------------------------------------------------------------------- +// ===================================================================== /// Post-finalize appender that grows an existing `.bendl` file with new assets without rewriting /// the assignment stream. diff --git a/ben/src/json/graph/tests/test_io.rs b/ben/src/json/graph/tests/test_io.rs index 12d327f..7435687 100644 --- a/ben/src/json/graph/tests/test_io.rs +++ b/ben/src/json/graph/tests/test_io.rs @@ -48,9 +48,9 @@ fn normalize(format: &mut NxGraphAdjFormat) { } } -// ================================================================ -// == Fixtures (generated with `uv run --with networkx python3`) == -// ================================================================ +// ===================================================================== +// Fixtures (generated with `uv run --with networkx python3`) +// ===================================================================== const KARATE_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [["name", "Zachary's Karate Club"]], "nodes": [{"club": "Mr. Hi", "id": 0}, {"club": "Mr. Hi", "id": 1}, {"club": "Mr. Hi", "id": 2}, {"club": "Mr. Hi", "id": 3}, {"club": "Mr. Hi", "id": 4}, {"club": "Mr. Hi", "id": 5}, {"club": "Mr. Hi", "id": 6}, {"club": "Mr. Hi", "id": 7}, {"club": "Mr. Hi", "id": 8}, {"club": "Officer", "id": 9}, {"club": "Mr. Hi", "id": 10}, {"club": "Mr. Hi", "id": 11}, {"club": "Mr. Hi", "id": 12}, {"club": "Mr. Hi", "id": 13}, {"club": "Officer", "id": 14}, {"club": "Officer", "id": 15}, {"club": "Mr. Hi", "id": 16}, {"club": "Mr. Hi", "id": 17}, {"club": "Officer", "id": 18}, {"club": "Mr. Hi", "id": 19}, {"club": "Officer", "id": 20}, {"club": "Mr. Hi", "id": 21}, {"club": "Officer", "id": 22}, {"club": "Officer", "id": 23}, {"club": "Officer", "id": 24}, {"club": "Officer", "id": 25}, {"club": "Officer", "id": 26}, {"club": "Officer", "id": 27}, {"club": "Officer", "id": 28}, {"club": "Officer", "id": 29}, {"club": "Officer", "id": 30}, {"club": "Officer", "id": 31}, {"club": "Officer", "id": 32}, {"club": "Officer", "id": 33}], "adjacency": [[{"weight": 4, "id": 1}, {"weight": 5, "id": 2}, {"weight": 3, "id": 3}, {"weight": 3, "id": 4}, {"weight": 3, "id": 5}, {"weight": 3, "id": 6}, {"weight": 2, "id": 7}, {"weight": 2, "id": 8}, {"weight": 2, "id": 10}, {"weight": 3, "id": 11}, {"weight": 1, "id": 12}, {"weight": 3, "id": 13}, {"weight": 2, "id": 17}, {"weight": 2, "id": 19}, {"weight": 2, "id": 21}, {"weight": 2, "id": 31}], [{"weight": 4, "id": 0}, {"weight": 6, "id": 2}, {"weight": 3, "id": 3}, {"weight": 4, "id": 7}, {"weight": 5, "id": 13}, {"weight": 1, "id": 17}, {"weight": 2, "id": 19}, {"weight": 2, "id": 21}, {"weight": 2, "id": 30}], [{"weight": 5, "id": 0}, {"weight": 6, "id": 1}, {"weight": 3, "id": 3}, {"weight": 4, "id": 7}, {"weight": 5, "id": 8}, {"weight": 1, "id": 9}, {"weight": 3, "id": 13}, {"weight": 2, "id": 27}, {"weight": 2, "id": 28}, {"weight": 2, "id": 32}], [{"weight": 3, "id": 0}, {"weight": 3, "id": 1}, {"weight": 3, "id": 2}, {"weight": 3, "id": 7}, {"weight": 3, "id": 12}, {"weight": 3, "id": 13}], [{"weight": 3, "id": 0}, {"weight": 2, "id": 6}, {"weight": 3, "id": 10}], [{"weight": 3, "id": 0}, {"weight": 5, "id": 6}, {"weight": 3, "id": 10}, {"weight": 3, "id": 16}], [{"weight": 3, "id": 0}, {"weight": 2, "id": 4}, {"weight": 5, "id": 5}, {"weight": 3, "id": 16}], [{"weight": 2, "id": 0}, {"weight": 4, "id": 1}, {"weight": 4, "id": 2}, {"weight": 3, "id": 3}], [{"weight": 2, "id": 0}, {"weight": 5, "id": 2}, {"weight": 3, "id": 30}, {"weight": 3, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 1, "id": 2}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 3, "id": 4}, {"weight": 3, "id": 5}], [{"weight": 3, "id": 0}], [{"weight": 1, "id": 0}, {"weight": 3, "id": 3}], [{"weight": 3, "id": 0}, {"weight": 5, "id": 1}, {"weight": 3, "id": 2}, {"weight": 3, "id": 3}, {"weight": 3, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 3, "id": 5}, {"weight": 3, "id": 6}], [{"weight": 2, "id": 0}, {"weight": 1, "id": 1}], [{"weight": 1, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 1}, {"weight": 1, "id": 33}], [{"weight": 3, "id": 32}, {"weight": 1, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 1}], [{"weight": 2, "id": 32}, {"weight": 3, "id": 33}], [{"weight": 5, "id": 25}, {"weight": 4, "id": 27}, {"weight": 3, "id": 29}, {"weight": 5, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 25}, {"weight": 3, "id": 27}, {"weight": 2, "id": 31}], [{"weight": 5, "id": 23}, {"weight": 2, "id": 24}, {"weight": 7, "id": 31}], [{"weight": 4, "id": 29}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 4, "id": 23}, {"weight": 3, "id": 24}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 2, "id": 31}, {"weight": 2, "id": 33}], [{"weight": 3, "id": 23}, {"weight": 4, "id": 26}, {"weight": 4, "id": 32}, {"weight": 2, "id": 33}], [{"weight": 2, "id": 1}, {"weight": 3, "id": 8}, {"weight": 3, "id": 32}, {"weight": 3, "id": 33}], [{"weight": 2, "id": 0}, {"weight": 2, "id": 24}, {"weight": 7, "id": 25}, {"weight": 2, "id": 28}, {"weight": 4, "id": 32}, {"weight": 4, "id": 33}], [{"weight": 2, "id": 2}, {"weight": 3, "id": 8}, {"weight": 3, "id": 14}, {"weight": 3, "id": 15}, {"weight": 1, "id": 18}, {"weight": 3, "id": 20}, {"weight": 2, "id": 22}, {"weight": 5, "id": 23}, {"weight": 4, "id": 29}, {"weight": 3, "id": 30}, {"weight": 4, "id": 31}, {"weight": 5, "id": 33}], [{"weight": 4, "id": 8}, {"weight": 2, "id": 9}, {"weight": 3, "id": 13}, {"weight": 2, "id": 14}, {"weight": 4, "id": 15}, {"weight": 2, "id": 18}, {"weight": 1, "id": 19}, {"weight": 1, "id": 20}, {"weight": 4, "id": 23}, {"weight": 2, "id": 26}, {"weight": 4, "id": 27}, {"weight": 2, "id": 28}, {"weight": 2, "id": 29}, {"weight": 3, "id": 30}, {"weight": 4, "id": 31}, {"weight": 5, "id": 32}, {"weight": 3, "id": 22}]]}"#; @@ -72,9 +72,9 @@ const SELF_LOOP_JSON: &str = r#"{"directed": false, "multigraph": false, "graph" const EMPTY_EDGES_JSON: &str = r#"{"directed": false, "multigraph": false, "graph": [], "nodes": [{"id": 0}, {"id": 1}, {"id": 2}], "adjacency": [[], [], []]}"#; -// ============================= -// == Karate club graph tests == -// ============================= +// ===================================================================== +// Karate club graph tests +// ===================================================================== #[test] fn karate_club_node_and_edge_counts() { @@ -143,9 +143,9 @@ fn karate_club_roundtrip() { } } -// ======================= -// == Complete graph K5 == -// ======================= +// ===================================================================== +// Complete graph K5 +// ===================================================================== #[test] fn k5_node_and_edge_counts() { @@ -180,9 +180,9 @@ fn k5_roundtrip() { assert_eq!(nx_roundtrip, nx_expected); } -// =================== -// == Path graph P4 == -// =================== +// ===================================================================== +// Path graph P4 +// ===================================================================== #[test] fn p4_structure() { @@ -206,9 +206,9 @@ fn p4_roundtrip() { assert_eq!(nx_roundtrip, nx_expected); } -// ===================== -// == Directed graphs == -// ===================== +// ===================================================================== +// Directed graphs +// ===================================================================== #[test] fn small_directed_structure() { @@ -254,9 +254,9 @@ fn directed_cycle_roundtrip() { assert_eq!(nx_roundtrip, nx_expected); } -// ================ -// == Edge cases == -// ================ +// ===================================================================== +// Edge cases +// ===================================================================== #[test] fn single_node_no_edges() { @@ -335,9 +335,9 @@ fn two_triangles_roundtrip() { assert_eq!(nx_roundtrip, nx_expected); } -// ========================================= -// == String node IDs and edge attributes == -// ========================================= +// ===================================================================== +// String node IDs and edge attributes +// ===================================================================== #[test] fn string_ids_structure() { @@ -379,9 +379,9 @@ fn string_ids_roundtrip() { } } -// ============================== -// == graph_has_parallel_edges == -// ============================== +// ===================================================================== +// graph_has_parallel_edges +// ===================================================================== #[test] fn no_parallel_edges_simple_graph() { @@ -456,9 +456,9 @@ fn antiparallel_not_parallel_in_directed() { assert!(!graph_has_parallel_edges(&graph)); } -// ====================================== -// == nx_node <-> petx_node conversion == -// ====================================== +// ===================================================================== +// nx_node <-> petx_node conversion +// ===================================================================== #[test] fn nx_to_petx_node_stores_id_in_attrs() { @@ -500,9 +500,9 @@ fn petx_to_nx_node_missing_id_errors() { ); } -// ================= -// == Error cases == -// ================= +// ===================================================================== +// Error cases +// ===================================================================== #[test] fn directedness_mismatch_undirected_to_directed() { @@ -606,9 +606,9 @@ fn missing_neighbor_node_error() { ); } -// ============================================================ -// == Type alias smoke tests (ensures they compile and work) == -// ============================================================ +// ===================================================================== +// Type alias smoke tests (ensures they compile and work) +// ===================================================================== #[test] fn type_aliases_work() { @@ -623,9 +623,9 @@ fn type_aliases_work() { let _inner_di: &PetxDiInnerGraph = &_petx_di.graph; } -// =================================== -// == Undirected edge deduplication == -// =================================== +// ===================================================================== +// Undirected edge deduplication +// ===================================================================== #[test] fn undirected_dedup_produces_correct_edge_count() { @@ -657,9 +657,9 @@ fn construct_nx_from_petx_restores_both_directions() { assert_eq!(nx_roundtrip.adjacency[1].len(), 2); } -// ============================================ -// == multigraph flag detection on roundtrip == -// ============================================ +// ===================================================================== +// multigraph flag detection on roundtrip +// ===================================================================== #[test] fn simple_graph_roundtrip_multigraph_false() { @@ -695,9 +695,9 @@ fn graph_with_parallel_edges_sets_multigraph_true() { assert!(nx.multigraph); } -// ============================= -// == JSON roundtrip fidelity == -// ============================= +// ===================================================================== +// JSON roundtrip fidelity +// ===================================================================== // // These tests verify that the full pipeline // JSON string → NxGraphAdjFormat → PetxGraph → NxGraphAdjFormat → JSON string diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 864b850..e52fb86 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1745,9 +1745,9 @@ fn bendl_cli_create_inspect_extract_append_roundtrip() { assert_failure(&append_duplicate); } -// --------------------------------------------------------------------------- +// ===================================================================== // `ben encode --graph` and `ben x-encode --graph` -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn ben_encode_graph_requires_input_file_not_stdin() { diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs index ea4730d..a026609 100644 --- a/ben/tests/test_format_stability.rs +++ b/ben/tests/test_format_stability.rs @@ -238,9 +238,9 @@ fn unknown_flags_bendl_v1_0_0_opens_and_decodes_cleanly() { ); } -// --------------------------------------------------------------------------- +// ===================================================================== // Fixture generation -// --------------------------------------------------------------------------- +// ===================================================================== // // IMPORTANT: this is intentionally `#[ignore]`. Once v1.0.0 fixtures are committed, they MUST NOT // be regenerated in place — see `docs/format-stability.md`. If a future format change requires diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index b41f8f4..acea7c6 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -26,7 +26,9 @@ use std::time::{SystemTime, UNIX_EPOCH}; mod common; use common::{expand_rle, jsonl_from_assignments}; -// ---------- Helpers ---------- +// ===================================================================== +// Helpers +// ===================================================================== /// From a decoded `(assignment, count)` stream, reconstitute JSONL. fn jsonl_from_records(records: &[(Vec, u16)], start_at: usize) -> Vec { @@ -65,7 +67,9 @@ where Ok(out) } -// ---------- proptest strategies ---------- +// ===================================================================== +// proptest strategies +// ===================================================================== /// Strategy for a single assignment vector: Generate as RLE runs (value in [1, max_val], length in /// [1, max_run]), expand to a bounded length. @@ -168,7 +172,9 @@ fn strat_threads_levels() -> impl Strategy { (1u32..=4, 0u32..=9) } -// ---------- Tests ---------- +// ===================================================================== +// Tests +// ===================================================================== proptest! { // JSONL -> BEN(Standard) -> JSONL round-trip via BenEncoder/BenStreamReader entry points. @@ -631,7 +637,9 @@ proptest! { } } -// ---------- Non-proptest unit checks for headers/validation ---------- +// ===================================================================== +// Non-proptest unit checks for headers/validation +// ===================================================================== #[test] fn invalid_ben_header_yields_error() { diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 4be9bcd..463ffcb 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -671,9 +671,9 @@ fn bendl_append_truncated_new_directory_is_rejected_on_reopen() { assert!(err.to_string().contains("IO error")); } -// --------------------------------------------------------------------------- +// ===================================================================== // BENDL adversarial-bytes fuzz -// --------------------------------------------------------------------------- +// ===================================================================== /// Mint a valid BENDL bundle that exercises every public surface the no-panic harness will drive: /// a finalized header with `HEADER_FLAG_STREAM_CHECKSUM`, an xz-compressed graph asset, a raw JSON @@ -932,10 +932,10 @@ fn seeded_malformed_bendl_bytes_do_not_panic() { } } -// --------------------------------------------------------------------------- +// ===================================================================== // Open-rejected variant-pinning. Each fixture must fail BendlReader::open // with a specific BendlFormatError variant, not just an unspecified Err. -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn bendl_open_rejects_directory_offset_past_eof() { @@ -994,10 +994,10 @@ fn bendl_open_rejects_name_len_longer_than_remaining_directory_bytes() { ); } -// --------------------------------------------------------------------------- +// ===================================================================== // Openable behavioral pins. Each fixture must let BendlReader::open succeed and then // surface the documented behavior through the accessors. -// --------------------------------------------------------------------------- +// ===================================================================== #[test] fn bendl_unknown_header_flag_bits_are_ignored() { From 4f8c06482d689b3cdcbf9e23a90b732ad034a368 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 29 May 2026 22:59:23 -0600 Subject: [PATCH 121/221] split bundle reader up --- ben/src/io/bundle/mod.rs | 1 + ben/src/io/bundle/reader.rs | 727 ++++-------------------------------- ben/src/io/bundle/verify.rs | 655 ++++++++++++++++++++++++++++++++ 3 files changed, 725 insertions(+), 658 deletions(-) create mode 100644 ben/src/io/bundle/verify.rs diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index 95a5d53..b075107 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -16,6 +16,7 @@ pub mod error; pub mod format; pub mod manifest; pub mod reader; +pub mod verify; pub mod writer; #[cfg(test)] diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index a562d2f..3007efb 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -6,6 +6,9 @@ //! likewise exposed as a byte range the caller can plumb into a [`BenStreamReader`] without this //! module reinterpreting any BEN/XBEN internals. //! +//! The byte-level read adapters (bounded ranges, CRC tees, verifying wrappers) live in +//! [`super::verify`]; this module composes them behind the public API. +//! //! ## Verification surface //! //! - [`BendlReader::asset_bytes`] and [`BendlReader::asset_reader`] are **verify-on-touch**: the @@ -17,14 +20,8 @@ //! - [`BendlReader::verify_asset_checksum`] and [`BendlReader::verify_all_asset_checksums`] are //! explicit raw-bytes verifiers (no decoding) that do not return decoded payload bytes. -use std::fmt; -use std::io::{self, Read, Seek, SeekFrom, Write}; -use std::sync::{ - atomic::{AtomicBool, AtomicU32, Ordering}, - Arc, -}; +use std::io::{self, Read, Seek, SeekFrom}; -use serde_json::json; use xz2::read::XzDecoder; use super::error::{BendlReadError, ChecksumError, ChecksumTarget}; @@ -32,8 +29,13 @@ use super::format::{ read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, ASSET_FLAG_XZ, }; -use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat, SubsampleFrameDecoder}; -use crate::BenVariant; +use super::verify::{ + scan_range_crc32c, CrcTeeReader, ExactLen, ShortRangeAwareReader, ShortRangeFlag, + ShortRangeMarker, VerifyingReader, +}; +use crate::io::reader::{BenStreamReader, BenWireFormat}; + +pub use super::verify::BendlVerifiedStreamReader; impl From for BenWireFormat { fn from(format: AssignmentFormat) -> Self { @@ -124,6 +126,27 @@ impl BendlReader { self.directory.iter().find(|e| e.asset_type == asset_type) } + /// Resolve the stored stream CRC32C, enforcing the stream-checksum precondition once. + /// + /// Returns `Err(BundleIncomplete)` for unfinalized bundles (the stored `stream_checksum` is not + /// authoritative until the bundle is finalized) and `Err(Unavailable)` when + /// `HEADER_FLAG_STREAM_CHECKSUM` is clear (foreign or hand-built bytes; the library writer always + /// sets this flag). The finalization check comes first by design: reporting `Unavailable` for an + /// unfinalized bundle would be misleading. + fn require_stream_checksum(&self) -> Result { + if !self.header.is_finalized() { + return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { + target: ChecksumTarget::Stream, + })); + } + if !self.header.has_stream_checksum() { + return Err(BendlReadError::Checksum(ChecksumError::Unavailable { + target: ChecksumTarget::Stream, + })); + } + Ok(self.header.stream_checksum) + } + /// Return the byte range occupied by the assignment stream. /// /// For finalized bundles this is `(stream_offset, stream_len)` as recorded in the header. For @@ -145,49 +168,33 @@ impl BendlReader { /// Return a verified reader for the assignment stream that checks the stored CRC32C at raw EOF. /// - /// Returns `Err(ChecksumError::BundleIncomplete)` for unfinalized bundles (the stored - /// `stream_checksum` is not authoritative until the bundle is finalized). - /// Returns `Err(ChecksumError::Unavailable)` when `HEADER_FLAG_STREAM_CHECKSUM` is clear - /// (foreign or hand-built bytes; the library writer always sets this flag). + /// Returns `Err(ChecksumError::BundleIncomplete)` for unfinalized bundles and + /// `Err(ChecksumError::Unavailable)` when `HEADER_FLAG_STREAM_CHECKSUM` is clear. /// /// On success, CRC mismatch surfaces from `Read::read` as /// `io::Error::new(io::ErrorKind::InvalidData, ChecksumError::Mismatch)` on the call that /// would otherwise return `Ok(0)` at raw EOF. For a raw copy that decodes nothing, driving the /// returned reader to EOF is sufficient. For decoded access use /// [`BendlReader::open_assignment_reader`]. - pub fn assignment_stream_reader( - &mut self, - ) -> Result, BendlReadError> { - if !self.header.is_finalized() { - return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { - target: ChecksumTarget::Stream, - })); - } - if !self.header.has_stream_checksum() { - return Err(BendlReadError::Checksum(ChecksumError::Unavailable { - target: ChecksumTarget::Stream, - })); - } - let expected = self.header.stream_checksum; + pub fn assignment_stream_reader(&mut self) -> Result, BendlReadError> { + let expected = self.require_stream_checksum()?; let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; - let raw = ExactLen::new(&mut self.inner, len, ShortRangeFlag::new()); - Ok(Box::new(RawVerifyingReader { - inner: raw, - hasher: 0, + let short_flag = ShortRangeFlag::new(); + let raw = ExactLen::new(&mut self.inner, len, short_flag.clone()); + Ok(Box::new(VerifyingReader::new( + CrcTeeReader::new(raw), expected, - target: ChecksumTarget::Stream, - state: VerifyState::Reading, - })) + ChecksumTarget::Stream, + short_flag, + ))) } /// Return a raw bounded reader for the assignment stream **without** CRC verification. /// /// Works on both finalized and unfinalized bundles. Useful for recovery/debug flows and for /// callers that need the raw bytes without the overhead of a CRC check. - pub fn assignment_stream_reader_unverified( - &mut self, - ) -> io::Result> { + pub fn assignment_stream_reader_unverified(&mut self) -> io::Result> { let (offset, len) = self.assignment_stream_range()?; self.inner.seek(SeekFrom::Start(offset))?; Ok(Box::new(ExactLen::new( @@ -206,20 +213,7 @@ impl BendlReader { pub fn open_assignment_reader( &mut self, ) -> Result, BendlReadError> { - // Finalization check must come first: if the bundle is unfinalized, stream_checksum is not - // authoritative and reporting Unavailable would be misleading. - if !self.header.is_finalized() { - return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { - target: ChecksumTarget::Stream, - })); - } - if !self.header.has_stream_checksum() { - return Err(BendlReadError::Checksum(ChecksumError::Unavailable { - target: ChecksumTarget::Stream, - })); - } - let expected = self.header.stream_checksum; - + let expected = self.require_stream_checksum()?; let format = self.assignment_format().ok_or_else(|| { BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat( self.header.assignment_format, @@ -230,38 +224,9 @@ impl BendlReader { let short_flag = ShortRangeFlag::new(); let raw = ExactLen::new(&mut self.inner, len, short_flag.clone()); - let arc_hasher = Arc::new(AtomicU32::new(0)); - let shared_raw = ArcHasher { - inner: raw, - state: Arc::clone(&arc_hasher), - }; - - let init = match format { - AssignmentFormat::Ben => BenStreamReader::from_ben(shared_raw), - AssignmentFormat::Xben => BenStreamReader::from_xben(shared_raw), - }; - let inner = match init { - Ok(inner) => inner, - Err(e) => { - // If the underlying ExactLen flagged a short range while the codec was reading its - // banner, surface a bundle-layer UnexpectedEof rather than a DecoderInit so callers - // see the structural truncation as the failure, not a banner parse error. - if short_flag.get() { - return Err(BendlReadError::Io(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - ))); - } - return Err(e.into()); - } - }; - - Ok(BendlVerifiedStreamReader { - inner, - expected, - arc_hasher, - short_flag, - state: StreamVerifyState::Running, + BendlVerifiedStreamReader::new(raw, short_flag, expected, |source| match format { + AssignmentFormat::Ben => BenStreamReader::from_ben(source).map_err(Into::into), + AssignmentFormat::Xben => BenStreamReader::from_xben(source).map_err(Into::into), }) } @@ -274,40 +239,13 @@ impl BendlReader { /// Returns `Err(BundleIncomplete)` for unfinalized bundles and `Err(Unavailable)` when the /// stream checksum flag is clear. pub fn verify_stream_checksum(&mut self) -> Result<(), BendlReadError> { - if !self.header.is_finalized() { - return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { - target: ChecksumTarget::Stream, - })); - } - if !self.header.has_stream_checksum() { - return Err(BendlReadError::Checksum(ChecksumError::Unavailable { - target: ChecksumTarget::Stream, - })); - } - let expected = self.header.stream_checksum; + let expected = self.require_stream_checksum()?; let (offset, len) = self.assignment_stream_range()?; - self.inner.seek(SeekFrom::Start(offset))?; - - let mut remaining = len; - let mut buf = [0u8; 64 * 1024]; - let mut hasher: u32 = 0; - while remaining > 0 { - let want = remaining.min(buf.len() as u64) as usize; - let n = self.inner.read(&mut buf[..want])?; - if n == 0 { - return Err(BendlReadError::Io(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining }, - ))); - } - hasher = crc32c::crc32c_append(hasher, &buf[..n]); - remaining -= n as u64; - } - - if hasher != expected { + let computed = scan_range_crc32c(&mut self.inner, offset, len)?; + if computed != expected { return Err(BendlReadError::Checksum(ChecksumError::Mismatch { target: ChecksumTarget::Stream, - computed: hasher, + computed, expected, })); } @@ -384,27 +322,23 @@ impl BendlReader { let short_flag = ShortRangeFlag::new(); let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); + // The CRC tee always sits at the raw on-disk layer (over the compressed bytes for xz + // assets, so verification happens before decompression). For xz assets the decoder sits + // above the tee; the verifying wrapper finalizes the check once the source reaches EOF. if entry.asset_flags & ASSET_FLAG_XZ != 0 { - // Compressed: CRC tee sits *inside* the XzDecoder so the tee accumulates over raw - // compressed bytes; the BENDL-owned wrapper around the decoder finalizes the check - // after the codec reaches its own EOF. - let tee = CrcTeeReader::new(raw); - let decoder = XzDecoder::new(tee); - Ok(Box::new(DecodedVerifyingReader { - decoder, + Ok(Box::new(VerifyingReader::new( + XzDecoder::new(CrcTeeReader::new(raw)), expected, target, short_flag, - state: VerifyState::Reading, - })) + ))) } else { - Ok(Box::new(RawVerifyingReader { - inner: raw, - hasher: 0, + Ok(Box::new(VerifyingReader::new( + CrcTeeReader::new(raw), expected, target, - state: VerifyState::Reading, - })) + short_flag, + ))) } } @@ -424,10 +358,10 @@ impl BendlReader { // Wrap the decoder so that if xz reports a runtime error while the underlying // ExactLen has flagged a short read, the surface is a short-range UnexpectedEof // rather than a codec error. - Ok(Box::new(ShortRangeAwareReader { - inner: XzDecoder::new(raw), + Ok(Box::new(ShortRangeAwareReader::new( + XzDecoder::new(raw), short_flag, - })) + ))) } else { Ok(Box::new(raw)) } @@ -471,29 +405,11 @@ impl BendlReader { } }; - self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - let mut remaining = entry.payload_len; - let mut buf = [0u8; 64 * 1024]; - let mut hasher: u32 = 0; - while remaining > 0 { - let want = remaining.min(buf.len() as u64) as usize; - let n = self.inner.read(&mut buf[..want])?; - if n == 0 { - // Short read against the declared payload length — surface as an I/O error so - // callers can distinguish a truncated bundle from a CRC mismatch. - return Err(BendlReadError::Io(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining }, - ))); - } - hasher = crc32c::crc32c_append(hasher, &buf[..n]); - remaining -= n as u64; - } - - if hasher != expected { + let computed = scan_range_crc32c(&mut self.inner, entry.payload_offset, entry.payload_len)?; + if computed != expected { return Err(BendlReadError::Checksum(ChecksumError::Mismatch { target: ChecksumTarget::Asset(entry.name.clone()), - computed: hasher, + computed, expected, })); } @@ -517,517 +433,12 @@ impl BendlReader { /// rules. /// /// Returns [`BundleValidationError`] if any entry violates the rules. This is called - /// automatically by [`BendlReader::open`] when the `strict` constructor is used in tests; in - /// normal reads, the writer is already expected to enforce these rules and a malformed bundle - /// is a program bug somewhere else. + /// automatically by [`BendlReader::open`]. pub fn validate_directory(&self) -> Result<(), BundleValidationError> { validate_directory_entries(&self.directory) } } -// ===================================================================== -// Strict-length plumbing -// ===================================================================== - -/// Marker error attached to the `io::Error` returned when an [`ExactLen`] reader hits underlying -/// EOF before consuming its declared length. Used by convenience APIs to recognise a bundle-layer -/// short-range failure even when it has surfaced through a codec. -#[derive(Debug)] -pub(crate) struct ShortRangeMarker { - pub remaining: u64, -} - -impl fmt::Display for ShortRangeMarker { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "bundle range ended {} byte(s) before declared length", - self.remaining - ) - } -} - -impl std::error::Error for ShortRangeMarker {} - -/// Shared flag set by an [`ExactLen`] reader when the underlying reader runs out of bytes before -/// the declared length is reached. Clones share state so a wrapper above a codec can detect the -/// short read even if the codec swallows the inner `UnexpectedEof` in favor of its own error. -#[derive(Clone, Default)] -pub struct ShortRangeFlag(Arc); - -impl ShortRangeFlag { - pub(crate) fn new() -> Self { - Self(Arc::new(AtomicBool::new(false))) - } - - pub(crate) fn set(&self) { - self.0.store(true, Ordering::Relaxed); - } - - pub(crate) fn get(&self) -> bool { - self.0.load(Ordering::Relaxed) - } -} - -/// Bounded reader that enforces an exact byte length. Behaves like [`std::io::Take`] for reads -/// within the declared length, but returns -/// `Err(io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker))` (and sets the shared -/// [`ShortRangeFlag`]) if the underlying reader signals EOF before the declared length is reached. -/// -/// `ExactLen` is the BENDL-layer guarantee that `payload_len` and `stream_len` are exact byte -/// counts of the on-disk range; a backing file shorter than declared is a corrupt bundle, not a -/// short successful read. -pub struct ExactLen { - inner: R, - remaining: u64, - flag: ShortRangeFlag, -} - -impl ExactLen { - pub(crate) fn new(inner: R, declared: u64, flag: ShortRangeFlag) -> Self { - Self { - inner, - remaining: declared, - flag, - } - } -} - -impl Read for ExactLen { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.remaining == 0 || buf.is_empty() { - return Ok(0); - } - let max = (buf.len() as u64).min(self.remaining) as usize; - let n = self.inner.read(&mut buf[..max])?; - if n == 0 { - // Underlying reader hit EOF before our declared length. Set the shared flag so a - // wrapper above a codec can recognise this as a bundle-range failure, and surface as - // UnexpectedEof carrying the marker. - let remaining = self.remaining; - self.flag.set(); - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining }, - )); - } - self.remaining -= n as u64; - Ok(n) - } -} - -/// Wraps a reader sitting above an [`ExactLen`]-bounded source. If the underlying reader returns -/// an error and the shared `ShortRangeFlag` is set, the error is replaced with an `UnexpectedEof` -/// carrying a [`ShortRangeMarker`] so callers see a bundle-layer short-range failure rather than a -/// codec-specific error message. -struct ShortRangeAwareReader { - inner: R, - short_flag: ShortRangeFlag, -} - -impl Read for ShortRangeAwareReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self.inner.read(buf) { - Ok(n) => Ok(n), - Err(e) => { - if self.short_flag.get() { - Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )) - } else { - Err(e) - } - } - } - } -} - -// ===================================================================== -// Verifying reader plumbing -// ===================================================================== - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum VerifyState { - /// Still feeding bytes from the underlying reader. - Reading, - /// Underlying reader returned EOF and the CRC matched. Subsequent reads return `Ok(0)` - /// (normal EOF). - EofChecked, - /// CRC mismatch was reported to the caller. Subsequent reads return `Ok(0)` so the reader stays - /// well-behaved if the caller re-polls after the error. - Failed, -} - -/// Uncompressed-asset verifying reader: forwards bytes from the bounded payload, accumulates CRC32C -/// as they fly past, and on raw EOF either confirms the checksum or returns -/// [`ChecksumError::Mismatch`] in place of the usual `Ok(0)`. -struct RawVerifyingReader<'a, R: Read + Seek> { - inner: ExactLen<&'a mut R>, - hasher: u32, - expected: u32, - target: ChecksumTarget, - state: VerifyState, -} - -impl Read for RawVerifyingReader<'_, R> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self.state { - VerifyState::EofChecked | VerifyState::Failed => return Ok(0), - VerifyState::Reading => {} - } - let n = self.inner.read(buf)?; - if n == 0 { - if self.hasher == self.expected { - self.state = VerifyState::EofChecked; - return Ok(0); - } - let err = ChecksumError::Mismatch { - target: self.target.clone(), - computed: self.hasher, - expected: self.expected, - }; - self.state = VerifyState::Failed; - return Err(io::Error::new(io::ErrorKind::InvalidData, err)); - } - self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); - Ok(n) - } -} - -/// CRC accumulator that sits *inside* an [`XzDecoder`] for compressed assets. It must never -/// substitute a checksum error for raw EOF — the codec needs to see the natural `Ok(0)` so it can -/// flush pending output. The post-decoder wrapper ([`DecodedVerifyingReader`]) inspects this -/// struct's accumulated hash after codec EOF. -struct CrcTeeReader { - inner: R, - hasher: u32, -} - -impl CrcTeeReader { - fn new(inner: R) -> Self { - Self { inner, hasher: 0 } - } -} - -impl Read for CrcTeeReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = self.inner.read(buf)?; - if n > 0 { - self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); - } - Ok(n) - } -} - -/// Verifying wrapper around an `XzDecoder>`. Lets the codec observe normal raw EOF -/// before finalizing the CRC check at the decoded layer. -struct DecodedVerifyingReader<'a, R: Read + Seek> { - decoder: XzDecoder>>, - expected: u32, - target: ChecksumTarget, - short_flag: ShortRangeFlag, - state: VerifyState, -} - -impl Read for DecodedVerifyingReader<'_, R> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self.state { - VerifyState::EofChecked | VerifyState::Failed => return Ok(0), - VerifyState::Reading => {} - } - let n = match self.decoder.read(buf) { - Ok(n) => n, - Err(e) => { - // If the underlying ExactLen flagged a short range, surface it as a bundle-layer - // UnexpectedEof rather than a codec error — the bytes were missing, not malformed. - self.state = VerifyState::Failed; - if self.short_flag.get() { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )); - } - return Err(e); - } - }; - if n == 0 { - let computed = self.decoder.get_ref().hasher; - if computed == self.expected { - self.state = VerifyState::EofChecked; - return Ok(0); - } - let err = ChecksumError::Mismatch { - target: self.target.clone(), - computed, - expected: self.expected, - }; - self.state = VerifyState::Failed; - return Err(io::Error::new(io::ErrorKind::InvalidData, err)); - } - Ok(n) - } -} - -/// CRC accumulator that shares its running hash via an `Arc`. Used as the source reader -/// for [`BendlVerifiedStreamReader`]: the `Arc` lets the outer wrapper read the final hash after a -/// consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. -/// -/// Unlike `CrcTeeReader`, this type never substitutes a checksum error for raw EOF — it is always -/// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. The type is -/// exposed because it leaks through the return signatures of the wrapper's intentionally-partial -/// APIs (`into_frames`, `into_subsample_by_*`); callers should treat it as an opaque reader. -pub struct ArcHasher { - inner: R, - state: Arc, -} - -impl Read for ArcHasher { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = self.inner.read(buf)?; - if n > 0 { - let prev = self.state.load(Ordering::Relaxed); - self.state - .store(crc32c::crc32c_append(prev, &buf[..n]), Ordering::Relaxed); - } - Ok(n) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum StreamVerifyState { - Running, - /// A CRC mismatch was returned once as `Some(Err(...))`. Subsequent iterator calls return - /// `None`. - MismatchReported, - /// A non-CRC terminal error (codec failure, bundle-layer short range, etc.) was returned once - /// as `Some(Err(...))`. Subsequent iterator calls return `None`. Kept distinct from - /// `MismatchReported` so the state machine self-documents which class of failure tripped it. - Errored, - /// CRC matched after natural EOF. Subsequent iterator calls return `None`. - Verified, -} - -/// Verified decoded assignment reader returned by [`BendlReader::open_assignment_reader`]. -/// -/// Wraps a [`BenStreamReader`] over a CRC-accumulating source and checks the stored stream CRC32C -/// after the codec reaches natural EOF. CRC mismatch surfaces from [`Iterator::next`] as -/// `Some(Err(io::ErrorKind::InvalidData))` — returned once after the last decoded record, then -/// `None`. Consuming methods (`count_samples`, `write_all_jsonl`, `for_each_assignment` when driven -/// to natural EOF) also fold the CRC check into their return value. -/// -/// **Intentionally partial APIs** (`into_frames`, `into_subsample_by_*`) are forwarded for -/// ergonomics but do not automatically verify — the underlying reader is stopped short of raw EOF -/// so the CRC tee is never finalized. Callers that need integrity for partial reads must call -/// [`BendlReader::verify_stream_checksum`] separately. -pub struct BendlVerifiedStreamReader<'a, R: Read + Seek> { - inner: BenStreamReader>>, - expected: u32, - arc_hasher: Arc, - short_flag: ShortRangeFlag, - state: StreamVerifyState, -} - -impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { - /// Return the BEN variant detected from the stream banner. - pub fn variant(&self) -> BenVariant { - self.inner.variant() - } - - /// Return the wire format (BEN vs XBEN) of this stream. - pub fn wire_format(&self) -> BenWireFormat { - self.inner.wire_format() - } - - /// Suppress progress output from the decoder. - pub fn silent(mut self, silent: bool) -> Self { - self.inner = self.inner.silent(silent); - self - } - - /// Count the number of samples in the stream and verify the stream CRC32C. - /// - /// Drives the decoder to raw EOF as a side effect, finalizing the CRC accumulator. If the - /// count succeeds but the CRC does not match, the CRC mismatch is returned instead of the - /// count. - pub fn count_samples(self) -> io::Result { - let arc = Arc::clone(&self.arc_hasher); - let expected = self.expected; - let short_flag = self.short_flag.clone(); - let count = match self.inner.count_samples() { - Ok(count) => count, - Err(e) => { - if short_flag.get() { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )); - } - return Err(e); - } - }; - let computed = arc.load(Ordering::Relaxed); - if computed != expected { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - ChecksumError::Mismatch { - target: ChecksumTarget::Stream, - computed, - expected, - }, - )); - } - Ok(count) - } - - /// Decode assignments and pass each one to a callback by reference. - /// - /// When the callback drives the reader to natural EOF, the stream CRC is verified and a - /// mismatch is returned as an error. When the callback stops early (`f` returns `Ok(false)`), - /// the CRC is not checked — only a full traversal can verify the whole stream. - pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> - where - F: FnMut(&[u16], u16) -> io::Result, - { - loop { - match self.next() { - Some(Ok((ref assignment, count))) => { - if !f(assignment, count)? { - return Ok(()); - } - } - Some(Err(e)) => return Err(e), - None => return Ok(()), - } - } - } - - /// Decode the remaining stream, write it as JSONL, and verify the stream CRC32C. - /// - /// Each decoded sample is written as a JSON object containing an `assignment` vector and a - /// 1-based `sample` index. After all records are written, the stream CRC is checked; a - /// mismatch is returned instead of `Ok(())`. - pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { - let mut sample_number = 0usize; - loop { - match self.next() { - Some(Ok((assignment, count))) => { - for _ in 0..count { - sample_number += 1; - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - } - } - Some(Err(e)) => return Err(e), - None => return Ok(()), - } - } - } - - /// Consume the wrapper and iterate over raw BEN/ben32 frames instead of materialized - /// assignments. - /// - /// Frame iteration is intentionally partial: callers typically stop short of EOF, so the CRC - /// tee is never finalized and the stream is **not verified** by this path. Callers needing - /// integrity for partial reads should call [`BendlReader::verify_stream_checksum`] separately. - pub fn into_frames(self) -> BenStreamFrameReader>> { - self.inner.into_frames() - } -} - -impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { - /// Convert into a subsampling iterator over explicit 1-based indices. - /// - /// Subsampling is intentionally partial: the underlying reader is stopped short of raw EOF, so - /// the CRC tee is never finalized and the stream is **not verified** by this path. Use - /// [`BendlReader::verify_stream_checksum`] for an explicit full-stream integrity check. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder>>> - where - T: IntoIterator, - { - self.inner.into_subsample_by_indices(indices) - } - - /// Convert into a subsampling iterator over the inclusive 1-based range `[start, end]`. - /// - /// Subsampling is intentionally partial and is **not verified** by this path; see - /// [`Self::into_subsample_by_indices`]. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder>>> { - self.inner.into_subsample_by_range(start, end) - } - - /// Convert into a subsampling iterator that selects every `step` samples from the 1-based - /// `offset`. - /// - /// Subsampling is intentionally partial and is **not verified** by this path; see - /// [`Self::into_subsample_by_indices`]. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder>>> { - self.inner.into_subsample_every(step, offset) - } -} - -impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { - type Item = io::Result<(Vec, u16)>; - - fn next(&mut self) -> Option { - match self.state { - StreamVerifyState::MismatchReported - | StreamVerifyState::Errored - | StreamVerifyState::Verified => return None, - StreamVerifyState::Running => {} - } - match self.inner.next() { - Some(Err(e)) => { - // Non-CRC terminal error: codec failure, bundle-layer short range, or anything else - // the inner reader returned. CRC mismatch lives in the `None` branch below. - self.state = StreamVerifyState::Errored; - if self.short_flag.get() { - return Some(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - ))); - } - Some(Err(e)) - } - Some(item) => Some(item), - None => { - // Inner reached natural EOF — finalize the CRC check. - let computed = self.arc_hasher.load(Ordering::Relaxed); - if computed == self.expected { - self.state = StreamVerifyState::Verified; - None - } else { - self.state = StreamVerifyState::MismatchReported; - Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - ChecksumError::Mismatch { - target: ChecksumTarget::Stream, - computed, - expected: self.expected, - }, - ))) - } - } - } - } -} - /// Map a `read_to_end`-time `io::Error` (or any `Read`-derived `io::Error`) into the right /// [`BendlReadError`] variant. /// diff --git a/ben/src/io/bundle/verify.rs b/ben/src/io/bundle/verify.rs new file mode 100644 index 0000000..cd93ec1 --- /dev/null +++ b/ben/src/io/bundle/verify.rs @@ -0,0 +1,655 @@ +//! Strict-length and CRC-verification plumbing for the `.bendl` reader. +//! +//! This module owns the byte-level read adapters that the public [`super::reader::BendlReader`] API +//! composes: +//! +//! - [`ExactLen`] turns a backing range shorter than its declared length into a structural +//! short-range error rather than a silently-short successful read. +//! - [`CrcTeeReader`] accumulates a CRC32C over the bytes that flow through it, without ever +//! substituting an error for raw EOF. +//! - [`VerifyingReader`] wraps a CRC-accumulating byte source and, at the source's natural EOF, +//! either confirms the stored CRC32C or surfaces [`ChecksumError::Mismatch`] in place of the usual +//! `Ok(0)`. The same wrapper serves uncompressed assets (source = `CrcTeeReader>`) and +//! xz-compressed assets (source = `XzDecoder>>`): the only difference is +//! *where* the tee sits, which the [`CrcSource`] trait abstracts. +//! - [`BendlVerifiedStreamReader`] folds the same verify-at-EOF discipline into the assignment-stream +//! iterator API. + +use std::fmt; +use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::sync::{ + atomic::{AtomicBool, AtomicU32, Ordering}, + Arc, +}; + +use serde_json::json; +use xz2::read::XzDecoder; + +use super::error::{BendlReadError, ChecksumError, ChecksumTarget}; +use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat, SubsampleFrameDecoder}; +use crate::BenVariant; + +// ===================================================================== +// Strict-length plumbing +// ===================================================================== + +/// Marker error attached to the `io::Error` returned when an [`ExactLen`] reader hits underlying +/// EOF before consuming its declared length. Used by convenience APIs to recognise a bundle-layer +/// short-range failure even when it has surfaced through a codec. +#[derive(Debug)] +pub(crate) struct ShortRangeMarker { + pub remaining: u64, +} + +impl fmt::Display for ShortRangeMarker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "bundle range ended {} byte(s) before declared length", + self.remaining + ) + } +} + +impl std::error::Error for ShortRangeMarker {} + +/// Shared flag set by an [`ExactLen`] reader when the underlying reader runs out of bytes before +/// the declared length is reached. Clones share state so a wrapper above a codec can detect the +/// short read even if the codec swallows the inner `UnexpectedEof` in favor of its own error. +#[derive(Clone, Default)] +pub(crate) struct ShortRangeFlag(Arc); + +impl ShortRangeFlag { + pub(crate) fn new() -> Self { + Self(Arc::new(AtomicBool::new(false))) + } + + pub(crate) fn set(&self) { + self.0.store(true, Ordering::Relaxed); + } + + pub(crate) fn get(&self) -> bool { + self.0.load(Ordering::Relaxed) + } +} + +/// Bounded reader that enforces an exact byte length. Behaves like [`std::io::Take`] for reads +/// within the declared length, but returns +/// `Err(io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker))` (and sets the shared +/// [`ShortRangeFlag`]) if the underlying reader signals EOF before the declared length is reached. +/// +/// `ExactLen` is the BENDL-layer guarantee that `payload_len` and `stream_len` are exact byte +/// counts of the on-disk range; a backing file shorter than declared is a corrupt bundle, not a +/// short successful read. +pub struct ExactLen { + inner: R, + remaining: u64, + flag: ShortRangeFlag, +} + +impl ExactLen { + pub(crate) fn new(inner: R, declared: u64, flag: ShortRangeFlag) -> Self { + Self { + inner, + remaining: declared, + flag, + } + } +} + +impl Read for ExactLen { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.remaining == 0 || buf.is_empty() { + return Ok(0); + } + let max = (buf.len() as u64).min(self.remaining) as usize; + let n = self.inner.read(&mut buf[..max])?; + if n == 0 { + // Underlying reader hit EOF before our declared length. Set the shared flag so a + // wrapper above a codec can recognise this as a bundle-range failure, and surface as + // UnexpectedEof carrying the marker. + let remaining = self.remaining; + self.flag.set(); + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining }, + )); + } + self.remaining -= n as u64; + Ok(n) + } +} + +/// Wraps a reader sitting above an [`ExactLen`]-bounded source. If the underlying reader returns +/// an error and the shared `ShortRangeFlag` is set, the error is replaced with an `UnexpectedEof` +/// carrying a [`ShortRangeMarker`] so callers see a bundle-layer short-range failure rather than a +/// codec-specific error message. +pub(crate) struct ShortRangeAwareReader { + inner: R, + short_flag: ShortRangeFlag, +} + +impl ShortRangeAwareReader { + pub(crate) fn new(inner: R, short_flag: ShortRangeFlag) -> Self { + Self { inner, short_flag } + } +} + +impl Read for ShortRangeAwareReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.inner.read(buf) { + Ok(n) => Ok(n), + Err(e) => { + if self.short_flag.get() { + Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )) + } else { + Err(e) + } + } + } + } +} + +/// Scan an exact on-disk byte range and return its CRC32C, without decoding. +/// +/// Seeks to `offset`, reads exactly `len` bytes in 64 KiB chunks, and returns the accumulated +/// CRC32C. A backing range shorter than `len` surfaces as +/// `io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker)` so callers can distinguish a +/// truncated bundle from any other failure. This is the shared core behind both +/// [`super::reader::BendlReader::verify_asset_checksum`] and +/// [`super::reader::BendlReader::verify_stream_checksum`]. +pub(crate) fn scan_range_crc32c( + inner: &mut R, + offset: u64, + len: u64, +) -> io::Result { + inner.seek(SeekFrom::Start(offset))?; + let mut remaining = len; + let mut buf = [0u8; 64 * 1024]; + let mut hasher: u32 = 0; + while remaining > 0 { + let want = remaining.min(buf.len() as u64) as usize; + let n = inner.read(&mut buf[..want])?; + if n == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining }, + )); + } + hasher = crc32c::crc32c_append(hasher, &buf[..n]); + remaining -= n as u64; + } + Ok(hasher) +} + +// ===================================================================== +// CRC-verifying reader plumbing +// ===================================================================== + +/// Build the `io::Error` used to surface a CRC mismatch through a `Read` or `Iterator` boundary. +/// The single definition keeps the kind (`InvalidData`) and inner [`ChecksumError`] shape identical +/// across every verify path. +pub(crate) fn crc_mismatch_error(target: ChecksumTarget, computed: u32, expected: u32) -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + ChecksumError::Mismatch { + target, + computed, + expected, + }, + ) +} + +/// CRC accumulator that sits between a byte source and its consumer. It never substitutes an error +/// for raw EOF — the surrounding [`VerifyingReader`] (for uncompressed assets) or the post-decoder +/// [`VerifyingReader`] (for xz assets) decides when and whether to check the accumulated hash. +pub(crate) struct CrcTeeReader { + inner: R, + hasher: u32, +} + +impl CrcTeeReader { + pub(crate) fn new(inner: R) -> Self { + Self { inner, hasher: 0 } + } +} + +impl Read for CrcTeeReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let n = self.inner.read(buf)?; + if n > 0 { + self.hasher = crc32c::crc32c_append(self.hasher, &buf[..n]); + } + Ok(n) + } +} + +/// A byte source that can report the CRC32C accumulated over the raw on-disk payload bytes it has +/// passed through so far. Implemented for both the uncompressed source (`CrcTeeReader` directly) and +/// the xz-compressed source (`XzDecoder` over a `CrcTeeReader`), so a single [`VerifyingReader`] +/// serves both. +pub(crate) trait CrcSource { + /// CRC32C of the raw on-disk bytes consumed so far. + fn crc(&self) -> u32; +} + +impl CrcSource for CrcTeeReader { + fn crc(&self) -> u32 { + self.hasher + } +} + +impl CrcSource for XzDecoder> { + fn crc(&self) -> u32 { + self.get_ref().hasher + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum VerifyState { + /// Still feeding bytes from the underlying reader. + Reading, + /// The source reached EOF and the CRC matched. Subsequent reads return `Ok(0)` (normal EOF). + EofChecked, + /// A terminal error (CRC mismatch or short range) was reported once. Subsequent reads return + /// `Ok(0)` so the reader stays well-behaved if the caller re-polls after the error. + Failed, +} + +/// Verifying reader for asset payloads. Forwards decoded bytes from a [`CrcSource`] and, at the +/// source's natural EOF, confirms the stored CRC32C or returns [`ChecksumError::Mismatch`] in place +/// of `Ok(0)`. +/// +/// For an uncompressed asset the source is `CrcTeeReader>`, so the bytes read and the +/// bytes hashed are the same payload bytes. For an xz-compressed asset the source is +/// `XzDecoder>>`: the tee accumulates over the raw *compressed* bytes +/// (verification happens before decompression) while the caller reads decompressed output, and the +/// hash is finalized once the decoder reaches its own EOF. +/// +/// If the underlying [`ExactLen`] flagged a short range, an error from the source is rewritten to a +/// bundle-layer `UnexpectedEof`/[`ShortRangeMarker`] so the structural truncation is the reported +/// failure rather than a codec-specific error. +pub(crate) struct VerifyingReader { + source: S, + expected: u32, + target: ChecksumTarget, + short_flag: ShortRangeFlag, + state: VerifyState, +} + +impl VerifyingReader { + pub(crate) fn new( + source: S, + expected: u32, + target: ChecksumTarget, + short_flag: ShortRangeFlag, + ) -> Self { + Self { + source, + expected, + target, + short_flag, + state: VerifyState::Reading, + } + } +} + +impl Read for VerifyingReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.state { + VerifyState::EofChecked | VerifyState::Failed => return Ok(0), + VerifyState::Reading => {} + } + match self.source.read(buf) { + Ok(0) => { + let computed = self.source.crc(); + if computed == self.expected { + self.state = VerifyState::EofChecked; + Ok(0) + } else { + self.state = VerifyState::Failed; + Err(crc_mismatch_error( + self.target.clone(), + computed, + self.expected, + )) + } + } + Ok(n) => Ok(n), + Err(e) => { + self.state = VerifyState::Failed; + // A short read from ExactLen already carries the real remaining count; pass it + // through untouched. Otherwise, if the source (e.g. an xz decoder) swallowed the + // short read in favor of its own error, the shared flag lets us still surface the + // structural truncation. + if e.get_ref().is_some_and(|inner| inner.is::()) { + Err(e) + } else if self.short_flag.get() { + Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )) + } else { + Err(e) + } + } + } + } +} + +// ===================================================================== +// Verified assignment-stream reader +// ===================================================================== + +/// CRC accumulator that shares its running hash via an `Arc`. Used as the source reader +/// for [`BendlVerifiedStreamReader`]: the `Arc` lets the outer wrapper read the final hash after a +/// consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. +/// +/// Unlike [`CrcTeeReader`], this type never substitutes a checksum error for raw EOF — it is always +/// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. The type is +/// exposed because it leaks through the return signatures of the wrapper's intentionally-partial +/// APIs (`into_frames`, `into_subsample_by_*`); callers should treat it as an opaque reader. +pub struct ArcHasher { + inner: R, + state: Arc, +} + +impl Read for ArcHasher { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let n = self.inner.read(buf)?; + if n > 0 { + let prev = self.state.load(Ordering::Relaxed); + self.state + .store(crc32c::crc32c_append(prev, &buf[..n]), Ordering::Relaxed); + } + Ok(n) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum StreamVerifyState { + Running, + /// A CRC mismatch was returned once as `Some(Err(...))`. Subsequent iterator calls return + /// `None`. + MismatchReported, + /// A non-CRC terminal error (codec failure, bundle-layer short range, etc.) was returned once + /// as `Some(Err(...))`. Subsequent iterator calls return `None`. Kept distinct from + /// `MismatchReported` so the state machine self-documents which class of failure tripped it. + Errored, + /// CRC matched after natural EOF. Subsequent iterator calls return `None`. + Verified, +} + +/// Source reader stack underneath a [`BendlVerifiedStreamReader`]. +pub(crate) type VerifiedStreamSource<'a, R> = ArcHasher>; + +/// Verified decoded assignment reader returned by +/// [`super::reader::BendlReader::open_assignment_reader`]. +/// +/// Wraps a [`BenStreamReader`] over a CRC-accumulating source and checks the stored stream CRC32C +/// after the codec reaches natural EOF. CRC mismatch surfaces from [`Iterator::next`] as +/// `Some(Err(io::ErrorKind::InvalidData))` — returned once after the last decoded record, then +/// `None`. Consuming methods (`count_samples`, `write_all_jsonl`, `for_each_assignment` when driven +/// to natural EOF) also fold the CRC check into their return value. +/// +/// **Intentionally partial APIs** (`into_frames`, `into_subsample_by_*`) are forwarded for +/// ergonomics but do not automatically verify — the underlying reader is stopped short of raw EOF +/// so the CRC tee is never finalized. Callers that need integrity for partial reads must call +/// [`super::reader::BendlReader::verify_stream_checksum`] separately. +pub struct BendlVerifiedStreamReader<'a, R: Read + Seek> { + inner: BenStreamReader>, + expected: u32, + arc_hasher: Arc, + short_flag: ShortRangeFlag, + state: StreamVerifyState, +} + +impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { + /// Construct a verified stream reader over a bounded assignment-stream range. + /// + /// `raw` is the bounded `ExactLen` over the on-disk stream bytes; `short_flag` is its shared + /// short-range flag. `init` builds the BEN/XBEN decoder over the CRC-accumulating source. On a + /// decoder-init failure caused by a truncated range (short flag set), the structural truncation + /// is surfaced as a bundle-layer `UnexpectedEof` rather than a banner-parse error. + pub(crate) fn new( + raw: ExactLen<&'a mut R>, + short_flag: ShortRangeFlag, + expected: u32, + init: impl FnOnce( + VerifiedStreamSource<'a, R>, + ) + -> Result>, BendlReadError>, + ) -> Result { + let arc_hasher = Arc::new(AtomicU32::new(0)); + let source = ArcHasher { + inner: raw, + state: Arc::clone(&arc_hasher), + }; + let inner = match init(source) { + Ok(inner) => inner, + Err(e) => { + if short_flag.get() { + return Err(BendlReadError::Io(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + ))); + } + return Err(e); + } + }; + Ok(Self { + inner, + expected, + arc_hasher, + short_flag, + state: StreamVerifyState::Running, + }) + } + + /// Return the BEN variant detected from the stream banner. + pub fn variant(&self) -> BenVariant { + self.inner.variant() + } + + /// Return the wire format (BEN vs XBEN) of this stream. + pub fn wire_format(&self) -> BenWireFormat { + self.inner.wire_format() + } + + /// Suppress progress output from the decoder. + pub fn silent(mut self, silent: bool) -> Self { + self.inner = self.inner.silent(silent); + self + } + + /// Compare the finalized stream hash against the stored CRC, mapping a mismatch to an + /// `InvalidData` error. Called by the consuming methods after they have driven the decoder to + /// raw EOF. + fn finalize_checksum(&self) -> io::Result<()> { + let computed = self.arc_hasher.load(Ordering::Relaxed); + if computed == self.expected { + Ok(()) + } else { + Err(crc_mismatch_error( + ChecksumTarget::Stream, + computed, + self.expected, + )) + } + } + + /// Map an error returned by a consuming inner call into the bundle-layer short-range error when + /// the shared flag fired, otherwise pass it through. + fn map_terminal_error(&self, e: io::Error) -> io::Error { + if self.short_flag.get() { + io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker { remaining: 0 }) + } else { + e + } + } + + /// Count the number of samples in the stream and verify the stream CRC32C. + /// + /// Drives the decoder to raw EOF as a side effect, finalizing the CRC accumulator. If the + /// count succeeds but the CRC does not match, the CRC mismatch is returned instead of the + /// count. + pub fn count_samples(self) -> io::Result { + // `count_samples` consumes `self.inner`, so capture the pieces the post-EOF check needs + // before the move rather than borrowing `self` afterwards. + let arc = Arc::clone(&self.arc_hasher); + let expected = self.expected; + let short_flag = self.short_flag.clone(); + let count = match self.inner.count_samples() { + Ok(count) => count, + Err(_) if short_flag.get() => { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + )); + } + Err(e) => return Err(e), + }; + let computed = arc.load(Ordering::Relaxed); + if computed == expected { + Ok(count) + } else { + Err(crc_mismatch_error(ChecksumTarget::Stream, computed, expected)) + } + } + + /// Decode assignments and pass each one to a callback by reference. + /// + /// When the callback drives the reader to natural EOF, the stream CRC is verified and a + /// mismatch is returned as an error. When the callback stops early (`f` returns `Ok(false)`), + /// the CRC is not checked — only a full traversal can verify the whole stream. + pub fn for_each_assignment(&mut self, mut f: F) -> io::Result<()> + where + F: FnMut(&[u16], u16) -> io::Result, + { + loop { + match self.next() { + Some(Ok((ref assignment, count))) => { + if !f(assignment, count)? { + return Ok(()); + } + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + } + } + } + + /// Decode the remaining stream, write it as JSONL, and verify the stream CRC32C. + /// + /// Each decoded sample is written as a JSON object containing an `assignment` vector and a + /// 1-based `sample` index. After all records are written, the stream CRC is checked; a + /// mismatch is returned instead of `Ok(())`. + pub fn write_all_jsonl(&mut self, mut writer: impl Write) -> io::Result<()> { + let mut sample_number = 0usize; + self.for_each_assignment(|assignment, count| { + for _ in 0..count { + sample_number += 1; + let line = json!({ + "assignment": assignment, + "sample": sample_number, + }) + .to_string() + + "\n"; + writer.write_all(line.as_bytes())?; + } + Ok(true) + }) + } + + /// Consume the wrapper and iterate over raw BEN/ben32 frames instead of materialized + /// assignments. + /// + /// Frame iteration is intentionally partial: callers typically stop short of EOF, so the CRC + /// tee is never finalized and the stream is **not verified** by this path. Callers needing + /// integrity for partial reads should call + /// [`super::reader::BendlReader::verify_stream_checksum`] separately. + pub fn into_frames(self) -> BenStreamFrameReader> { + self.inner.into_frames() + } +} + +impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { + /// Convert into a subsampling iterator over explicit 1-based indices. + /// + /// Subsampling is intentionally partial: the underlying reader is stopped short of raw EOF, so + /// the CRC tee is never finalized and the stream is **not verified** by this path. Use + /// [`super::reader::BendlReader::verify_stream_checksum`] for an explicit full-stream integrity + /// check. + pub fn into_subsample_by_indices( + self, + indices: T, + ) -> SubsampleFrameDecoder>> + where + T: IntoIterator, + { + self.inner.into_subsample_by_indices(indices) + } + + /// Convert into a subsampling iterator over the inclusive 1-based range `[start, end]`. + /// + /// Subsampling is intentionally partial and is **not verified** by this path; see + /// [`Self::into_subsample_by_indices`]. + pub fn into_subsample_by_range( + self, + start: usize, + end: usize, + ) -> SubsampleFrameDecoder>> { + self.inner.into_subsample_by_range(start, end) + } + + /// Convert into a subsampling iterator that selects every `step` samples from the 1-based + /// `offset`. + /// + /// Subsampling is intentionally partial and is **not verified** by this path; see + /// [`Self::into_subsample_by_indices`]. + pub fn into_subsample_every( + self, + step: usize, + offset: usize, + ) -> SubsampleFrameDecoder>> { + self.inner.into_subsample_every(step, offset) + } +} + +impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { + type Item = io::Result<(Vec, u16)>; + + fn next(&mut self) -> Option { + match self.state { + StreamVerifyState::MismatchReported + | StreamVerifyState::Errored + | StreamVerifyState::Verified => return None, + StreamVerifyState::Running => {} + } + match self.inner.next() { + Some(Err(e)) => { + // Non-CRC terminal error: codec failure, bundle-layer short range, or anything else + // the inner reader returned. CRC mismatch lives in the `None` branch below. + self.state = StreamVerifyState::Errored; + Some(Err(self.map_terminal_error(e))) + } + Some(item) => Some(item), + None => { + // Inner reached natural EOF — finalize the CRC check. + match self.finalize_checksum() { + Ok(()) => { + self.state = StreamVerifyState::Verified; + None + } + Err(e) => { + self.state = StreamVerifyState::MismatchReported; + Some(Err(e)) + } + } + } + } + } +} From ff2d3ec5f7e6ae6a6f27757c9456d0c4fda9588b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 30 May 2026 12:32:19 -0600 Subject: [PATCH 122/221] create asset registry --- ben/src/io/bundle/writer.rs | 279 +++++++++++++++++------------------- 1 file changed, 133 insertions(+), 146 deletions(-) diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index d54730b..998e55b 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -102,13 +102,115 @@ impl AddAssetOptions { } } +/// An asset payload prepared for on-disk storage: the (optionally xz-compressed) bytes, the +/// directory-entry flags describing them, and the CRC32C over those exact bytes. +struct EncodedAsset { + bytes: Vec, + asset_flags: u16, + checksum: Vec, +} + +/// Compress (if requested), checksum, and assemble the directory-entry flags for one asset payload. +/// +/// This is the single encode path shared by [`BendlWriter::add_asset`] and +/// [`BendlAppender::commit`], so the create and append routes can never drift on compression, flag +/// assembly, or CRC coverage. It is pure (in-memory), so a failure leaves any backing file +/// untouched. The CRC32C is over the **on-disk** bytes — the compressed bytes when xz is applied, so +/// verification happens before decompression (see [`ASSET_FLAG_CHECKSUM`]). +fn encode_asset_payload(payload: Vec, compress: bool, is_json: bool) -> io::Result { + let bytes = if compress { + let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); + encoder.write_all(&payload)?; + encoder.finish()? + } else { + payload + }; + + let mut asset_flags: u16 = ASSET_FLAG_CHECKSUM; + if is_json { + asset_flags |= ASSET_FLAG_JSON; + } + if compress { + asset_flags |= ASSET_FLAG_XZ; + } + + let checksum = crc32c::crc32c(&bytes).to_le_bytes().to_vec(); + Ok(EncodedAsset { + bytes, + asset_flags, + checksum, + }) +} + +/// Tracks the asset names and singleton asset-types already claimed in a bundle, and enforces the +/// canonical-name + uniqueness rules shared by the create and append paths. +/// +/// [`Self::claim`] validates fully before mutating, so a rejected asset never leaves the registry in +/// a half-updated state — there is nothing to roll back. +#[derive(Default)] +struct AssetNameRegistry { + names: HashSet, + singleton_types: HashSet, +} + +impl AssetNameRegistry { + /// An empty registry, for a fresh bundle. + fn new() -> Self { + Self::default() + } + + /// Seed a registry from the directory entries of an existing finalized bundle (append path). + fn from_entries(entries: &[BendlDirectoryEntry]) -> Self { + let mut registry = Self::new(); + for entry in entries { + registry.names.insert(entry.name.clone()); + if standardized_name_for(entry.asset_type).is_some() { + registry.singleton_types.insert(entry.asset_type); + } + } + registry + } + + /// Validate the canonical-name and uniqueness rules for a candidate asset **without** mutating + /// state. A known singleton type must use its standardized name and may appear only once; every + /// asset name must be unique. + fn check(&self, asset_type: u16, name: &str) -> Result<(), BendlWriteError> { + if let Some(canonical) = standardized_name_for(asset_type) { + if name != canonical { + return Err(BendlWriteError::WrongCanonicalName { + asset_type, + expected: canonical.to_string(), + found: name.to_string(), + }); + } + if self.singleton_types.contains(&asset_type) { + return Err(BendlWriteError::DuplicateSingletonType(asset_type)); + } + } + if self.names.contains(name) { + return Err(BendlWriteError::DuplicateName(name.to_string())); + } + Ok(()) + } + + /// Validate via [`Self::check`] and, on success, reserve the name and (for singleton types) the + /// asset-type so subsequent claims see it as taken. + fn claim(&mut self, asset_type: u16, name: &str) -> Result<(), BendlWriteError> { + self.check(asset_type, name)?; + self.names.insert(name.to_string()); + if standardized_name_for(asset_type).is_some() { + self.singleton_types.insert(asset_type); + } + Ok(()) + } +} + /// Writer for a single `.bendl` file. pub struct BendlWriter { inner: W, header: BendlHeader, entries: Vec, - names: HashSet, - singleton_types: HashSet, + registry: AssetNameRegistry, state: WriterState, } @@ -141,8 +243,7 @@ impl BendlWriter { inner, header, entries: Vec::new(), - names: HashSet::new(), - singleton_types: HashSet::new(), + registry: AssetNameRegistry::new(), state: WriterState::Assets, }) } @@ -166,72 +267,25 @@ impl BendlWriter { return Err(BendlWriteError::AssetsAfterStream); } - // Canonical-name rule for known singleton types. - if let Some(canonical) = standardized_name_for(asset_type) { - if name != canonical { - return Err(BendlWriteError::WrongCanonicalName { - asset_type, - expected: canonical.to_string(), - found: name.to_string(), - }); - } - if !self.singleton_types.insert(asset_type) { - return Err(BendlWriteError::DuplicateSingletonType(asset_type)); - } - } - - // Unique name rule. - if !self.names.insert(name.to_string()) { - // Roll back the singleton insertion before returning, so the writer remains in a - // consistent state. (Only known singleton types would have been inserted above.) - if standardized_name_for(asset_type).is_some() { - self.singleton_types.remove(&asset_type); - } - return Err(BendlWriteError::DuplicateName(name.to_string())); - } + // Validate and reserve the name/type up front, so a rejected asset writes no bytes. + self.registry.claim(asset_type, name)?; - // Decide compression. let compress = options .compress .unwrap_or_else(|| default_compresses_by_type(asset_type)); - - // Compute final payload bytes. - let payload_bytes: Vec = if compress { - let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); - encoder.write_all(payload).map_err(BendlWriteError::Io)?; - encoder.finish().map_err(BendlWriteError::Io)? - } else { - payload.to_vec() - }; - - // CRC32C over the on-disk payload bytes. For compressed assets this is the compressed bytes - // (verification happens before decompression). See ASSET_FLAG_CHECKSUM for the wire-format - // pin. - let crc = crc32c::crc32c(&payload_bytes); - let checksum_bytes = crc.to_le_bytes().to_vec(); - - let mut asset_flags: u16 = ASSET_FLAG_CHECKSUM; - if options.is_json { - asset_flags |= ASSET_FLAG_JSON; - } - if compress { - asset_flags |= ASSET_FLAG_XZ; - } + let encoded = encode_asset_payload(payload.to_vec(), compress, options.is_json)?; // Write at current file position. let payload_offset = self.inner.seek(SeekFrom::Current(0))?; - self.inner - .write_all(&payload_bytes) - .map_err(BendlWriteError::Io)?; - let payload_len = payload_bytes.len() as u64; + self.inner.write_all(&encoded.bytes)?; self.entries.push(BendlDirectoryEntry { asset_type, - asset_flags, + asset_flags: encoded.asset_flags, name: name.to_string(), payload_offset, - payload_len, - checksum: Some(checksum_bytes), + payload_len: encoded.bytes.len() as u64, + checksum: Some(encoded.checksum), }); Ok(()) @@ -310,8 +364,7 @@ impl BendlWriter { parent: Some(ParentState { header: self.header, entries: self.entries, - names: self.names, - singleton_types: self.singleton_types, + registry: self.registry, }), start_offset: stream_offset, bytes_written: 0, @@ -370,8 +423,7 @@ impl BendlWriter { struct ParentState { header: BendlHeader, entries: Vec, - names: HashSet, - singleton_types: HashSet, + registry: AssetNameRegistry, } /// Owned stream-phase session. Holds the underlying writer and the parent [`BendlWriter`]'s @@ -421,8 +473,7 @@ impl BendlStreamSession { inner, header: parent.header, entries: parent.entries, - names: parent.names, - singleton_types: parent.singleton_types, + registry: parent.registry, state: WriterState::StreamWritten { stream_len: self.bytes_written, sample_count, @@ -532,11 +583,10 @@ pub struct BendlAppender { inner: W, header: BendlHeader, existing_entries: Vec, - existing_names: HashSet, - existing_singleton_types: HashSet, pending: Vec, - pending_names: HashSet, - pending_singleton_types: HashSet, + /// Names and singleton types claimed by the existing directory plus any pending adds. Seeded + /// from the existing entries at open time, then extended as each pending asset is enqueued. + registry: AssetNameRegistry, } /// An asset queued for append but not yet written to disk. @@ -578,24 +628,14 @@ impl BendlAppender { BendlWriteError::Format(BendlFormatError::MalformedDirectory(e.to_string())) })?; - let mut existing_names = HashSet::new(); - let mut existing_singleton_types = HashSet::new(); - for entry in &existing_entries { - existing_names.insert(entry.name.clone()); - if standardized_name_for(entry.asset_type).is_some() { - existing_singleton_types.insert(entry.asset_type); - } - } + let registry = AssetNameRegistry::from_entries(&existing_entries); Ok(BendlAppender { inner, header, existing_entries, - existing_names, - existing_singleton_types, pending: Vec::new(), - pending_names: HashSet::new(), - pending_singleton_types: HashSet::new(), + registry, }) } @@ -611,35 +651,14 @@ impl BendlAppender { payload: &[u8], options: AddAssetOptions, ) -> Result<(), BendlWriteError> { - // Canonical-name rule. - if let Some(canonical) = standardized_name_for(asset_type) { - if name != canonical { - return Err(BendlWriteError::WrongCanonicalName { - asset_type, - expected: canonical.to_string(), - found: name.to_string(), - }); - } - if self.existing_singleton_types.contains(&asset_type) - || self.pending_singleton_types.contains(&asset_type) - { - return Err(BendlWriteError::DuplicateSingletonType(asset_type)); - } - } - - // Uniqueness rule against both existing and pending assets. - if self.existing_names.contains(name) || self.pending_names.contains(name) { - return Err(BendlWriteError::DuplicateName(name.to_string())); - } + // Validate against both the loaded directory and previously-enqueued pending assets, and + // reserve the name/type on success. Nothing is buffered if validation fails. + self.registry.claim(asset_type, name)?; let compress = options .compress .unwrap_or_else(|| default_compresses_by_type(asset_type)); - self.pending_names.insert(name.to_string()); - if standardized_name_for(asset_type).is_some() { - self.pending_singleton_types.insert(asset_type); - } self.pending.push(PendingAsset { asset_type, name: name.to_string(), @@ -705,45 +724,13 @@ impl BendlAppender { return Ok(self.inner); } - // Phase 1: compress any pending payloads and build new entries with placeholder offsets. Do - // this entirely in memory so failures here leave the file untouched. - struct EncodedPending { - asset_type: u16, - name: String, - bytes: Vec, - asset_flags: u16, - checksum: Option>, - } - - let mut encoded: Vec = Vec::with_capacity(self.pending.len()); + // Phase 1: compress any pending payloads through the shared encode path and pair each with + // its identifying name/type. Done entirely in memory so failures here leave the file + // untouched. + let mut encoded: Vec<(u16, String, EncodedAsset)> = Vec::with_capacity(self.pending.len()); for asset in self.pending.drain(..) { - let bytes = if asset.compress { - let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); - encoder.write_all(&asset.raw_payload)?; - encoder.finish()? - } else { - asset.raw_payload - }; - - // CRC32C over on-disk payload bytes (compressed if XZ). - let crc = crc32c::crc32c(&bytes); - let checksum_bytes = crc.to_le_bytes().to_vec(); - - let mut asset_flags: u16 = ASSET_FLAG_CHECKSUM; - if asset.is_json { - asset_flags |= ASSET_FLAG_JSON; - } - if asset.compress { - asset_flags |= ASSET_FLAG_XZ; - } - - encoded.push(EncodedPending { - asset_type: asset.asset_type, - name: asset.name, - bytes, - asset_flags, - checksum: Some(checksum_bytes), - }); + let enc = encode_asset_payload(asset.raw_payload, asset.compress, asset.is_json)?; + encoded.push((asset.asset_type, asset.name, enc)); } // Phase 2: file mutation. From this point forward, a failure leaves the bundle in a damaged @@ -761,16 +748,16 @@ impl BendlAppender { Vec::with_capacity(self.existing_entries.len() + encoded.len()); new_entries.extend(self.existing_entries.iter().cloned()); - for enc in encoded { + for (asset_type, name, enc) in encoded { let payload_offset = self.inner.seek(SeekFrom::Current(0))?; self.inner.write_all(&enc.bytes)?; new_entries.push(BendlDirectoryEntry { - asset_type: enc.asset_type, + asset_type, asset_flags: enc.asset_flags, - name: enc.name, + name, payload_offset, payload_len: enc.bytes.len() as u64, - checksum: enc.checksum, + checksum: Some(enc.checksum), }); } From 4de88837464bda39960d5580869d1286c4882bd4 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 30 May 2026 13:34:27 -0600 Subject: [PATCH 123/221] Improve crash safety of BENDL and tighten verified stream APIs --- ben/src/cli/bendl/helpers.rs | 8 +-- ben/src/io/bundle/format.rs | 6 +- ben/src/io/bundle/reader.rs | 37 +++++++++-- ben/src/io/bundle/tests/writer.rs | 103 ++++++++++++++++++++-------- ben/src/io/bundle/verify.rs | 107 +++++++++--------------------- ben/src/io/bundle/writer.rs | 95 ++++++++++++-------------- ben/tests/test_stress_edges.rs | 30 +++------ 7 files changed, 194 insertions(+), 192 deletions(-) diff --git a/ben/src/cli/bendl/helpers.rs b/ben/src/cli/bendl/helpers.rs index f9bd8a8..2e05fb6 100644 --- a/ben/src/cli/bendl/helpers.rs +++ b/ben/src/cli/bendl/helpers.rs @@ -41,9 +41,7 @@ pub(super) fn add_custom_file_asset( .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) } -pub(super) fn append_known_file_asset< - W: Read + Write + Seek + crate::io::bundle::writer::BendlTruncate, ->( +pub(super) fn append_known_file_asset( appender: &mut BendlAppender, kind: KnownAssetKind, path: &Path, @@ -56,9 +54,7 @@ pub(super) fn append_known_file_asset< .map_err(|e: BendlWriteError| format!("failed to add asset {name:?}: {e}")) } -pub(super) fn append_custom_file_asset< - W: Read + Write + Seek + crate::io::bundle::writer::BendlTruncate, ->( +pub(super) fn append_custom_file_asset( appender: &mut BendlAppender, name: &str, path: &Path, diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 3e2c3d6..cbd00f9 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -200,8 +200,10 @@ pub struct BendlHeader { /// `HEADER_FLAG_STREAM_CHECKSUM` is set in `flags`. Writers set this to zero while the /// bundle is unfinalized and patch it on finalization. pub stream_checksum: u32, - /// Absolute byte offset of the directory table, or `0` if no directory has been written yet. - /// In a finalized bundle the directory lives at the end of the file. + /// Absolute byte offset of the authoritative directory table, or `0` if no directory has been + /// written yet. Successful finalization writes this directory after the assignment stream; a + /// failed post-finalize append may leave newer orphaned bytes after the old authoritative + /// directory until the header is patched. pub directory_offset: u64, /// Byte length of the directory table, or `0` if absent. pub directory_len: u64, diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 3007efb..7846385 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -130,9 +130,9 @@ impl BendlReader { /// /// Returns `Err(BundleIncomplete)` for unfinalized bundles (the stored `stream_checksum` is not /// authoritative until the bundle is finalized) and `Err(Unavailable)` when - /// `HEADER_FLAG_STREAM_CHECKSUM` is clear (foreign or hand-built bytes; the library writer always - /// sets this flag). The finalization check comes first by design: reporting `Unavailable` for an - /// unfinalized bundle would be misleading. + /// `HEADER_FLAG_STREAM_CHECKSUM` is clear (foreign or hand-built bytes; the library writer + /// always sets this flag). The finalization check comes first by design: reporting + /// `Unavailable` for an unfinalized bundle would be misleading. fn require_stream_checksum(&self) -> Result { if !self.header.is_finalized() { return Err(BendlReadError::Checksum(ChecksumError::BundleIncomplete { @@ -205,8 +205,8 @@ impl BendlReader { } /// Construct a verified decoded assignment reader that checks the stream CRC32C after the - /// codec reaches EOF. The returned [`BendlVerifiedStreamReader`] forwards the full - /// [`BenStreamReader`] API surface and folds the CRC check into consuming methods. + /// codec reaches EOF. The returned [`BendlVerifiedStreamReader`] exposes only full-consumption + /// APIs, because partial frame/subsample iteration cannot prove the whole stream checksum. /// /// Returns `Err(BundleIncomplete)` for unfinalized bundles and `Err(Unavailable)` when the /// stream checksum flag is clear. @@ -230,6 +230,33 @@ impl BendlReader { }) } + /// Construct a decoded assignment reader without CRC verification. + /// + /// This is the explicit escape hatch for partial/random-access decode paths such as + /// `into_frames` and `into_subsample_by_*`: those operations intentionally stop before raw EOF, + /// so they cannot verify the whole stream checksum. Call [`Self::verify_stream_checksum`] + /// separately when whole-stream integrity matters. + pub fn open_assignment_reader_unverified( + &mut self, + ) -> Result>, BendlReadError> + where + R: Send, + { + let format = self.assignment_format().ok_or_else(|| { + BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat( + self.header.assignment_format, + )) + })?; + let (offset, len) = self.assignment_stream_range()?; + self.inner.seek(SeekFrom::Start(offset))?; + let raw: Box = + Box::new(ExactLen::new(&mut self.inner, len, ShortRangeFlag::new())); + match format { + AssignmentFormat::Ben => BenStreamReader::from_ben(raw).map_err(Into::into), + AssignmentFormat::Xben => BenStreamReader::from_xben(raw).map_err(Into::into), + } + } + /// Verify the stored stream CRC32C by scanning the raw on-disk bytes of the assignment stream. /// /// This is the explicit full-scan verifier for callers that want to check integrity without diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index f19fc33..b454d58 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -405,8 +405,8 @@ fn append_rejects_incomplete_bundle() { } #[test] -fn append_rejects_complete_bundle_with_zero_directory() { - // Header claims complete but has directory_offset=0 — hits the second BundleIncomplete check. +fn append_rejects_finalized_bundle_with_zero_directory() { + // Header claims finalized but has directory_offset=0 — hits the second BundleIncomplete check. let header = BendlHeader { magic: BENDL_MAGIC, major_version: BENDL_MAJOR_VERSION, @@ -1138,7 +1138,7 @@ fn five_successive_appends_preserve_everything() { #[test] fn randomized_append_sequence_preserves_all_prior_entries() { // Independent coverage for append: random number of rounds, random payload sizes. Catches any - // bookkeeping drift in the appender's directory-rewrite path. + // bookkeeping drift in the appender's append-only replacement-directory path. use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; @@ -1734,13 +1734,10 @@ fn two_parallel_readers_against_the_same_bundle_agree() { // shared mutable state internally (e.g., no static caches, no thread-local position // tracking) that would let one thread's reads scramble the other's. // - // Reader-during-append is intentionally not covered here: today's append path truncates - // the old trailing directory before writing the new one, while the header still points at - // the old directory offset until the final patch. A concurrent reader during that window - // would observe a torn state. Whether the contract should weaken to "errors cleanly - // during torn states" or strengthen to "snapshot-style readers" is a design decision - // (see the coverage plan tier 0.12 for the design question), and the right pin here is - // not a test against the current behavior. + // Reader-during-append is intentionally not covered here. The append path preserves the old + // authoritative directory until the final header patch, so payload/directory writes alone do + // not create a torn reader state; concurrent access to the same mutable file handle is still an + // integration-level filesystem contract rather than a property of immutable reader state. use std::sync::Arc; use std::thread; @@ -1846,7 +1843,7 @@ fn appender_preserves_unknown_asset_flag_bits_on_existing_entries() { } // ===================================================================== -// rollback paths and accessors +// validation-failure paths and accessors // ===================================================================== #[test] @@ -1869,13 +1866,70 @@ fn stream_session_start_offset_returns_recorded_value() { } #[test] -fn writer_duplicate_name_after_singleton_insert_rolls_back_singleton_state() { - // Trigger the rare DuplicateName-after-canonical-singleton-insert branch in BendlWriter:: - // add_asset (the `singleton_types.remove(&asset_type)` rollback path). Reach it by adding a - // custom asset that happens to take the canonical name of a known singleton type, then - // attempting to add the actual singleton: the canonical-name check passes, singleton_types - // accepts the new type, then names.insert fails because the custom asset already claimed - // that name. The rollback keeps the writer state consistent for a future retry. +fn writer_failed_asset_write_does_not_poison_registry() { + struct FailOnceAfterHeader { + inner: Cursor>, + failed: bool, + } + + impl FailOnceAfterHeader { + fn new() -> Self { + Self { + inner: Cursor::new(Vec::new()), + failed: false, + } + } + } + + impl Write for FailOnceAfterHeader { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if !self.failed && self.inner.position() >= HEADER_SIZE as u64 { + self.failed = true; + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "simulated payload write failure", + )); + } + self.inner.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } + } + + impl Seek for FailOnceAfterHeader { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + self.inner.seek(pos) + } + } + + let mut writer = BendlWriter::new(FailOnceAfterHeader::new(), AssignmentFormat::Ben).unwrap(); + let err = writer + .add_asset( + ASSET_TYPE_CUSTOM, + "retry.bin", + b"payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap_err(); + assert!(matches!(err, BendlWriteError::Io(_))); + + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "retry.bin", + b"payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); +} + +#[test] +fn writer_duplicate_name_after_singleton_check_leaves_writer_usable() { + // A custom asset can claim the standardized name of a known singleton type. A later attempt to + // add the actual singleton must fail cleanly during validation, without reserving any + // singleton state or making the writer unusable for unrelated additions. let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer .add_asset( @@ -1898,9 +1952,6 @@ fn writer_duplicate_name_after_singleton_insert_rolls_back_singleton_state() { "expected DuplicateName, got {err:?}" ); - // The rollback contract: a second attempt at adding ASSET_TYPE_GRAPH must NOT see a stale - // entry in singleton_types from the previous attempt. The writer is also expected to - // remain usable for non-conflicting additions. writer .add_asset( ASSET_TYPE_METADATA, @@ -1912,11 +1963,9 @@ fn writer_duplicate_name_after_singleton_insert_rolls_back_singleton_state() { } #[test] -fn appender_duplicate_name_after_singleton_insert_rolls_back_pending_state() { - // Same rollback contract for BendlAppender (rather than BendlWriter): a successful canonical- - // name singleton insert into pending_singleton_types must be undone if the name collides - // with an existing entry. Reach it by appending a custom asset that takes a canonical name, - // committing, then opening the appender and attempting the singleton add. +fn appender_duplicate_name_after_singleton_check_leaves_appender_usable() { + // Same validation contract for BendlAppender: a singleton-name collision must fail without + // reserving pending singleton state, so the appender remains usable for unrelated additions. let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer .add_asset( @@ -1944,8 +1993,6 @@ fn appender_duplicate_name_after_singleton_insert_rolls_back_pending_state() { "expected DuplicateName, got {err:?}" ); - // After the rejection, the appender must still be usable for non-conflicting additions - // (the rollback removed the stale pending_singleton_types entry). appender .add_asset( ASSET_TYPE_METADATA, diff --git a/ben/src/io/bundle/verify.rs b/ben/src/io/bundle/verify.rs index cd93ec1..2e378a4 100644 --- a/ben/src/io/bundle/verify.rs +++ b/ben/src/io/bundle/verify.rs @@ -8,12 +8,13 @@ //! - [`CrcTeeReader`] accumulates a CRC32C over the bytes that flow through it, without ever //! substituting an error for raw EOF. //! - [`VerifyingReader`] wraps a CRC-accumulating byte source and, at the source's natural EOF, -//! either confirms the stored CRC32C or surfaces [`ChecksumError::Mismatch`] in place of the usual -//! `Ok(0)`. The same wrapper serves uncompressed assets (source = `CrcTeeReader>`) and -//! xz-compressed assets (source = `XzDecoder>>`): the only difference is -//! *where* the tee sits, which the [`CrcSource`] trait abstracts. -//! - [`BendlVerifiedStreamReader`] folds the same verify-at-EOF discipline into the assignment-stream -//! iterator API. +//! either confirms the stored CRC32C or surfaces [`ChecksumError::Mismatch`] in place of the +//! usual `Ok(0)`. The same wrapper serves uncompressed assets (source = +//! `CrcTeeReader>`) and xz-compressed assets (source = +//! `XzDecoder>>`): the only difference is *where* the tee sits, which +//! the [`CrcSource`] trait abstracts. +//! - [`BendlVerifiedStreamReader`] folds the same verify-at-EOF discipline into full-consumption +//! assignment-stream APIs. use std::fmt; use std::io::{self, Read, Seek, SeekFrom, Write}; @@ -26,7 +27,7 @@ use serde_json::json; use xz2::read::XzDecoder; use super::error::{BendlReadError, ChecksumError, ChecksumTarget}; -use crate::io::reader::{BenStreamFrameReader, BenStreamReader, BenWireFormat, SubsampleFrameDecoder}; +use crate::io::reader::{BenStreamReader, BenWireFormat}; use crate::BenVariant; // ===================================================================== @@ -192,7 +193,11 @@ pub(crate) fn scan_range_crc32c( /// Build the `io::Error` used to surface a CRC mismatch through a `Read` or `Iterator` boundary. /// The single definition keeps the kind (`InvalidData`) and inner [`ChecksumError`] shape identical /// across every verify path. -pub(crate) fn crc_mismatch_error(target: ChecksumTarget, computed: u32, expected: u32) -> io::Error { +pub(crate) fn crc_mismatch_error( + target: ChecksumTarget, + computed: u32, + expected: u32, +) -> io::Error { io::Error::new( io::ErrorKind::InvalidData, ChecksumError::Mismatch { @@ -228,9 +233,9 @@ impl Read for CrcTeeReader { } /// A byte source that can report the CRC32C accumulated over the raw on-disk payload bytes it has -/// passed through so far. Implemented for both the uncompressed source (`CrcTeeReader` directly) and -/// the xz-compressed source (`XzDecoder` over a `CrcTeeReader`), so a single [`VerifyingReader`] -/// serves both. +/// passed through so far. Implemented for both the uncompressed source (`CrcTeeReader` directly) +/// and the xz-compressed source (`XzDecoder` over a `CrcTeeReader`), so a single +/// [`VerifyingReader`] serves both. pub(crate) trait CrcSource { /// CRC32C of the raw on-disk bytes consumed so far. fn crc(&self) -> u32; @@ -325,7 +330,9 @@ impl Read for VerifyingReader { // through untouched. Otherwise, if the source (e.g. an xz decoder) swallowed the // short read in favor of its own error, the shared flag lets us still surface the // structural truncation. - if e.get_ref().is_some_and(|inner| inner.is::()) { + if e.get_ref() + .is_some_and(|inner| inner.is::()) + { Err(e) } else if self.short_flag.get() { Err(io::Error::new( @@ -349,10 +356,8 @@ impl Read for VerifyingReader { /// consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. /// /// Unlike [`CrcTeeReader`], this type never substitutes a checksum error for raw EOF — it is always -/// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. The type is -/// exposed because it leaks through the return signatures of the wrapper's intentionally-partial -/// APIs (`into_frames`, `into_subsample_by_*`); callers should treat it as an opaque reader. -pub struct ArcHasher { +/// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. +pub(crate) struct ArcHasher { inner: R, state: Arc, } @@ -394,11 +399,6 @@ pub(crate) type VerifiedStreamSource<'a, R> = ArcHasher>; /// `Some(Err(io::ErrorKind::InvalidData))` — returned once after the last decoded record, then /// `None`. Consuming methods (`count_samples`, `write_all_jsonl`, `for_each_assignment` when driven /// to natural EOF) also fold the CRC check into their return value. -/// -/// **Intentionally partial APIs** (`into_frames`, `into_subsample_by_*`) are forwarded for -/// ergonomics but do not automatically verify — the underlying reader is stopped short of raw EOF -/// so the CRC tee is never finalized. Callers that need integrity for partial reads must call -/// [`super::reader::BendlReader::verify_stream_checksum`] separately. pub struct BendlVerifiedStreamReader<'a, R: Read + Seek> { inner: BenStreamReader>, expected: u32, @@ -485,7 +485,10 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { /// the shared flag fired, otherwise pass it through. fn map_terminal_error(&self, e: io::Error) -> io::Error { if self.short_flag.get() { - io::Error::new(io::ErrorKind::UnexpectedEof, ShortRangeMarker { remaining: 0 }) + io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + ) } else { e } @@ -516,7 +519,11 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { if computed == expected { Ok(count) } else { - Err(crc_mismatch_error(ChecksumTarget::Stream, computed, expected)) + Err(crc_mismatch_error( + ChecksumTarget::Stream, + computed, + expected, + )) } } @@ -563,60 +570,6 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { Ok(true) }) } - - /// Consume the wrapper and iterate over raw BEN/ben32 frames instead of materialized - /// assignments. - /// - /// Frame iteration is intentionally partial: callers typically stop short of EOF, so the CRC - /// tee is never finalized and the stream is **not verified** by this path. Callers needing - /// integrity for partial reads should call - /// [`super::reader::BendlReader::verify_stream_checksum`] separately. - pub fn into_frames(self) -> BenStreamFrameReader> { - self.inner.into_frames() - } -} - -impl<'a, R: Read + Seek + Send> BendlVerifiedStreamReader<'a, R> { - /// Convert into a subsampling iterator over explicit 1-based indices. - /// - /// Subsampling is intentionally partial: the underlying reader is stopped short of raw EOF, so - /// the CRC tee is never finalized and the stream is **not verified** by this path. Use - /// [`super::reader::BendlReader::verify_stream_checksum`] for an explicit full-stream integrity - /// check. - pub fn into_subsample_by_indices( - self, - indices: T, - ) -> SubsampleFrameDecoder>> - where - T: IntoIterator, - { - self.inner.into_subsample_by_indices(indices) - } - - /// Convert into a subsampling iterator over the inclusive 1-based range `[start, end]`. - /// - /// Subsampling is intentionally partial and is **not verified** by this path; see - /// [`Self::into_subsample_by_indices`]. - pub fn into_subsample_by_range( - self, - start: usize, - end: usize, - ) -> SubsampleFrameDecoder>> { - self.inner.into_subsample_by_range(start, end) - } - - /// Convert into a subsampling iterator that selects every `step` samples from the 1-based - /// `offset`. - /// - /// Subsampling is intentionally partial and is **not verified** by this path; see - /// [`Self::into_subsample_by_indices`]. - pub fn into_subsample_every( - self, - step: usize, - offset: usize, - ) -> SubsampleFrameDecoder>> { - self.inner.into_subsample_every(step, offset) - } } impl<'a, R: Read + Seek> Iterator for BendlVerifiedStreamReader<'a, R> { diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 998e55b..fbf4d0f 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -19,9 +19,9 @@ //! 3. **finalize phase** — [`BendlWriter::finish`] writes the trailing directory and patches the //! header. //! -//! The writer requires `Write + Seek` because the header is patched twice: once with the stream -//! offset (implicitly, by having reserved its slot at construction) and once with the finalized -//! stream length, sample count, directory offset, directory length, and `complete` flag. +//! The writer requires `Write + Seek` because the header is written provisionally at construction +//! and patched on finalization with the stream checksum, stream length, sample count, directory +//! offset, directory length, and `finalized` flag. use std::collections::HashSet; use std::io::{self, Read, Seek, SeekFrom, Write}; @@ -36,31 +36,6 @@ use super::format::{ FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; -/// Ability to truncate an underlying seekable target to a given length. -/// -/// This is not part of `std::io`, so `BendlAppender` takes a trait bound that abstracts it and is -/// implemented below for `std::fs::File` and `std::io::Cursor>`. -pub trait BendlTruncate { - /// Truncate or extend the underlying target to exactly `len` bytes. - fn truncate_at(&mut self, len: u64) -> io::Result<()>; -} - -impl BendlTruncate for std::fs::File { - fn truncate_at(&mut self, len: u64) -> io::Result<()> { - self.set_len(len) - } -} - -impl BendlTruncate for std::io::Cursor> { - fn truncate_at(&mut self, len: u64) -> io::Result<()> { - let target = len as usize; - let vec = self.get_mut(); - debug_assert!(vec.len() >= target, "truncate_at called past end of buffer"); - vec.truncate(target); - Ok(()) - } -} - /// Options passed alongside each [`BendlWriter::add_asset`] call. /// /// There is no "checksum opt-in/opt-out" knob: every asset written through the library carries a @@ -115,9 +90,13 @@ struct EncodedAsset { /// This is the single encode path shared by [`BendlWriter::add_asset`] and /// [`BendlAppender::commit`], so the create and append routes can never drift on compression, flag /// assembly, or CRC coverage. It is pure (in-memory), so a failure leaves any backing file -/// untouched. The CRC32C is over the **on-disk** bytes — the compressed bytes when xz is applied, so -/// verification happens before decompression (see [`ASSET_FLAG_CHECKSUM`]). -fn encode_asset_payload(payload: Vec, compress: bool, is_json: bool) -> io::Result { +/// untouched. The CRC32C is over the **on-disk** bytes — the compressed bytes when xz is applied, +/// so verification happens before decompression (see [`ASSET_FLAG_CHECKSUM`]). +fn encode_asset_payload( + payload: Vec, + compress: bool, + is_json: bool, +) -> io::Result { let bytes = if compress { let mut encoder = XzEncoder::new(Vec::new(), DEFAULT_XZ_PRESET); encoder.write_all(&payload)?; @@ -145,8 +124,8 @@ fn encode_asset_payload(payload: Vec, compress: bool, is_json: bool) -> io:: /// Tracks the asset names and singleton asset-types already claimed in a bundle, and enforces the /// canonical-name + uniqueness rules shared by the create and append paths. /// -/// [`Self::claim`] validates fully before mutating, so a rejected asset never leaves the registry in -/// a half-updated state — there is nothing to roll back. +/// [`Self::claim`] validates fully before mutating, so a rejected asset never leaves the registry +/// in a half-updated state — there is nothing to roll back. #[derive(Default)] struct AssetNameRegistry { names: HashSet, @@ -267,8 +246,10 @@ impl BendlWriter { return Err(BendlWriteError::AssetsAfterStream); } - // Validate and reserve the name/type up front, so a rejected asset writes no bytes. - self.registry.claim(asset_type, name)?; + // Validate before any expensive work, but do not reserve the name/type until the fallible + // encoding and write have both succeeded. A failed compression or write should not poison + // the in-memory registry and make a retry look like a duplicate. + self.registry.check(asset_type, name)?; let compress = options .compress @@ -279,6 +260,7 @@ impl BendlWriter { let payload_offset = self.inner.seek(SeekFrom::Current(0))?; self.inner.write_all(&encoded.bytes)?; + self.registry.claim(asset_type, name)?; self.entries.push(BendlDirectoryEntry { asset_type, asset_flags: encoded.asset_flags, @@ -539,7 +521,7 @@ pub enum BendlWriteError { AssetsAfterStream, /// Tried to append to a bundle that is not finalized. - #[error("cannot append to a bundle whose header does not have complete == 1")] + #[error("cannot append to a bundle whose header does not have finalized == 1")] BundleIncomplete, /// The writer was asked to perform an operation in the wrong state. @@ -573,13 +555,14 @@ pub enum BendlWriteError { /// 2. [`BendlAppender::add_asset`] (or [`BendlAppender::add_json_asset`]) validates and buffers /// each new asset. Validation happens up front, so duplicate singletons or names are rejected /// **before** any file mutation, and a rejected add_asset leaves the file unchanged. -/// 3. [`BendlAppender::commit`] compresses the buffered assets (if any), truncates the file at the -/// old directory offset, writes the new asset payloads, writes a new directory at the new EOF, -/// and patches the header. +/// 3. [`BendlAppender::commit`] compresses the buffered assets (if any), appends the new asset +/// payloads after the old EOF, writes a new directory, and patches the header. The old directory +/// is left in place as orphaned bytes until a future compact/rewrite operation; this keeps the +/// old header valid until the final header patch. /// /// A [`BendlAppender`] that is dropped without calling `commit` leaves the underlying file /// unchanged. -pub struct BendlAppender { +pub struct BendlAppender { inner: W, header: BendlHeader, existing_entries: Vec, @@ -600,10 +583,10 @@ struct PendingAsset { is_json: bool, } -impl BendlAppender { +impl BendlAppender { /// Open a finalized bundle for append. /// - /// Returns [`BendlWriteError::BundleIncomplete`] if the header's `complete` flag is not set — + /// Returns [`BendlWriteError::BundleIncomplete`] if the header's `finalized` flag is not set — /// append is unsafe on unfinalized bundles because the stream region has no authoritative end. pub fn open(mut inner: W) -> Result { inner.seek(SeekFrom::Start(0))?; @@ -714,8 +697,8 @@ impl BendlAppender { /// Commit all pending appends. /// /// This compresses any buffered payloads that need it (entirely in memory), then performs the - /// file mutation in a single burst: truncate at the old directory offset, write new payloads, - /// write a new directory, and patch the header. + /// file mutation in one append-only burst: seek to old EOF, write new payloads, write a new + /// directory, and patch the header. /// /// If compression fails, the file is left unchanged. pub fn commit(mut self) -> Result { @@ -733,15 +716,21 @@ impl BendlAppender { encoded.push((asset.asset_type, asset.name, enc)); } - // Phase 2: file mutation. From this point forward, a failure leaves the bundle in a damaged - // state. We do everything in the order (truncate, write payloads, write directory, patch - // header) so that even if we crash mid-way, the header still points at the old directory - // until the very last write. - let old_directory_offset = self.header.directory_offset; - - // Truncate at the old directory offset. - self.inner.truncate_at(old_directory_offset)?; - self.inner.seek(SeekFrom::Start(old_directory_offset))?; + // Phase 2: append-only file mutation. Until the final header patch, the old header still + // points at the old directory, which remains intact. A crash before the patch leaves the + // previous bundle readable with trailing orphaned bytes. + let old_directory_end = self + .header + .directory_offset + .checked_add(self.header.directory_len) + .ok_or_else(|| { + BendlWriteError::Io(io::Error::new( + io::ErrorKind::InvalidData, + "directory_offset + directory_len overflowed while appending", + )) + })?; + + self.inner.seek(SeekFrom::Start(old_directory_end))?; // Compute new entries with real offsets as we write. let mut new_entries: Vec = diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 463ffcb..3bcf0f0 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -12,9 +12,7 @@ use binary_ensemble::io::bundle::format::{ ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; -use binary_ensemble::io::bundle::writer::{ - AddAssetOptions, BendlAppender, BendlTruncate, BendlWriter, -}; +use binary_ensemble::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriter}; use binary_ensemble::io::bundle::{BendlReadError, BendlReader, ChecksumError, ChecksumTarget}; use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::BenStreamWriter; @@ -78,7 +76,7 @@ fn expect_bendl_open_err(bytes: Vec) -> binary_ensemble::io::bundle::format: struct CrashState { bytes: Vec, pos: u64, - truncated: bool, + initial_len: usize, } #[derive(Debug, Clone)] @@ -89,9 +87,9 @@ struct HeaderPatchCrashCursor { impl HeaderPatchCrashCursor { fn new(bytes: Vec) -> (Self, Rc>) { let state = Rc::new(RefCell::new(CrashState { + initial_len: bytes.len(), bytes, pos: 0, - truncated: false, })); ( Self { @@ -119,7 +117,7 @@ impl Read for HeaderPatchCrashCursor { impl Write for HeaderPatchCrashCursor { fn write(&mut self, buf: &[u8]) -> std::io::Result { let mut state = self.state.borrow_mut(); - if state.truncated && state.pos < HEADER_SIZE as u64 { + if state.bytes.len() > state.initial_len && state.pos < HEADER_SIZE as u64 { return Err(std::io::Error::new( std::io::ErrorKind::Other, "simulated crash while patching bundle header", @@ -159,18 +157,6 @@ impl Seek for HeaderPatchCrashCursor { } } -impl BendlTruncate for HeaderPatchCrashCursor { - fn truncate_at(&mut self, len: u64) -> std::io::Result<()> { - let mut state = self.state.borrow_mut(); - state.truncated = true; - state.bytes.truncate(len as usize); - if state.pos > len { - state.pos = len; - } - Ok(()) - } -} - fn tiny_bendl_bundle() -> Vec { let mut writer = BendlWriter::new(Cursor::new(Vec::new()), AssignmentFormat::Ben).unwrap(); writer @@ -625,7 +611,7 @@ fn seeded_malformed_xben_bytes_do_not_panic() { } #[test] -fn bendl_append_header_patch_crash_is_rejected_on_reopen() { +fn bendl_append_header_patch_crash_preserves_old_directory() { let base = tiny_bendl_bundle(); assert!(BendlReader::open(Cursor::new(base.clone())).is_ok()); @@ -647,8 +633,10 @@ fn bendl_append_header_patch_crash_is_rejected_on_reopen() { assert!(err.to_string().contains("simulated crash")); let damaged = state.borrow().bytes.clone(); - assert!(BendlReader::open(Cursor::new(damaged.clone())).is_err()); - assert!(BendlAppender::open(Cursor::new(damaged)).is_err()); + let reader = BendlReader::open(Cursor::new(damaged.clone())).unwrap(); + assert!(reader.find_asset_by_name("base.bin").is_some()); + assert!(reader.find_asset_by_name("after-crash.bin").is_none()); + assert!(BendlAppender::open(Cursor::new(damaged)).is_ok()); } #[test] From 39a6ee8453757e2bc0c74837e68ace60a8ba0e80 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 09:18:55 -0600 Subject: [PATCH 124/221] Fix maximum number of directory entries --- ben/src/io/bundle/format.rs | 46 ++++++++++++++++++++++++++++++- ben/src/io/bundle/tests/reader.rs | 39 ++++++++++++++++++++------ ben/src/io/bundle/tests/writer.rs | 6 ++-- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index cbd00f9..7b14869 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -325,6 +325,20 @@ impl BendlHeader { /// optional `checksum` bytes. pub const DIRECTORY_ENTRY_HEADER_SIZE: usize = 28; +/// Upper bound on the number of directory entries a single bundle may declare. +/// +/// A real bundle carries only a handful of assets — typically `graph.json`, a node-permutation +/// map, `metadata.json`, and at most a few small custom blobs — so this ceiling sits far above any +/// legitimate use while keeping the worst-case directory read bounded. The assignment stream is +/// stored outside the directory and does not count toward this limit, so a large ensemble does not +/// push against it. +/// +/// [`read_directory`] rejects an inflated `entry_count` against this bound **before** allocating, +/// so a corrupt or adversarial header cannot trigger a multi-gigabyte reservation; [`encode_directory`] +/// enforces the same bound on the write side so the library never produces a bundle it would refuse +/// to read back. +pub const MAX_DIRECTORY_ENTRIES: u32 = 256; + /// In-memory representation of a single directory entry. #[derive(Debug, Clone, PartialEq, Eq)] pub struct BendlDirectoryEntry { @@ -458,7 +472,19 @@ pub fn read_directory( ) -> Result, BendlFormatError> { let mut count_buf = [0u8; 4]; reader.read_exact(&mut count_buf)?; - let entry_count = u32::from_le_bytes(count_buf) as usize; + let entry_count = u32::from_le_bytes(count_buf); + + // Reject an inflated count before allocating: `entry_count` is untrusted on-disk data, and + // `Vec::with_capacity` would otherwise reserve `entry_count * size_of::()` + // bytes up front — a `u32::MAX` count aborts the process on the allocation rather than failing + // gracefully on the missing entry bytes. + if entry_count > MAX_DIRECTORY_ENTRIES { + return Err(BendlFormatError::TooManyDirectoryEntries { + count: entry_count as u64, + max: MAX_DIRECTORY_ENTRIES, + }); + } + let entry_count = entry_count as usize; let mut entries = Vec::with_capacity(entry_count); for _ in 0..entry_count { @@ -469,6 +495,14 @@ pub fn read_directory( /// Serialize a directory table into a byte vector. pub fn encode_directory(entries: &[BendlDirectoryEntry]) -> Result, BendlFormatError> { + // Enforce the same ceiling the reader applies, so the library never writes a bundle it would + // refuse to read back. + if entries.len() > MAX_DIRECTORY_ENTRIES as usize { + return Err(BendlFormatError::TooManyDirectoryEntries { + count: entries.len() as u64, + max: MAX_DIRECTORY_ENTRIES, + }); + } let entry_count = entries.len() as u32; let body_len: usize = entries.iter().map(|e| e.encoded_len()).sum(); @@ -518,6 +552,16 @@ pub enum BendlFormatError { remaining: u64, }, + /// A directory declared more entries than [`MAX_DIRECTORY_ENTRIES`] allows. Rejected before any + /// allocation so an inflated on-disk count cannot trigger a huge reservation. + #[error("directory declares {count} entries, which exceeds the maximum of {max}")] + TooManyDirectoryEntries { + /// The entry count declared in the directory header (read path) or requested by the writer. + count: u64, + /// The maximum permitted entry count ([`MAX_DIRECTORY_ENTRIES`]). + max: u32, + }, + /// A directory table violated bundle-level validation rules. #[error("malformed directory: {0}")] MalformedDirectory(String), diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 05bf86f..76b3a8e 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -7,6 +7,7 @@ use crate::io::bundle::format::{ ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, BENDL_MAGIC, BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, + MAX_DIRECTORY_ENTRIES, }; use crate::io::bundle::reader::{validate_directory_entries, BendlReader, BundleValidationError}; @@ -333,13 +334,32 @@ fn open_rejects_unsupported_major_version() { } #[test] -fn open_rejects_directory_with_inflated_entry_count() { +fn open_rejects_directory_with_count_over_max() { + // An entry count above MAX_DIRECTORY_ENTRIES must be rejected before any allocation, so a + // `u32::MAX` count fails gracefully instead of aborting on a multi-gigabyte reservation. let mut bytes = build_basic_finalized_bundle(); // Read directory_offset from the header (bytes 24..32). let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; - // Blow up the entry count at the start of the directory to a value that cannot possibly fit in - // the remaining file bytes. - bytes[directory_offset..directory_offset + 4].copy_from_slice(&9999u32.to_le_bytes()); + bytes[directory_offset..directory_offset + 4].copy_from_slice(&u32::MAX.to_le_bytes()); + match BendlReader::open(Cursor::new(bytes)) { + Err(BendlFormatError::TooManyDirectoryEntries { count, max }) => { + assert_eq!(count, u32::MAX as u64); + assert_eq!(max, MAX_DIRECTORY_ENTRIES); + } + Err(other) => panic!("expected TooManyDirectoryEntries, got {other:?}"), + Ok(_) => panic!("expected error, got Ok"), + } +} + +#[test] +fn open_rejects_directory_with_truncated_entries() { + // A count within the cap but larger than the directory region can supply must still fail — here + // it surfaces as an Io error when read_exact runs out of bytes mid-directory. + let mut bytes = build_basic_finalized_bundle(); + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let inflated = MAX_DIRECTORY_ENTRIES - 1; + assert!(inflated <= MAX_DIRECTORY_ENTRIES); + bytes[directory_offset..directory_offset + 4].copy_from_slice(&inflated.to_le_bytes()); match BendlReader::open(Cursor::new(bytes)) { Err(BendlFormatError::Io(_)) => {} Err(other) => panic!("expected Io, got {other:?}"), @@ -586,11 +606,12 @@ fn validate_directory_accepts_well_formed_multi_singleton_bundle() { } #[test] -fn stress_thousand_custom_assets_round_trip() { - // Build a directory with 1000 small custom assets, each with a unique payload derived from its +fn stress_many_custom_assets_round_trip() { + // Build a directory with many small custom assets, each with a unique payload derived from its // index, and confirm they all round-trip via `asset_bytes`. This catches any off-by-one or - // seek-caching bugs that might only show up with many entries. - const N: usize = 1000; + // seek-caching bugs that might only show up with many entries. `N` stays under + // `MAX_DIRECTORY_ENTRIES` so the directory is well-formed. + const N: usize = 200; let mut bytes = Vec::new(); bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); @@ -645,7 +666,7 @@ fn stress_thousand_custom_assets_round_trip() { assert_eq!(reader.assets().len(), N); reader.validate_directory().unwrap(); // Access in scrambled order to exercise seeking. - for &idx in &[0usize, N - 1, 1, N / 2, N / 3, 2 * N / 3, 7, 999] { + for &idx in &[0usize, N - 1, 1, N / 2, N / 3, 2 * N / 3, 7, N - 2] { let name = format!("blob-{idx:04}.bin"); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let got = reader.asset_bytes(&entry).unwrap(); diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index b454d58..eb55c08 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -694,7 +694,9 @@ fn into_stream_session_after_stream_written_returns_wrong_state() { #[test] fn stress_many_custom_assets_round_trip() { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); - let count = 500usize; + // Stays under MAX_DIRECTORY_ENTRIES so the directory is well-formed while still exercising the + // many-entry seek/round-trip paths. + let count = 200usize; for i in 0..count { let name = format!("blob_{i:05}"); let payload = vec![(i & 0xFF) as u8; (i % 17) + 1]; @@ -713,7 +715,7 @@ fn stress_many_custom_assets_round_trip() { let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); assert_eq!(reader.assets().len(), count); // Spot-check a handful of entries by reading their payload bytes back. - for i in [0usize, 1, 42, 199, 499] { + for i in [0usize, 1, 42, 150, 199] { let name = format!("blob_{i:05}"); let entry = reader.find_asset_by_name(&name).cloned().unwrap(); let got = reader.asset_bytes(&entry).unwrap(); From e343f1f568c34e5ccc3939be283fae6d12bf70e7 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 09:20:01 -0600 Subject: [PATCH 125/221] remove repetition in verify --- ben/src/io/bundle/verify.rs | 106 ++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 58 deletions(-) diff --git a/ben/src/io/bundle/verify.rs b/ben/src/io/bundle/verify.rs index 2e378a4..d5d566a 100644 --- a/ben/src/io/bundle/verify.rs +++ b/ben/src/io/bundle/verify.rs @@ -140,16 +140,7 @@ impl Read for ShortRangeAwareReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { match self.inner.read(buf) { Ok(n) => Ok(n), - Err(e) => { - if self.short_flag.get() { - Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )) - } else { - Err(e) - } - } + Err(e) => Err(override_if_short(&self.short_flag, e)), } } } @@ -208,6 +199,43 @@ pub(crate) fn crc_mismatch_error( ) } +/// Build the bundle-layer short-range EOF error used whenever a wrapper above a codec detects (via +/// the shared [`ShortRangeFlag`]) that the backing range ended early but the codec reported its own +/// error instead. The exact remaining count is unknown at this layer, so it is reported as zero; a +/// raw [`ExactLen`] short read that survives untouched still carries the precise count in its own +/// [`ShortRangeMarker`]. +pub(crate) fn short_range_eof() -> io::Error { + io::Error::new( + io::ErrorKind::UnexpectedEof, + ShortRangeMarker { remaining: 0 }, + ) +} + +/// If `flag` fired, replace `err` with a bundle-layer [`short_range_eof`]; otherwise pass `err` +/// through unchanged. Centralizes the "codec swallowed the short read in favor of its own error" +/// rewrite shared by every reader that sits above an [`ExactLen`]. +pub(crate) fn override_if_short(flag: &ShortRangeFlag, err: io::Error) -> io::Error { + if flag.get() { + short_range_eof() + } else { + err + } +} + +/// Compare a finalized stream CRC32C against the stored value, mapping a mismatch to the standard +/// `InvalidData`/[`ChecksumError::Mismatch`] error used across every stream verify path. +fn check_stream_crc(computed: u32, expected: u32) -> io::Result<()> { + if computed == expected { + Ok(()) + } else { + Err(crc_mismatch_error( + ChecksumTarget::Stream, + computed, + expected, + )) + } +} + /// CRC accumulator that sits between a byte source and its consumer. It never substitutes an error /// for raw EOF — the surrounding [`VerifyingReader`] (for uncompressed assets) or the post-decoder /// [`VerifyingReader`] (for xz assets) decides when and whether to check the accumulated hash. @@ -334,13 +362,8 @@ impl Read for VerifyingReader { .is_some_and(|inner| inner.is::()) { Err(e) - } else if self.short_flag.get() { - Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )) } else { - Err(e) + Err(override_if_short(&self.short_flag, e)) } } } @@ -432,10 +455,7 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { Ok(inner) => inner, Err(e) => { if short_flag.get() { - return Err(BendlReadError::Io(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - ))); + return Err(BendlReadError::Io(short_range_eof())); } return Err(e); } @@ -469,29 +489,13 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { /// `InvalidData` error. Called by the consuming methods after they have driven the decoder to /// raw EOF. fn finalize_checksum(&self) -> io::Result<()> { - let computed = self.arc_hasher.load(Ordering::Relaxed); - if computed == self.expected { - Ok(()) - } else { - Err(crc_mismatch_error( - ChecksumTarget::Stream, - computed, - self.expected, - )) - } + check_stream_crc(self.arc_hasher.load(Ordering::Relaxed), self.expected) } /// Map an error returned by a consuming inner call into the bundle-layer short-range error when /// the shared flag fired, otherwise pass it through. fn map_terminal_error(&self, e: io::Error) -> io::Error { - if self.short_flag.get() { - io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - ) - } else { - e - } + override_if_short(&self.short_flag, e) } /// Count the number of samples in the stream and verify the stream CRC32C. @@ -505,26 +509,12 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { let arc = Arc::clone(&self.arc_hasher); let expected = self.expected; let short_flag = self.short_flag.clone(); - let count = match self.inner.count_samples() { - Ok(count) => count, - Err(_) if short_flag.get() => { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - ShortRangeMarker { remaining: 0 }, - )); - } - Err(e) => return Err(e), - }; - let computed = arc.load(Ordering::Relaxed); - if computed == expected { - Ok(count) - } else { - Err(crc_mismatch_error( - ChecksumTarget::Stream, - computed, - expected, - )) - } + let count = self + .inner + .count_samples() + .map_err(|e| override_if_short(&short_flag, e))?; + check_stream_crc(arc.load(Ordering::Relaxed), expected)?; + Ok(count) } /// Decode assignments and pass each one to a callback by reference. From 841d16bdb76306ce9f2138513ed74c4c3c5f4833 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 09:20:22 -0600 Subject: [PATCH 126/221] More consistent progress bar semantics --- ben/src/json/graph/mlc.rs | 56 ++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index b25cf38..3887157 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -17,11 +17,16 @@ use std::time::Duration; /// /// Spinners auto-hide when stderr is not a terminal (e.g. under `cargo test` or when output is /// piped), so no config is needed for CI/test environments. +/// One recursion depth's spinner plus its running item counts. +struct DepthBar { + bar: ProgressBar, + total: usize, + done: usize, +} + struct MlcProgress { multi: MultiProgress, - bars: Vec, - totals: Vec, - dones: Vec, + depths: Vec, } impl MlcProgress { @@ -29,15 +34,13 @@ impl MlcProgress { fn new() -> Self { Self { multi: MultiProgress::new(), - bars: Vec::new(), - totals: Vec::new(), - dones: Vec::new(), + depths: Vec::new(), } } /// Make sure a bar exists for `depth`, creating any intermediate bars that don't exist yet. fn ensure_depth(&mut self, depth: usize) { - while self.bars.len() <= depth { + while self.depths.len() <= depth { let bar = self.multi.add(ProgressBar::new_spinner()); bar.set_style( ProgressStyle::default_spinner() @@ -45,10 +48,12 @@ impl MlcProgress { .unwrap(), ); bar.enable_steady_tick(Duration::from_millis(100)); - self.bars.push(bar); - self.totals.push(0); - self.dones.push(0); - let d = self.bars.len() - 1; + self.depths.push(DepthBar { + bar, + total: 0, + done: 0, + }); + let d = self.depths.len() - 1; self.refresh(d); } } @@ -56,26 +61,29 @@ impl MlcProgress { /// Record that `n` more items will be processed at `depth`. fn add_total(&mut self, depth: usize, n: usize) { self.ensure_depth(depth); - self.totals[depth] += n; + self.depths[depth].total += n; self.refresh(depth); } /// Record that `n` more items at `depth` have been finalized. fn add_done(&mut self, depth: usize, n: usize) { self.ensure_depth(depth); - self.dones[depth] += n; + self.depths[depth].done += n; self.refresh(depth); } fn refresh(&self, depth: usize) { - let done = self.dones[depth]; - let total = self.totals[depth]; - let pct = if total == 0 { 0 } else { done * 100 / total }; - self.bars[depth].set_message(format!( + let d = &self.depths[depth]; + let pct = if d.total == 0 { + 0 + } else { + d.done * 100 / d.total + }; + d.bar.set_message(format!( "MLC phase {}: {}/{} {} ({}%)", depth + 1, - done, - total, + d.done, + d.total, Self::unit_for_depth(depth), pct )); @@ -91,12 +99,12 @@ impl MlcProgress { /// Stop all spinners, leaving a final "complete" message on each. fn finish(&self) { - for (d, bar) in self.bars.iter().enumerate() { - bar.finish_with_message(format!( + for (depth, d) in self.depths.iter().enumerate() { + d.bar.finish_with_message(format!( "MLC phase {}: complete ({} {})", - d + 1, - self.totals[d], - Self::unit_for_depth(d) + depth + 1, + d.total, + Self::unit_for_depth(depth) )); } } From d78b27111ad8ece2a4a63e2fdc0910a59be8d36d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 09:21:07 -0600 Subject: [PATCH 127/221] clean up fast path in relabel --- ben/src/ops/relabel/mod.rs | 87 ++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index d4c2585..ca5c02d 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -92,12 +92,16 @@ impl RelabelOptions { } } + /// Set a concrete sample limit. Convenience form for call sites that hold a plain `usize`; for + /// an already-optional value (e.g. a parsed CLI argument) use [`Self::with_max_samples_opt`] + /// instead of unwrapping. pub fn with_max_samples(mut self, n: usize) -> Self { self.max_samples = Some(n); self } - /// Set the sample limit. `Some(n)` sets the limit; `None` clears it. + /// Set the sample limit from an optional value: `Some(n)` sets the limit, `None` clears it. Lets + /// CLI argument plumbing pass an `Option` straight through. pub fn with_max_samples_opt(mut self, n: Option) -> Self { self.max_samples = n; self @@ -198,8 +202,9 @@ pub fn convert_ben_file( /// True when the driver may take the byte-walking RLE fast path. /// -/// The predicate is one boolean computed once. See `risks` in the plan for why it is its own pure -/// function and gets a dedicated unit-test matrix. +/// Kept as a single pure predicate (rather than inlined into [`relabel_ben_file`]) so the exact +/// conditions under which the fast path is safe are stated in one place and can be exhaustively +/// covered by a dedicated unit-test matrix. fn can_use_first_seen_fast_path( transform: &RelabelTransform, target_variant: Option, @@ -231,58 +236,46 @@ where F: FnMut(&[u16]) -> io::Result>, { let mut decoder = BenStreamReader::from_ben(reader)?.silent(true); + let mut writer = BenStreamWriter::for_ben(writer, target_variant)?; let mut sample_number = 0usize; let spinner = Spinner::new("Relabeling line"); - match run_policy { - RunPolicy::CollapseAdjacentEqualAssignments => { - let mut encoder = BenStreamWriter::for_ben(writer, target_variant)?; - decoder.for_each_assignment(|assignment, count| { - if max_samples.is_some_and(|limit| sample_number >= limit) { - return Ok(false); - } - - let relabeled = transform(assignment)?; - let out_count = max_samples - .map(|limit| (limit - sample_number).min(count as usize)) - .unwrap_or(count as usize); - - for _ in 1..out_count { - encoder.write_assignment(relabeled.clone())?; - } - if out_count > 0 { - encoder.write_assignment(relabeled)?; - } - - sample_number += out_count; - spinner.set_count(sample_number as u64); - Ok(true) - })?; - encoder.finish()?; + // Both run policies share the same per-frame bookkeeping (sample limit, transform, output count, + // progress); they differ only in how the relabeled assignment is emitted. `out_count` is bounded + // by the input frame's `count` (a `u16`), so the `as u16` cast on the preserve path cannot + // truncate. + decoder.for_each_assignment(|assignment, count| { + if max_samples.is_some_and(|limit| sample_number >= limit) { + return Ok(false); } - RunPolicy::PreserveFrameBoundaries => { - let mut writer = BenStreamWriter::for_ben(writer, target_variant)?; - decoder.for_each_assignment(|assignment, count| { - if max_samples.is_some_and(|limit| sample_number >= limit) { - return Ok(false); - } - let relabeled = transform(assignment)?; - let out_count = max_samples - .map(|limit| (limit - sample_number).min(count as usize)) - .unwrap_or(count as usize); - - if out_count > 0 { + let relabeled = transform(assignment)?; + let out_count = max_samples + .map(|limit| (limit - sample_number).min(count as usize)) + .unwrap_or(count as usize); + + if out_count > 0 { + match run_policy { + // Emit `out_count` separate assignments; the writer merges adjacent equal ones into + // a single counted frame where the target variant can encode counts. + RunPolicy::CollapseAdjacentEqualAssignments => { + for _ in 1..out_count { + writer.write_assignment(relabeled.clone())?; + } + writer.write_assignment(relabeled)?; + } + // Emit one counted frame, never merging across input frame boundaries. + RunPolicy::PreserveFrameBoundaries => { writer.write_frame(relabeled, out_count as u16)?; } - - sample_number += out_count; - spinner.set_count(sample_number as u64); - Ok(true) - })?; - writer.finish()?; + } } - } + + sample_number += out_count; + spinner.set_count(sample_number as u64); + Ok(true) + })?; + writer.finish()?; Ok(()) } From 189b2d6a4648b5d343d83c34aad333ba7786db71 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 13:26:40 -0600 Subject: [PATCH 128/221] improve readability --- Taskfile.yml | 85 +++- ben-py/docs/user/using_pyben.ipynb | 89 ++-- ben-py/pyproject.toml | 1 + ben-py/src/decode/decoder.rs | 33 +- ben-py/tests/test_bundle.py | 183 ++++---- ben-py/tests/test_python_pipelines.py | 92 ++-- ben-py/uv.lock | 27 ++ ben/src/cli/ben/args.rs | 16 +- ben/src/cli/ben/bundle.rs | 1 - ben/src/cli/bendl/args.rs | 4 + ben/src/cli/bendl/extract.rs | 28 +- ben/src/cli/bendl/tests.rs | 55 +++ ben/src/cli/pcben/args.rs | 4 +- ben/src/codec/encode/tests.rs | 10 +- ben/src/codec/encode/twodelta.rs | 169 +++++--- ben/src/io/bundle/format.rs | 13 +- ben/src/io/bundle/reader.rs | 36 +- ben/src/io/bundle/tests/reader.rs | 5 +- ben/src/io/bundle/verify.rs | 14 +- ben/src/io/bundle/writer.rs | 49 ++- ben/src/io/writer/options.rs | 20 +- ben/src/io/writer/tests.rs | 8 +- ben/src/json/graph/mlc.rs | 208 +++++---- ben/src/json/graph/petxgraph/nx_convert.rs | 91 ++-- ben/src/ops/relabel/mod.rs | 12 +- ben/src/test_utils.rs | 287 ++++++++++++- ben/tests/test_bendl_append_proptest.rs | 464 +++++++++++++-------- ben/tests/test_cli.rs | 12 +- ben/tests/test_coverage.rs | 4 +- ben/tests/test_format_stability.rs | 31 +- ben/tests/test_ops_equivalence_proptest.rs | 26 +- ben/tests/test_stress_edges.rs | 212 +++++----- 32 files changed, 1554 insertions(+), 735 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 1d6d572..d956777 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -146,6 +146,51 @@ tasks: - task: test-rust - task: test-python + format-rust: + desc: Format Rust code + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo fmt --all + + format-python: + desc: Format ben-py Python code + silent: true + deps: + - ben-py-sync + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: ben-py + cmds: + - uv run ruff format . + + format: + desc: Format Rust and Python code + silent: true + cmds: + - task: format-rust + - task: format-python + + lint-python: + desc: Lint ben-py Python code + silent: true + deps: + - ben-py-sync + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: ben-py + cmds: + - uv run ruff check . + + lint: + desc: Run linters + silent: true + cmds: + - task: lint-python + coverage-ben: desc: Run Rust coverage for the ben crate silent: true @@ -165,6 +210,13 @@ tasks: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - cargo llvm-cov clean --workspace + - >- + bash -lc 'if [ -d "{{.ROOT_DIR}}/target" ]; then + find "{{.ROOT_DIR}}/target" -maxdepth 1 -name "*.profraw" -delete; + fi; + if [ -d "{{.COV_TARGET_DIR}}" ]; then + find "{{.COV_TARGET_DIR}}" -maxdepth 1 -name "*.profraw" -delete; + fi' - cargo llvm-cov -p ben-py --no-report - >- bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; @@ -179,7 +231,7 @@ tasks: -o /tmp/ben-py.profdata - >- {{.LLVM_BIN}}/llvm-cov report - {{.COV_TARGET_DIR}}/debug/libpyben_core.so + {{.ROOT_DIR}}/ben-py/binary_ensemble/_core.abi3.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='/.cargo/registry|/rustc/|^/mnt/.*/ben/src/' @@ -192,6 +244,13 @@ tasks: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - cargo llvm-cov clean --workspace + - >- + bash -lc 'if [ -d "{{.ROOT_DIR}}/target" ]; then + find "{{.ROOT_DIR}}/target" -maxdepth 1 -name "*.profraw" -delete; + fi; + if [ -d "{{.COV_TARGET_DIR}}" ]; then + find "{{.COV_TARGET_DIR}}" -maxdepth 1 -name "*.profraw" -delete; + fi' - cargo llvm-cov -p ben-py --no-report - >- bash -lc 'eval "$(cargo llvm-cov show-env --sh)"; @@ -205,7 +264,7 @@ tasks: {{.COV_TARGET_DIR}}/*.profraw -o /tmp/ben-py.profdata - >- - bash -lc '{{.LLVM_BIN}}/llvm-cov show {{.COV_TARGET_DIR}}/debug/libpyben_core.so + bash -lc '{{.LLVM_BIN}}/llvm-cov show {{.ROOT_DIR}}/ben-py/binary_ensemble/_core.abi3.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='\"'\"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'\"'\"' --format=html > /tmp/ben-py-coverage.html' @@ -222,22 +281,34 @@ tasks: cmds: - >- bash -lc ' + set -euo pipefail; ben_report_file=/tmp/ben-coverage-report.txt; ben_py_report_file=/tmp/ben-py-coverage-report.txt; + clean_cov_artifacts() { + cargo llvm-cov clean --workspace >/dev/null; + if [ -d "{{.ROOT_DIR}}/target" ]; then + find "{{.ROOT_DIR}}/target" -maxdepth 1 -name "*.profraw" -delete; + fi; + if [ -d "{{.COV_TARGET_DIR}}" ]; then + find "{{.COV_TARGET_DIR}}" -maxdepth 1 -name "*.profraw" -delete; + fi; + }; + clean_cov_artifacts; cargo llvm-cov --color always --package binary-ensemble --summary-only --ignore-filename-regex '"'"'(^|/)bin/'"'"' > "$ben_report_file"; ben_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_report_file")"; - cargo llvm-cov clean --workspace >/dev/null; + clean_cov_artifacts; cargo llvm-cov -p ben-py --no-report >/dev/null; eval "$(cargo llvm-cov show-env --sh)"; export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}" >/dev/null; - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} >/dev/null; + pytest_status=0; + uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} >/tmp/ben-py-pytest.log || pytest_status="$?"; cd "{{.ROOT_DIR}}"; {{.LLVM_BIN}}/llvm-profdata merge -sparse target/*.profraw {{.COV_TARGET_DIR}}/*.profraw -o /tmp/ben-py.profdata >/dev/null; - {{.LLVM_BIN}}/llvm-cov report {{.COV_TARGET_DIR}}/debug/libpyben_core.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$ben_py_report_file"; + {{.LLVM_BIN}}/llvm-cov report {{.ROOT_DIR}}/ben-py/binary_ensemble/_core.abi3.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$ben_py_report_file"; ben_py_total="$(awk '"'"'$1=="TOTAL"{print $10}'"'"' "$ben_py_report_file")"; printf "\n%s\n\n" "BEN COVERAGE"; @@ -247,6 +318,10 @@ tasks: printf "\n%-10s %-10s\n" "Target" "Lines"; printf "%-10s %-10s\n" "ben" "${ben_total:-n/a}"; printf "%-10s %-10s\n" "ben-py" "${ben_py_total:-n/a}"; + if [ "$pytest_status" -ne 0 ]; then + printf "\nben-py pytest failed with exit status %s; see /tmp/ben-py-pytest.log\n" "$pytest_status"; + exit "$pytest_status"; + fi; ' clean-linux: &clean-unix diff --git a/ben-py/docs/user/using_pyben.ipynb b/ben-py/docs/user/using_pyben.ipynb index 24902b5..1df43b7 100644 --- a/ben-py/docs/user/using_pyben.ipynb +++ b/ben-py/docs/user/using_pyben.ipynb @@ -92,6 +92,7 @@ " break\n", " f.write(buf)\n", "\n", + "\n", "url_base = \"https://raw.githubusercontent.com/peterrrock2/binary-ensemble/main/example\"\n", "for file_name in [\n", " \"CO_small.json\",\n", @@ -117,7 +118,9 @@ " print(f\"{file_name} already exists, skipping download.\")\n", "\n", "\n", - "url_base = \"https://raw.githubusercontent.com/mggg/GerryChain/refs/heads/main/docs/_static\"\n", + "url_base = (\n", + " \"https://raw.githubusercontent.com/mggg/GerryChain/refs/heads/main/docs/_static\"\n", + ")\n", "for file_name in [\n", " \"gerrymandria.json\",\n", "]:\n", @@ -148,7 +151,12 @@ "outputs": [], "source": [ "from binary_ensemble import (\n", - " compress_jsonl_to_ben, compress_jsonl_to_xben, compress_ben_to_xben, decompress_ben_to_jsonl, decompress_xben_to_jsonl, decompress_xben_to_ben\n", + " compress_jsonl_to_ben,\n", + " compress_jsonl_to_xben,\n", + " compress_ben_to_xben,\n", + " decompress_ben_to_jsonl,\n", + " decompress_xben_to_jsonl,\n", + " decompress_xben_to_ben,\n", ")" ] }, @@ -171,7 +179,7 @@ "outputs": [], "source": [ "compress_jsonl_to_ben(\n", - " in_file=\"example_data/small_example.jsonl\", \n", + " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", ")" ] @@ -202,7 +210,7 @@ "source": [ "try:\n", " compress_jsonl_to_ben(\n", - " in_file=\"example_data/small_example.jsonl\", \n", + " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " )\n", "except OSError as e:\n", @@ -248,7 +256,7 @@ "outputs": [], "source": [ "compress_jsonl_to_xben(\n", - " in_file=\"example_data/small_example.jsonl\", \n", + " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", " overwrite=True,\n", " variant=\"mkv_chain\",\n", @@ -257,7 +265,7 @@ ")\n", "\n", "compress_ben_to_xben(\n", - " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\", \n", + " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " out_file=\"example_data/small_example_jsonl_to_ben_to_xben.jsonl.xben\",\n", " overwrite=True,\n", " n_threads=1,\n", @@ -307,10 +315,10 @@ "outputs": [], "source": [ "decompress_ben_to_jsonl(\n", - " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\", \n", + " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " out_file=\"example_data/small_example_jsonl_to_ben_to_jsonl.jsonl\",\n", " overwrite=True,\n", - ") \n", + ")\n", "\n", "decompress_xben_to_jsonl(\n", " in_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", @@ -366,22 +374,18 @@ "\n", "graph = Graph.from_json(\"./example_data/gerrymandria.json\")\n", "\n", - "my_updaters = { \"population\": updaters.Tally(\"TOTPOP\"), }\n", + "my_updaters = {\n", + " \"population\": updaters.Tally(\"TOTPOP\"),\n", + "}\n", "\n", - "initial_partition = Partition(\n", - " graph,\n", - " assignment=\"district\",\n", - " updaters=my_updaters\n", - ")\n", + "initial_partition = Partition(graph, assignment=\"district\", updaters=my_updaters)\n", "\n", - "ideal_population = sum(initial_partition[\"population\"].values()) / len(initial_partition)\n", + "ideal_population = sum(initial_partition[\"population\"].values()) / len(\n", + " initial_partition\n", + ")\n", "\n", "proposal = partial(\n", - " recom,\n", - " pop_col=\"TOTPOP\",\n", - " pop_target=ideal_population,\n", - " epsilon=0.01,\n", - " node_repeats=2\n", + " recom, pop_col=\"TOTPOP\", pop_target=ideal_population, epsilon=0.01, node_repeats=2\n", ")\n", "\n", "recom_chain = MarkovChain(\n", @@ -389,7 +393,7 @@ " constraints=[contiguous],\n", " accept=accept.always_accept,\n", " initial_state=initial_partition,\n", - " total_steps=10_000\n", + " total_steps=10_000,\n", ")" ] }, @@ -435,9 +439,10 @@ " for partition in recom_chain.with_progress_bar():\n", " assignment_series = partition.assignment.to_series()\n", " # Assignment vectors must be lists of integers\n", - " ordered_assignment = assignment_series.loc[graph_node_order].astype(int).tolist() \n", - " encoder.write(ordered_assignment)\n", - "\n" + " ordered_assignment = (\n", + " assignment_series.loc[graph_node_order].astype(int).tolist()\n", + " )\n", + " encoder.write(ordered_assignment)" ] }, { @@ -483,13 +488,9 @@ "\n", "for i, assignment in enumerate(PyBenDecoder(\"example_data/gerrychain_10000.jsonl.ben\")):\n", " assignment = pd.Series(assignment, index=graph_node_order_series)\n", - " partition = Partition(\n", - " graph,\n", - " assignment=assignment,\n", - " updaters=my_updaters\n", - " )\n", + " partition = Partition(graph, assignment=assignment, updaters=my_updaters)\n", " if i % 1000 == 0:\n", - " print(f\"Sample: {i+1}, Cut Edge Count: {len(partition['cut_edges'])}\")" + " print(f\"Sample: {i + 1}, Cut Edge Count: {len(partition['cut_edges'])}\")" ] }, { @@ -539,8 +540,10 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_indices([1, 23978, 100000]):\n", - " print(assignment[:10])\n" + "for assignment in PyBenDecoder(\n", + " \"example_data/100k_CO_chain.jsonl.ben\"\n", + ").subsample_indices([1, 23978, 100000]):\n", + " print(assignment[:10])" ] }, { @@ -563,7 +566,9 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(1000,1005):\n", + "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(\n", + " 1000, 1005\n", + "):\n", " print(assignment[:10])" ] }, @@ -592,7 +597,9 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(10000):\n", + "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(\n", + " 10000\n", + "):\n", " print(assignment[:10])" ] }, @@ -630,8 +637,10 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_indices([1, 23978, 100000]):\n", - " print(assignment[:10])\n" + "for assignment in PyBenDecoder(\n", + " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", + ").subsample_indices([1, 23978, 100000]):\n", + " print(assignment[:10])" ] }, { @@ -662,7 +671,9 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_range(1000,1005):\n", + "for assignment in PyBenDecoder(\n", + " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", + ").subsample_range(1000, 1005):\n", " print(assignment[:10])" ] }, @@ -699,7 +710,9 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n", + "for assignment in PyBenDecoder(\n", + " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", + ").subsample_every(10000):\n", " print(assignment[:10])" ] }, diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index 821d0db..4bda97e 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -40,5 +40,6 @@ dev = [ "ipywidgets>=8.1.7", "maturin>=1.9.6", "pytest>=8.4.2", + "ruff>=0.11.0", "tqdm>=4.67.1", ] diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 2bb7a40..5e0cef7 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -21,9 +21,9 @@ pub struct PyBenDecoder { path: PathBuf, mode: DecoderMode, backend: DecoderBackend, - /// Lazily-constructed frame iterator. We defer construction so opening a bundle whose stream is - /// empty or truncated still succeeds — only methods that actually walk the stream need a live - /// iterator. + /// Lazily-constructed frame iterator. We defer construction so opening a bundle whose stream + /// is empty or truncated still succeeds — only methods that actually walk the stream need + /// a live iterator. iter: Option, current_assignment: Option>, remaining_count: u16, @@ -471,9 +471,14 @@ impl PyBenDecoder { /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be /// opened directly with `PyBenDecoder(out_path, mode=dec.assignment_format())`. Errors on plain /// streams. - #[pyo3(signature = (out_path, overwrite=false))] - #[pyo3(text_signature = "(self, out_path, overwrite=False)")] - fn extract_stream(&mut self, out_path: PathBuf, overwrite: bool) -> PyResult<()> { + #[pyo3(signature = (out_path, overwrite=false, allow_unfinalized=false))] + #[pyo3(text_signature = "(self, out_path, overwrite=False, allow_unfinalized=False)")] + fn extract_stream( + &mut self, + out_path: PathBuf, + overwrite: bool, + allow_unfinalized: bool, + ) -> PyResult<()> { let state = self.require_bundle_mut("extract_stream()")?; if out_path.exists() && !overwrite { return Err(PyIOError::new_err(format!( @@ -481,6 +486,18 @@ impl PyBenDecoder { out_path.display() ))); } + let mut stream = if allow_unfinalized && !state.reader.is_finalized() { + state + .reader + .assignment_stream_reader_unverified() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? + } else { + state + .reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? + }; + let out = if overwrite { OpenOptions::new() .write(true) @@ -496,10 +513,6 @@ impl PyBenDecoder { .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; let mut out = BufWriter::new(out); - let mut stream = state - .reader - .assignment_stream_reader() - .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; io::copy(&mut stream, &mut out) .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; out.flush() diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index ffb6f00..058a6f1 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -56,6 +56,19 @@ ASSET_FLAG_XZ = 1 << 1 ASSET_FLAG_CHECKSUM = 1 << 2 +HEADER_FLAG_STREAM_CHECKSUM = 1 << 0 + + +def _crc32c(data: bytes) -> int: + """Compute CRC32C (Castagnoli), matching the Rust bundle checksum contract.""" + crc = 0xFFFFFFFF + for byte in data: + crc ^= byte + for _ in range(8): + mask = -(crc & 1) + crc = (crc >> 1) ^ (0x82F63B78 & mask) + return (~crc) & 0xFFFFFFFF + # --------------------------------------------------------------------------- # Byte-level bundle construction @@ -75,26 +88,25 @@ def _pack_header( major_version: int = BENDL_MAJOR_VERSION, minor_version: int = BENDL_MINOR_VERSION, flags: int = 0, + stream_checksum: int = 0, reserved_0: int = 0, ) -> bytes: if len(magic) != 8: raise ValueError("magic must be 8 bytes") - return ( - magic - + struct.pack( - " bytes: return _xz(self.raw_payload) if self.compress else self.raw_payload - def flags(self) -> int: + def flags(self, *, has_checksum: bool) -> int: flags = 0 if self.is_json: flags |= ASSET_FLAG_JSON if self.compress: flags |= ASSET_FLAG_XZ - if self.checksum is not None: + if has_checksum: flags |= ASSET_FLAG_CHECKSUM return flags @@ -175,13 +187,17 @@ def build_bundle( complete: int = COMPLETE_YES, magic: bytes = BENDL_MAGIC, major_version: int = BENDL_MAJOR_VERSION, + checksums: bool = True, ) -> bytes: """Construct the bytes of a `.bendl` file from pieces. The layout is ``[header][asset payloads][stream][directory]``. This helper mirrors the writer's finalize path closely enough to produce bundles that the Rust reader accepts, while also exposing enough knobs - to generate deliberately broken bundles for negative tests. + to generate deliberately broken bundles for negative tests. By default + it mirrors the current writer and stores CRC32C checksums for finalized + streams and assets; pass ``checksums=False`` for foreign/no-checksum + fixtures. """ assets = list(assets) @@ -203,21 +219,30 @@ def build_bundle( directory_offset = len(buf) entries_bytes: List[bytes] = [] - for (offset, length, _enc), asset in zip(encoded_assets, assets): + for (offset, length, encoded), asset in zip(encoded_assets, assets): + checksum = asset.checksum + if checksums and checksum is None: + checksum = struct.pack(" None: f.write("\n") -def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: +def _ben_bytes_for( + samples: List[List[int]], tmp: Path, variant: str = "standard" +) -> bytes: """Produce real BEN bytes for ``samples`` via ``BenEncoder``.""" ben_path = tmp / "inner.ben" with BenEncoder( @@ -256,7 +285,9 @@ def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard return ben_path.read_bytes() -def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: +def _xben_bytes_for( + samples: List[List[int]], tmp: Path, variant: str = "standard" +) -> bytes: src = tmp / "src.jsonl" _write_jsonl(samples, src) out = tmp / "inner.xben" @@ -284,7 +315,9 @@ def test_module_exports_decoder_and_encoder() -> None: def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: rng = random.Random(4242) - samples = [[rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40)] + samples = [ + [rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40) + ] graph_json = b'{"nodes":[0,1,2,3],"edges":[[0,1],[1,2],[2,3]]}' metadata_json = b'{"note":"hello bundle","seed":4242}' @@ -336,7 +369,12 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: assert reader.assignment_format() == "ben" names = reader.asset_names() - assert names == ["metadata.json", "graph.json", "node_permutation_map.json", "notes.bin"] + assert names == [ + "metadata.json", + "graph.json", + "node_permutation_map.json", + "notes.bin", + ] assets = reader.list_assets() assert [a["name"] for a in assets] == names @@ -346,7 +384,7 @@ def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: assert "json" in by_name["graph.json"]["flags"] assert "xz" not in by_name["metadata.json"]["flags"] assert "json" in by_name["metadata.json"]["flags"] - assert by_name["notes.bin"]["flags"] == [] + assert by_name["notes.bin"]["flags"] == ["checksum"] # payload_offset must sit at or past the end of the header. for entry in assets: assert entry["offset"] >= HEADER_SIZE @@ -399,7 +437,9 @@ def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: assert list(BenDecoder(extracted, mode="xben")) == samples -def test_bundle_reader_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: +def test_bundle_reader_canonical_helpers_return_none_when_absent( + tmp_path: Path, +) -> None: samples = [[1, 2, 3]] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -675,7 +715,9 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: BenDecoder(path) -def test_open_rejects_declared_directory_len_with_trailing_bytes(tmp_path: Path) -> None: +def test_open_rejects_declared_directory_len_with_trailing_bytes( + tmp_path: Path, +) -> None: bundle = bytearray( build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), @@ -711,9 +753,11 @@ def test_incomplete_bundle_scans_stream_for_sample_count(tmp_path: Path) -> None assert reader.is_complete() is False assert reader.count_samples() == 1 assert reader.asset_names() == [] - # extract_stream should still write out bytes that decode as BEN. + # Verified extraction requires a finalized stream checksum. out = tmp_path / "extracted.ben" - reader.extract_stream(out) + with pytest.raises(Exception, match="unfinalized"): + reader.extract_stream(out) + reader.extract_stream(out, overwrite=True, allow_unfinalized=True) assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] @@ -840,9 +884,12 @@ def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) - except Exception: pass - # extract_stream should write exactly the partial byte sequence. + # Verified extraction refuses unfinalized streams because their checksum is + # not authoritative yet. extracted = tmp_path / "partial.ben" - reader.extract_stream(extracted) + with pytest.raises(Exception, match="unfinalized"): + reader.extract_stream(extracted) + reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) assert extracted.read_bytes() == partial # The extracted file opens as a BEN stream (banner is intact). @@ -871,7 +918,9 @@ def test_interrupted_ben_stream_inside_banner_fails_to_open_decoder( assert reader.is_complete() is False extracted = tmp_path / "head_cut.ben" - reader.extract_stream(extracted) + with pytest.raises(Exception, match="unfinalized"): + reader.extract_stream(extracted) + reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) # The decoder must reject a BEN file whose banner is incomplete. with pytest.raises(Exception, match="Failed to create BenDecoder"): BenDecoder(extracted, mode="ben") @@ -890,14 +939,18 @@ def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: reader.count_samples() extracted = tmp_path / "zero.ben" - reader.extract_stream(extracted) + with pytest.raises(Exception, match="unfinalized"): + reader.extract_stream(extracted) + reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) assert extracted.read_bytes() == b"" # A zero-byte .ben has no banner → decoder construction must fail. with pytest.raises(Exception, match="Failed to create BenDecoder"): BenDecoder(extracted, mode="ben") -def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) -> None: +def test_finalized_bundle_with_inflated_stream_len_survives_open( + tmp_path: Path, +) -> None: # Build a valid finalized bundle, then patch stream_len to a value # larger than the actual stream payload. This simulates the narrow # window where the writer updated the header but was killed before @@ -976,9 +1029,7 @@ def test_long_asset_name_near_u16_max(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name=long_name, payload=payload) - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name=long_name, payload=payload)], ) path = _write_bundle(tmp_path / "long.bendl", bundle) reader = BenDecoder(path) @@ -1027,6 +1078,7 @@ def test_list_assets_flag_fidelity(tmp_path: Path) -> None: stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, assets=assets, + checksums=False, ) path = _write_bundle(tmp_path / "flags.bendl", bundle) reader = BenDecoder(path) @@ -1065,9 +1117,9 @@ def test_read_asset_bytes_is_idempotent(tmp_path: Path) -> None: def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: - # 500 custom assets with rotating flags. This exercises directory + # A full directory with rotating flags. This exercises directory # scaling, offset bookkeeping, and name lookup on a non-trivial directory. - N = 500 + N = 256 assets: List[_Asset] = [] expected: List[Tuple[str, bytes]] = [] rng = random.Random(0xBEEF) @@ -1268,9 +1320,7 @@ def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: out = tmp_path / "with_graph.bendl" samples = [[1, 1, 2, 2], [1, 1, 3, 3]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: for a in samples: enc.write(a) @@ -1296,9 +1346,7 @@ def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: out = tmp_path / "with_graph_path.bendl" samples = [[0, 0, 1, 1]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=graph_path - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=graph_path) as enc: for a in samples: enc.write(a) @@ -1329,9 +1377,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: raw = json.dumps(SAMPLE_GRAPH).encode("utf-8") out = tmp_path / "via-bytes.bendl" samples = [[2, 2, 2, 2]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=raw - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=raw) as enc: for a in samples: enc.write(a) @@ -1343,9 +1389,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: buf = io.BytesIO(json.dumps(SAMPLE_GRAPH).encode("utf-8")) out = tmp_path / "via-bytesio.bendl" samples = [[1, 2, 1, 2]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=buf - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=buf) as enc: for a in samples: enc.write(a) @@ -1357,9 +1401,7 @@ def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: buf = io.StringIO(json.dumps(SAMPLE_GRAPH)) out = tmp_path / "via-stringio.bendl" samples = [[3, 3, 3, 3]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=buf - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=buf) as enc: for a in samples: enc.write(a) @@ -1371,9 +1413,7 @@ def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> No out = tmp_path / "full.bendl" rng = random.Random(0xCAFE) samples = [[rng.randint(1, 8) for _ in range(12)] for _ in range(15)] - with BenEncoder( - out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: for a in samples: enc.write(a) @@ -1402,9 +1442,7 @@ def test_pybenencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: # A ben_file_only=True output should be byte-identical to the legacy # plain-BEN path, so the header has no BENDL magic. out = tmp_path / "legacy.ben" - with BenEncoder( - out, overwrite=True, variant="standard", ben_file_only=True - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", ben_file_only=True) as enc: enc.write([1, 2, 3]) blob = out.read_bytes() assert not blob.startswith(BENDL_MAGIC) @@ -1528,7 +1566,7 @@ def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: by_name = {a["name"]: a for a in assets} assert "xz" in by_name["graph.json"]["flags"] assert "json" in by_name["graph.json"]["flags"] - assert by_name["notes.bin"]["flags"] == [] + assert by_name["notes.bin"]["flags"] == ["checksum"] # Raw and JSON asset access assert dec.read_asset_bytes("metadata.json") == metadata_json @@ -1553,9 +1591,7 @@ def test_pybendecoder_bundle_canonical_helpers_return_none_when_absent( bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples), - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name="custom.bin", payload=b"x") - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="custom.bin", payload=b"x")], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) dec = BenDecoder(path) @@ -1683,9 +1719,7 @@ def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> N # must round-trip through a single BenDecoder call — no need to # extract the stream first. out = tmp_path / "e2e.bendl" - with BenEncoder( - out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: for a in [[1, 2, 3], [2, 3, 4]]: enc.write(a) @@ -1943,9 +1977,7 @@ def test_pybendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> No # A bundle built with BenEncoder compresses the graph asset as xz; # read_graph() on BenDecoder must still return the decoded JSON. out = tmp_path / "xz_graph.bendl" - with BenEncoder( - out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH - ) as enc: + with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: enc.write([1, 2, 3]) dec = BenDecoder(out) # Spot-check that graph.json was actually stored compressed. @@ -2006,7 +2038,12 @@ def test_pybendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: _write_jsonl(samples, src) xben_path = tmp_path / "plain.xben" encode_jsonl_to_xben( - src, xben_path, overwrite=True, variant="standard", n_threads=1, compression_level=1 + src, + xben_path, + overwrite=True, + variant="standard", + n_threads=1, + compression_level=1, ) with pytest.warns(UserWarning): dec = BenDecoder(xben_path, mode="xben") diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index 5f8ffea..c687bda 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -303,9 +303,7 @@ def test_pybenencoder_roundtrip(tmp_path: Path) -> None: seq = gen_sequence_standard(rng, n_samples) ben = tmp_path / "out.ben" - with BenEncoder( - ben, overwrite=True, variant="standard", ben_file_only=True - ) as enc: + with BenEncoder(ben, overwrite=True, variant="standard", ben_file_only=True) as enc: for a in seq: enc.write(a) @@ -379,7 +377,9 @@ def test_compress_helpers_reject_unknown_variants(tmp_path: Path) -> None: encode_jsonl_to_ben(src, tmp_path / "out.ben", overwrite=True, variant="weird") with pytest.raises(ValueError, match="Unknown variant"): - encode_jsonl_to_xben(src, tmp_path / "out.xben", overwrite=True, variant="weird") + encode_jsonl_to_xben( + src, tmp_path / "out.xben", overwrite=True, variant="weird" + ) def test_module_exports_are_exposed() -> None: @@ -428,9 +428,7 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: out = tmp_path / "out.ben" - enc = BenEncoder( - out, overwrite=True, variant="standard", ben_file_only=True - ) + enc = BenEncoder(out, overwrite=True, variant="standard", ben_file_only=True) enc.write([1, 2, 3]) enc.close() enc.close() @@ -472,7 +470,9 @@ def test_pybenencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> N ) -def test_compress_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: Path) -> None: +def test_compress_helpers_reject_same_path_missing_input_and_bad_json( + tmp_path: Path, +) -> None: src = tmp_path / "src.jsonl" write_jsonl([[1, 1, 2]], src) @@ -663,7 +663,9 @@ def test_decoder_subsample_validations_and_warning_paths(tmp_path: Path) -> None assert list(BenDecoder(ben, mode="ben").subsample_every(2, 2)) == samples[1::2] -def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears(tmp_path: Path) -> None: +def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears( + tmp_path: Path, +) -> None: src = tmp_path / "src.jsonl" write_jsonl([[1], [2], [3]], src) @@ -900,9 +902,7 @@ def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: gpath = tmp_path / "g2.json" gpath.write_text(json.dumps(graph), encoding="utf-8") path = tmp_path / "str_path_graph.bendl" - with BenEncoder( - path, overwrite=True, variant="standard", graph=str(gpath) - ) as enc: + with BenEncoder(path, overwrite=True, variant="standard", graph=str(gpath)) as enc: enc.write([1]) assert BenDecoder(path).read_graph() == graph @@ -1083,7 +1083,9 @@ def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> N def test_pybendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: path = tmp_path / "plain.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: enc.write([1, 2]) dec = BenDecoder(path) @@ -1133,8 +1135,12 @@ def test_pybendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: xben_path = tmp_path / "samples.xben" encode_jsonl_to_xben( - src, xben_path, overwrite=True, variant="standard", - n_threads=1, compression_level=1, + src, + xben_path, + overwrite=True, + variant="standard", + n_threads=1, + compression_level=1, ) bendl_path = tmp_path / "xben_bundle.bendl" @@ -1154,8 +1160,12 @@ def test_pybendecoder_xben_plain_stream(tmp_path: Path) -> None: xben_path = tmp_path / "plain.xben" encode_jsonl_to_xben( - src, xben_path, overwrite=True, variant="standard", - n_threads=1, compression_level=1, + src, + xben_path, + overwrite=True, + variant="standard", + n_threads=1, + compression_level=1, ) dec = BenDecoder(xben_path, mode="xben") @@ -1249,7 +1259,9 @@ def test_pybendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1261,7 +1273,9 @@ def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_range.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1273,7 +1287,9 @@ def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5], [6]] path = tmp_path / "plain_every.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1290,7 +1306,9 @@ def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: samples = [[1], [2], [3]] path = tmp_path / "plain_len.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1302,7 +1320,9 @@ def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub_len.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1321,7 +1341,9 @@ def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "multi_iter.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1334,7 +1356,9 @@ def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] path = tmp_path / "plain_re_sub.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1353,7 +1377,9 @@ def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> No def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: samples = [[10, 20, 30], [40, 50, 60]] path = tmp_path / "ben_only.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1365,7 +1391,9 @@ def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: def test_pybenencoder_ben_file_only_mkv(tmp_path: Path) -> None: samples = [[1, 2], [1, 2], [3, 4]] path = tmp_path / "ben_mkv.ben" - with BenEncoder(path, overwrite=True, variant="mkv_chain", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="mkv_chain", ben_file_only=True + ) as enc: for a in samples: enc.write(a) @@ -1408,7 +1436,9 @@ def test_pybenencoder_bundle_with_metadata(tmp_path: Path) -> None: def test_pybendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: path = tmp_path / "plain_extract.ben" - with BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder( + path, overwrite=True, variant="standard", ben_file_only=True + ) as enc: enc.write([1, 2]) dec = BenDecoder(path, mode="ben") @@ -1443,8 +1473,12 @@ def test_decode_xben_to_jsonl_roundtrip(tmp_path: Path) -> None: xben = tmp_path / "out.xben" encode_jsonl_to_xben( - src, xben, overwrite=True, variant="standard", - n_threads=1, compression_level=1, + src, + xben, + overwrite=True, + variant="standard", + n_threads=1, + compression_level=1, ) out = tmp_path / "round.jsonl" diff --git a/ben-py/uv.lock b/ben-py/uv.lock index a92bd5d..173008d 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -111,6 +111,7 @@ dev = [ { name = "ipywidgets" }, { name = "maturin" }, { name = "pytest" }, + { name = "ruff" }, { name = "tqdm" }, ] @@ -134,6 +135,7 @@ dev = [ { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "maturin", specifier = ">=1.9.6" }, { name = "pytest", specifier = ">=8.4.2" }, + { name = "ruff", specifier = ">=0.11.0" }, { name = "tqdm", specifier = ">=4.67.1" }, ] @@ -2065,6 +2067,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/08/4349bdd5c64d9d193c360aa9db89adeee6f6682ab8825dca0a3f535f434f/rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:dc23e6820e3b40847e2f4a7726462ba0cf53089512abe9ee16318c366494c17a", size = 556523, upload-time = "2025-08-27T12:16:12.188Z" }, ] +[[package]] +name = "ruff" +version = "0.15.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/84/6f/a76f7d96e5c962f5b69cee865e49c15c1116897c01990faa8a57edb62e7f/ruff-0.15.15.tar.gz", hash = "sha256:b8dff018130b46d8e5bf0f926ef6b60cf871d6d5ae45fc9334e09632daa741d6", size = 4706985, upload-time = "2026-05-28T14:16:57.784Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/9d/3a45c05b8ab04b4705989de70a79008e27c8003296a0feaee9edc18dd7e9/ruff-0.15.15-py3-none-linux_armv6l.whl", hash = "sha256:cf93e5388f412e1b108b1f8b34a6e036b70fe8aff89393befad96fe48670311b", size = 10710652, upload-time = "2026-05-28T14:16:06.701Z" }, + { url = "https://files.pythonhosted.org/packages/05/66/da974431624bf3b49f6ee1f9543c02d929ff1cba78b0d5a79c38cf21f744/ruff-0.15.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ac5a646d1f6a7dadd5d50842dae2c1f9862ac887ef5d1b1375e02def791fde6e", size = 11096615, upload-time = "2026-05-28T14:16:23.313Z" }, + { url = "https://files.pythonhosted.org/packages/8c/09/7443452e5d290230a712103f2fdceeef7184f3ec99a2bd01c8be78aaceb5/ruff-0.15.15-py3-none-macosx_11_0_arm64.whl", hash = "sha256:77d955a431430c66f72dd94e379ad38a16daea3d25094872ac4edf9e797be530", size = 10436683, upload-time = "2026-05-28T14:16:40.974Z" }, + { url = "https://files.pythonhosted.org/packages/53/01/d330c26a57fa4f3943a14424904027428315b700fe4d14a84bb123a649e5/ruff-0.15.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7614ee79c69788cf6cedd568069ade9cecc22a1ad20494efe8d0c9ebb4b622d4", size = 10769064, upload-time = "2026-05-28T14:16:28.905Z" }, + { url = "https://files.pythonhosted.org/packages/1d/85/cc8770f8bdff541b1da8392d1634141fe4a0e3f4ee596605959b7906c27f/ruff-0.15.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3cdb1679e06a1f6b47bc384714ae96f6e2fb65ca441eb78c43d2ca554176ce1f", size = 10511987, upload-time = "2026-05-28T14:16:43.732Z" }, + { url = "https://files.pythonhosted.org/packages/7c/29/8c190c1472b63013583ba391f3342036e02010544c1270455ed8e519bdf3/ruff-0.15.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2728b93d7b23a603ea2c0ac6eb73d760bd38ec9de35f35fb41e18f7a3fee7622", size = 11275100, upload-time = "2026-05-28T14:16:55.244Z" }, + { url = "https://files.pythonhosted.org/packages/9f/6b/7e145ce2cc8e63d6834eca03d83a0e18d121def5c69f91b4cf4011ed4879/ruff-0.15.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be582fcc0db438902c7792b08d6ddf6c9b9e21addaa10092c2c741cfb09e5a45", size = 12176903, upload-time = "2026-05-28T14:16:14.368Z" }, + { url = "https://files.pythonhosted.org/packages/80/a3/d5974637f68e451f7fadf015cf3101d1cd7d8ba5027cffe0b9e3826ebe6b/ruff-0.15.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7aa77465b8ecaf1a27bea098d696f7fed5e1eccbd10b321b682d6de586ae5627", size = 11404550, upload-time = "2026-05-28T14:16:20.138Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1c/e6e5e568f22be4fb05d6244234aba384c06b451252453b821e1a529263cf/ruff-0.15.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48decfa11d740de4889de623be1463308346312f2409a56e24aa280c86162dc4", size = 11382027, upload-time = "2026-05-28T14:16:46.615Z" }, + { url = "https://files.pythonhosted.org/packages/1d/01/170921b49fcd2e8858825593f91cf7146c3e40a5c3e6df763e4bb0484dde/ruff-0.15.15-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a5015088452ca0081387063649ec67f06d3d1d6b8b936a1f836b5e9657ecd48c", size = 11366041, upload-time = "2026-05-28T14:16:26.247Z" }, + { url = "https://files.pythonhosted.org/packages/87/54/a7bad711d7de93254e15e06a4c375b89a03d18de45d3e5dcc86a4472fb1a/ruff-0.15.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:f5294aab6356c81600fcdea3a62bb1b924dfd5e91767c12318d3f68f86af57cd", size = 10741795, upload-time = "2026-05-28T14:16:17.11Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/38c075963668f8b41c6914ee0f6f318727fbe30ab9145cb29e6df464c5fa/ruff-0.15.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:db5bd4d802415cca656dc1616070b725952d6ae95eb5d4831e49fbd94a38f75f", size = 10511117, upload-time = "2026-05-28T14:16:31.767Z" }, + { url = "https://files.pythonhosted.org/packages/9d/96/6ff689e1f7e375d1d97075eca022f74c2bab59554a432fe4d2e6f091986a/ruff-0.15.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:587a6278ed42059191c1a466e490bd7930fb50bd2e255398bc29616c895a61cb", size = 10994867, upload-time = "2026-05-28T14:16:35.149Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c2/5dce0ab9f92a8d534fa62b9bf9caca3eddb8c1a81b616f5e195ada4f0d6e/ruff-0.15.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df0c1c084f5f4be9812f61518a45c440d3c30d69ce4bf6c5270e66d38338f02a", size = 11482101, upload-time = "2026-05-28T14:16:49.598Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c0/1003b60edd697c649faf61f1a34094b1abb38fb3d1181e3f895781250a08/ruff-0.15.15-py3-none-win32.whl", hash = "sha256:29428ea79694afbe756d45fd59b36f22b6b020dc0443cf7de0173046236964b9", size = 10716774, upload-time = "2026-05-28T14:16:52.337Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/1269eddd6945a06c23f055ef7848886e37cf9d6a8bebb386a3115f01470c/ruff-0.15.15-py3-none-win_amd64.whl", hash = "sha256:8df0323902e15e24bc4bf246da830573d3cf3352bd0b9a164eab335d111ff4a4", size = 11868463, upload-time = "2026-05-28T14:16:11.333Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b2/920464c907b191e37469d477a1aa8bc048b8f36c4c1610dfa4ab87b39e18/ruff-0.15.15-py3-none-win_arm64.whl", hash = "sha256:3c8ceca6792f38196b8f589bc92eccd03eef286602da92e5dc05cc42ef6441b7", size = 11138498, upload-time = "2026-05-28T14:16:38.425Z" }, +] + [[package]] name = "scipy" version = "1.16.2" diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index bcb7462..9f07451 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -60,8 +60,8 @@ pub(super) struct Args { /// Input file to read from. #[arg()] pub input_file: Option, - /// Output file to write to. Optional. If not provided, the output file will be determined based - /// on the input file and the mode of operation. + /// Output file to write to. Optional. If not provided, the output file will be determined + /// based on the input file and the mode of operation. #[arg(short, long)] pub output_file: Option, /// The standard behaviour is to try and derive the output file name from the input file name. @@ -92,9 +92,9 @@ pub(super) struct Args { /// `--variant standard`. Ignored if `--variant` is set. #[arg(short = 'a', long)] pub save_all: bool, - /// BEN variant to use when encoding. Possible values: standard, mkvchain, twodelta. Defaults to - /// mkvchain if neither this nor --save-all is given. Takes precedence over --save-all when both - /// are provided. + /// BEN variant to use when encoding. Possible values: standard, mkvchain, twodelta. Defaults + /// to mkvchain if neither this nor --save-all is given. Takes precedence over --save-all + /// when both are provided. #[arg(short = 't', long, value_enum)] pub variant: Option, /// If the output file already exists, this flag will cause the program to overwrite it without @@ -125,9 +125,9 @@ pub(super) struct Args { #[arg(long)] pub chunk_size: Option, /// Per-block size in bytes for the multithreaded XZ encoder. liblzma needs a non-zero block - /// size to actually fan compression out across worker threads; smaller blocks scale parallelism - /// better at a slight compression-ratio cost. Defaults to 16 MiB when `--n-cpus > 1`, or 0 - /// (liblzma auto, ~192 MiB at preset 9) for single-thread runs. + /// size to actually fan compression out across worker threads; smaller blocks scale + /// parallelism better at a slight compression-ratio cost. Defaults to 16 MiB when + /// `--n-cpus > 1`, or 0 (liblzma auto, ~192 MiB at preset 9) for single-thread runs. #[arg(long)] pub xz_block_size: Option, /// Embed a graph JSON asset alongside the assignment stream and emit the result as a `.bendl` diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 05e5682..47eb6da 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -144,4 +144,3 @@ pub(super) fn run_xencode_bundle_with_graph( append_graph_asset(out_path, graph_path) } - diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs index c10acdf..f073ff3 100644 --- a/ben/src/cli/bendl/args.rs +++ b/ben/src/cli/bendl/args.rs @@ -102,6 +102,10 @@ pub(super) struct ExtractArgs { /// Extract the embedded assignment stream region verbatim. Mutually exclusive with `--asset`. #[arg(long, conflicts_with = "asset")] pub stream: bool, + /// Allow `--stream` extraction from an unfinalized bundle. This skips stream checksum + /// verification because an unfinalized stream checksum is not authoritative. + #[arg(long, requires = "stream")] + pub allow_unfinalized: bool, /// Name of the asset to extract (e.g. `graph.json`). If the asset is xz-compressed, the /// extracted file contains the decompressed bytes. #[arg(long)] diff --git a/ben/src/cli/bendl/extract.rs b/ben/src/cli/bendl/extract.rs index 52eff65..6deaf18 100644 --- a/ben/src/cli/bendl/extract.rs +++ b/ben/src/cli/bendl/extract.rs @@ -19,16 +19,22 @@ pub(super) fn run_extract(args: ExtractArgs) -> Result<(), String> { let mut reader = BendlReader::open(BufReader::new(file)) .map_err(|e| format!("failed to parse bundle header: {e}"))?; - let mut out = BufWriter::new( - File::create(&args.output) - .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, - ); - if args.stream { - let mut stream = reader - .assignment_stream_reader() - .map_err(|e| format!("failed to open stream region: {e}"))?; + let mut stream = if args.allow_unfinalized && !reader.is_finalized() { + reader + .assignment_stream_reader_unverified() + .map_err(|e| format!("failed to open stream region: {e}"))? + } else { + reader + .assignment_stream_reader() + .map_err(|e| format!("failed to open stream region: {e}"))? + }; + let mut out = BufWriter::new( + File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, + ); io::copy(&mut stream, &mut out).map_err(|e| format!("failed to copy stream bytes: {e}"))?; + out.flush().map_err(|e| format!("flush failed: {e}"))?; } else { // asset is Some — validated by the early return above. let name = args.asset.unwrap(); @@ -39,10 +45,14 @@ pub(super) fn run_extract(args: ExtractArgs) -> Result<(), String> { let mut asset = reader .asset_reader(&entry) .map_err(|e| format!("failed to open asset {name:?}: {e}"))?; + let mut out = BufWriter::new( + File::create(&args.output) + .map_err(|e| format!("failed to create {:?}: {e}", args.output))?, + ); io::copy(&mut asset, &mut out) .map_err(|e| format!("failed to copy asset {name:?} bytes: {e}"))?; + out.flush().map_err(|e| format!("flush failed: {e}"))?; } - out.flush().map_err(|e| format!("flush failed: {e}"))?; Ok(()) } diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index f799722..041fb8a 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -502,6 +502,61 @@ fn run_extract_stream_writes_raw_assignment_bytes() { let _ = std::fs::remove_file(&out); } +#[test] +fn run_extract_stream_allows_unfinalized_when_requested() { + use crate::io::bundle::format::{AssignmentFormat, BendlHeader, FINALIZED_NO, HEADER_SIZE}; + + let known_stream = b"STANDARD BEN FILE\x00partial stream bytes"; + let header = BendlHeader { + magic: crate::io::bundle::format::BENDL_MAGIC, + major_version: crate::io::bundle::format::BENDL_MAJOR_VERSION, + minor_version: crate::io::bundle::format::BENDL_MINOR_VERSION, + finalized: FINALIZED_NO, + assignment_format: AssignmentFormat::Ben.to_u8(), + alignment_padding: 0, + flags: 0, + stream_checksum: 0, + directory_offset: 0, + directory_len: 0, + stream_offset: HEADER_SIZE as u64, + stream_len: 0, + sample_count: -1, + }; + let mut buf = Vec::from(header.to_bytes()); + buf.extend_from_slice(known_stream); + + let bendl = unique_path("extract_unfinalized_stream.bendl"); + std::fs::write(&bendl, &buf).unwrap(); + let out = unique_path("extract_unfinalized_stream_out.bin"); + + let default_args = ExtractArgs::try_parse_from([ + "extract", + "--stream", + "--output", + out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + let err = run_extract(default_args).unwrap_err(); + assert!(err.contains("unfinalized"), "unexpected error: {err}"); + assert!(!out.exists(), "failed extraction must not create output"); + + let allow_args = ExtractArgs::try_parse_from([ + "extract", + "--stream", + "--allow-unfinalized", + "--output", + out.to_str().unwrap(), + bendl.to_str().unwrap(), + ]) + .unwrap(); + run_extract(allow_args).unwrap(); + assert_eq!(std::fs::read(&out).unwrap(), known_stream); + + let _ = std::fs::remove_file(&bendl); + let _ = std::fs::remove_file(&out); +} + #[test] fn run_extract_asset_with_unknown_name_errors_cleanly() { // Pin the no-asset-named-X branch of extract.rs — find_asset_by_name returns None and the diff --git a/ben/src/cli/pcben/args.rs b/ben/src/cli/pcben/args.rs index 7c11fed..caa6de0 100644 --- a/ben/src/cli/pcben/args.rs +++ b/ben/src/cli/pcben/args.rs @@ -27,8 +27,8 @@ pub(super) struct Args { /// Input file to read from. #[arg(short, long)] pub(super) input_file: Option, - /// Output file to write to. Optional. If not provided, the output file will be determined based - /// on the input file and the mode of operation. + /// Output file to write to. Optional. If not provided, the output file will be determined + /// based on the input file and the mode of operation. #[arg(short, long)] pub(super) output_file: Option, /// If the output file already exists, this flag will cause the program to overwrite it without diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 9caa5a3..4d7af9c 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1390,8 +1390,14 @@ fn twodelta_round_trip_with_label_zero_pairs() { use std::io::Cursor; let fixtures = vec![ - ("pair (0, 1)", vec![vec![0u16, 0, 1, 1], vec![0u16, 1, 0, 1]]), - ("pair (1, 0)", vec![vec![1u16, 1, 0, 0], vec![1u16, 0, 1, 0]]), + ( + "pair (0, 1)", + vec![vec![0u16, 0, 1, 1], vec![0u16, 1, 0, 1]], + ), + ( + "pair (1, 0)", + vec![vec![1u16, 1, 0, 0], vec![1u16, 0, 1, 0]], + ), ]; for (label, assignments) in fixtures { diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index dab3816..504d2be 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -103,12 +103,59 @@ pub(crate) fn encode_twodelta_frame_with_hint( ), _ => construct_twodelta_frame_from_scratch(previous_assignment, new_assignment, count), } +} + +/// A district pair ordered so that the first element is the district occupying the **first pair +/// position in the current assignment** — i.e. the district whose run is emitted first. +/// +/// This ordering is not mere numeric or positional sorting of the two ids; it is the +/// round-trip-determinism invariant TwoDelta depends on. The decoder replays the alternating runs +/// starting from that same first position, so a pair ordered the other way would silently decode to +/// a different assignment. Constructing the pair only through [`Self::from_first_pair_position`] +/// makes that broken ordering unrepresentable. +#[derive(Clone, Copy)] +struct FirstRunDistrictPair { + first_run_district: u16, + second_run_district: u16, +} + +impl FirstRunDistrictPair { + /// Order `pair` so that the first-run district is whichever id the current assignment places at + /// `first_pair_pos` (the lowest position held by either id). `current[first_pair_pos]` must be + /// one of the two ids in `pair`. + fn from_first_pair_position(pair: (u16, u16), first_pair_pos: usize, current: &[u16]) -> Self { + if current[first_pair_pos] == pair.0 { + FirstRunDistrictPair { + first_run_district: pair.0, + second_run_district: pair.1, + } + } else { + FirstRunDistrictPair { + first_run_district: pair.1, + second_run_district: pair.0, + } + } + } + + /// The district whose run is emitted first (it holds the lowest pair position in `current`). + fn first_run_district(&self) -> u16 { + self.first_run_district + } + + /// The other district in the pair. + fn second_run_district(&self) -> u16 { + self.second_run_district + } - // Ok(BenEncodeFrame::from_run_lengths(ordered_pair, run_lengths)) + /// The ordered `(first_run_district, second_run_district)` tuple expected by + /// [`BenEncodeFrame::from_run_lengths`]. + fn as_ordered_pair(&self) -> (u16, u16) { + (self.first_run_district, self.second_run_district) + } } -/// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return the -/// pair ordered so that `pair.0` occupies a lower index than `pair.1`. +/// Validate that `previous_masks` contains non-empty entries for both ids in `pair` and return them +/// as a [`FirstRunDistrictPair`] ordered by their first position in `current`. /// /// Ordering by first position ensures that the run-length sequence produced during encoding always /// begins with the id whose positions come first in the assignment vector, which is required for @@ -121,14 +168,13 @@ pub(crate) fn encode_twodelta_frame_with_hint( /// /// # Returns /// -/// The pair reordered so that `pair.0` has a smaller first position in the current vector than -/// `pair.1`, or an error if either id is absent from `previous_masks` or has an empty position -/// list. +/// A [`FirstRunDistrictPair`] whose first-run district has a smaller first position in `current`, +/// or an error if either id is absent from `previous_masks` or has an empty position list. fn validate_masks_and_order_pairs_for_twodelta( pair: (u16, u16), masks: &HashMap>, current: &[u16], -) -> Result<(u16, u16)> { +) -> Result { let mask_a = match masks.get(&pair.0) { Some(m) => m, None => return Err(Error::from(EncodeError::TwoDeltaMissingMask { id: pair.0 })), @@ -139,23 +185,23 @@ fn validate_masks_and_order_pairs_for_twodelta( None => return Err(Error::from(EncodeError::TwoDeltaMissingMask { id: pair.1 })), }; - if mask_a.len() == 0 { + if mask_a.is_empty() { return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.0 })); - }; + } - if mask_b.len() == 0 { + if mask_b.is_empty() { return Err(Error::from(EncodeError::TwoDeltaEmptyMask { id: pair.1 })); - }; - - // Order so that pair.0 is the value the new assignment places at the first pair position (the - // lowest index held by either mask). This guarantees run_lengths[0] >= 1 with no leading-zero - // sentinel. - let first_pos = mask_a[0].min(mask_b[0]); - if current[first_pos] == pair.0 { - Ok((pair.0, pair.1)) - } else { - Ok((pair.1, pair.0)) } + + // Order so that the first-run district is the value the new assignment places at the first pair + // position (the lowest index held by either mask). This guarantees run_lengths[0] >= 1 with no + // leading-zero sentinel. + let first_pair_pos = mask_a[0].min(mask_b[0]); + Ok(FirstRunDistrictPair::from_first_pair_position( + pair, + first_pair_pos, + current, + )) } /// Build a TwoDelta frame using both a known pair and pre-computed position masks. @@ -204,11 +250,11 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( }; let mask_a = previous_masks - .get(&pair.0) - .expect("Failed to get mask for pair.0 after validation"); + .get(&pair.first_run_district()) + .expect("Failed to get mask for first-run district after validation"); let mask_b = previous_masks - .get(&pair.1) - .expect("Failed to get mask for pair.1 after validation"); + .get(&pair.second_run_district()) + .expect("Failed to get mask for second-run district after validation"); let new_capacity = mask_a.len() + mask_b.len(); let mut run_lengths = Vec::with_capacity(new_capacity); @@ -217,13 +263,13 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let mut new_mask_b = Vec::with_capacity(new_capacity); let (mut i, mut j) = (0usize, 0usize); - // pair.0 is guaranteed to equal current[first_pos] by + // The first-run district is guaranteed to equal current[first_pair_pos] by // validate_masks_and_order_pairs_for_twodelta, so the first iteration always hits the - // `new_val == run_value` branch and increments the count — no special-case initialization - // needed. - let mut run_value = pair.0; - let mut current_mask_count = 0u16; - let mut found_assignment_change = false; + // `new_val == active_district` branch and increments the run length — no special-case + // initialization needed. + let mut active_district = pair.first_run_district(); + let mut active_run_length = 0u16; + let mut saw_changed_assignment_position = false; while i < mask_a.len() || j < mask_b.len() { // Pick the next position from whichever mask is lower. @@ -238,56 +284,61 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let previous_value = previous[idx]; let new_val = current[idx]; - if previous_value != pair.0 && previous_value != pair.1 { + if previous_value != pair.first_run_district() && previous_value != pair.second_run_district() + { return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { pos: idx, actual: previous_value, - a: pair.0, - b: pair.1, + a: pair.first_run_district(), + b: pair.second_run_district(), })); } - if new_val != pair.0 && new_val != pair.1 { + if new_val != pair.first_run_district() && new_val != pair.second_run_district() { return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { pos: idx, actual: new_val, - a: pair.0, - b: pair.1, + a: pair.first_run_district(), + b: pair.second_run_district(), })); } if new_val != previous_value { - found_assignment_change = true; + saw_changed_assignment_position = true; } - if new_val == run_value { - if current_mask_count == u16::MAX { + if new_val == active_district { + if active_run_length == u16::MAX { return Err(Error::new( ErrorKind::InvalidInput, "TwoDelta run length exceeds u16::MAX", )); } - current_mask_count += 1; + active_run_length += 1; } else { - run_lengths.push(current_mask_count); - run_value = new_val; - current_mask_count = 1; + run_lengths.push(active_run_length); + active_district = new_val; + active_run_length = 1; } - if new_val == pair.0 { + if new_val == pair.first_run_district() { new_mask_a.push(idx); } else { new_mask_b.push(idx); } } - run_lengths.push(current_mask_count); + run_lengths.push(active_run_length); // Special error that signals that we can reuse the last TwoDelta frame - if !found_assignment_change { + if !saw_changed_assignment_position { return Err(Error::from(EncodeError::TwoDeltaIdentical)); } - previous_masks.insert(pair.0, new_mask_a); - previous_masks.insert(pair.1, new_mask_b); - Ok(BenEncodeFrame::from_run_lengths(pair, run_lengths, count)) + previous_masks.insert(pair.first_run_district(), new_mask_a); + previous_masks.insert(pair.second_run_district(), new_mask_b); + Ok(BenEncodeFrame::from_run_lengths( + pair.as_ordered_pair(), + run_lengths, + count, + )) } /// Build a TwoDelta frame using only pre-computed position masks, inferring the pair from the first @@ -366,8 +417,8 @@ fn construct_twodelta_frame_from_scratch( let mut enc_pair = (0u16, 0u16); let mut enc_pair_known = false; let mut run_lengths: Vec = Vec::new(); - let mut run_value = 0u16; - let mut run_count = 0u16; + let mut active_district = 0u16; + let mut active_run_length = 0u16; for (&prev_val, &curr_val) in previous.iter().zip(current.iter()) { if prev_val == a || prev_val == b { @@ -376,27 +427,27 @@ fn construct_twodelta_frame_from_scratch( } if !enc_pair_known { enc_pair = (curr_val, if curr_val == a { b } else { a }); - run_value = enc_pair.0; + active_district = enc_pair.0; enc_pair_known = true; } - if curr_val == run_value { - if run_count == u16::MAX { + if curr_val == active_district { + if active_run_length == u16::MAX { return Err(Error::new( ErrorKind::InvalidInput, "TwoDelta run length exceeds u16::MAX", )); } - run_count += 1; + active_run_length += 1; } else { - run_lengths.push(run_count); - run_value = curr_val; - run_count = 1; + run_lengths.push(active_run_length); + active_district = curr_val; + active_run_length = 1; } } else if prev_val != curr_val { return Err(Error::from(EncodeError::TwoDeltaTooManyIds)); } } - run_lengths.push(run_count); + run_lengths.push(active_run_length); Ok(BenEncodeFrame::from_run_lengths( enc_pair, diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 7b14869..749240a 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -334,9 +334,9 @@ pub const DIRECTORY_ENTRY_HEADER_SIZE: usize = 28; /// push against it. /// /// [`read_directory`] rejects an inflated `entry_count` against this bound **before** allocating, -/// so a corrupt or adversarial header cannot trigger a multi-gigabyte reservation; [`encode_directory`] -/// enforces the same bound on the write side so the library never produces a bundle it would refuse -/// to read back. +/// so a corrupt or adversarial header cannot trigger a multi-gigabyte reservation; +/// [`encode_directory`] enforces the same bound on the write side so the library never produces a +/// bundle it would refuse to read back. pub const MAX_DIRECTORY_ENTRIES: u32 = 256; /// In-memory representation of a single directory entry. @@ -552,11 +552,12 @@ pub enum BendlFormatError { remaining: u64, }, - /// A directory declared more entries than [`MAX_DIRECTORY_ENTRIES`] allows. Rejected before any - /// allocation so an inflated on-disk count cannot trigger a huge reservation. + /// A directory declared more entries than [`MAX_DIRECTORY_ENTRIES`] allows. Rejected before + /// any allocation so an inflated on-disk count cannot trigger a huge reservation. #[error("directory declares {count} entries, which exceeds the maximum of {max}")] TooManyDirectoryEntries { - /// The entry count declared in the directory header (read path) or requested by the writer. + /// The entry count declared in the directory header (read path) or requested by the + /// writer. count: u64, /// The maximum permitted entry count ([`MAX_DIRECTORY_ENTRIES`]). max: u32, diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 7846385..634f116 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -279,6 +279,24 @@ impl BendlReader { Ok(()) } + /// Seek to an asset's `payload_offset` and return a reader bounded to its declared + /// `payload_len`, paired with the [`ShortRangeFlag`] that reader will set if the backing range + /// is shorter than declared. + /// + /// This is the raw on-disk byte range shared by every asset read mode (verified/unverified, + /// decoded/raw); the codec and CRC layering is applied by each caller on top of the returned + /// range. It is scoped to `entry`-based reads and intentionally does not cover the + /// assignment-stream readers, which seek to a separately computed `(offset, len)`. + fn open_asset_payload_range( + &mut self, + entry: &BendlDirectoryEntry, + ) -> io::Result<(ExactLen<&mut R>, ShortRangeFlag)> { + self.inner.seek(SeekFrom::Start(entry.payload_offset))?; + let short_flag = ShortRangeFlag::new(); + let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); + Ok((raw, short_flag)) + } + /// Read the fully-decoded bytes of an asset by directory entry, verifying its CRC32C before /// returning. /// @@ -345,9 +363,7 @@ impl BendlReader { }; let target = ChecksumTarget::Asset(entry.name.clone()); - self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - let short_flag = ShortRangeFlag::new(); - let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); + let (raw, short_flag) = self.open_asset_payload_range(entry)?; // The CRC tee always sits at the raw on-disk layer (over the compressed bytes for xz // assets, so verification happens before decompression). For xz assets the decoder sits @@ -378,9 +394,7 @@ impl BendlReader { &'a mut self, entry: &BendlDirectoryEntry, ) -> Result, BendlReadError> { - self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - let short_flag = ShortRangeFlag::new(); - let raw = ExactLen::new(&mut self.inner, entry.payload_len, short_flag.clone()); + let (raw, short_flag) = self.open_asset_payload_range(entry)?; if entry.asset_flags & ASSET_FLAG_XZ != 0 { // Wrap the decoder so that if xz reports a runtime error while the underlying // ExactLen has flagged a short read, the surface is a short-range UnexpectedEof @@ -405,12 +419,10 @@ impl BendlReader { &'a mut self, entry: &BendlDirectoryEntry, ) -> Result, BendlReadError> { - self.inner.seek(SeekFrom::Start(entry.payload_offset))?; - Ok(Box::new(ExactLen::new( - &mut self.inner, - entry.payload_len, - ShortRangeFlag::new(), - ))) + // No codec or CRC layer sits above this range, so the short-range flag has nothing to + // observe it — a short read surfaces directly as the ExactLen's own marker. + let (raw, _short_flag) = self.open_asset_payload_range(entry)?; + Ok(Box::new(raw)) } /// Verify the stored CRC32C of a single asset without returning any decoded bytes. diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 76b3a8e..6199e29 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -1482,7 +1482,10 @@ fn asset_payload_reader_unverified_returns_compressed_bytes_for_xz_asset() { let mut out = Vec::new(); payload_reader.read_to_end(&mut out).unwrap(); drop(payload_reader); - assert_eq!(out, compressed, "payload reader returns raw compressed bytes"); + assert_eq!( + out, compressed, + "payload reader returns raw compressed bytes" + ); assert_ne!(out, raw, "payload reader did NOT decompress"); // For an uncompressed asset, the payload reader and the decoded unverified reader produce the diff --git a/ben/src/io/bundle/verify.rs b/ben/src/io/bundle/verify.rs index d5d566a..b7d63e5 100644 --- a/ben/src/io/bundle/verify.rs +++ b/ben/src/io/bundle/verify.rs @@ -374,18 +374,18 @@ impl Read for VerifyingReader { // Verified assignment-stream reader // ===================================================================== -/// CRC accumulator that shares its running hash via an `Arc`. Used as the source reader -/// for [`BendlVerifiedStreamReader`]: the `Arc` lets the outer wrapper read the final hash after a -/// consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. +/// CRC32C accumulator that shares its running hash via an `Arc`. Used as the source +/// reader for [`BendlVerifiedStreamReader`]: the `Arc` lets the outer wrapper read the final hash +/// after a consuming inner method (e.g. `count_samples`) moves ownership away from the wrapper. /// /// Unlike [`CrcTeeReader`], this type never substitutes a checksum error for raw EOF — it is always /// the outer [`BendlVerifiedStreamReader`] that decides when and whether to check. -pub(crate) struct ArcHasher { +pub(crate) struct SharedCrc32cAccumulatorReader { inner: R, state: Arc, } -impl Read for ArcHasher { +impl Read for SharedCrc32cAccumulatorReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { let n = self.inner.read(buf)?; if n > 0 { @@ -412,7 +412,7 @@ enum StreamVerifyState { } /// Source reader stack underneath a [`BendlVerifiedStreamReader`]. -pub(crate) type VerifiedStreamSource<'a, R> = ArcHasher>; +pub(crate) type VerifiedStreamSource<'a, R> = SharedCrc32cAccumulatorReader>; /// Verified decoded assignment reader returned by /// [`super::reader::BendlReader::open_assignment_reader`]. @@ -447,7 +447,7 @@ impl<'a, R: Read + Seek> BendlVerifiedStreamReader<'a, R> { -> Result>, BendlReadError>, ) -> Result { let arc_hasher = Arc::new(AtomicU32::new(0)); - let source = ArcHasher { + let source = SharedCrc32cAccumulatorReader { inner: raw, state: Arc::clone(&arc_hasher), }; diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index fbf4d0f..c9f2c46 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -583,6 +583,17 @@ struct PendingAsset { is_json: bool, } +/// A pending asset whose payload has been encoded in memory and is ready to be written to disk. +/// +/// One element per prepared asset — this is the output of the pure, in-memory compression phase of +/// [`BendlAppender::commit`], carrying everything the subsequent file-mutation phase needs to write +/// the payload and its directory entry. +struct PreparedAppendAsset { + asset_type: u16, + asset_name: String, + encoded_asset: EncodedAsset, +} + impl BendlAppender { /// Open a finalized bundle for append. /// @@ -694,6 +705,26 @@ impl BendlAppender { self.add_asset(ASSET_TYPE_CUSTOM, name, payload, options) } + /// Phase 1 of [`Self::commit`]: drain the pending queue and encode each payload through the + /// shared encode path, entirely in memory. + /// + /// This is pure with respect to the file — it has no ordering constraint against the + /// append-only mutation in `commit`, so a failure here returns before any byte is written and + /// leaves the bundle untouched. + fn prepare_pending_assets(&mut self) -> Result, BendlWriteError> { + let mut prepared = Vec::with_capacity(self.pending.len()); + for asset in self.pending.drain(..) { + let encoded_asset = + encode_asset_payload(asset.raw_payload, asset.compress, asset.is_json)?; + prepared.push(PreparedAppendAsset { + asset_type: asset.asset_type, + asset_name: asset.name, + encoded_asset, + }); + } + Ok(prepared) + } + /// Commit all pending appends. /// /// This compresses any buffered payloads that need it (entirely in memory), then performs the @@ -707,14 +738,9 @@ impl BendlAppender { return Ok(self.inner); } - // Phase 1: compress any pending payloads through the shared encode path and pair each with - // its identifying name/type. Done entirely in memory so failures here leave the file - // untouched. - let mut encoded: Vec<(u16, String, EncodedAsset)> = Vec::with_capacity(self.pending.len()); - for asset in self.pending.drain(..) { - let enc = encode_asset_payload(asset.raw_payload, asset.compress, asset.is_json)?; - encoded.push((asset.asset_type, asset.name, enc)); - } + // Phase 1: compress any pending payloads in memory. This has no ordering constraint against + // the file mutation below — a failure here leaves the file untouched. + let encoded = self.prepare_pending_assets()?; // Phase 2: append-only file mutation. Until the final header patch, the old header still // points at the old directory, which remains intact. A crash before the patch leaves the @@ -737,13 +763,14 @@ impl BendlAppender { Vec::with_capacity(self.existing_entries.len() + encoded.len()); new_entries.extend(self.existing_entries.iter().cloned()); - for (asset_type, name, enc) in encoded { + for prepared in encoded { + let enc = prepared.encoded_asset; let payload_offset = self.inner.seek(SeekFrom::Current(0))?; self.inner.write_all(&enc.bytes)?; new_entries.push(BendlDirectoryEntry { - asset_type, + asset_type: prepared.asset_type, asset_flags: enc.asset_flags, - name, + name: prepared.asset_name, payload_offset, payload_len: enc.bytes.len() as u64, checksum: Some(enc.checksum), diff --git a/ben/src/io/writer/options.rs b/ben/src/io/writer/options.rs index 34d8741..571f1e1 100644 --- a/ben/src/io/writer/options.rs +++ b/ben/src/io/writer/options.rs @@ -93,16 +93,22 @@ mod tests { #[test] fn with_compression_level_clamps_to_nine() { assert_eq!( - XzEncodeOptions::new().with_compression_level(99).compression_level, + XzEncodeOptions::new() + .with_compression_level(99) + .compression_level, Some(9) ); // Level 0 (store-mode) is a legitimate setting and must be preserved as-is. assert_eq!( - XzEncodeOptions::new().with_compression_level(0).compression_level, + XzEncodeOptions::new() + .with_compression_level(0) + .compression_level, Some(0) ); assert_eq!( - XzEncodeOptions::new().with_compression_level(6).compression_level, + XzEncodeOptions::new() + .with_compression_level(6) + .compression_level, Some(6) ); } @@ -116,11 +122,15 @@ mod tests { #[test] fn with_twodelta_chunk_size_clamps_zero_to_one() { assert_eq!( - XzEncodeOptions::new().with_twodelta_chunk_size(0).twodelta_chunk_size, + XzEncodeOptions::new() + .with_twodelta_chunk_size(0) + .twodelta_chunk_size, 1 ); assert_eq!( - XzEncodeOptions::new().with_twodelta_chunk_size(7).twodelta_chunk_size, + XzEncodeOptions::new() + .with_twodelta_chunk_size(7) + .twodelta_chunk_size, 7 ); } diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index fe9cb62..0b64cf0 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -279,7 +279,13 @@ fn writer_twodelta_chunk_boundary_off_by_one_grid() { continue; } let assignments: Vec> = (0..n_samples) - .map(|i| if i % 2 == 0 { anchor.clone() } else { delta.clone() }) + .map(|i| { + if i % 2 == 0 { + anchor.clone() + } else { + delta.clone() + } + }) .collect(); let mut xben = Vec::new(); diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index 3887157..ca4de07 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -8,6 +8,27 @@ use std::cmp::Reverse; use std::collections::HashSet; use std::time::Duration; +/// A single cluster: the original nodes that get contracted into one coarse node. +type Cluster = Vec; + +/// A partition of a component into clusters. Every node in the component appears in exactly one. +type ClusterPartition = Vec; + +/// The coarse graph produced by contracting each cluster to a node; always undirected. +type CoarseGraph = Graph<(), (), petgraph::Undirected>; + +/// Per-node tie-breaking keys, indexed by `NodeIndex::index()`. At depth 0 these are the original +/// node identities; at deeper levels each entry is the minimum original key in a coarse node's +/// cluster, so the same vector type carries whatever level's keys the recursion is working on. +type TieBreakKeys = Vec; + +/// One recursion depth's spinner plus its running item counts. +struct DepthBar { + bar: ProgressBar, + total: usize, + done: usize, +} + /// Per-phase progress tracker for MLC, with one spinner line per recursion depth. /// /// Phase 1 (depth 0) processes the original nodes; phase 2 processes the level-1 clusters produced @@ -17,13 +38,6 @@ use std::time::Duration; /// /// Spinners auto-hide when stderr is not a terminal (e.g. under `cargo test` or when output is /// piped), so no config is needed for CI/test environments. -/// One recursion depth's spinner plus its running item counts. -struct DepthBar { - bar: ProgressBar, - total: usize, - done: usize, -} - struct MlcProgress { multi: MultiProgress, depths: Vec, @@ -53,39 +67,39 @@ impl MlcProgress { total: 0, done: 0, }); - let d = self.depths.len() - 1; - self.refresh(d); + let new_depth = self.depths.len() - 1; + self.refresh(new_depth); } } - /// Record that `n` more items will be processed at `depth`. - fn add_total(&mut self, depth: usize, n: usize) { + /// Record that `additional_items` more items will be processed at `depth`. + fn add_total(&mut self, depth: usize, additional_items: usize) { self.ensure_depth(depth); - self.depths[depth].total += n; + self.depths[depth].total += additional_items; self.refresh(depth); } - /// Record that `n` more items at `depth` have been finalized. - fn add_done(&mut self, depth: usize, n: usize) { + /// Record that `additional_items` more items at `depth` have been finalized. + fn add_done(&mut self, depth: usize, additional_items: usize) { self.ensure_depth(depth); - self.depths[depth].done += n; + self.depths[depth].done += additional_items; self.refresh(depth); } fn refresh(&self, depth: usize) { - let d = &self.depths[depth]; - let pct = if d.total == 0 { + let depth_bar = &self.depths[depth]; + let percent_complete = if depth_bar.total == 0 { 0 } else { - d.done * 100 / d.total + depth_bar.done * 100 / depth_bar.total }; - d.bar.set_message(format!( + depth_bar.bar.set_message(format!( "MLC phase {}: {}/{} {} ({}%)", depth + 1, - d.done, - d.total, + depth_bar.done, + depth_bar.total, Self::unit_for_depth(depth), - pct + percent_complete )); } @@ -99,11 +113,11 @@ impl MlcProgress { /// Stop all spinners, leaving a final "complete" message on each. fn finish(&self) { - for (depth, d) in self.depths.iter().enumerate() { - d.bar.finish_with_message(format!( + for (depth, depth_bar) in self.depths.iter().enumerate() { + depth_bar.bar.finish_with_message(format!( "MLC phase {}: complete ({} {})", depth + 1, - d.total, + depth_bar.total, Self::unit_for_depth(depth) )); } @@ -129,9 +143,14 @@ pub(super) fn apply_multi_level_clustering(petx_graph: &mut PetxGraph) - where Ty: petgraph::EdgeType, { - let labels: Vec = (0..petx_graph.graph.node_bound()).collect(); + let original_node_tie_break_keys: TieBreakKeys = (0..petx_graph.graph.node_bound()).collect(); let mut progress = MlcProgress::new(); - let order = mlc_order_inner(&petx_graph.graph, &labels, &mut progress, 0); + let order = mlc_order_inner( + &petx_graph.graph, + &original_node_tie_break_keys, + &mut progress, + 0, + ); *petx_graph = apply_permutation(petx_graph, &order); progress.finish(); @@ -141,7 +160,7 @@ where /// Recursively order each connected component via multilevel clustering, then concatenate the /// results. /// -/// Components are sorted by decreasing size (ties broken by minimum label) so that larger +/// Components are sorted by decreasing size (ties broken by minimum tie-break key) so that larger /// components occupy the beginning of the output. Each component is ordered independently by /// [`mlc_component`]. /// @@ -149,8 +168,9 @@ where /// /// * `graph` - The input graph to order. Generic over node/edge weights and edge type so it also /// works with the coarse graph during recursion. -/// * `labels` - A per-node label vector used for tie-breaking when choosing seeds and sorting -/// neighbors. Indexed by `NodeIndex::index()`. +/// * `tie_break_keys` - Per-node keys used to break ties when choosing seeds and ordering +/// components. Indexed by `NodeIndex::index()`. Carries whatever level's keys the recursion is +/// at. /// * `progress` - Progress tracker for the multi-phase spinner display. /// * `depth` - Recursion depth (0 at the top level). Used to route progress updates to the correct /// phase bar. @@ -161,7 +181,7 @@ where /// position `new_index`. fn mlc_order_inner( graph: &Graph, - labels: &[usize], + tie_break_keys: &[usize], progress: &mut MlcProgress, depth: usize, ) -> Vec @@ -174,18 +194,24 @@ where .into_iter() .map(|set| set.into_iter().collect()) .collect(); - components.sort_by_key(|c| { - let min_label = c + components.sort_by_key(|component| { + let min_key = component .iter() - .map(|n| labels[n.index()]) + .map(|node| tie_break_keys[node.index()]) .min() .unwrap_or(usize::MAX); - (Reverse(c.len()), min_label) + (Reverse(component.len()), min_key) }); let mut order = Vec::with_capacity(graph.node_count()); for component in components { - order.extend(mlc_component(graph, labels, &component, progress, depth)); + order.extend(mlc_component( + graph, + tie_break_keys, + &component, + progress, + depth, + )); } order } @@ -209,7 +235,7 @@ where /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by `NodeIndex::index()`. +/// * `tie_break_keys` - Per-node tie-breaking keys, indexed by `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to order. /// * `progress` - Progress tracker for the multi-phase spinner display. /// * `depth` - Recursion depth; routes progress updates to the correct phase bar. @@ -219,7 +245,7 @@ where /// A permutation of the nodes in `component` representing their new order. fn mlc_component( graph: &Graph, - labels: &[usize], + tie_break_keys: &[usize], component: &[NodeIndex], progress: &mut MlcProgress, depth: usize, @@ -234,14 +260,14 @@ where // `greedy_cluster_partition` ticks this depth's progress per cluster, so every node in // `component` contributes to phase `depth+1` exactly once. - let mut clusters = greedy_cluster_partition(graph, labels, component, progress, depth); + let mut clusters = greedy_cluster_partition(graph, tie_break_keys, component, progress, depth); // Reorder each cluster internally via RCM on the subgraph induced by its members. This puts // peripheral (degree-1) nodes at both ends of the cluster and the high-degree seed near the // middle/end, which keeps cluster boundaries "loose" and avoids stranding the most- connected // node next to the previous cluster. for cluster in clusters.iter_mut() { - *cluster = rcm_component(graph, labels, cluster); + *cluster = rcm_component(graph, tie_break_keys, cluster); } // Single-cluster case: the whole component is one star. @@ -251,8 +277,14 @@ where // Multi-cluster case: recurse on the coarse graph to decide the order in which the clusters // appear. - let (coarse_graph, coarse_labels) = build_coarse_graph(graph, labels, &clusters); - let coarse_order = mlc_order_inner(&coarse_graph, &coarse_labels, progress, depth + 1); + let (coarse_graph, coarse_node_tie_break_keys) = + build_coarse_graph(graph, tie_break_keys, &clusters); + let coarse_order = mlc_order_inner( + &coarse_graph, + &coarse_node_tie_break_keys, + progress, + depth + 1, + ); let mut order = Vec::with_capacity(component.len()); for coarse_node in coarse_order { @@ -263,10 +295,10 @@ where /// Partition a component into star-shaped clusters using a greedy seed-expansion strategy. /// -/// At each step, the lowest-degree unassigned node (ties broken by label) is chosen as a seed, and -/// the seed together with all of its unassigned neighbors becomes the next cluster. Local degrees -/// are then decremented for every unassigned node adjacent to a newly-assigned one, so subsequent -/// seed selections reflect the residual graph. +/// At each step, the lowest-degree unassigned node (ties broken by tie-break key) is chosen as a +/// seed, and the seed together with all of its unassigned neighbors becomes the next cluster. Local +/// degrees are then decremented for every unassigned node adjacent to a newly-assigned one, so +/// subsequent seed selections reflect the residual graph. /// /// Only cluster *membership* is meaningful here; the internal order of each returned cluster is not /// final and is expected to be overwritten by the caller (e.g. via [`rcm_component`]). @@ -274,7 +306,7 @@ where /// # Arguments /// /// * `graph` - The full graph (only edges within `component` are relevant). -/// * `labels` - Per-node labels for tie-breaking, indexed by `NodeIndex::index()`. +/// * `tie_break_keys` - Per-node tie-breaking keys, indexed by `NodeIndex::index()`. /// * `component` - The subset of `NodeIndex` values to partition. /// * `progress` - Progress tracker; `depth`'s done counter is advanced by each cluster's size as /// the cluster is formed, so the caller's phase bar fills up gradually during large partitions. @@ -286,34 +318,35 @@ where /// `component` appears in exactly one cluster. fn greedy_cluster_partition( graph: &Graph, - labels: &[usize], + tie_break_keys: &[usize], component: &[NodeIndex], progress: &mut MlcProgress, depth: usize, -) -> Vec> +) -> ClusterPartition where Ty: petgraph::EdgeType, { let component_set: HashSet = component.iter().copied().collect(); - let mut local_deg = local_degree_in_component(graph, &component_set, component); + let mut local_degree = local_degree_in_component(graph, &component_set, component); - let mut assigned = vec![false; graph.node_bound()]; - let mut remaining: Vec = component.to_vec(); + let mut node_is_assigned = vec![false; graph.node_bound()]; + let mut unassigned_nodes: Vec = component.to_vec(); let mut clusters = Vec::new(); - while !remaining.is_empty() { - remaining.sort_by_key(|&node| (local_deg[node.index()], labels[node.index()])); - let seed = remaining[0]; + while !unassigned_nodes.is_empty() { + unassigned_nodes + .sort_by_key(|&node| (local_degree[node.index()], tie_break_keys[node.index()])); + let seed = unassigned_nodes[0]; let mut cluster = vec![seed]; - assigned[seed.index()] = true; + node_is_assigned[seed.index()] = true; // Cluster membership is seed + every unassigned in-component neighbor. Internal order here // is irrelevant: the caller (`mlc_component`) overwrites it with an RCM ordering on the // cluster's induced subgraph. for neighbor in graph.neighbors(seed) { - if component_set.contains(&neighbor) && !assigned[neighbor.index()] { - assigned[neighbor.index()] = true; + if component_set.contains(&neighbor) && !node_is_assigned[neighbor.index()] { + node_is_assigned[neighbor.index()] = true; cluster.push(neighbor); } } @@ -321,13 +354,13 @@ where // Decrement degrees of unassigned nodes adjacent to the new cluster. for &node in &cluster { for neighbor in graph.neighbors(node) { - if component_set.contains(&neighbor) && !assigned[neighbor.index()] { - local_deg[neighbor.index()] -= 1; + if component_set.contains(&neighbor) && !node_is_assigned[neighbor.index()] { + local_degree[neighbor.index()] -= 1; } } } - remaining.retain(|&n| !assigned[n.index()]); + unassigned_nodes.retain(|&node| !node_is_assigned[node.index()]); progress.add_done(depth, cluster.len()); clusters.push(cluster); } @@ -338,68 +371,77 @@ where /// Build a coarse graph where each cluster is contracted into a single node. /// /// The coarse graph is always undirected: an edge exists between two coarse nodes whenever any -/// original-graph edge connects their clusters. Each coarse node's label is the minimum original -/// label among its cluster members. +/// original-graph edge connects their clusters. Each coarse node's tie-break key is the minimum +/// original key among its cluster members. /// /// # Arguments /// /// * `graph` - The full graph containing the original edges. -/// * `labels` - Per-node labels for the original graph, indexed by `NodeIndex::index()`. +/// * `tie_break_keys` - Per-node tie-breaking keys for the original graph, indexed by +/// `NodeIndex::index()`. /// * `clusters` - The partition produced by [`greedy_cluster_partition`]. Cluster `i` maps to /// coarse node `i`. /// /// # Returns /// /// A tuple of: -/// * The coarse `Graph<(), (), Undirected>` with one node per cluster and one edge per -/// inter-cluster connection. -/// * A label vector for the coarse graph (one entry per cluster), where each label is the minimum -/// original label in that cluster. +/// * The coarse [`CoarseGraph`] with one node per cluster and one edge per inter-cluster +/// connection. +/// * The coarse graph's tie-break keys (one entry per cluster), where each is the minimum original +/// key in that cluster. fn build_coarse_graph( graph: &Graph, - labels: &[usize], - clusters: &[Vec], -) -> (Graph<(), (), petgraph::Undirected>, Vec) + tie_break_keys: &[usize], + clusters: &[Cluster], +) -> (CoarseGraph, TieBreakKeys) where Ty: petgraph::EdgeType, { - let mut cluster_of = vec![usize::MAX; graph.node_bound()]; - for (ci, cluster) in clusters.iter().enumerate() { + let mut coarse_node_by_original_index = vec![usize::MAX; graph.node_bound()]; + for (cluster_idx, cluster) in clusters.iter().enumerate() { for &node in cluster { - cluster_of[node.index()] = ci; + coarse_node_by_original_index[node.index()] = cluster_idx; } } - let mut coarse_graph = Graph::<(), (), petgraph::Undirected>::with_capacity(clusters.len(), 0); + let mut coarse_graph = CoarseGraph::with_capacity(clusters.len(), 0); for _ in 0..clusters.len() { coarse_graph.add_node(()); } let mut seen_edges: HashSet<(usize, usize)> = HashSet::new(); - for (ci, cluster) in clusters.iter().enumerate() { + for (cluster_idx, cluster) in clusters.iter().enumerate() { for &node in cluster { for neighbor in graph.neighbors(node) { - let nc = cluster_of[neighbor.index()]; - if nc != ci && nc != usize::MAX { - let canonical = if ci < nc { (ci, nc) } else { (nc, ci) }; + let neighbors_cluster = coarse_node_by_original_index[neighbor.index()]; + if neighbors_cluster != cluster_idx && neighbors_cluster != usize::MAX { + let canonical = if cluster_idx < neighbors_cluster { + (cluster_idx, neighbors_cluster) + } else { + (neighbors_cluster, cluster_idx) + }; if seen_edges.insert(canonical) { - coarse_graph.add_edge(NodeIndex::new(ci), NodeIndex::new(nc), ()); + coarse_graph.add_edge( + NodeIndex::new(cluster_idx), + NodeIndex::new(neighbors_cluster), + (), + ); } } } } } - let coarse_labels: Vec = clusters + let coarse_node_tie_break_keys: TieBreakKeys = clusters .iter() .map(|cluster| { cluster .iter() - .map(|n| labels[n.index()]) + .map(|node| tie_break_keys[node.index()]) .min() .unwrap_or(usize::MAX) }) .collect(); - (coarse_graph, coarse_labels) + (coarse_graph, coarse_node_tie_break_keys) } diff --git a/ben/src/json/graph/petxgraph/nx_convert.rs b/ben/src/json/graph/petxgraph/nx_convert.rs index 905cb09..bc173f5 100644 --- a/ben/src/json/graph/petxgraph/nx_convert.rs +++ b/ben/src/json/graph/petxgraph/nx_convert.rs @@ -6,6 +6,10 @@ use petgraph::visit::{EdgeRef, IntoNodeReferences}; use petgraph::{Directed, Undirected}; use std::collections::{HashMap, HashSet}; +/// Reserved attribute key under which a NetworkX node's original `id` is stashed while the node +/// lives in petgraph form, so it can be recovered on the round trip back to NetworkX. +const NETWORKX_ID_ATTR: &str = "__networkx_id__"; + /// Convert an [`NxNode`] into a [`PetxNode`]. /// /// The node's `id` field is moved into the attribute map under the reserved key `"__networkx_id__"` @@ -20,7 +24,7 @@ use std::collections::{HashMap, HashSet}; /// A [`PetxNode`] whose `attrs` map contains all original attributes plus `"__networkx_id__"`. pub(in crate::json::graph) fn nx_node_to_petx_node(nx_node: NxNode) -> PetxNode { let mut attrs = nx_node.attrs; - attrs.insert("__networkx_id__".to_string(), nx_node.id); + attrs.insert(NETWORKX_ID_ATTR.to_string(), nx_node.id); PetxNode { attrs } } @@ -44,8 +48,8 @@ pub(in crate::json::graph) fn petx_node_to_nx_node( petx_node: &PetxNode, ) -> Result { let mut attrs = petx_node.attrs.clone(); - let id = attrs.remove("__networkx_id__").ok_or_else(|| { - NxPetgraphError::Other("missing __networkx_id__ on petgraph node".to_string()) + let id = attrs.remove(NETWORKX_ID_ATTR).ok_or_else(|| { + NxPetgraphError::Other(format!("missing {NETWORKX_ID_ATTR} on petgraph node")) })?; Ok(NxNode { id, attrs }) @@ -102,6 +106,21 @@ where } = nx_graph; let mut graph = Graph::::with_capacity(nodes.len(), 0); + let node_id_to_index = add_networkx_nodes(&mut graph, nodes)?; + add_networkx_adjacency_edges(&mut graph, &node_id_to_index, adjacency, is_directed)?; + + Ok(PetxGraph { graph_attrs, graph }) +} + +/// Add every NetworkX node to `graph`, returning a map from each node's original id to its assigned +/// [`NodeIndex`]. Errors with [`NxPetgraphError::DuplicateNodeId`] if any id appears twice. +fn add_networkx_nodes( + graph: &mut Graph, + nodes: Vec, +) -> Result, NxPetgraphError> +where + Ty: petgraph::EdgeType, +{ let mut node_id_to_index: HashMap = HashMap::with_capacity(nodes.len()); @@ -116,24 +135,37 @@ where node_id_to_index.insert(node_id, index); } - // NetworkX adjacency format is a list of adjacency lists, where the i-th adjacency list - // corresponds to the i-th node in the nodes list. - // - // For undirected graphs, the format may contain both (u, v) and (v, u), so we track - // canonicalized edge endpoint pairs and only add each undirected edge once. + Ok(node_id_to_index) +} + +/// Add edges from the NetworkX adjacency lists to `graph`. +/// +/// The NetworkX adjacency format is a list of adjacency lists, where the i-th list holds the +/// out-neighbors of the i-th node. For undirected graphs the format may list both `(u, v)` and +/// `(v, u)`, so endpoint pairs are canonicalized via [`canonical_undirected_edge_key`] and each +/// undirected edge is added only once. Errors with [`NxPetgraphError::MissingNeighborNode`] if an +/// adjacency entry references an id absent from `node_id_to_index`. +fn add_networkx_adjacency_edges( + graph: &mut Graph, + node_id_to_index: &HashMap, + adjacency: Vec>, + is_directed: bool, +) -> Result<(), NxPetgraphError> +where + Ty: petgraph::EdgeType, +{ let mut seen_undirected_edges: HashSet<(String, String, Option)> = HashSet::new(); - for (source_idx_orig, neighbors) in adjacency.into_iter().enumerate() { - let source_idx = NodeIndex::new(source_idx_orig); - // Adjacency length was validated against nodes length above. + for (source_index, neighbors) in adjacency.into_iter().enumerate() { + let source_idx = NodeIndex::new(source_index); + // Adjacency length was validated against nodes length by the caller. let source_node = graph .node_weight(source_idx) .expect("adjacency length validated against nodes length"); - // __networkx_id__ is always inserted by nx_node_to_petx_node. let source_id = source_node .attrs - .get("__networkx_id__") + .get(NETWORKX_ID_ATTR) .expect("__networkx_id__ always set by nx_node_to_petx_node"); // serde_json::Value is always serializable. @@ -142,36 +174,45 @@ where for edge in neighbors { let target_id = &edge.id; - let target_idx = node_id_to_index + let target_idx = *node_id_to_index .get(target_id) .ok_or_else(|| NxPetgraphError::MissingNeighborNode(target_id.clone()))?; if is_directed { - graph.add_edge(source_idx, *target_idx, edge); + graph.add_edge(source_idx, target_idx, edge); } else { // serde_json::Value is always serializable. let target_key = serde_json::to_string(target_id).expect("serde_json::Value always serializes"); - - let edge_key_str = edge + let edge_key = edge .key .as_ref() .and_then(|key| serde_json::to_string(key).ok()); - let canonical = if source_key <= target_key { - (source_key.clone(), target_key, edge_key_str) - } else { - (target_key, source_key.clone(), edge_key_str) - }; - + let canonical = canonical_undirected_edge_key(&source_key, target_key, edge_key); if seen_undirected_edges.insert(canonical) { - graph.add_edge(source_idx, *target_idx, edge); + graph.add_edge(source_idx, target_idx, edge); } } } } - Ok(PetxGraph { graph_attrs, graph }) + Ok(()) +} + +/// Order an undirected edge's endpoint keys so that `(u, v)` and `(v, u)` map to the same tuple, +/// letting a `HashSet` deduplicate the two directions NetworkX may list. The optional edge key is +/// carried through unchanged so parallel edges between the same endpoints stay distinct. +fn canonical_undirected_edge_key( + source_key: &str, + target_key: String, + edge_key: Option, +) -> (String, String, Option) { + if source_key <= target_key.as_str() { + (source_key.to_string(), target_key, edge_key) + } else { + (target_key, source_key.to_string(), edge_key) + } } /// Check whether a graph contains parallel (multi) edges. diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index ca5c02d..36376a1 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -100,8 +100,8 @@ impl RelabelOptions { self } - /// Set the sample limit from an optional value: `Some(n)` sets the limit, `None` clears it. Lets - /// CLI argument plumbing pass an `Option` straight through. + /// Set the sample limit from an optional value: `Some(n)` sets the limit, `None` clears it. + /// Lets CLI argument plumbing pass an `Option` straight through. pub fn with_max_samples_opt(mut self, n: Option) -> Self { self.max_samples = n; self @@ -240,10 +240,10 @@ where let mut sample_number = 0usize; let spinner = Spinner::new("Relabeling line"); - // Both run policies share the same per-frame bookkeeping (sample limit, transform, output count, - // progress); they differ only in how the relabeled assignment is emitted. `out_count` is bounded - // by the input frame's `count` (a `u16`), so the `as u16` cast on the preserve path cannot - // truncate. + // Both run policies share the same per-frame bookkeeping (sample limit, transform, output + // count, progress); they differ only in how the relabeled assignment is emitted. + // `out_count` is bounded by the input frame's `count` (a `u16`), so the `as u16` cast on + // the preserve path cannot truncate. decoder.for_each_assignment(|assignment, count| { if max_samples.is_some_and(|limit| sample_number >= limit) { return Ok(false); diff --git a/ben/src/test_utils.rs b/ben/src/test_utils.rs index f757aa1..39b856f 100644 --- a/ben/src/test_utils.rs +++ b/ben/src/test_utils.rs @@ -10,8 +10,10 @@ use std::time::{SystemTime, UNIX_EPOCH}; use serde_json::json; +use std::ops::Range; + use crate::codec::encode::encode_jsonl_to_ben; -use crate::io::bundle::format::AssignmentFormat; +use crate::io::bundle::format::{AssignmentFormat, DIRECTORY_ENTRY_HEADER_SIZE}; use crate::io::bundle::BendlWriter; use crate::BenVariant; @@ -70,6 +72,207 @@ pub fn sample_bendl_bytes(stream: &[u8], format: AssignmentFormat) -> Vec { buf } +/// A field of the fixed `.bendl` header, identified by name rather than by raw byte offset. +/// +/// The associated [`HeaderField::range`] is the field's byte range inside the 64-byte header, the +/// single source of truth that adversarial fixtures patch against instead of hard-coding offsets. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum HeaderField { + /// `alignment_padding` (`u16`). + AlignmentPadding, + /// `flags` (`u32`). + Flags, + /// `stream_checksum` (`u32`). + StreamChecksum, + /// `directory_offset` (`u64`). + DirectoryOffset, + /// `directory_len` (`u64`). + DirectoryLen, + /// `stream_offset` (`u64`). + StreamOffset, + /// `stream_len` (`u64`). + StreamLen, + /// `sample_count` (`i64`). + SampleCount, +} + +impl HeaderField { + /// Byte range this field occupies within the fixed 64-byte header. + pub fn range(self) -> Range { + match self { + HeaderField::AlignmentPadding => 14..16, + HeaderField::Flags => 16..20, + HeaderField::StreamChecksum => 20..24, + HeaderField::DirectoryOffset => 24..32, + HeaderField::DirectoryLen => 32..40, + HeaderField::StreamOffset => 40..48, + HeaderField::StreamLen => 48..56, + HeaderField::SampleCount => 56..64, + } + } +} + +/// A field of a directory entry's fixed header, identified by name rather than by raw byte offset. +/// +/// The associated [`DirectoryEntryField::range`] is the field's byte range *relative to the start +/// of the entry* (the entry begins after the directory's leading `u32` entry count). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum DirectoryEntryField { + /// `asset_type` (`u16`). + AssetType, + /// `asset_flags` (`u16`). + AssetFlags, + /// `name_len` (`u16`). + NameLen, + /// `payload_offset` (`u64`). + PayloadOffset, + /// `payload_len` (`u64`). + PayloadLen, + /// `checksum_len` (`u32`). + ChecksumLen, +} + +impl DirectoryEntryField { + /// Byte range this field occupies relative to the start of a directory entry. + pub fn range(self) -> Range { + match self { + DirectoryEntryField::AssetType => 0..2, + DirectoryEntryField::AssetFlags => 2..4, + DirectoryEntryField::NameLen => 4..6, + DirectoryEntryField::PayloadOffset => 8..16, + DirectoryEntryField::PayloadLen => 16..24, + DirectoryEntryField::ChecksumLen => 24..28, + } + } +} + +/// A mutable wrapper over raw `.bendl` bytes for building adversarial fixtures by *named field* +/// instead of by magic byte offset. +/// +/// The builder methods (`with_*`, `corrupt_*`) consume `self` and return it so patches can be +/// chained; the reader methods (`header_u64`, `entry_count`) inspect the current bytes so a fixture +/// can patch a field whose location depends on another (e.g. walking entries from +/// `directory_offset`). Field locations come from [`HeaderField`] / [`DirectoryEntryField`], so the +/// on-disk layout is named in exactly one place. +#[derive(Clone, Debug)] +pub struct BendlBytes { + bytes: Vec, +} + +impl BendlBytes { + /// Wrap an existing byte vector (typically a valid bundle seed) for patching. + pub fn new(bytes: Vec) -> Self { + Self { bytes } + } + + /// Borrow the current bytes. + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Consume the builder and return the patched bytes. + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Read a header field as a little-endian `u64` (reading only the field's own width). + pub fn header_u64(&self, field: HeaderField) -> u64 { + let range = field.range(); + let mut buf = [0u8; 8]; + buf[..range.len()].copy_from_slice(&self.bytes[range]); + u64::from_le_bytes(buf) + } + + /// Patch a header field to the low bytes of `value` (the field's own width), returning `self` + /// for chaining. Works for the `u16`/`u32`/`u64` header fields alike: only + /// `field.range().len()` little-endian bytes are written, so e.g. patching + /// `AlignmentPadding` writes two bytes. + pub fn with_header_u64(mut self, field: HeaderField, value: u64) -> Self { + let range = field.range(); + let width = range.len(); + self.bytes[range].copy_from_slice(&value.to_le_bytes()[..width]); + self + } + + /// The directory's leading `u32` entry count. + pub fn entry_count(&self) -> u32 { + let dir = self.header_u64(HeaderField::DirectoryOffset) as usize; + u32::from_le_bytes(self.bytes[dir..dir + 4].try_into().unwrap()) + } + + /// Patch the directory's leading `u32` entry count, returning `self` for chaining. + pub fn with_entry_count(mut self, count: u32) -> Self { + let dir = self.header_u64(HeaderField::DirectoryOffset) as usize; + self.bytes[dir..dir + 4].copy_from_slice(&count.to_le_bytes()); + self + } + + /// Byte offset where directory entry `index` begins, walking the variable-length entries from + /// `directory_offset` using each entry's own `name_len` / `checksum_len`. + fn directory_entry_offset(&self, index: usize) -> usize { + let dir = self.header_u64(HeaderField::DirectoryOffset) as usize; + let mut cursor = dir + 4; // skip the u32 entry count + for _ in 0..index { + let name_len = self.entry_field_u64(cursor, DirectoryEntryField::NameLen) as usize; + let checksum_len = + self.entry_field_u64(cursor, DirectoryEntryField::ChecksumLen) as usize; + cursor += DIRECTORY_ENTRY_HEADER_SIZE + name_len + checksum_len; + } + cursor + } + + /// Read a directory-entry field (relative to `entry_start`) as a little-endian `u64`. + fn entry_field_u64(&self, entry_start: usize, field: DirectoryEntryField) -> u64 { + let range = field.range(); + let mut buf = [0u8; 8]; + buf[..range.len()] + .copy_from_slice(&self.bytes[entry_start + range.start..entry_start + range.end]); + u64::from_le_bytes(buf) + } + + /// Patch a field of directory entry `index` to the low bytes of `value` (the field's own + /// width), returning `self` for chaining. + pub fn with_directory_entry_field( + mut self, + index: usize, + field: DirectoryEntryField, + value: u64, + ) -> Self { + let base = self.directory_entry_offset(index); + let range = field.range(); + let width = range.len(); + self.bytes[base + range.start..base + range.start + width] + .copy_from_slice(&value.to_le_bytes()[..width]); + self + } + + /// Flip a byte of directory entry `index`'s on-disk payload, simulating payload corruption that + /// a stored CRC should catch. `byte_within_payload` indexes from the entry's `payload_offset`. + pub fn corrupt_asset_payload(mut self, index: usize, byte_within_payload: usize) -> Self { + let base = self.directory_entry_offset(index); + let payload_offset = + self.entry_field_u64(base, DirectoryEntryField::PayloadOffset) as usize; + self.bytes[payload_offset + byte_within_payload] ^= 0xFF; + self + } + + /// Flip the first byte of directory entry `index`'s stored trailing checksum, simulating a + /// corrupt stored CRC. The entry must carry a trailing checksum (the bytes after its name). + pub fn corrupt_stored_asset_crc(mut self, index: usize) -> Self { + let base = self.directory_entry_offset(index); + let name_len = self.entry_field_u64(base, DirectoryEntryField::NameLen) as usize; + let checksum_start = base + DIRECTORY_ENTRY_HEADER_SIZE + name_len; + self.bytes[checksum_start] ^= 0xFF; + self + } +} + +impl From for Vec { + fn from(b: BendlBytes) -> Vec { + b.bytes + } +} + #[cfg(test)] mod tests { use super::*; @@ -124,4 +327,86 @@ mod tests { let reader = BendlReader::open(BufReader::new(Cursor::new(bytes))).unwrap(); assert!(reader.is_finalized()); } + + #[test] + fn bendl_bytes_reads_and_patches_named_fields() { + use crate::io::bundle::format::ASSET_TYPE_CUSTOM; + use crate::io::bundle::writer::AddAssetOptions; + use crate::io::bundle::BendlReader; + + let mut buf = Vec::new(); + { + let mut writer = + BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "first.bin", + b"first payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "second.bin", + b"second payload bytes", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + session.write_all(b"STANDARD BEN FILE\x00fake").unwrap(); + let writer = session.finish_into_writer(1); + writer.finish().unwrap(); + } + + let reader = BendlReader::open(Cursor::new(buf.clone())).unwrap(); + let entries = reader.assets().to_vec(); + drop(reader); + assert_eq!(entries.len(), 2); + + let bb = BendlBytes::new(buf); + assert_eq!(bb.entry_count(), 2); + + // Field reads agree with the parsed directory for every entry, proving the entry walk. + for (i, entry) in entries.iter().enumerate() { + let base = bb.directory_entry_offset(i); + assert_eq!( + bb.entry_field_u64(base, DirectoryEntryField::PayloadOffset), + entry.payload_offset + ); + assert_eq!( + bb.entry_field_u64(base, DirectoryEntryField::PayloadLen), + entry.payload_len + ); + assert_eq!( + bb.entry_field_u64(base, DirectoryEntryField::NameLen) as usize, + entry.name.len() + ); + } + + // with_header_u64 round-trips through header_u64. + let relabeled = bb.clone().with_header_u64(HeaderField::DirectoryLen, 4242); + assert_eq!(relabeled.header_u64(HeaderField::DirectoryLen), 4242); + + // corrupt_asset_payload flips exactly one byte, at the chosen entry's payload offset. + let original = bb.as_bytes().to_vec(); + let payload_corrupted = bb.clone().corrupt_asset_payload(1, 0).into_bytes(); + let second_payload_offset = entries[1].payload_offset as usize; + assert_eq!( + payload_corrupted[second_payload_offset], + original[second_payload_offset] ^ 0xFF + ); + assert_eq!(count_differing_bytes(&original, &payload_corrupted), 1); + + // corrupt_stored_asset_crc also flips exactly one byte (the entry's stored CRC start), + // which a default-written entry carries as four trailing bytes. + let crc_corrupted = bb.corrupt_stored_asset_crc(1).into_bytes(); + assert_eq!(count_differing_bytes(&original, &crc_corrupted), 1); + } + + fn count_differing_bytes(a: &[u8], b: &[u8]) -> usize { + assert_eq!(a.len(), b.len(), "fixtures should be the same length"); + a.iter().zip(b.iter()).filter(|(x, y)| x != y).count() + } } diff --git a/ben/tests/test_bendl_append_proptest.rs b/ben/tests/test_bendl_append_proptest.rs index 8ee2945..d070339 100644 --- a/ben/tests/test_bendl_append_proptest.rs +++ b/ben/tests/test_bendl_append_proptest.rs @@ -13,8 +13,8 @@ //! 2. After every `Commit`, every existing directory entry's `(payload_offset, payload_len)` is //! unchanged, and the raw bytes at those offsets are byte-for-byte identical to before the //! commit. This is the strong append-only invariant. -//! 3. After every `Abort` or drop-without-commit, the file is byte-identical to before the -//! appender was opened. +//! 3. After every `Abort` or drop-without-commit, the file is byte-identical to before the appender +//! was opened. use binary_ensemble::io::bundle::format::{ AssignmentFormat, BendlDirectoryEntry, ASSET_TYPE_CUSTOM, @@ -27,10 +27,7 @@ use std::io::{Cursor, Read, Seek, SeekFrom}; #[derive(Debug, Clone)] enum Op { /// Open an appender (if none is open) and enqueue a pending asset. - AddAsset { - payload: Vec, - compress: bool, - }, + AddAsset { payload: Vec, compress: bool }, /// Commit the currently-open appender, if any. Commit, /// Abort the currently-open appender via the explicit `.abort()` API, if any. @@ -53,6 +50,272 @@ fn op_strategy() -> impl Strategy { ] } +/// A reader-side *expectation*: an asset that has been committed to disk, paired with the decoded +/// bytes the reader must return for it. Decoded bytes equal the originally-added payload regardless +/// of whether the asset was stored compressed. +#[derive(Debug, Clone)] +struct CommittedAsset { + asset_name: String, + decoded_payload_bytes: Vec, +} + +/// One asset enqueued against the currently-open appender but not yet committed. These are the +/// append-side *inputs*: the raw bytes handed to `add_asset` and whether compression was requested. +#[derive(Debug, Clone)] +struct PendingAsset { + asset_name: String, + raw_payload_bytes: Vec, + compress_payload: bool, +} + +/// The batch of pending assets accumulated against a single open appender, plus the round-local +/// name index used (alongside the model's global counter) to mint stable, unique asset names. +#[derive(Debug, Default)] +struct PendingAppendRound { + assets: Vec, + next_name_index: usize, +} + +/// A snapshot of one existing directory entry's physical placement, taken before a commit. The +/// strong append-only invariant is that `(payload_offset, payload_len)` *and* the raw bytes at that +/// range (`on_disk_payload_bytes`) are byte-for-byte identical after the commit. +#[derive(Debug, Clone)] +struct EntrySnapshot { + asset_name: String, + payload_offset: u64, + payload_len: u64, + on_disk_payload_bytes: Vec, +} + +/// The model of expected bundle state as a sequence of append-grammar operations is applied. Each +/// operation mutates the model and asserts the relevant BENDL append invariant inline. +struct AppendModel { + /// Every asset committed to disk, in commit order, with the decoded bytes the reader must + /// return for it. + committed: Vec, + /// The current on-disk bundle bytes. + current_bytes: Vec, + /// `sample_count()` of the seed bundle; must never drift across appends. + baseline_samples: Option, + /// The pending round against the currently-open appender, if any. + round: Option, + /// Monotonic counter that makes every minted asset name unique across rounds. + name_counter: usize, +} + +impl AppendModel { + /// Start from the seed bundle, recording its baseline sample count and seed asset. + fn new(seed: Vec) -> Self { + let reader = BendlReader::open(Cursor::new(&seed)).unwrap(); + let baseline_samples = reader.sample_count(); + drop(reader); + AppendModel { + committed: vec![CommittedAsset { + asset_name: "seed.bin".to_string(), + decoded_payload_bytes: b"seed payload bytes".to_vec(), + }], + current_bytes: seed, + baseline_samples, + round: None, + name_counter: 0, + } + } + + /// Apply one generated operation. + fn apply(&mut self, op: &Op) { + match op { + Op::AddAsset { payload, compress } => self.enqueue_asset(payload, *compress), + Op::Commit => self.commit_round(), + Op::Abort => self.abort_round(), + Op::DropWithoutCommit => self.drop_round_without_commit(), + } + } + + /// `AddAsset`: open a round if none is open and enqueue a pending asset with a freshly minted + /// name. The global counter guarantees uniqueness across rounds, and the round-local index is + /// embedded so a successful commit lands a stable name. + fn enqueue_asset(&mut self, payload: &[u8], compress: bool) { + let name_counter = self.name_counter; + self.name_counter += 1; + let round = self.round.get_or_insert_with(PendingAppendRound::default); + let asset_name = format!("asset-{}-{}.bin", name_counter, round.next_name_index); + round.next_name_index += 1; + round.assets.push(PendingAsset { + asset_name, + raw_payload_bytes: payload.to_vec(), + compress_payload: compress, + }); + } + + /// `Commit`: replay the pending round through a real appender, commit it, and assert every + /// append invariant against the resulting bytes. + fn commit_round(&mut self) { + let Some(round) = self.round.take() else { + return; + }; + let snapshot = snapshot_existing_entries(&self.current_bytes); + + let mut appender = BendlAppender::open(Cursor::new(self.current_bytes.clone())).unwrap(); + for asset in &round.assets { + let opts = pending_asset_options(asset); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + &asset.asset_name, + &asset.raw_payload_bytes, + opts, + ) + .expect("add_asset on pending entry should succeed"); + } + let new_bytes = appender.commit().unwrap().into_inner(); + + // An open round always holds at least one pending asset (AddAsset is the only way to enter + // the round state), so the file can only grow, never shrink. + assert!( + new_bytes.len() >= self.current_bytes.len(), + "file shrank after commit" + ); + + let new_reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); + let new_entries: Vec = new_reader.assets().to_vec(); + drop(new_reader); + Self::assert_existing_entries_unchanged(&snapshot, &new_entries, &new_bytes); + + for asset in &round.assets { + self.committed.push(CommittedAsset { + asset_name: asset.asset_name.clone(), + decoded_payload_bytes: asset.raw_payload_bytes.clone(), + }); + } + + let mut reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); + assert_eq!( + reader.assets().len(), + self.committed.len(), + "directory size mismatch after commit" + ); + assert_eq!( + reader.sample_count(), + self.baseline_samples, + "sample_count drifted across append" + ); + self.assert_committed_assets_readable(&mut reader); + + self.current_bytes = new_bytes; + } + + /// `Abort`: open an appender on a clone, abort it via the explicit API, and confirm the bytes + /// returned by `.abort()` equal the pre-abort bytes (nothing was written at the writer level). + fn abort_round(&mut self) { + let Some(_round) = self.round.take() else { + return; + }; + let pre_bytes = self.current_bytes.clone(); + let appender = BendlAppender::open(Cursor::new(self.current_bytes.clone())).unwrap(); + let cursor = appender.abort(); + let post_bytes = cursor.into_inner(); + assert_eq!( + post_bytes, pre_bytes, + "Abort modified the file (it must be a no-op)" + ); + } + + /// `DropWithoutCommit`: re-enqueue the pending assets on a fresh appender over a clone, then + /// let it drop without `commit()`. The appender owns a clone, so the master bytes are + /// untouched regardless; the assertion pins that intent. + fn drop_round_without_commit(&mut self) { + let Some(round) = self.round.take() else { + return; + }; + let pre_bytes = self.current_bytes.clone(); + { + let mut appender = + BendlAppender::open(Cursor::new(self.current_bytes.clone())).unwrap(); + for (i, asset) in round.assets.iter().enumerate() { + let opts = pending_asset_options(asset); + let name = format!("dropped-{}-{}.bin", self.name_counter, i); + let _ = + appender.add_asset(ASSET_TYPE_CUSTOM, &name, &asset.raw_payload_bytes, opts); + } + // appender drops here without commit(). + } + assert_eq!( + self.current_bytes, pre_bytes, + "DropWithoutCommit modified the master file (it must be a no-op)" + ); + } + + /// Final consistency check: reopen the file, validate the directory, confirm every committed + /// asset is still readable, and drive a raw read to EOF to confirm structural soundness. Any + /// pending round at end-of-sequence is implicitly dropped, which must not affect the bytes. + fn assert_final_consistency(&self) { + let mut reader = BendlReader::open(Cursor::new(&self.current_bytes)).unwrap(); + reader.validate_directory().unwrap(); + assert_eq!( + reader.assets().len(), + self.committed.len(), + "final directory size mismatch" + ); + self.assert_committed_assets_readable(&mut reader); + + let mut tail = Vec::new(); + let mut cursor = Cursor::new(&self.current_bytes); + cursor.seek(SeekFrom::Start(0)).unwrap(); + cursor.read_to_end(&mut tail).unwrap(); + assert_eq!(tail.len(), self.current_bytes.len()); + } + + /// Assert every committed asset is present in `reader` and decodes to its expected bytes. + fn assert_committed_assets_readable(&self, reader: &mut BendlReader) { + for asset in &self.committed { + let entry = reader + .find_asset_by_name(&asset.asset_name) + .cloned() + .unwrap(); + let got = reader.asset_bytes(&entry).unwrap(); + assert_eq!( + got, asset.decoded_payload_bytes, + "decoded payload mismatch for {}", + asset.asset_name + ); + } + } + + /// Assert the strong append-only invariant: every snapshotted entry kept its offset, length, + /// and raw on-disk payload bytes after the commit. + fn assert_existing_entries_unchanged( + snapshot: &[EntrySnapshot], + new_entries: &[BendlDirectoryEntry], + new_bytes: &[u8], + ) { + for snap in snapshot { + let entry = find_entry(new_entries, &snap.asset_name); + assert_eq!( + (entry.payload_offset, entry.payload_len), + (snap.payload_offset, snap.payload_len), + "directory entry {} (offset, len) drifted after commit", + snap.asset_name + ); + let new_raw = raw_bytes_at(new_bytes, entry.payload_offset, entry.payload_len); + assert_eq!( + new_raw, snap.on_disk_payload_bytes, + "directory entry {} raw payload bytes drifted after commit", + snap.asset_name + ); + } + } +} + +/// Translate a pending asset's `compress_payload` flag into the matching writer options. +fn pending_asset_options(asset: &PendingAsset) -> AddAssetOptions { + let opts = AddAssetOptions::defaults(); + if asset.compress_payload { + opts.compress() + } else { + opts.raw() + } +} + /// Build the seed bundle used by every proptest case: a finalized bundle with one initial custom /// asset and a short stream so there's something to preserve across appends. fn build_seed_bundle() -> Vec { @@ -80,16 +343,18 @@ fn raw_bytes_at(buf: &[u8], offset: u64, len: u64) -> Vec { buf[start..end].to_vec() } -/// Snapshot the (offset, len, raw payload bytes) for every directory entry in `bytes`. The -/// invariant is that these tuples must be unchanged after an append-only commit. -fn snapshot_existing_entries(bytes: &[u8]) -> Vec<(String, u64, u64, Vec)> { +/// Snapshot the physical placement of every directory entry in `bytes`, for the append-only +/// invariant check after a commit. +fn snapshot_existing_entries(bytes: &[u8]) -> Vec { let reader = BendlReader::open(Cursor::new(bytes)).unwrap(); reader .assets() .iter() - .map(|e| { - let payload = raw_bytes_at(bytes, e.payload_offset, e.payload_len); - (e.name.clone(), e.payload_offset, e.payload_len, payload) + .map(|e| EntrySnapshot { + asset_name: e.name.clone(), + payload_offset: e.payload_offset, + payload_len: e.payload_len, + on_disk_payload_bytes: raw_bytes_at(bytes, e.payload_offset, e.payload_len), }) .collect() } @@ -102,179 +367,14 @@ fn find_entry<'a>(entries: &'a [BendlDirectoryEntry], name: &str) -> &'a BendlDi .unwrap_or_else(|| panic!("entry {name:?} not found in directory")) } -/// Run a single sequence of ops against the seed bundle. Returns the final state for an outer -/// `prop_assert!` to inspect, but most assertions fire inline as the ops execute. +/// Run a single sequence of ops against the seed bundle. All invariants are asserted inline by the +/// model as the ops execute. fn run_sequence(ops: &[Op]) { - let seed = build_seed_bundle(); - let baseline_reader = BendlReader::open(Cursor::new(&seed)).unwrap(); - let baseline_samples = baseline_reader.sample_count(); - drop(baseline_reader); - - // The "model" of every asset that has been committed to disk, in commit order. Each entry is - // (name, raw_payload_bytes_as_added, compress_flag). The decoded asset bytes returned by the - // reader must equal `raw_payload_bytes_as_added` regardless of compression. - let mut committed: Vec<(String, Vec, bool)> = vec![( - "seed.bin".to_string(), - b"seed payload bytes".to_vec(), - false, - )]; - - let mut current_bytes = seed.clone(); - - // Per-appender state: when an appender is open we hold its Vec of pending payloads alongside - // the snapshot we'll diff against if it commits. We don't keep the appender itself in this - // structure because moving it through closures with a snapshot is awkward; instead we - // construct/consume the appender inline at the next Op that uses it. We do, however, track - // the names allocated so the appender doesn't get hit with DuplicateName on the second - // AddAsset in the same round. - struct PendingRound { - pending: Vec<(String, Vec, bool)>, - next_name_index: usize, - } - let mut round: Option = None; - let mut name_counter: usize = 0; - + let mut model = AppendModel::new(build_seed_bundle()); for op in ops { - match op { - Op::AddAsset { payload, compress } => { - let r = round.get_or_insert(PendingRound { - pending: Vec::new(), - next_name_index: 0, - }); - // Name allocation: use a global counter to guarantee uniqueness across rounds, - // and embed the round-local index so a successful Commit lands a stable name. - let name = format!("asset-{}-{}.bin", name_counter, r.next_name_index); - r.next_name_index += 1; - name_counter += 1; - r.pending.push((name, payload.clone(), *compress)); - } - Op::Commit => { - let Some(r) = round.take() else { continue }; - let snapshot = snapshot_existing_entries(¤t_bytes); - - let mut appender = BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); - for (name, payload, compress) in &r.pending { - let mut opts = AddAssetOptions::defaults(); - opts = if *compress { opts.compress() } else { opts.raw() }; - appender - .add_asset(ASSET_TYPE_CUSTOM, name, payload, opts) - .expect("add_asset on pending entry should succeed"); - } - let new_bytes = appender.commit().unwrap().into_inner(); - - // File must have grown (or stayed equal if pending was empty — but an empty - // round only happens when the user inserts nothing before Commit, which isn't a - // generated op here since AddAsset is the only way to enter Pending state). - assert!( - new_bytes.len() >= current_bytes.len(), - "file shrank after commit" - ); - - // Strong invariant: every previously-committed directory entry kept its offset, - // length, and raw payload bytes. - let new_reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); - let new_entries: Vec = new_reader.assets().to_vec(); - drop(new_reader); - for (name, old_offset, old_len, old_payload) in &snapshot { - let entry = find_entry(&new_entries, name); - assert_eq!( - (entry.payload_offset, entry.payload_len), - (*old_offset, *old_len), - "directory entry {name} (offset, len) drifted after commit" - ); - let new_raw = raw_bytes_at(&new_bytes, entry.payload_offset, entry.payload_len); - assert_eq!( - new_raw, *old_payload, - "directory entry {name} raw payload bytes drifted after commit" - ); - } - - // Append model: every previously-committed asset + every freshly-committed - // pending one is readable and decodes to the right bytes. - for (name, payload, _compress) in &r.pending { - committed.push((name.clone(), payload.clone(), false)); - } - let mut reader = BendlReader::open(Cursor::new(&new_bytes)).unwrap(); - assert_eq!( - reader.assets().len(), - committed.len(), - "directory size mismatch after commit" - ); - assert_eq!( - reader.sample_count(), - baseline_samples, - "sample_count drifted across append" - ); - for (name, want, _) in &committed { - let entry = reader.find_asset_by_name(name).cloned().unwrap(); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(&got, want, "decoded payload mismatch for {name}"); - } - - current_bytes = new_bytes; - } - Op::Abort => { - let Some(_r) = round.take() else { continue }; - let pre_bytes = current_bytes.clone(); - let appender = BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); - // .abort() consumes the appender and returns the underlying cursor. We never - // wrote anything to it (we never entered the pending state at the writer - // level), so the bytes must equal pre_bytes. - let cursor = appender.abort(); - let post_bytes = cursor.into_inner(); - assert_eq!( - post_bytes, pre_bytes, - "Abort modified the file (it must be a no-op)" - ); - } - Op::DropWithoutCommit => { - let Some(_r) = round.take() else { continue }; - let pre_bytes = current_bytes.clone(); - { - let mut appender = - BendlAppender::open(Cursor::new(current_bytes.clone())).unwrap(); - // Re-enqueue the pending ops on this appender, then let it drop without - // committing. The file underlying `appender` is a clone of `current_bytes`, - // so dropping it can't affect `current_bytes` either way — but the - // assertion below pins that intent for clarity. - for (i, (_, payload, compress)) in _r.pending.iter().enumerate() { - let mut opts = AddAssetOptions::defaults(); - opts = if *compress { opts.compress() } else { opts.raw() }; - let name = format!("dropped-{name_counter}-{i}.bin"); - let _ = appender.add_asset(ASSET_TYPE_CUSTOM, &name, payload, opts); - } - // appender drops here without commit(). - } - assert_eq!( - current_bytes, pre_bytes, - "DropWithoutCommit modified the master file (it must be a no-op)" - ); - } - } + model.apply(op); } - - // Final consistency check: open the file one last time, validate the directory, and confirm - // every committed asset is still readable. Any pending round at end-of-sequence is implicitly - // dropped (no commit), which must not affect `current_bytes`. - let mut reader = BendlReader::open(Cursor::new(¤t_bytes)).unwrap(); - reader.validate_directory().unwrap(); - assert_eq!( - reader.assets().len(), - committed.len(), - "final directory size mismatch" - ); - for (name, want, _) in &committed { - let entry = reader.find_asset_by_name(name).cloned().unwrap(); - let got = reader.asset_bytes(&entry).unwrap(); - assert_eq!(&got, want, "final decoded payload mismatch for {name}"); - } - - // Also drive a raw seek to EOF to confirm the file is structurally sound. - let mut tail = Vec::new(); - let mut cursor = Cursor::new(¤t_bytes); - cursor.seek(SeekFrom::Start(0)).unwrap(); - cursor.read_to_end(&mut tail).unwrap(); - assert_eq!(tail.len(), current_bytes.len()); + model.assert_final_consistency(); } proptest! { diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index e52fb86..31f75ec 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1759,12 +1759,7 @@ fn ben_encode_graph_requires_input_file_not_stdin() { let out = run( "ben", - &[ - "--mode", - "encode", - "--graph", - graph_path.to_str().unwrap(), - ], + &["--mode", "encode", "--graph", graph_path.to_str().unwrap()], temp.path(), ); assert_failure(&out); @@ -1882,7 +1877,10 @@ fn ben_encode_graph_happy_path_produces_bendl() { temp.path(), ); assert_success(&extract_graph); - assert_eq!(fs::read_to_string(&recovered_graph).unwrap(), sample_graph()); + assert_eq!( + fs::read_to_string(&recovered_graph).unwrap(), + sample_graph() + ); } #[test] diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 4bf0515..8c35fb1 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -10,8 +10,8 @@ use binary_ensemble::codec::encode::{ }; use binary_ensemble::codec::BenEncodeFrame; use binary_ensemble::format::banners::{ - banner_for_variant, has_known_banner_prefix, variant_from_banner, - MKVCHAIN_BEN_BANNER, STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, + banner_for_variant, has_known_banner_prefix, variant_from_banner, MKVCHAIN_BEN_BANNER, + STANDARD_BEN_BANNER, TWODELTA_BEN_BANNER, }; use binary_ensemble::io::reader::{ BenStreamFrameReader, BenStreamReader, DecodeFrame, DecoderInitError, diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs index a026609..887f987 100644 --- a/ben/tests/test_format_stability.rs +++ b/ben/tests/test_format_stability.rs @@ -324,16 +324,16 @@ fn mint_flags_set_bendl() -> Vec { /// Returns a copy of `bytes` with reserved bits set on both the header flags and the custom /// asset's asset_flags. Used to mint the `unknown_flags.bendl` fixture from a known-good bundle. fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { - // 1. Set bit 1 of the header flags (offset 16..20). Bit 0 is HEADER_FLAG_STREAM_CHECKSUM; bit - // 1 is currently reserved. + // 1. Set bit 1 of the header flags (offset 16..20). Bit 0 is HEADER_FLAG_STREAM_CHECKSUM; bit 1 + // is currently reserved. let mut header_flags = u32::from_le_bytes(bytes[16..20].try_into().unwrap()); header_flags |= 1 << 1; bytes[16..20].copy_from_slice(&header_flags.to_le_bytes()); // 2. Add a custom asset entry's asset_flags reserved bit. Since the writer-minted bundle does // not include a custom asset, append one to the directory before flipping. Rather than - // surgery, do the simpler thing: reopen the bundle, append a custom asset via the - // appender API, then flip a reserved bit on its directory entry. + // surgery, do the simpler thing: reopen the bundle, append a custom asset via the appender + // API, then flip a reserved bit on its directory entry. let mut appender = binary_ensemble::io::bundle::writer::BendlAppender::open(Cursor::new(bytes)) .expect("open appender"); appender @@ -346,12 +346,11 @@ fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { let cursor = appender.commit().expect("commit appender"); let mut bytes = cursor.into_inner(); - // 3. Locate the custom asset's directory entry and flip bit 7 of its asset_flags. - // Directory entry layout per `BendlDirectoryEntry::to_bytes`: - // [u16 asset_type][u16 asset_flags][u16 name_len][u16 reserved][u64 payload_offset] - // [u64 payload_len][u32 checksum_len][name bytes][checksum bytes] - // asset_flags is at byte offset 2 within each entry. We scan the directory and patch the - // entry whose asset_type is ASSET_TYPE_CUSTOM. + // 3. Locate the custom asset's directory entry and flip bit 7 of its asset_flags. Directory + // entry layout per `BendlDirectoryEntry::to_bytes`: [u16 asset_type][u16 asset_flags][u16 + // name_len][u16 reserved][u64 payload_offset] [u64 payload_len][u32 checksum_len][name + // bytes][checksum bytes] asset_flags is at byte offset 2 within each entry. We scan the + // directory and patch the entry whose asset_type is ASSET_TYPE_CUSTOM. let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; let entry_count_offset = directory_offset; let entry_count = u32::from_le_bytes( @@ -363,14 +362,14 @@ fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { let mut cursor = directory_offset + 4; for _ in 0..entry_count { let asset_type = u16::from_le_bytes(bytes[cursor..cursor + 2].try_into().unwrap()); - let name_len = u16::from_le_bytes(bytes[cursor + 4..cursor + 6].try_into().unwrap()) as usize; - let checksum_len = u32::from_le_bytes(bytes[cursor + 24..cursor + 28].try_into().unwrap()) - as usize; + let name_len = + u16::from_le_bytes(bytes[cursor + 4..cursor + 6].try_into().unwrap()) as usize; + let checksum_len = + u32::from_le_bytes(bytes[cursor + 24..cursor + 28].try_into().unwrap()) as usize; if asset_type == ASSET_TYPE_CUSTOM { let flags_offset = cursor + 2; - let mut asset_flags = u16::from_le_bytes( - bytes[flags_offset..flags_offset + 2].try_into().unwrap(), - ); + let mut asset_flags = + u16::from_le_bytes(bytes[flags_offset..flags_offset + 2].try_into().unwrap()); asset_flags |= 1 << 7; // currently reserved bytes[flags_offset..flags_offset + 2].copy_from_slice(&asset_flags.to_le_bytes()); return bytes; diff --git a/ben/tests/test_ops_equivalence_proptest.rs b/ben/tests/test_ops_equivalence_proptest.rs index c3c2dfa..eeec6c5 100644 --- a/ben/tests/test_ops_equivalence_proptest.rs +++ b/ben/tests/test_ops_equivalence_proptest.rs @@ -4,13 +4,13 @@ //! (`translate` direction). The complementary properties here pin the algebraic identities of //! the post-decode operations: //! -//! - **relabel composition:** for any node permutation `P` of length `L`, -//! `relabel(P^-1, relabel(P, x)) == x`. -//! - **extract correctness:** for any sample index `i` in `1..=N`, -//! `extract(i, encode(x)) == x[i-1]`. -//! - **convert variant round-trip:** for any variant pair `(A, B)`, -//! `convert(A, convert(B, x)) == x` (compared at the decoded-assignment level, since BEN -//! variants differ in frame structure and counts but not assignment data). +//! - **relabel composition:** for any node permutation `P` of length `L`, `relabel(P^-1, relabel(P, +//! x)) == x`. +//! - **extract correctness:** for any sample index `i` in `1..=N`, `extract(i, encode(x)) == +//! x[i-1]`. +//! - **convert variant round-trip:** for any variant pair `(A, B)`, `convert(A, convert(B, x)) == +//! x` (compared at the decoded-assignment level, since BEN variants differ in frame structure and +//! counts but not assignment data). use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::codec::encode::encode_jsonl_to_ben; @@ -26,12 +26,7 @@ use std::io::{BufReader, Cursor, Write}; fn jsonl_from(seq: &[Vec]) -> Vec { let mut buf = Vec::new(); for (i, a) in seq.iter().enumerate() { - writeln!( - &mut buf, - "{}", - json!({"assignment": a, "sample": i + 1}) - ) - .unwrap(); + writeln!(&mut buf, "{}", json!({"assignment": a, "sample": i + 1})).unwrap(); } buf } @@ -42,10 +37,7 @@ fn strat_fixed_length_seq( len: usize, max_samples: usize, ) -> impl Strategy>> { - prop::collection::vec( - prop::collection::vec(1u16..=max_val, len), - 1..=max_samples, - ) + prop::collection::vec(prop::collection::vec(1u16..=max_val, len), 1..=max_samples) } /// Invert a permutation `P` (new_idx → old_idx). The inverse maps `old_idx → new_idx`. Given diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 3bcf0f0..deb6a59 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -17,6 +17,7 @@ use binary_ensemble::io::bundle::{BendlReadError, BendlReader, ChecksumError, Ch use binary_ensemble::io::reader::BenStreamReader; use binary_ensemble::io::writer::BenStreamWriter; use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; +use binary_ensemble::test_utils::{BendlBytes, DirectoryEntryField, HeaderField}; use binary_ensemble::BenVariant; use std::cell::RefCell; use std::collections::HashMap; @@ -65,8 +66,10 @@ fn minimal_bendl_with_entries( bytes } -fn expect_bendl_open_err(bytes: Vec) -> binary_ensemble::io::bundle::format::BendlFormatError { - match BendlReader::open(Cursor::new(bytes)) { +fn expect_bendl_open_err( + bytes: impl Into>, +) -> binary_ensemble::io::bundle::format::BendlFormatError { + match BendlReader::open(Cursor::new(bytes.into())) { Ok(_) => panic!("expected BendlReader::open to fail"), Err(err) => err, } @@ -715,7 +718,8 @@ fn valid_bendl_seed() -> Vec { /// Open the bundle and drive every public read accessor. Any panic from any reader path fails the /// test loudly. Errors are expected (the input is adversarial) and are silently discarded; only /// panics matter here. -fn assert_bendl_bytes_do_not_panic(bytes: Vec) { +fn assert_bendl_bytes_do_not_panic(bytes: impl Into>) { + let bytes = bytes.into(); let outcome = std::panic::catch_unwind(|| { let mut reader = match BendlReader::open(Cursor::new(bytes)) { Ok(r) => r, @@ -826,97 +830,75 @@ fn seeded_malformed_bendl_bytes_do_not_panic() { assert_bendl_bytes_do_not_panic(mutated); } - // Length-field inflation seeds. Header field offsets per the v1.0.0 spec: - // bytes 16..20 : flags (u32) - // bytes 20..24 : stream_checksum (u32) - // bytes 24..32 : directory_offset (u64) - // bytes 32..40 : directory_len (u64) - // bytes 40..48 : stream_offset (u64) - // bytes 48..56 : stream_len (u64) - let make_inflated = |range: std::ops::Range, value: u64| -> Vec { - let len = range.end - range.start; - let mut bytes = seed.clone(); - bytes[range].copy_from_slice(&value.to_le_bytes()[..len]); - bytes + // Header length-field inflation. Each fixture patches one named header field; the capped + // values keep the "value far past end of input" paths reachable without turning this into an + // OOM stress test. + let inflate_header = |field: HeaderField, value: u64| { + BendlBytes::new(seed.clone()).with_header_u64(field, value) }; // directory_offset past EOF. - assert_bendl_bytes_do_not_panic(make_inflated(24..32, u64::MAX)); + assert_bendl_bytes_do_not_panic(inflate_header(HeaderField::DirectoryOffset, u64::MAX)); // directory_len past EOF (capped to avoid OOM if the implementation pre-allocates). - assert_bendl_bytes_do_not_panic(make_inflated(32..40, ADVERSARIAL_LEN_CAP as u64)); + assert_bendl_bytes_do_not_panic(inflate_header( + HeaderField::DirectoryLen, + ADVERSARIAL_LEN_CAP as u64, + )); // stream_offset past EOF. - assert_bendl_bytes_do_not_panic(make_inflated(40..48, u64::MAX)); + assert_bendl_bytes_do_not_panic(inflate_header(HeaderField::StreamOffset, u64::MAX)); // stream_len past EOF (capped). - assert_bendl_bytes_do_not_panic(make_inflated(48..56, ADVERSARIAL_LEN_CAP as u64)); + assert_bendl_bytes_do_not_panic(inflate_header( + HeaderField::StreamLen, + ADVERSARIAL_LEN_CAP as u64, + )); // stream_offset + stream_len overflowing u64. - let mut overflow_bundle = seed.clone(); - overflow_bundle[40..48].copy_from_slice(&(u64::MAX - 1).to_le_bytes()); - overflow_bundle[48..56].copy_from_slice(&u64::MAX.to_le_bytes()); - assert_bendl_bytes_do_not_panic(overflow_bundle); - // Reserved header flag bits set. - assert_bendl_bytes_do_not_panic(make_inflated(16..20, u32::MAX as u64)); - // Non-zero alignment_padding at bytes 14..16; we don't have a make_inflated for u16 so do it - // inline. - let mut padded = seed.clone(); - padded[14..16].copy_from_slice(&u16::MAX.to_le_bytes()); - assert_bendl_bytes_do_not_panic(padded); - - // Directory-entry length-field inflation. The directory starts at directory_offset and begins - // with a u32 entry_count followed by the entries themselves. Each entry header is 28 bytes: - // u16 asset_type | u16 asset_flags | u16 name_len | u16 reserved - // u64 payload_offset | u64 payload_len | u32 checksum_len - let directory_offset = u64::from_le_bytes(seed[24..32].try_into().unwrap()) as usize; - let entry_count = u32::from_le_bytes( - seed[directory_offset..directory_offset + 4] - .try_into() - .unwrap(), + assert_bendl_bytes_do_not_panic( + BendlBytes::new(seed.clone()) + .with_header_u64(HeaderField::StreamOffset, u64::MAX - 1) + .with_header_u64(HeaderField::StreamLen, u64::MAX), ); + // Reserved header flag bits set. + assert_bendl_bytes_do_not_panic(inflate_header(HeaderField::Flags, u32::MAX as u64)); + // Non-zero alignment_padding (writers zero it; readers must ignore non-zero bytes there). + assert_bendl_bytes_do_not_panic(inflate_header( + HeaderField::AlignmentPadding, + u16::MAX as u64, + )); + + // Directory-entry length-field inflation: walk each entry and inflate its per-entry length + // fields one at a time, plus an inflated entry count. + let entry_count = BendlBytes::new(seed.clone()).entry_count(); assert!(entry_count > 0, "valid_bendl_seed must contain entries"); // entry_count inflation (capped to keep test runtime bounded — the reader must not try to // pre-allocate a Vec with u32::MAX capacity, but we don't want to find out the hard way here). - let mut inflated_entry_count = seed.clone(); - inflated_entry_count[directory_offset..directory_offset + 4] - .copy_from_slice(&ADVERSARIAL_LEN_CAP.to_le_bytes()); - assert_bendl_bytes_do_not_panic(inflated_entry_count); - - // Walk each entry and inflate its per-entry length fields one at a time. - let mut entry_cursor = directory_offset + 4; - for _ in 0..entry_count { - let name_len = - u16::from_le_bytes(seed[entry_cursor + 4..entry_cursor + 6].try_into().unwrap()) - as usize; - let checksum_len = u32::from_le_bytes( - seed[entry_cursor + 24..entry_cursor + 28] - .try_into() - .unwrap(), - ) as usize; - let entry_size = 28 + name_len + checksum_len; + assert_bendl_bytes_do_not_panic( + BendlBytes::new(seed.clone()).with_entry_count(ADVERSARIAL_LEN_CAP), + ); - // name_len inflation (capped). - let mut inflated = seed.clone(); - inflated[entry_cursor + 4..entry_cursor + 6] - .copy_from_slice(&(ADVERSARIAL_LEN_CAP as u16).to_le_bytes()); - assert_bendl_bytes_do_not_panic(inflated); + for index in 0..entry_count as usize { + let inflate_entry = |field: DirectoryEntryField, value: u64| { + BendlBytes::new(seed.clone()).with_directory_entry_field(index, field, value) + }; + // name_len inflation (capped). + assert_bendl_bytes_do_not_panic(inflate_entry( + DirectoryEntryField::NameLen, + ADVERSARIAL_LEN_CAP as u64, + )); // checksum_len inflation (capped). - let mut inflated = seed.clone(); - inflated[entry_cursor + 24..entry_cursor + 28] - .copy_from_slice(&ADVERSARIAL_LEN_CAP.to_le_bytes()); - assert_bendl_bytes_do_not_panic(inflated); - - // payload_len inflation to u64::MAX. ExactLen at read time, plus the per-frame decode - // cap, prevent any actual allocation. - let mut inflated = seed.clone(); - inflated[entry_cursor + 16..entry_cursor + 24].copy_from_slice(&u64::MAX.to_le_bytes()); - assert_bendl_bytes_do_not_panic(inflated); - + assert_bendl_bytes_do_not_panic(inflate_entry( + DirectoryEntryField::ChecksumLen, + ADVERSARIAL_LEN_CAP as u64, + )); + // payload_len inflation to u64::MAX. ExactLen at read time, plus the per-frame decode cap, + // prevent any actual allocation. + assert_bendl_bytes_do_not_panic(inflate_entry(DirectoryEntryField::PayloadLen, u64::MAX)); // payload_offset past EOF. - let mut inflated = seed.clone(); - inflated[entry_cursor + 8..entry_cursor + 16].copy_from_slice(&u64::MAX.to_le_bytes()); - assert_bendl_bytes_do_not_panic(inflated); - - entry_cursor += entry_size; + assert_bendl_bytes_do_not_panic(inflate_entry( + DirectoryEntryField::PayloadOffset, + u64::MAX, + )); } } @@ -930,9 +912,9 @@ fn bendl_open_rejects_directory_offset_past_eof() { // directory_offset claims a position well past the actual file. Cursor seek succeeds (its // position is u64) but the subsequent read returns Ok(0); read_directory's read_exact for the // entry count fails with UnexpectedEof, which becomes BendlFormatError::Io. - let mut bytes = valid_bendl_seed(); - let past_eof = (bytes.len() as u64) + 4096; - bytes[24..32].copy_from_slice(&past_eof.to_le_bytes()); + let seed = valid_bendl_seed(); + let past_eof = seed.len() as u64 + 4096; + let bytes = BendlBytes::new(seed).with_header_u64(HeaderField::DirectoryOffset, past_eof); let err = expect_bendl_open_err(bytes); assert!( matches!(err, BendlFormatError::Io(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof), @@ -944,9 +926,9 @@ fn bendl_open_rejects_directory_offset_past_eof() { fn bendl_open_rejects_directory_offset_plus_directory_len_overflow() { // directory_offset + directory_len overflows u64. The reader has no chance to read anything at // u64::MAX - 4; the failure surface is the same UnexpectedEof from the bounded read attempt. - let mut bytes = valid_bendl_seed(); - bytes[24..32].copy_from_slice(&(u64::MAX - 4).to_le_bytes()); - bytes[32..40].copy_from_slice(&100u64.to_le_bytes()); + let bytes = BendlBytes::new(valid_bendl_seed()) + .with_header_u64(HeaderField::DirectoryOffset, u64::MAX - 4) + .with_header_u64(HeaderField::DirectoryLen, 100); let err = expect_bendl_open_err(bytes); assert!( matches!(err, BendlFormatError::Io(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof), @@ -967,13 +949,13 @@ fn bendl_open_rejects_name_len_longer_than_remaining_directory_bytes() { payload_len: 0, checksum: None, }]; - let mut bytes = minimal_bendl_with_entries(entries, 0); - - // Directory layout in the bundle starts at HEADER_SIZE: [u32 count][entry_header (28 bytes) - // including u16 name_len at offset +4][name bytes][checksum bytes]. - // Patch name_len from 2 to a huge value that exceeds the directory's declared length. - let name_len_offset = HEADER_SIZE + 4 + 4; - bytes[name_len_offset..name_len_offset + 2].copy_from_slice(&u16::MAX.to_le_bytes()); + // Patch the sole entry's name_len from 2 to a huge value that exceeds the directory's declared + // length, so read_exact for the name buffer fails inside the bounded directory region. + let bytes = BendlBytes::new(minimal_bendl_with_entries(entries, 0)).with_directory_entry_field( + 0, + DirectoryEntryField::NameLen, + u16::MAX as u64, + ); let err = expect_bendl_open_err(bytes); assert!( @@ -992,19 +974,17 @@ fn bendl_unknown_header_flag_bits_are_ignored() { // Forward-compat contract: bits 1..31 of `flags` are reserved. Setting them on a finalized // bundle must not change anything observable — open succeeds, directory entries are intact, // verify_stream_checksum passes, asset access works. - let mut bytes = valid_bendl_seed(); - let flags_offset = 16; - let original_flags = - u32::from_le_bytes(bytes[flags_offset..flags_offset + 4].try_into().unwrap()); + let seed = BendlBytes::new(valid_bendl_seed()); + let original_flags = seed.header_u64(HeaderField::Flags) as u32; assert!( original_flags & HEADER_FLAG_STREAM_CHECKSUM != 0, "seed must have STREAM_CHECKSUM set; otherwise this test is testing the wrong contract" ); let polluted_flags = original_flags | (1u32 << 5) | (1u32 << 31); - bytes[flags_offset..flags_offset + 4].copy_from_slice(&polluted_flags.to_le_bytes()); + let bytes = seed.with_header_u64(HeaderField::Flags, polluted_flags as u64); - let mut reader = - BendlReader::open(Cursor::new(bytes)).expect("unknown flag bits must not block open"); + let mut reader = BendlReader::open(Cursor::new(bytes.into_bytes())) + .expect("unknown flag bits must not block open"); assert!(reader.is_finalized()); assert_eq!( reader.assets().len(), @@ -1035,15 +1015,13 @@ fn bendl_clear_stream_checksum_flag_with_nonzero_bytes_returns_unavailable_not_m // but leaving non-zero garbage in the stream_checksum slot — a buggy reader that interpreted // bytes 20..24 unconditionally would return Mismatch (since the garbage would not match the // actual CRC). - let mut bytes = valid_bendl_seed(); - let flags_offset = 16; - let cleared_flags = - u32::from_le_bytes(bytes[flags_offset..flags_offset + 4].try_into().unwrap()) - & !HEADER_FLAG_STREAM_CHECKSUM; - bytes[flags_offset..flags_offset + 4].copy_from_slice(&cleared_flags.to_le_bytes()); - bytes[20..24].copy_from_slice(&0xDEADBEEFu32.to_le_bytes()); + let seed = BendlBytes::new(valid_bendl_seed()); + let cleared_flags = seed.header_u64(HeaderField::Flags) as u32 & !HEADER_FLAG_STREAM_CHECKSUM; + let bytes = seed + .with_header_u64(HeaderField::Flags, cleared_flags as u64) + .with_header_u64(HeaderField::StreamChecksum, 0xDEADBEEF); - let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + let mut reader = BendlReader::open(Cursor::new(bytes.into_bytes())).expect("open must succeed"); let expect_unavailable = |result: Result<_, BendlReadError>| match result { Err(BendlReadError::Checksum(ChecksumError::Unavailable { @@ -1070,10 +1048,10 @@ fn bendl_nonzero_alignment_padding_is_ignored() { // alignment_padding occupies bytes 14..16. Writers zero it; readers must ignore non-zero bytes // there. Forward-compat insurance: a future writer that accidentally stamps something into the // padding region must not break readers. - let mut bytes = valid_bendl_seed(); - bytes[14..16].copy_from_slice(&u16::MAX.to_le_bytes()); + let bytes = BendlBytes::new(valid_bendl_seed()) + .with_header_u64(HeaderField::AlignmentPadding, u16::MAX as u64); - let mut reader = BendlReader::open(Cursor::new(bytes)) + let mut reader = BendlReader::open(Cursor::new(bytes.into_bytes())) .expect("non-zero alignment_padding must not block open"); reader .verify_stream_checksum() @@ -1093,11 +1071,11 @@ fn bendl_stream_offset_plus_stream_len_overflow_surfaces_short_range() { // read; verify_stream_checksum returns BendlReadError::Io(UnexpectedEof); // open_assignment_reader either fails at construction or surfaces UnexpectedEof during // iteration; assignment_stream_reader_unverified surfaces UnexpectedEof on read. - let mut bytes = valid_bendl_seed(); - bytes[40..48].copy_from_slice(&(u64::MAX - 5).to_le_bytes()); - bytes[48..56].copy_from_slice(&u64::MAX.to_le_bytes()); + let bytes = BendlBytes::new(valid_bendl_seed()) + .with_header_u64(HeaderField::StreamOffset, u64::MAX - 5) + .with_header_u64(HeaderField::StreamLen, u64::MAX); - let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + let mut reader = BendlReader::open(Cursor::new(bytes.into_bytes())).expect("open must succeed"); match reader.verify_stream_checksum() { Err(BendlReadError::Io(ref e)) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), @@ -1146,11 +1124,11 @@ fn bendl_stream_offset_plus_stream_len_overflow_surfaces_short_range() { fn bendl_stream_offset_past_eof_surfaces_short_range() { // stream_offset alone points past EOF. Same surface contract as the overflow case — open // succeeds; every stream accessor reports UnexpectedEof on read. - let mut bytes = valid_bendl_seed(); - let past_eof = (bytes.len() as u64) + 4096; - bytes[40..48].copy_from_slice(&past_eof.to_le_bytes()); + let seed = valid_bendl_seed(); + let past_eof = seed.len() as u64 + 4096; + let bytes = BendlBytes::new(seed).with_header_u64(HeaderField::StreamOffset, past_eof); - let mut reader = BendlReader::open(Cursor::new(bytes)).expect("open must succeed"); + let mut reader = BendlReader::open(Cursor::new(bytes.into_bytes())).expect("open must succeed"); match reader.verify_stream_checksum() { Err(BendlReadError::Io(ref e)) => assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof), From c53baf05ed5b782cc593fe4faee4e8c7817be814 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 16:12:56 -0600 Subject: [PATCH 129/221] make clippy happy --- ben/src/cli/ben/bundle.rs | 28 +++++++++++-------------- ben/src/cli/bendl/tests.rs | 2 +- ben/src/cli/common/error.rs | 2 +- ben/src/cli/pcben/translate.rs | 4 ++-- ben/src/codec/decode/jsonl.rs | 4 +--- ben/src/codec/decode/tests/mkvchain.rs | 5 +++++ ben/src/codec/decode/tests/standard.rs | 5 +++++ ben/src/codec/decode/tests/twodelta.rs | 7 ++++++- ben/src/codec/decode/xz.rs | 4 +--- ben/src/codec/encode/tests.rs | 13 ++++++++---- ben/src/codec/encode/twodelta.rs | 5 ++--- ben/src/codec/frames/tests.rs | 8 +++++-- ben/src/codec/translate/tests.rs | 21 +++++++------------ ben/src/io/bundle/reader.rs | 4 ++-- ben/src/io/bundle/tests/reader.rs | 25 +++++++++++----------- ben/src/io/bundle/tests/writer.rs | 15 +++++++------ ben/src/io/bundle/writer.rs | 10 ++++----- ben/src/io/reader/errors.rs | 2 +- ben/src/io/reader/stream_reader/xben.rs | 17 ++++++++------- ben/src/io/reader/subsample.rs | 4 +--- ben/src/io/reader/tests.rs | 11 +++++----- ben/src/io/writer/tests.rs | 18 ++++++++-------- ben/src/json/graph/mlc.rs | 8 +++---- ben/src/json/graph/tests/test_io.rs | 4 ++-- ben/src/ops/extract/tests.rs | 5 +++++ ben/src/ops/relabel/tests.rs | 14 ++++++------- ben/tests/test_assignment_reader.rs | 8 +++---- ben/tests/test_coverage.rs | 21 +++++++++---------- ben/tests/test_impls_pipeline.rs | 11 +++++----- ben/tests/test_pipeline.rs | 2 +- ben/tests/test_stress_edges.rs | 11 +++++----- 31 files changed, 152 insertions(+), 146 deletions(-) diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 47eb6da..dd92ed9 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -16,15 +16,14 @@ use std::path::Path; pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { eprintln!("Adding graph..."); let graph_bytes = std::fs::read(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, + io::Error::other( format!("failed to read graph {graph_path:?}: {e}"), ) })?; let file = OpenOptions::new().read(true).write(true).open(out_path)?; let mut appender = BendlAppender::open(file) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; appender .add_asset( ASSET_TYPE_GRAPH, @@ -33,14 +32,13 @@ pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<() AddAssetOptions::defaults().json(), ) .map_err(|e| { - io::Error::new( - io::ErrorKind::Other, + io::Error::other( format!("failed to add graph asset: {e}"), ) })?; appender .commit() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; Ok(()) } @@ -55,8 +53,7 @@ pub(super) fn run_encode_bundle_with_graph( // Validate the graph file is readable before we do any real work, so a bad --graph path doesn't // leave a half-written bundle behind. std::fs::metadata(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, + io::Error::other( format!("failed to stat graph {graph_path:?}: {e}"), ) })?; @@ -65,10 +62,10 @@ pub(super) fn run_encode_bundle_with_graph( let out_file = File::create(out_path)?; let bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Ben) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; let mut session = bendl_writer .into_stream_session() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; { let reader = BufReader::new(File::open(input_path)?); encode_jsonl_to_ben(reader, &mut session, variant)?; @@ -76,7 +73,7 @@ pub(super) fn run_encode_bundle_with_graph( let bendl_writer = session.finish_into_writer(sample_count); bendl_writer .finish() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; append_graph_asset(out_path, graph_path) } @@ -96,8 +93,7 @@ pub(super) fn run_xencode_bundle_with_graph( graph_path: &Path, ) -> Result<()> { std::fs::metadata(graph_path).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, + io::Error::other( format!("failed to stat graph {graph_path:?}: {e}"), ) })?; @@ -110,10 +106,10 @@ pub(super) fn run_xencode_bundle_with_graph( let out_file = File::create(out_path)?; let bendl_writer = BendlWriter::new(out_file, AssignmentFormat::Xben) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; let mut session = bendl_writer .into_stream_session() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; { let reader = BufReader::new(File::open(input_path)?); if from_ben { @@ -140,7 +136,7 @@ pub(super) fn run_xencode_bundle_with_graph( let bendl_writer = session.finish_into_writer(sample_count); bendl_writer .finish() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{e}")))?; + .map_err(|e| io::Error::other(format!("{e}")))?; append_graph_asset(out_path, graph_path) } diff --git a/ben/src/cli/bendl/tests.rs b/ben/src/cli/bendl/tests.rs index 041fb8a..5629f82 100644 --- a/ben/src/cli/bendl/tests.rs +++ b/ben/src/cli/bendl/tests.rs @@ -234,7 +234,7 @@ fn run_inspect_unknown_format_and_no_sample_count() { header[56..64].copy_from_slice(&sample_count.to_le_bytes()); let path = unique_path("inspect_unknown.bendl"); - std::fs::write(&path, &header).unwrap(); + std::fs::write(&path, header).unwrap(); run_inspect(InspectArgs { input: path.clone(), }) diff --git a/ben/src/cli/common/error.rs b/ben/src/cli/common/error.rs index eb6cf14..ed6832f 100644 --- a/ben/src/cli/common/error.rs +++ b/ben/src/cli/common/error.rs @@ -102,7 +102,7 @@ mod tests { #[test] fn io_source_propagates() { use std::error::Error; - let original = io::Error::new(io::ErrorKind::Other, "deep"); + let original = io::Error::other("deep"); let cli = CliError::Io(original); assert!(cli.source().is_some()); } diff --git a/ben/src/cli/pcben/translate.rs b/ben/src/cli/pcben/translate.rs index b86ff1c..ef09537 100644 --- a/ben/src/cli/pcben/translate.rs +++ b/ben/src/cli/pcben/translate.rs @@ -57,7 +57,7 @@ pub(super) fn assignment_encode_ben( let assignment: Vec = serde_json::from_str::>(&line.unwrap()) .unwrap() .into_iter() - .map(|x| x as u16 + 1) + .map(|x| x + 1) .collect(); ben_writer.write_assignment(assignment)?; } @@ -78,7 +78,7 @@ pub(super) fn assignment_encode_xben( let assignment: Vec = serde_json::from_str::>(&line.unwrap()) .unwrap() .into_iter() - .map(|x| x as u16 + 1) + .map(|x| x + 1) .collect(); xben_writer.write_json_value(json!({ "assignment": assignment }))?; } diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index f9c68c5..2a7e25d 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -41,9 +41,7 @@ pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> i let mut first_buffer = [0u8; BANNER_LEN]; - if let Err(e) = decoder.read_exact(&mut first_buffer) { - return Err(e); - } + decoder.read_exact(&mut first_buffer)?; let variant = match variant_from_banner(&first_buffer) { Some(BenVariant::Standard) => BenVariant::Standard, diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs index 5d56864..881b80e 100644 --- a/ben/src/codec/decode/tests/mkvchain.rs +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -1,3 +1,8 @@ +// Binary literals here are grouped by BEN bit-field boundaries (e.g. `0b01100_100` is a 5-bit +// value followed by a 3-bit value), not by even nibbles, so the grouping documents the packed +// layout under test. +#![allow(clippy::unusual_byte_groupings)] + use crate::codec::decode::jsonl_decode_ben32; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use crate::codec::encode::{encode_ben_to_xben, xz_compress}; diff --git a/ben/src/codec/decode/tests/standard.rs b/ben/src/codec/decode/tests/standard.rs index 34c8dba..4ea688d 100644 --- a/ben/src/codec/decode/tests/standard.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -1,3 +1,8 @@ +// Binary literals here are grouped by BEN bit-field boundaries (e.g. `0b01100_100` is a 5-bit +// value followed by a 3-bit value), not by even nibbles, so the grouping documents the packed +// layout under test. +#![allow(clippy::unusual_byte_groupings)] + use crate::codec::decode::jsonl_decode_ben32; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use crate::codec::encode::xz_compress; diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index 2de7a85..679a5c4 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -1,3 +1,8 @@ +// Binary literals here are grouped by BEN bit-field boundaries (e.g. `0b01100_100` is a 5-bit +// value followed by a 3-bit value), not by even nibbles, so the grouping documents the packed +// layout under test. +#![allow(clippy::unusual_byte_groupings)] + use crate::codec::decode::{ apply_twodelta_runs_to_assignment, decode_ben_to_jsonl, decode_twodelta_frame, decode_xben_to_jsonl, @@ -480,7 +485,7 @@ fn decode_ben_to_jsonl_three_frames_byte_level() { #[test] fn decode_xben_to_jsonl_twodelta_anchor_only() { let anchor = vec![1u16, 2, 1, 2]; - let ben = make_twodelta_ben(&[anchor.clone()]); + let ben = make_twodelta_ben(std::slice::from_ref(&anchor)); let mut xben = Vec::new(); encode_ben_to_xben( BufReader::new(ben.as_slice()), diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index 5bbf80d..c5f21be 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -25,9 +25,7 @@ pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io: let mut first_buffer = [0u8; BANNER_LEN]; - if let Err(e) = decoder.read_exact(&mut first_buffer) { - return Err(e); - } + decoder.read_exact(&mut first_buffer)?; let variant: XBenVariant = match variant_from_banner(&first_buffer) { Some(BenVariant::Standard) => { diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 4d7af9c..7d56d4b 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -1,3 +1,8 @@ +// Binary literals here are grouped by BEN bit-field boundaries (e.g. `0b01100_100` is a 5-bit +// value followed by a 3-bit value), not by even nibbles, so the grouping documents the packed +// layout under test. +#![allow(clippy::unusual_byte_groupings)] + use super::*; use crate::codec::frames::BenEncodeFrame; use crate::util::rle::rle_to_vec; @@ -1365,7 +1370,7 @@ fn mkvchain_round_trip_with_label_zero() { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!(decoded, assignments); @@ -1414,7 +1419,7 @@ fn twodelta_round_trip_with_label_zero_pairs() { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!( @@ -1450,7 +1455,7 @@ fn twodelta_round_trip_all_zero_assignment() { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!(decoded, assignments); @@ -1485,7 +1490,7 @@ fn assert_ben_round_trip(assignment: Vec, variant: BenVariant) { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); let expected = if matches!(variant, BenVariant::TwoDelta) { diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 504d2be..8fba898 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -75,11 +75,10 @@ pub(crate) fn encode_twodelta_frame_with_hint( })); } - if delta_pair.is_some() { + if let Some(pair) = delta_pair { if previous_masks.is_none() { return Err(Error::from(EncodeError::TwoDeltaHintWithoutMasks)); } - let pair = delta_pair.unwrap(); if pair.0 == pair.1 { return Err(Error::from(EncodeError::TwoDeltaIdenticalPairHint { value: pair.0, @@ -377,7 +376,7 @@ fn construct_twodelta_frame_from_mask_hint( } } - return Err(Error::from(EncodeError::TwoDeltaIdentical)); + Err(Error::from(EncodeError::TwoDeltaIdentical)) } /// Build a TwoDelta frame by scanning both assignment vectors from scratch, with no hints from the diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index 25c1036..c40bee5 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -78,7 +78,11 @@ fn unwrap_encode_standard(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u3 } } -fn unwrap_encode_mkv(frame: BenEncodeFrame) -> (Vec<(u16, u16)>, u8, u8, u32, Vec, u16) { +/// The destructured fields of a [`BenEncodeFrame::MkvChain`] arm: `(runs, max_val_bit_count, +/// max_len_bit_count, n_bytes, raw_bytes, count)`. +type MkvChainEncodeFields = (Vec<(u16, u16)>, u8, u8, u32, Vec, u16); + +fn unwrap_encode_mkv(frame: BenEncodeFrame) -> MkvChainEncodeFields { match frame { BenEncodeFrame::MkvChain { runs, @@ -621,7 +625,7 @@ fn encode_partial_eq_vec_both_directions() { #[test] fn decode_expand_standard_assignment() { // An assignment of [1, 1, 2, 2, 3] becomes RLE [(1,2),(2,2),(3,1)]. - let encoded = BenEncodeFrame::from_assignment(&[1u16, 1, 2, 2, 3], BenVariant::Standard, None); + let encoded = BenEncodeFrame::from_assignment([1u16, 1, 2, 2, 3], BenVariant::Standard, None); let mut cursor = io::Cursor::new(encoded.into_bytes()); let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index ac0d335..705f44a 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -75,9 +75,8 @@ fn test_simple_translation_ben32_to_ben() { let mut output: Vec = Vec::new(); let mut writer = &mut output; - if let Err(_) = translate_ben32_to_ben_file(&mut reader, &mut writer) { - assert!(false) - } + translate_ben32_to_ben_file(&mut reader, &mut writer) + .expect("ben32-to-ben translation should succeed"); let mut buffer: Vec = Vec::new(); let writer2 = &mut buffer; @@ -129,9 +128,8 @@ fn test_random_translation_ben32_to_ben() { let mut output: Vec = Vec::new(); let mut writer = &mut output; - if let Err(_) = translate_ben32_to_ben_file(&mut reader, &mut writer) { - assert!(false) - } + translate_ben32_to_ben_file(&mut reader, &mut writer) + .expect("ben32-to-ben translation should succeed"); let mut buffer: Vec = Vec::new(); let writer2 = &mut buffer; @@ -167,10 +165,8 @@ fn test_simple_translation_ben_to_ben32() { let mut output: Vec = Vec::new(); let mut writer = &mut output; - if let Err(e) = translate_ben_to_ben32_file(&mut reader, &mut writer) { - eprintln!("{:?}", e); - assert!(false) - } + translate_ben_to_ben32_file(&mut reader, &mut writer) + .expect("ben-to-ben32 translation should succeed"); let mut buffer: Vec = Vec::new(); let writer2 = &mut buffer; @@ -222,9 +218,8 @@ fn test_random_translation_ben_to_ben32() { let mut output: Vec = Vec::new(); let mut writer = &mut output; - if let Err(_) = translate_ben_to_ben32_file(&mut reader, &mut writer) { - assert!(false) - } + translate_ben_to_ben32_file(&mut reader, &mut writer) + .expect("ben-to-ben32 translation should succeed"); let mut buffer: Vec = Vec::new(); let writer2 = &mut buffer; diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 634f116..2d78f9b 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -214,7 +214,7 @@ impl BendlReader { &mut self, ) -> Result, BendlReadError> { let expected = self.require_stream_checksum()?; - let format = self.assignment_format().ok_or_else(|| { + let format = self.assignment_format().ok_or({ BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat( self.header.assignment_format, )) @@ -242,7 +242,7 @@ impl BendlReader { where R: Send, { - let format = self.assignment_format().ok_or_else(|| { + let format = self.assignment_format().ok_or({ BendlReadError::Format(BendlFormatError::UnknownAssignmentFormat( self.header.assignment_format, )) diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 6199e29..011a82e 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -42,7 +42,7 @@ fn build_finalized_bundle() -> (Vec, Vec, Vec, Vec) { // [directory_offset .. EOF) directory let mut bundle = Vec::new(); // Reserve space for header; fill later. - bundle.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bundle.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let graph_offset = bundle.len() as u64; bundle.extend_from_slice(&compressed_graph); @@ -262,7 +262,7 @@ fn validate_directory_catches_wrong_canonical_name() { /// validation pitfalls. Useful as a base that tests can mutate byte-by-byte. fn build_basic_finalized_bundle() -> Vec { let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); // One raw metadata asset right after the header. let metadata_payload = br#"{"k":"v"}"#.to_vec(); @@ -614,7 +614,7 @@ fn stress_many_custom_assets_round_trip() { const N: usize = 200; let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let mut entries = Vec::with_capacity(N); let mut expected = Vec::with_capacity(N); @@ -679,7 +679,7 @@ fn xz_flagged_asset_with_corrupt_payload_surfaces_io_error() { // Hand-build a bundle with a single asset flagged ASSET_FLAG_XZ whose payload bytes are not a // valid xz container. `asset_bytes` must surface an io::Error rather than panicking. let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let bad_payload = vec![0xFFu8, 0xFE, 0xFD, 0xFC, 0xFB]; let payload_offset = bytes.len() as u64; @@ -839,7 +839,7 @@ use crate::io::bundle::error::{BendlReadError, ChecksumError, ChecksumTarget}; /// Returns `(bundle_bytes, asset_name, directory_offset, payload_offset)` for hand-patching tests. fn make_single_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, u64, u64) { let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(payload); @@ -888,7 +888,7 @@ fn make_single_xz_asset_bundle(name: &str, payload: &[u8]) -> (Vec, String, let compressed = encoder.finish().unwrap(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(&compressed); @@ -1023,7 +1023,7 @@ fn verify_asset_checksum_returns_unavailable_when_flag_clear() { // Hand-build a foreign bundle whose entry has the flag clear. let payload = b"orphan".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(&payload); let stream_offset = bytes.len() as u64; @@ -1151,7 +1151,7 @@ fn asset_bytes_returns_unavailable_when_flag_clear() { // Same hand-built foreign bundle as in the verifier test. let payload = b"orphan".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(&payload); let directory_offset = bytes.len() as u64; @@ -1236,7 +1236,7 @@ fn verify_all_asset_checksums_reports_first_mismatch_in_directory_order() { let p1 = b"first".to_vec(); let p2 = b"second".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let off1 = bytes.len() as u64; bytes.extend_from_slice(&p1); let off2 = bytes.len() as u64; @@ -1336,7 +1336,7 @@ fn crc32c_polynomial_pin_against_known_vectors() { fn make_unflagged_stream_bundle() -> Vec { let fake_stream = b"hello stream".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let stream_offset = bytes.len() as u64; bytes.extend_from_slice(&fake_stream); let directory_offset = bytes.len() as u64; @@ -1566,8 +1566,7 @@ struct FailWhenArmed { impl Read for FailWhenArmed { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { if self.armed.load(std::sync::atomic::Ordering::SeqCst) { - return Err(std::io::Error::new( - std::io::ErrorKind::Other, + return Err(std::io::Error::other( "forced read failure", )); } @@ -1894,7 +1893,7 @@ fn asset_with_unknown_flag_bit_opens_and_verifies_checksum() { let payload = b"asset bytes with reserved bit".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(&payload); diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index eb55c08..2a87ca7 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -537,7 +537,7 @@ fn bundle_ben_stream_round_trips_through_assignment_reader() { .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) + std::iter::repeat_n(assign, count as usize) }) .collect(); assert_eq!(decoded, samples); @@ -579,7 +579,7 @@ fn bundle_xben_stream_round_trips_through_assignment_reader() { .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) + std::iter::repeat_n(assign, count as usize) }) .collect(); assert_eq!(decoded, samples); @@ -626,7 +626,7 @@ fn bundle_ben_stream_alongside_front_loaded_asset() { .silent(true) .flat_map(|r| { let (assign, count) = r.unwrap(); - std::iter::repeat(assign).take(count as usize) + std::iter::repeat_n(assign, count as usize) }) .collect(); assert_eq!(decoded, samples); @@ -1419,12 +1419,12 @@ fn make_ben_stream_bundle(count: usize) -> (Vec, Vec>) { } /// Corrupt the stored `stream_checksum` field in-place by flipping a byte at header offset 20. -fn corrupt_stream_checksum(bytes: &mut Vec) { +fn corrupt_stream_checksum(bytes: &mut [u8]) { bytes[20] ^= 0xFF; } /// Flip a byte in the stream payload to corrupt the stream contents without changing its length. -fn corrupt_stream_payload(bytes: &mut Vec, reader: &mut BendlReader>>) { +fn corrupt_stream_payload(bytes: &mut [u8], reader: &mut BendlReader>>) { let (offset, len) = reader.assignment_stream_range().unwrap(); assert!( len > 0, @@ -1688,7 +1688,7 @@ fn bundle_with_reserved_asset_flag_bit() -> (Vec, u16) { const RESERVED_BIT_7: u16 = 1 << 7; let payload = b"forward-compat asset".to_vec(); let mut bytes = Vec::new(); - bytes.extend(std::iter::repeat(0u8).take(HEADER_SIZE)); + bytes.extend(std::iter::repeat_n(0u8, HEADER_SIZE)); let payload_offset = bytes.len() as u64; bytes.extend_from_slice(&payload); @@ -1887,8 +1887,7 @@ fn writer_failed_asset_write_does_not_poison_registry() { fn write(&mut self, buf: &[u8]) -> std::io::Result { if !self.failed && self.inner.position() >= HEADER_SIZE as u64 { self.failed = true; - return Err(std::io::Error::new( - std::io::ErrorKind::Other, + return Err(std::io::Error::other( "simulated payload write failure", )); } diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index c9f2c46..3b2a7c6 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -257,7 +257,7 @@ impl BendlWriter { let encoded = encode_asset_payload(payload.to_vec(), compress, options.is_json)?; // Write at current file position. - let payload_offset = self.inner.seek(SeekFrom::Current(0))?; + let payload_offset = self.inner.stream_position()?; self.inner.write_all(&encoded.bytes)?; self.registry.claim(asset_type, name)?; @@ -338,7 +338,7 @@ impl BendlWriter { } } - let stream_offset = self.inner.seek(SeekFrom::Current(0))?; + let stream_offset = self.inner.stream_position()?; self.header.stream_offset = stream_offset; Ok(BendlStreamSession { @@ -363,7 +363,7 @@ impl BendlWriter { } => (stream_len, sample_count), WriterState::Assets => { // No stream written; treat as empty stream located just after the asset region. - let stream_offset = self.inner.seek(SeekFrom::Current(0))?; + let stream_offset = self.inner.stream_position()?; self.header.stream_offset = stream_offset; // CRC32C of an empty byte sequence is 0x00000000. self.header.stream_checksum = 0; @@ -765,7 +765,7 @@ impl BendlAppender { for prepared in encoded { let enc = prepared.encoded_asset; - let payload_offset = self.inner.seek(SeekFrom::Current(0))?; + let payload_offset = self.inner.stream_position()?; self.inner.write_all(&enc.bytes)?; new_entries.push(BendlDirectoryEntry { asset_type: prepared.asset_type, @@ -778,7 +778,7 @@ impl BendlAppender { } // Write the new directory at the new EOF. - let new_directory_offset = self.inner.seek(SeekFrom::Current(0))?; + let new_directory_offset = self.inner.stream_position()?; let directory_bytes = encode_directory(&new_entries).map_err(BendlWriteError::Format)?; self.inner.write_all(&directory_bytes)?; let new_directory_len = directory_bytes.len() as u64; diff --git a/ben/src/io/reader/errors.rs b/ben/src/io/reader/errors.rs index 8abbb2f..4609313 100644 --- a/ben/src/io/reader/errors.rs +++ b/ben/src/io/reader/errors.rs @@ -32,7 +32,7 @@ fn to_hex(bytes: &[u8]) -> String { } /// Format an `InvalidFileFormat` byte header into a human-readable error message. -fn format_invalid_file_format(header: &Vec) -> String { +fn format_invalid_file_format(header: &[u8]) -> String { if is_xz_header(header) { format!( "Invalid file format: Compressed header detected (hex: {}). \ diff --git a/ben/src/io/reader/stream_reader/xben.rs b/ben/src/io/reader/stream_reader/xben.rs index 6e39970..773351b 100644 --- a/ben/src/io/reader/stream_reader/xben.rs +++ b/ben/src/io/reader/stream_reader/xben.rs @@ -14,10 +14,10 @@ use crate::BenVariant; /// /// Scans `overflow` for a four-byte zero sentinel that terminates a ben32 frame and, for MkvChain /// streams, reads the trailing repetition count. -pub(super) fn pop_frame_from_overflow<'a>( +pub(super) fn pop_frame_from_overflow( variant: BenVariant, - overflow: &'a [u8], -) -> Option<(&'a [u8], usize, u16)> { + overflow: &[u8], +) -> Option<(&[u8], usize, u16)> { if variant == BenVariant::Standard { if overflow.len() < 4 { return None; @@ -48,10 +48,12 @@ pub(super) fn pop_frame_from_overflow<'a>( } } +/// A TwoDelta frame popped from the overflow buffer: its `(value, run_length)` pairs, the number of +/// overflow bytes the frame consumed, and its repetition count. +type PoppedTwoDeltaFrame = (Vec<(u16, u16)>, usize, u16); + /// Try to extract one complete TwoDelta frame from the buffered overflow. -fn pop_twodelta_frame_from_overflow( - overflow: &[u8], -) -> Option, usize, u16)>> { +fn pop_twodelta_frame_from_overflow(overflow: &[u8]) -> Option> { let tag = *overflow.first()?; match tag { XBEN_TWODELTA_FULL_TAG => { @@ -140,7 +142,7 @@ fn try_parse_twodelta_chunk(inner: &mut XBenInner) -> bool { let run_data_start = run_counts_start + run_counts_len; let mut run_cursor = run_data_start; - for i in 0..n_frames { + for (i, &rc) in run_counts.iter().enumerate() { let po = pairs_start + i * 4; let pair = ( u16::from_be_bytes([inner.overflow[po], inner.overflow[po + 1]]), @@ -149,7 +151,6 @@ fn try_parse_twodelta_chunk(inner: &mut XBenInner) -> bool { let co = counts_start + i * 2; let count = u16::from_be_bytes([inner.overflow[co], inner.overflow[co + 1]]); - let rc = run_counts[i]; let mut run_lengths = Vec::with_capacity(rc); for _ in 0..rc { run_lengths.push(u16::from_be_bytes([ diff --git a/ben/src/io/reader/subsample.rs b/ben/src/io/reader/subsample.rs index 025a90c..bdd1087 100644 --- a/ben/src/io/reader/subsample.rs +++ b/ben/src/io/reader/subsample.rs @@ -161,9 +161,7 @@ where } } if let Selection::Indices(ref mut it) = self.selection { - if it.peek().is_none() { - return None; - } + it.peek()?; } let (frame, count) = match self.inner.next()? { diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 50f4645..4ee31fd 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -576,8 +576,7 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader .for_each_assignment(|_assignment, _count| { - Err(std::io::Error::new( - std::io::ErrorKind::Other, + Err(std::io::Error::other( "callback failed", )) }) @@ -591,7 +590,7 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { #[test] fn xz_reader_large_assignment_roundtrip() { let big_assign: Vec = (1..=1000).collect(); - let xben = make_xben_from_assignments(&[big_assign.clone()], BenVariant::Standard); + let xben = make_xben_from_assignments(std::slice::from_ref(&big_assign), BenVariant::Standard); let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results.len(), 1); @@ -783,11 +782,11 @@ fn xz_twodelta_chunk_boundary_roundtrip() { let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); assert_eq!(results.len(), 21); assert_eq!(results[0], anchor); - for i in 1..=20 { + for (i, sample) in results.iter().enumerate().skip(1) { if i % 2 == 1 { - assert_eq!(results[i], delta); + assert_eq!(*sample, delta); } else { - assert_eq!(results[i], anchor); + assert_eq!(*sample, anchor); } } } diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 0b64cf0..02672ad 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -6,11 +6,11 @@ use xz2::write::XzEncoder; /// Build a `BenStreamWriter` over an explicit single-thread XZ encoder so the resulting xben byte /// stream is deterministic and small. -fn build_xben_writer<'a>( - out: &'a mut Vec, +fn build_xben_writer( + out: &mut Vec, variant: BenVariant, chunk_size: Option, -) -> BenStreamWriter<&'a mut Vec> { +) -> BenStreamWriter<&mut Vec> { let encoder = XzEncoder::new(out, 1); BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size).unwrap() } @@ -162,7 +162,7 @@ fn assert_ben_round_trip(assignments: &[Vec], variant: BenVariant) { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!( @@ -186,7 +186,7 @@ fn assert_xben_round_trip(assignments: &[Vec], variant: BenVariant) { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!( @@ -232,7 +232,7 @@ fn writer_ben_one_sample_round_trip_per_variant() { BenVariant::MkvChain, BenVariant::TwoDelta, ] { - assert_ben_round_trip(&[assignment.clone()], variant); + assert_ben_round_trip(std::slice::from_ref(&assignment), variant); } } @@ -247,7 +247,7 @@ fn writer_xben_one_sample_round_trip_per_variant() { BenVariant::MkvChain, BenVariant::TwoDelta, ] { - assert_xben_round_trip(&[assignment.clone()], variant); + assert_xben_round_trip(std::slice::from_ref(&assignment), variant); } } @@ -302,7 +302,7 @@ fn writer_twodelta_chunk_boundary_off_by_one_grid() { let decoded: Vec> = reader .flat_map(|r| { let (a, count) = r.unwrap(); - std::iter::repeat(a).take(count as usize) + std::iter::repeat_n(a, count as usize) }) .collect(); assert_eq!( @@ -1011,7 +1011,7 @@ fn for_xben_top_level_constructor_round_trips_per_variant() { .silent(true) .flat_map(|r| { let (a, c) = r.unwrap(); - std::iter::repeat(a).take(c as usize) + std::iter::repeat_n(a, c as usize) }) .collect(); assert_eq!(decoded, vec![assignment.clone()], "variant={variant:?}"); diff --git a/ben/src/json/graph/mlc.rs b/ben/src/json/graph/mlc.rs index ca4de07..f30d609 100644 --- a/ben/src/json/graph/mlc.rs +++ b/ben/src/json/graph/mlc.rs @@ -88,11 +88,9 @@ impl MlcProgress { fn refresh(&self, depth: usize) { let depth_bar = &self.depths[depth]; - let percent_complete = if depth_bar.total == 0 { - 0 - } else { - depth_bar.done * 100 / depth_bar.total - }; + let percent_complete = (depth_bar.done * 100) + .checked_div(depth_bar.total) + .unwrap_or(0); depth_bar.bar.set_message(format!( "MLC phase {}: {}/{} {} ({}%)", depth + 1, diff --git a/ben/src/json/graph/tests/test_io.rs b/ben/src/json/graph/tests/test_io.rs index 7435687..5696122 100644 --- a/ben/src/json/graph/tests/test_io.rs +++ b/ben/src/json/graph/tests/test_io.rs @@ -476,12 +476,12 @@ fn petx_to_nx_node_restores_id() { let petx = PetxNode { attrs: BTreeMap::from([ ("__networkx_id__".into(), json!("node_a")), - ("weight".into(), json!(3.14)), + ("weight".into(), json!(1.5)), ]), }; let nx = petx_node_to_nx_node(&petx).unwrap(); assert_eq!(nx.id, json!("node_a")); - assert_eq!(nx.attrs.get("weight"), Some(&json!(3.14))); + assert_eq!(nx.attrs.get("weight"), Some(&json!(1.5))); assert!(!nx.attrs.contains_key("__networkx_id__")); } diff --git a/ben/src/ops/extract/tests.rs b/ben/src/ops/extract/tests.rs index 5ecb669..9744a82 100644 --- a/ben/src/ops/extract/tests.rs +++ b/ben/src/ops/extract/tests.rs @@ -1,3 +1,8 @@ +// Binary literals here are grouped by BEN bit-field boundaries (e.g. `0b01100_100` is a 5-bit +// value followed by a 3-bit value), not by even nibbles, so the grouping documents the packed +// layout under test. +#![allow(clippy::unusual_byte_groupings)] + use super::*; use crate::codec::encode::encode_jsonl_to_xben; use crate::BenVariant; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 164d344..7eb1715 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -26,12 +26,12 @@ impl Read for ErrorAfterOneByte { } } -fn shuffle_with_mapping(vec: &mut Vec) -> HashMap +fn shuffle_with_mapping(vec: &mut [T]) -> HashMap where T: Clone + std::cmp::PartialEq, { let mut rng = ChaCha8Rng::seed_from_u64(42); - let original_vec = vec.clone(); + let original_vec = vec.to_vec(); vec.shuffle(&mut rng); let mut map = HashMap::new(); @@ -318,7 +318,7 @@ fn test_relabel_ben_line_with_large_shuffle() { .collect::>(); let mut out_assign = in_assign.clone(); - let in_rle = assign_to_rle(in_assign.to_vec()); + let in_rle = assign_to_rle(&in_assign); let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); let new_to_old_map = shuffle_with_mapping(&mut out_assign); @@ -1231,8 +1231,8 @@ fn run_policy_pins_frame_preservation_and_collapse() { { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } @@ -1307,8 +1307,8 @@ fn standard_target_cross_policy_byte_identity() { { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = BenEncodeFrame::from_assignment(&[1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)); + let frame_b = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } diff --git a/ben/tests/test_assignment_reader.rs b/ben/tests/test_assignment_reader.rs index 4f34c69..0dd1cab 100644 --- a/ben/tests/test_assignment_reader.rs +++ b/ben/tests/test_assignment_reader.rs @@ -77,7 +77,7 @@ mod mkvchain { #[test] fn single_assignment_round_trip() { let assignment = vec![3u16, 3, 1, 2, 2, 1]; - let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); + let ben = encode_ben(std::slice::from_ref(&assignment), BenVariant::MkvChain); let mut decoder = BenStreamReader::from_ben(ben.as_slice()) .unwrap() @@ -429,7 +429,7 @@ mod mkvchain { #[test] fn frame_reader_bytes_decode_back_to_original_assignment() { let assignment = vec![3u16, 3, 1, 2]; - let ben = encode_ben(&[assignment.clone()], BenVariant::MkvChain); + let ben = encode_ben(std::slice::from_ref(&assignment), BenVariant::MkvChain); let (frame, _count) = BenStreamReader::from_ben(ben.as_slice()) .unwrap() @@ -650,7 +650,7 @@ mod twodelta { fn single_anchor_frame_round_trip() { // A stream with only one assignment contains just the anchor frame. let assignment = vec![1u16, 1, 2, 2, 3, 3]; - let ben = encode_twodelta(&[assignment.clone()]); + let ben = encode_twodelta(std::slice::from_ref(&assignment)); assert_eq!(expand_assignments(&ben), vec![assignment]); } @@ -839,7 +839,7 @@ mod twodelta { #[test] fn write_all_jsonl_single_anchor() { let assignment = vec![1u16, 2, 3]; - let ben = encode_twodelta(&[assignment.clone()]); + let ben = encode_twodelta(std::slice::from_ref(&assignment)); let mut out = Vec::new(); BenStreamReader::from_ben(ben.as_slice()) diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 8c35fb1..625b349 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -284,7 +284,7 @@ fn ben_decoder_rejects_xz_data_with_helpful_message() { #[test] fn ben_decoder_standard_single_assignment_round_trip() { let assignment = vec![1u16, 1, 2, 3, 3, 3]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let mut decoder = BenStreamReader::from_ben(ben.as_slice()).unwrap(); let (decoded, count) = decoder.next().unwrap().unwrap(); @@ -501,8 +501,7 @@ fn ben_encoder_finish_is_idempotent() { let mut enc = BenStreamWriter::for_ben(&mut out, BenVariant::MkvChain).unwrap(); enc.write_assignment(vec![1u16, 2]).unwrap(); enc.finish().unwrap(); - let len_after_first_finish = enc.finish().unwrap(); // second call - let _ = len_after_first_finish; + enc.finish().unwrap(); // second call } // The output should decode to exactly one sample (not duplicated). let mut decoded = Vec::new(); @@ -638,7 +637,7 @@ fn encode_ben_vec_from_assign_and_rle_are_equivalent() { #[test] fn encode_ben_vec_from_assign_single_element() { - let frame = BenEncodeFrame::from_assignment(&[42u16], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_assignment([42u16], BenVariant::Standard, None); assert!(!frame.as_slice().is_empty()); } @@ -1136,7 +1135,7 @@ fn encode_and_decode_empty_assignment_standard() { #[test] fn encode_and_decode_max_u16_values_standard() { let assignment = vec![0u16, 65535, 32768, 1, 65534]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let decoded_str = decode_ben_to_string(&ben); assert!( decoded_str.contains("\"assignment\":[0,65535,32768,1,65534]"), @@ -1151,7 +1150,7 @@ fn encode_and_decode_max_u16_values_standard() { #[test] fn single_sample_standard_round_trip() { let assignment = vec![42u16; 1000]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let decoded_str = decode_ben_to_string(&ben); assert_eq!(decoded_str.lines().count(), 1); assert!(decoded_str.contains("\"sample\":1")); @@ -1395,7 +1394,7 @@ fn relabel_ben_file_standard_is_idempotent() { #[test] fn single_unique_label_assignment_round_trips() { let assignment = vec![42u16; 50]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let decoded_str = decode_ben_to_string(&ben); assert!( decoded_str.contains("\"assignment\":[42,42,42"), @@ -1427,7 +1426,7 @@ fn single_unique_label_relabeled_to_one() { fn encode_decode_max_run_length_standard() { // A run of 65535 identical values. let assignment = vec![7u16; 65535]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let decoded_str = decode_ben_to_string(&ben); assert!(decoded_str.contains("\"sample\":1")); @@ -1447,7 +1446,7 @@ fn encode_decode_max_run_length_standard() { fn ben_variant_clone_and_copy() { let v = BenVariant::MkvChain; let v2 = v; // Copy - let v3 = v.clone(); // Clone + let v3 = v; // Clone assert_eq!(v2, v3); assert_eq!(v, BenVariant::MkvChain); } @@ -1465,7 +1464,7 @@ fn ben_variant_debug() { #[test] fn ben_decoder_accepts_cursor_reader() { let assignment = vec![1u16, 2, 3]; - let ben = encode_standard_ben(&[assignment.clone()]); + let ben = encode_standard_ben(std::slice::from_ref(&assignment)); let cursor = Cursor::new(ben); let mut decoder = BenStreamReader::from_ben(cursor).unwrap().silent(true); let (decoded, _) = decoder.next().unwrap().unwrap(); @@ -1555,7 +1554,7 @@ fn encode_ben_frame_from_assignment() { let assignment = vec![1u16, 1, 2, 2, 3]; let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); // Frame from assignment should produce runs - let runs = &frame.runs().unwrap()[..]; + let runs = frame.runs().unwrap(); assert_eq!(runs, &[(1u16, 2u16), (2u16, 2u16), (3u16, 1u16)]); } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index acea7c6..faa4dbe 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -838,9 +838,9 @@ fn xben_truncated_frame_reports_unexpected_eof() { // Trim the last byte to force partial frame after decompress let trimmed = &xz[..xz.len() - 1]; // Iterating should surface UnexpectedEof (partial frame) - let mut it = BenStreamReader::from_xben(trimmed).unwrap(); + let it = BenStreamReader::from_xben(trimmed).unwrap(); // Drain until error - while let Some(res) = it.next() { + for res in it { if let Err(e) = res { assert_eq!(e.kind(), std::io::ErrorKind::UnexpectedEof); return; @@ -864,7 +864,7 @@ fn encode_decode_ben32_odd_bit_packing_roundtrip() { assert_eq!( decoded, rle.into_iter() - .flat_map(|(v, c)| std::iter::repeat((v, 1)).take(c as usize)) + .flat_map(|(v, c)| std::iter::repeat_n((v, 1), c as usize)) .fold(Vec::<(u16, u16)>::new(), |mut acc, (v, _)| { if let Some(last) = acc.last_mut() { if last.0 == v { @@ -889,8 +889,7 @@ fn encode_jsonl_to_ben_rejects_bad_assignment_shapes() { for s in bads { let mut out = Vec::new(); let err = encode_jsonl_to_ben(BufReader::new(s.as_bytes()), &mut out, BenVariant::Standard) - .err() - .expect("expected invalid data"); + .expect_err("expected invalid data"); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } } @@ -898,7 +897,7 @@ fn encode_jsonl_to_ben_rejects_bad_assignment_shapes() { #[test] fn subsample_by_indices_sorts_and_dedups() { // Build 5 distinct samples 1..=5 - let seq = vec![vec![1u16], vec![2], vec![3], vec![4], vec![5]]; + let seq = [vec![1u16], vec![2], vec![3], vec![4], vec![5]]; let jsonl = { let mut b = Vec::new(); for (i, a) in seq.iter().enumerate() { diff --git a/ben/tests/test_pipeline.rs b/ben/tests/test_pipeline.rs index 86b522c..7528b28 100755 --- a/ben/tests/test_pipeline.rs +++ b/ben/tests/test_pipeline.rs @@ -245,7 +245,7 @@ fn test_xben_pipeline() { .to_string() + "\n"; - sample_writer.write_all(&line.as_bytes()).unwrap(); + sample_writer.write_all(line.as_bytes()).unwrap(); } eprintln!(); diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index deb6a59..0d70093 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -30,7 +30,7 @@ fn expand_ben(bytes: &[u8]) -> Vec> { .silent(true) .flat_map(|record| { let (assignment, count) = record.unwrap(); - std::iter::repeat(assignment).take(count as usize) + std::iter::repeat_n(assignment, count as usize) }) .collect() } @@ -43,7 +43,7 @@ fn minimal_bendl_with_entries( let directory_offset = bytes.len() as u64; let mut directory = encode_directory(&entries).unwrap(); if directory_len_adjustment > 0 { - directory.extend(std::iter::repeat(0u8).take(directory_len_adjustment as usize)); + directory.extend(std::iter::repeat_n(0u8, directory_len_adjustment as usize)); } bytes.extend_from_slice(&directory); @@ -121,8 +121,7 @@ impl Write for HeaderPatchCrashCursor { fn write(&mut self, buf: &[u8]) -> std::io::Result { let mut state = self.state.borrow_mut(); if state.bytes.len() > state.initial_len && state.pos < HEADER_SIZE as u64 { - return Err(std::io::Error::new( - std::io::ErrorKind::Other, + return Err(std::io::Error::other( "simulated crash while patching bundle header", )); } @@ -352,12 +351,12 @@ fn xz_compress_propagates_input_reader_errors() { struct FailingReader; impl std::io::Read for FailingReader { fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { - Err(std::io::Error::new(std::io::ErrorKind::Other, "boom")) + Err(std::io::Error::other("boom")) } } impl std::io::BufRead for FailingReader { fn fill_buf(&mut self) -> std::io::Result<&[u8]> { - Err(std::io::Error::new(std::io::ErrorKind::Other, "boom")) + Err(std::io::Error::other("boom")) } fn consume(&mut self, _amt: usize) {} } From 912e526511eaa79aedebe4de19098feeb90268ab Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sun, 31 May 2026 16:46:40 -0600 Subject: [PATCH 130/221] better recovery for dropped stream --- ben/src/cli/ben/bundle.rs | 30 +++++++------------------- ben/src/codec/encode/twodelta.rs | 3 ++- ben/src/io/bundle/tests/reader.rs | 4 +--- ben/src/io/bundle/tests/writer.rs | 36 ++++++++++++++++++++++++++++--- ben/src/io/bundle/writer.rs | 4 ++++ ben/src/io/reader/tests.rs | 6 +----- 6 files changed, 49 insertions(+), 34 deletions(-) diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index dd92ed9..4ccaa4b 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -15,15 +15,11 @@ use std::path::Path; /// this point. pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<()> { eprintln!("Adding graph..."); - let graph_bytes = std::fs::read(graph_path).map_err(|e| { - io::Error::other( - format!("failed to read graph {graph_path:?}: {e}"), - ) - })?; + let graph_bytes = std::fs::read(graph_path) + .map_err(|e| io::Error::other(format!("failed to read graph {graph_path:?}: {e}")))?; let file = OpenOptions::new().read(true).write(true).open(out_path)?; - let mut appender = BendlAppender::open(file) - .map_err(|e| io::Error::other(format!("{e}")))?; + let mut appender = BendlAppender::open(file).map_err(|e| io::Error::other(format!("{e}")))?; appender .add_asset( ASSET_TYPE_GRAPH, @@ -31,11 +27,7 @@ pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<() &graph_bytes, AddAssetOptions::defaults().json(), ) - .map_err(|e| { - io::Error::other( - format!("failed to add graph asset: {e}"), - ) - })?; + .map_err(|e| io::Error::other(format!("failed to add graph asset: {e}")))?; appender .commit() .map_err(|e| io::Error::other(format!("{e}")))?; @@ -52,11 +44,8 @@ pub(super) fn run_encode_bundle_with_graph( ) -> Result<()> { // Validate the graph file is readable before we do any real work, so a bad --graph path doesn't // leave a half-written bundle behind. - std::fs::metadata(graph_path).map_err(|e| { - io::Error::other( - format!("failed to stat graph {graph_path:?}: {e}"), - ) - })?; + std::fs::metadata(graph_path) + .map_err(|e| io::Error::other(format!("failed to stat graph {graph_path:?}: {e}")))?; let sample_count = count_jsonl_lines(input_path)?; @@ -92,11 +81,8 @@ pub(super) fn run_xencode_bundle_with_graph( block_size: Option, graph_path: &Path, ) -> Result<()> { - std::fs::metadata(graph_path).map_err(|e| { - io::Error::other( - format!("failed to stat graph {graph_path:?}: {e}"), - ) - })?; + std::fs::metadata(graph_path) + .map_err(|e| io::Error::other(format!("failed to stat graph {graph_path:?}: {e}")))?; let sample_count: i64 = if from_ben { count_samples_from_file(input_path, BenWireFormat::Ben)? as i64 diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 8fba898..02a2482 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -283,7 +283,8 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( let previous_value = previous[idx]; let new_val = current[idx]; - if previous_value != pair.first_run_district() && previous_value != pair.second_run_district() + if previous_value != pair.first_run_district() + && previous_value != pair.second_run_district() { return Err(Error::from(EncodeError::TwoDeltaMaskOutOfPair { pos: idx, diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 011a82e..5b99c1e 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -1566,9 +1566,7 @@ struct FailWhenArmed { impl Read for FailWhenArmed { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { if self.armed.load(std::sync::atomic::Ordering::SeqCst) { - return Err(std::io::Error::other( - "forced read failure", - )); + return Err(std::io::Error::other("forced read failure")); } self.inner.read(buf) } diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 2a87ca7..3488808 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -1334,6 +1334,38 @@ fn bundle_streaming_session_drop_leaves_unfinalized() { ); } +#[test] +fn dropped_stream_session_persists_recoverable_stream_offset_after_assets() { + let asset = b"asset-before-stream"; + let stream = b"STANDARD BEN FILE\x00partial"; + let mut buf: Vec = Vec::new(); + { + let mut writer = BendlWriter::new(Cursor::new(&mut buf), AssignmentFormat::Ben).unwrap(); + writer + .add_custom_asset("asset.bin", asset, AddAssetOptions::defaults().raw()) + .unwrap(); + let mut session = writer.into_stream_session().unwrap(); + assert_eq!(session.start_offset(), (HEADER_SIZE + asset.len()) as u64); + session.write_all(stream).unwrap(); + drop(session); + } + + let mut reader = BendlReader::open(Cursor::new(&buf)).unwrap(); + assert!(!reader.is_finalized()); + assert_eq!( + reader.header().stream_offset, + (HEADER_SIZE + asset.len()) as u64 + ); + + let mut recovered = Vec::new(); + reader + .assignment_stream_reader_unverified() + .unwrap() + .read_to_end(&mut recovered) + .unwrap(); + assert_eq!(recovered, stream); +} + /// Verification #9: `BendlStreamSession::write` must increment its internal byte counter by the /// returned write count, not by the requested buffer length, so partial writes are accounted /// correctly and the finalized header's `stream_len` matches the actual byte count of the stream @@ -1887,9 +1919,7 @@ fn writer_failed_asset_write_does_not_poison_registry() { fn write(&mut self, buf: &[u8]) -> std::io::Result { if !self.failed && self.inner.position() >= HEADER_SIZE as u64 { self.failed = true; - return Err(std::io::Error::other( - "simulated payload write failure", - )); + return Err(std::io::Error::other("simulated payload write failure")); } self.inner.write(buf) } diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 3b2a7c6..a714242 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -340,6 +340,10 @@ impl BendlWriter { let stream_offset = self.inner.stream_position()?; self.header.stream_offset = stream_offset; + self.inner.seek(SeekFrom::Start(0))?; + self.header.write_to(&mut self.inner)?; + self.inner.flush()?; + self.inner.seek(SeekFrom::Start(stream_offset))?; Ok(BendlStreamSession { inner: Some(self.inner), diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 4ee31fd..4a1d951 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -575,11 +575,7 @@ fn xz_reader_for_each_assignment_callback_error_propagates() { let xben = make_xben(jsonl, BenVariant::Standard); let mut reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); let err = reader - .for_each_assignment(|_assignment, _count| { - Err(std::io::Error::other( - "callback failed", - )) - }) + .for_each_assignment(|_assignment, _count| Err(std::io::Error::other("callback failed"))) .unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::Other); assert_eq!(err.to_string(), "callback failed"); From 696038db31a1baf8e1fac4fa08d32ce028a166c5 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:40:35 -0600 Subject: [PATCH 131/221] finish benpy rename --- .github/workflows/ci_cd.yml | 6 +- .readthedocs.yaml | 4 +- Taskfile.yml | 12 +- ben-py/Cargo.toml | 2 +- ben-py/docs/conf.py | 2 +- ben-py/docs/index.rst | 10 +- .../{using_pyben.ipynb => using_ben_py.ipynb} | 42 +++---- ben-py/tests/test_bundle.py | 108 +++++++++--------- ben-py/tests/test_python_pipelines.py | 92 +++++++-------- 9 files changed, 139 insertions(+), 139 deletions(-) rename ben-py/docs/user/{using_pyben.ipynb => using_ben_py.ipynb} (92%) diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index 642294d..c4b24a7 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -1,4 +1,4 @@ -name: Build & publish pyben wheels +name: Build & publish ben-py wheels on: push: @@ -14,7 +14,7 @@ on: # options: [testpypi, pypi] env: - PKG_DIR: pyben + PKG_DIR: ben-py OUT_DIR: dist permissions: @@ -112,7 +112,7 @@ jobs: name: windows (ARM64, abi3) runs-on: windows-11-arm env: - PKG_DIR: pyben + PKG_DIR: ben-py OUT_DIR: dist steps: - uses: actions/checkout@v4 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index be7bd9c..fb83822 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,10 +9,10 @@ build: python: install: - method: pip - path: ./pyben + path: ./ben-py extra_requirements: - docs sphinx: builder: dirhtml - configuration: pyben/docs/conf.py + configuration: ben-py/docs/conf.py diff --git a/Taskfile.yml b/Taskfile.yml index d956777..31f5584 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -7,7 +7,7 @@ vars: LOCAL_BIN: '{{.HOME}}/.local/bin' LLVM_BIN: '{{.HOME}}/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/bin' COV_TARGET_DIR: '{{.ROOT_DIR}}/target/llvm-cov-target' - PYBEN_TEST_PATHS: 'tests/' + BEN_PY_TEST_PATHS: 'tests/' tasks: default: @@ -137,7 +137,7 @@ tasks: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' dir: ben-py cmds: - - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} + - uv run pytest {{.CLI_ARGS | default .BEN_PY_TEST_PATHS}} test: desc: Run Rust and Python tests @@ -223,7 +223,7 @@ tasks: export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' + uv run pytest {{.CLI_ARGS | default .BEN_PY_TEST_PATHS}}' - >- {{.LLVM_BIN}}/llvm-profdata merge -sparse {{.ROOT_DIR}}/target/*.profraw @@ -257,7 +257,7 @@ tasks: export CARGO_TARGET_DIR="{{.COV_TARGET_DIR}}"; cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}"; - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}}' + uv run pytest {{.CLI_ARGS | default .BEN_PY_TEST_PATHS}}' - >- {{.LLVM_BIN}}/llvm-profdata merge -sparse {{.ROOT_DIR}}/target/*.profraw @@ -305,7 +305,7 @@ tasks: cd "{{.ROOT_DIR}}/ben-py"; uv run maturin develop --target-dir "{{.COV_TARGET_DIR}}" >/dev/null; pytest_status=0; - uv run pytest {{.CLI_ARGS | default .PYBEN_TEST_PATHS}} >/tmp/ben-py-pytest.log || pytest_status="$?"; + uv run pytest {{.CLI_ARGS | default .BEN_PY_TEST_PATHS}} >/tmp/ben-py-pytest.log || pytest_status="$?"; cd "{{.ROOT_DIR}}"; {{.LLVM_BIN}}/llvm-profdata merge -sparse target/*.profraw {{.COV_TARGET_DIR}}/*.profraw -o /tmp/ben-py.profdata >/dev/null; {{.LLVM_BIN}}/llvm-cov report {{.ROOT_DIR}}/ben-py/binary_ensemble/_core.abi3.so -instr-profile=/tmp/ben-py.profdata --ignore-filename-regex='"'"'/.cargo/registry|/rustc/|^/mnt/.*/ben/src/'"'"' > "$ben_py_report_file"; @@ -313,7 +313,7 @@ tasks: printf "\n%s\n\n" "BEN COVERAGE"; cat "$ben_report_file"; - printf "\n%s\n\n" "PYBEN COVERAGE"; + printf "\n%s\n\n" "BEN-PY COVERAGE"; cat "$ben_py_report_file"; printf "\n%-10s %-10s\n" "Target" "Lines"; printf "%-10s %-10s\n" "ben" "${ben_total:-n/a}"; diff --git a/ben-py/Cargo.toml b/ben-py/Cargo.toml index 77dcc5d..49750e4 100755 --- a/ben-py/Cargo.toml +++ b/ben-py/Cargo.toml @@ -9,7 +9,7 @@ repository = "https://github.com/peterrrock2/binary-ensemble" description = "Python bindings for the Binary Ensemble Package. Specifically the Encoder and Decoder" [lib] -name = "pyben_core" +name = "ben_py_core" crate-type = ["cdylib"] [dependencies] diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index 98b6c49..ae821e1 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- -project = "PyBen" +project = "binary-ensemble" copyright = "2025, Peter Rock" author = "Peter Rock" diff --git a/ben-py/docs/index.rst b/ben-py/docs/index.rst index fd21792..095dea1 100644 --- a/ben-py/docs/index.rst +++ b/ben-py/docs/index.rst @@ -1,5 +1,5 @@ -Welcome to PyBen's documentation! -================================= +Welcome to binary-ensemble's documentation! +=========================================== BEN (short for Binary-Ensemble) is a compression algorithm designed for efficient storage and access of ensembles of districting plans, and was designed to work primarily as a companion to the @@ -15,7 +15,7 @@ BEN and XBEN formats. Installation ------------ -To install PyBen, you can just use pip! +To install binary-ensemble, you can just use pip! .. code-block:: bash @@ -23,9 +23,9 @@ To install PyBen, you can just use pip! .. toctree:: - :caption: PyBEN Tutorial + :caption: binary-ensemble Tutorial - user/using_pyben + user/using_ben_py .. toctree:: diff --git a/ben-py/docs/user/using_pyben.ipynb b/ben-py/docs/user/using_ben_py.ipynb similarity index 92% rename from ben-py/docs/user/using_pyben.ipynb rename to ben-py/docs/user/using_ben_py.ipynb index 1df43b7..5ae2efb 100644 --- a/ben-py/docs/user/using_pyben.ipynb +++ b/ben-py/docs/user/using_ben_py.ipynb @@ -5,9 +5,9 @@ "id": "c7a7778b", "metadata": {}, "source": [ - "# An Introduction to Using PyBen\n", + "# An Introduction to Using binary-ensemble\n", "\n", - "This is a small tutorial that is meant to help users get to using PyBen: the Python interface\n", + "This is a small tutorial that is meant to help users get to using binary-ensemble: the Python interface\n", "for the [binary-ensemble](https://crates.io/crates/binary-ensemble) Rust package.\n", "\n", "BEN (short for Binary-Ensemble) is a compression algorithm designed for efficient storage and\n", @@ -139,7 +139,7 @@ "source": [ "## Converting between file types\n", "\n", - "PyBen comes equiped with some utility functions for users who wish to convert between different\n", + "binary-ensemble comes equiped with some utility functions for users who wish to convert between different\n", "file types." ] }, @@ -338,9 +338,9 @@ "id": "157bc601", "metadata": {}, "source": [ - "## PyBen and GerryChain\n", + "## binary-ensemble and GerryChain\n", "\n", - "As mentioned before, PyBen was originally designed to work with ensembles generated by programs\n", + "As mentioned before, binary-ensemble was originally designed to work with ensembles generated by programs\n", "like [GerryChain](https://gerrychain.readthedocs.io), and so we will give a small tutorial here.\n", "\n", "> **Note:** in the current version of GerryChain (0.3.2), there are some small peculiarities in\n", @@ -354,7 +354,7 @@ "source": [ "### Encoding\n", "\n", - "Working with the PyBen encoder should feel a lot like working with any Python object that handles\n", + "Working with the binary-ensemble encoder should feel a lot like working with any Python object that handles\n", "writing to files. In particular, we will use the context manager pattern to make sure that the\n", "file is appropriately opened and closed as we write assignment vectors to it." ] @@ -431,11 +431,11 @@ } ], "source": [ - "from binary_ensemble import PyBenEncoder\n", + "from binary_ensemble import BenEncoder\n", "\n", "graph_node_order = list(graph.nodes)\n", "\n", - "with PyBenEncoder(\"example_data/gerrychain_10000.jsonl.ben\", overwrite=True) as encoder:\n", + "with BenEncoder(\"example_data/gerrychain_10000.jsonl.ben\", overwrite=True) as encoder:\n", " for partition in recom_chain.with_progress_bar():\n", " assignment_series = partition.assignment.to_series()\n", " # Assignment vectors must be lists of integers\n", @@ -452,7 +452,7 @@ "source": [ "### Decoding\n", "\n", - "Decoding with PyBen should also feel fairly simple: just iterate over the file and pull out the \n", + "Decoding with binary-ensemble should also feel fairly simple: just iterate over the file and pull out the \n", "assignment vector that you would like to work with." ] }, @@ -480,13 +480,13 @@ } ], "source": [ - "from binary_ensemble import PyBenDecoder\n", + "from binary_ensemble import BenDecoder\n", "import pandas as pd\n", "\n", "\n", "graph_node_order_series = pd.Index(graph.nodes)\n", "\n", - "for i, assignment in enumerate(PyBenDecoder(\"example_data/gerrychain_10000.jsonl.ben\")):\n", + "for i, assignment in enumerate(BenDecoder(\"example_data/gerrychain_10000.jsonl.ben\")):\n", " assignment = pd.Series(assignment, index=graph_node_order_series)\n", " partition = Partition(graph, assignment=assignment, updaters=my_updaters)\n", " if i % 1000 == 0:\n", @@ -501,7 +501,7 @@ "### Subsampling\n", "\n", "Often times, when working with ensembles of plans, it is desirable to subsample from the ensemble\n", - "for the sake of winnowing, and the `PyBenDecoder` has native support for this.\n", + "for the sake of winnowing, and the `BenDecoder` has native support for this.\n", "\n", "\n", "We'll work with the \"100k_CO_chain.json.xben\" file which contains 100k districting plans on \n", @@ -540,7 +540,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\n", + "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.ben\"\n", ").subsample_indices([1, 23978, 100000]):\n", " print(assignment[:10])" @@ -566,7 +566,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(\n", + "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(\n", " 1000, 1005\n", "):\n", " print(assignment[:10])" @@ -597,7 +597,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(\n", + "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(\n", " 10000\n", "):\n", " print(assignment[:10])" @@ -623,7 +623,7 @@ "output_type": "stream", "text": [ "/tmp/ipykernel_239360/229284435.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_indices([1, 23978, 100000]):\n" + " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_indices([1, 23978, 100000]):\n" ] }, { @@ -637,7 +637,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\n", + "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", ").subsample_indices([1, 23978, 100000]):\n", " print(assignment[:10])" @@ -654,7 +654,7 @@ "output_type": "stream", "text": [ "/tmp/ipykernel_239360/1010090289.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_range(1000,1005):\n" + " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_range(1000,1005):\n" ] }, { @@ -671,7 +671,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\n", + "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", ").subsample_range(1000, 1005):\n", " print(assignment[:10])" @@ -688,7 +688,7 @@ "output_type": "stream", "text": [ "/tmp/ipykernel_239360/49125867.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in PyBenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n" + " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n" ] }, { @@ -710,7 +710,7 @@ } ], "source": [ - "for assignment in PyBenDecoder(\n", + "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", ").subsample_every(10000):\n", " print(assignment[:10])" diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index 058a6f1..cd13bbd 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -1296,7 +1296,7 @@ def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) - } -def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None: +def test_benencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None: out = tmp_path / "stream.bendl" samples = [[1, 1, 2, 2], [3, 3, 2, 2], [3, 3, 3, 3]] with BenEncoder(out, overwrite=True, variant="standard") as enc: @@ -1317,7 +1317,7 @@ def test_pybenencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None assert list(BenDecoder(extracted, mode="ben")) == samples -def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: out = tmp_path / "with_graph.bendl" samples = [[1, 1, 2, 2], [1, 1, 3, 3]] with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: @@ -1340,7 +1340,7 @@ def test_pybenencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: graph_path = tmp_path / "graph.json" graph_path.write_text(json.dumps(SAMPLE_GRAPH)) @@ -1355,7 +1355,7 @@ def test_pybenencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: # String paths must be accepted verbatim (same coercion Path arguments # go through elsewhere in the API). graph_path = tmp_path / "graph-str.json" @@ -1373,7 +1373,7 @@ def test_pybenencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: raw = json.dumps(SAMPLE_GRAPH).encode("utf-8") out = tmp_path / "via-bytes.bendl" samples = [[2, 2, 2, 2]] @@ -1385,7 +1385,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: buf = io.BytesIO(json.dumps(SAMPLE_GRAPH).encode("utf-8")) out = tmp_path / "via-bytesio.bendl" samples = [[1, 2, 1, 2]] @@ -1397,7 +1397,7 @@ def test_pybenencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: +def test_benencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: buf = io.StringIO(json.dumps(SAMPLE_GRAPH)) out = tmp_path / "via-stringio.bendl" samples = [[3, 3, 3, 3]] @@ -1409,7 +1409,7 @@ def test_pybenencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> None: +def test_benencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> None: out = tmp_path / "full.bendl" rng = random.Random(0xCAFE) samples = [[rng.randint(1, 8) for _ in range(12)] for _ in range(15)] @@ -1426,7 +1426,7 @@ def test_pybenencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> No assert reader.read_graph() == SAMPLE_GRAPH -def test_pybenencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: +def test_benencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: out = tmp_path / "ben-with-graph.ben" with pytest.raises(ValueError, match="ben_file_only"): BenEncoder( @@ -1438,7 +1438,7 @@ def test_pybenencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: ) -def test_pybenencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: +def test_benencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: # A ben_file_only=True output should be byte-identical to the legacy # plain-BEN path, so the header has no BENDL magic. out = tmp_path / "legacy.ben" @@ -1450,7 +1450,7 @@ def test_pybenencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] -def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: +def test_benencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: out = tmp_path / "idem.bendl" enc = BenEncoder(out, overwrite=True, variant="standard") enc.write([1, 1, 2]) @@ -1464,7 +1464,7 @@ def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: assert reader.count_samples() == 1 -def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: +def test_benencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: out = tmp_path / "bad.bendl" with pytest.raises(ValueError, match="graph must be"): BenEncoder(out, overwrite=True, variant="standard", graph=12345) @@ -1480,7 +1480,7 @@ def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: +def test_bendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: samples = [[1, 2, 3], [1, 2, 3], [4, 4, 5]] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -1498,7 +1498,7 @@ def test_pybendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: +def test_bendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: samples = [[1, 1, 2, 2], [3, 3, 4, 4]] bundle = build_bundle( stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), @@ -1514,7 +1514,7 @@ def test_pybendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: +def test_bendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: samples = [[1, 2, 3]] graph_json = b'{"nodes":[0,1],"edges":[[0,1]]}' metadata_json = b'{"note":"hello"}' @@ -1584,7 +1584,7 @@ def test_pybendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_canonical_helpers_return_none_when_absent( +def test_bendecoder_bundle_canonical_helpers_return_none_when_absent( tmp_path: Path, ) -> None: samples = [[1, 2]] @@ -1600,7 +1600,7 @@ def test_pybendecoder_bundle_canonical_helpers_return_none_when_absent( assert dec.read_relabel_map() is None -def test_pybendecoder_bundle_subsample_range(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_range(tmp_path: Path) -> None: samples = [[i, i + 1] for i in range(1, 11)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -1613,7 +1613,7 @@ def test_pybendecoder_bundle_subsample_range(tmp_path: Path) -> None: assert list(dec) == samples[2:6] -def test_pybendecoder_bundle_subsample_indices(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_indices(tmp_path: Path) -> None: samples = [[i] for i in range(1, 9)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -1626,7 +1626,7 @@ def test_pybendecoder_bundle_subsample_indices(tmp_path: Path) -> None: assert list(dec) == [samples[0], samples[3], samples[7]] -def test_pybendecoder_bundle_subsample_every(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_every(tmp_path: Path) -> None: samples = [[i, i] for i in range(1, 11)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -1639,7 +1639,7 @@ def test_pybendecoder_bundle_subsample_every(tmp_path: Path) -> None: assert list(dec) == [samples[1], samples[4], samples[7]] -def test_pybendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: +def test_bendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: # For bundles, the header decides the format — a caller-supplied # `mode="xben"` on a BEN bundle must not confuse the reader. samples = [[1, 2, 3]] @@ -1655,7 +1655,7 @@ def test_pybendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None: +def test_bendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None: # Opening a plain .ben file must still iterate unchanged; the new # bundle surface is simply unavailable. samples = [[1, 2, 3], [4, 5, 6]] @@ -1686,7 +1686,7 @@ def test_pybendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None lambda d: d.read_relabel_map(), ], ) -def test_pybendecoder_plain_stream_rejects_bundle_methods( +def test_bendecoder_plain_stream_rejects_bundle_methods( tmp_path: Path, method_call ) -> None: ben_path = tmp_path / "plain.ben" @@ -1700,7 +1700,7 @@ def test_pybendecoder_plain_stream_rejects_bundle_methods( method_call(dec) -def test_pybendecoder_plain_stream_error_mentions_ben_file_only( +def test_bendecoder_plain_stream_error_mentions_ben_file_only( tmp_path: Path, ) -> None: ben_path = tmp_path / "plain.ben" @@ -1714,7 +1714,7 @@ def test_pybendecoder_plain_stream_error_mentions_ben_file_only( dec.read_graph() -def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> None: +def test_bendecoder_opens_bundle_produced_by_benencoder(tmp_path: Path) -> None: # End-to-end: a bundle written by BenEncoder (with a graph asset) # must round-trip through a single BenDecoder call — no need to # extract the stream first. @@ -1731,7 +1731,7 @@ def test_pybendecoder_opens_bundle_produced_by_pybenencoder(tmp_path: Path) -> N assert list(dec) == [[1, 2, 3], [2, 3, 4]] -def test_pybendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: +def test_bendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: # An incomplete bundle has complete=0 and no directory — its header # carries no authoritative sample_count, so __len__ must fall back # to scanning the stream region. This exercises the @@ -1761,7 +1761,7 @@ def test_pybendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_incomplete_bundle_count_samples_matches_len( +def test_bendecoder_incomplete_bundle_count_samples_matches_len( tmp_path: Path, ) -> None: # Explicit count_samples() also flows through scan_bundle_samples @@ -1785,7 +1785,7 @@ def test_pybendecoder_incomplete_bundle_count_samples_matches_len( assert len(dec) == len(samples) -def test_pybendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: +def test_bendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: # A finalized bundle whose assignment_format byte is neither BEN # nor XBEN must surface a clear error at decoder construction, not # silently fall through. @@ -1800,7 +1800,7 @@ def test_pybendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: BenDecoder(path) -def test_pybendecoder_empty_stream_bundle(tmp_path: Path) -> None: +def test_bendecoder_empty_stream_bundle(tmp_path: Path) -> None: # A bundle containing a valid BEN banner but zero frames must be # openable and produce an empty iterator / zero-length decoder. bundle = build_bundle(stream_bytes=_ben_bytes_for([], tmp_path), sample_count=0) @@ -1815,7 +1815,7 @@ def test_pybendecoder_empty_stream_bundle(tmp_path: Path) -> None: assert dec.list_assets() == [] -def test_pybendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> None: +def test_bendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> None: # Calling TOC / asset methods in between __next__ calls must not # break the iterator — the TOC access uses a separate BendlReader, # not the file handle backing the iterator. @@ -1849,7 +1849,7 @@ def test_pybendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> N next(it) -def test_pybendecoder_bundle_subsample_range_rejects_out_of_bounds( +def test_bendecoder_bundle_subsample_range_rejects_out_of_bounds( tmp_path: Path, ) -> None: samples = [[1, 2], [3, 4], [5, 6]] @@ -1865,7 +1865,7 @@ def test_pybendecoder_bundle_subsample_range_rejects_out_of_bounds( dec.subsample_range(0, 1) -def test_pybendecoder_bundle_subsample_indices_rejects_out_of_bounds( +def test_bendecoder_bundle_subsample_indices_rejects_out_of_bounds( tmp_path: Path, ) -> None: samples = [[1, 2], [3, 4]] @@ -1883,7 +1883,7 @@ def test_pybendecoder_bundle_subsample_indices_rejects_out_of_bounds( dec2.subsample_indices([]) -def test_pybendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> None: samples = [[1], [2], [3]] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -1898,7 +1898,7 @@ def test_pybendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> dec2.subsample_every(0, 1) -def test_pybendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: +def test_bendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: # __len__ caches the scan result; calling it twice must not re-scan # but must return the same answer. samples = [[1, 2], [3, 4], [5, 6]] @@ -1915,7 +1915,7 @@ def test_pybendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: assert dec.count_samples() == len(samples) -def test_pybendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: +def test_bendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: # A 4-byte file cannot start with the BENDL magic; detect_is_bundle # must return false on UnexpectedEof, after which plain-stream # decoding fails with a banner error. @@ -1925,14 +1925,14 @@ def test_pybendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: BenDecoder(path) -def test_pybendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: +def test_bendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: path = tmp_path / "empty.ben" path.write_bytes(b"") with pytest.raises(Exception): BenDecoder(path) -def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: +def test_bendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: # read_json_asset on the decoder should reject non-UTF-8 the same as # error behavior when an asset isn't valid UTF-8. bundle = build_bundle( @@ -1954,7 +1954,7 @@ def test_pybendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> dec.read_json_asset("binary.bin") -def test_pybendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> None: +def test_bendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, @@ -1973,7 +1973,7 @@ def test_pybendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> dec.read_metadata() -def test_pybendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> None: +def test_bendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> None: # A bundle built with BenEncoder compresses the graph asset as xz; # read_graph() on BenDecoder must still return the decoded JSON. out = tmp_path / "xz_graph.bendl" @@ -1986,7 +1986,7 @@ def test_pybendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> No assert dec.read_graph() == SAMPLE_GRAPH -def test_pybendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: +def test_bendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: # XBEN bundles with TOC entries were not previously covered — only # the plain XBEN-bundle auto-detect case. Verify iteration AND TOC # access both work on an XBEN bundle. @@ -2014,7 +2014,7 @@ def test_pybendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> None: # The subsample_indices path that sorts+dedupes unsorted input also # has to work for bundles. Mixing in duplicates should still yield # the deduplicated selection. @@ -2030,7 +2030,7 @@ def test_pybendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> assert list(dec) == [[1], [4]] -def test_pybendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: +def test_bendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: # `assignment_format()` must report "xben" when opened on a plain # XBEN stream as well, not only on bundles. samples = [[1, 1, 2, 2], [2, 2, 1, 1]] @@ -2052,7 +2052,7 @@ def test_pybendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( +def test_bendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( tmp_path: Path, ) -> None: # An incomplete bundle has no directory, so there are no assets to @@ -2083,7 +2083,7 @@ def test_pybendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( assert dec.read_relabel_map() is None -def test_pybendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: +def test_bendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: # `__iter__` rebuilds the underlying frame walker so `for x in dec:` # can be used more than once against a bundle. samples = [[1, 2], [3, 4], [5, 6]] @@ -2098,7 +2098,7 @@ def test_pybendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None: +def test_bendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] ben_path = tmp_path / "twice.ben" with BenEncoder( @@ -2111,7 +2111,7 @@ def test_pybendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None assert list(dec) == samples -def test_pybendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> None: +def test_bendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> None: # Subsample selections must persist across `__iter__` calls, so # iterating the same (subsampled) decoder twice gives the same # filtered window each time. @@ -2128,7 +2128,7 @@ def test_pybendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> No assert list(dec) == expected -def test_pybendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> None: +def test_bendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -2142,7 +2142,7 @@ def test_pybendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> assert list(dec) == expected -def test_pybendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> None: +def test_bendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 11)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), @@ -2156,7 +2156,7 @@ def test_pybendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> No assert list(dec) == expected -def test_pybendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> None: +def test_bendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> None: # Calling subsample_* a second time must replace the first selection # AND survive reiteration with the new selection. samples = [[i] for i in range(1, 8)] @@ -2174,7 +2174,7 @@ def test_pybendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> assert list(dec) == expected -def test_pybendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: +def test_bendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: # Consuming part of the iterator and then calling `iter()` / `list()` # again must restart cleanly from the first sample, not resume # mid-stream. @@ -2192,7 +2192,7 @@ def test_pybendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_count_samples_after_subsample_preserves_len( +def test_bendecoder_count_samples_after_subsample_preserves_len( tmp_path: Path, ) -> None: # After `subsample_*`, `len(dec)` must reflect the filtered count. @@ -2213,7 +2213,7 @@ def test_pybendecoder_count_samples_after_subsample_preserves_len( assert list(dec) == samples[1:5] -def test_pybendecoder_count_samples_plain_after_subsample_preserves_len( +def test_bendecoder_count_samples_plain_after_subsample_preserves_len( tmp_path: Path, ) -> None: # Same contract as above, but on a plain .ben stream to cover the @@ -2234,7 +2234,7 @@ def test_pybendecoder_count_samples_plain_after_subsample_preserves_len( assert list(dec) == expected -def test_pybendecoder_subsample_then_count_samples_then_reiterate( +def test_bendecoder_subsample_then_count_samples_then_reiterate( tmp_path: Path, ) -> None: # Composing subsample → count_samples → restart iteration must keep @@ -2253,7 +2253,7 @@ def test_pybendecoder_subsample_then_count_samples_then_reiterate( assert list(dec) == expected -def test_pybendecoder_bundle_read_json_asset_missing_name_raises_keyerror( +def test_bendecoder_bundle_read_json_asset_missing_name_raises_keyerror( tmp_path: Path, ) -> None: # `read_json_asset` on a valid bundle that does not carry the named @@ -2268,7 +2268,7 @@ def test_pybendecoder_bundle_read_json_asset_missing_name_raises_keyerror( dec.read_json_asset("nope.json") -def test_pybendecoder_bundle_len_uses_header_fast_path(tmp_path: Path) -> None: +def test_bendecoder_bundle_len_uses_header_fast_path(tmp_path: Path) -> None: # For a finalized bundle, `len(dec)` should use the O(1) header # sample_count fast path rather than scanning the stream. We can't # observe the scan directly, but we can verify the result matches diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index c687bda..38b060c 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -297,7 +297,7 @@ def test_subsample_every(tmp_path: Path) -> None: # ---------- Encoder surface (context manager & write) ---------- -def test_pybenencoder_roundtrip(tmp_path: Path) -> None: +def test_benencoder_roundtrip(tmp_path: Path) -> None: rng = random.Random(777) n_samples = 60 seq = gen_sequence_standard(rng, n_samples) @@ -399,7 +399,7 @@ def test_module_exports_are_exposed() -> None: assert hasattr(binary_ensemble, "_core") -def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: +def test_benencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3]] default_ben = tmp_path / "default.ben" @@ -426,7 +426,7 @@ def test_pybenencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: assert list(BenDecoder(alias_xben, mode="xben")) == samples -def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: +def test_benencoder_close_and_write_error_paths(tmp_path: Path) -> None: out = tmp_path / "out.ben" enc = BenEncoder(out, overwrite=True, variant="standard", ben_file_only=True) enc.write([1, 2, 3]) @@ -452,7 +452,7 @@ def test_pybenencoder_close_and_write_error_paths(tmp_path: Path) -> None: invalid_enc.write([65536]) -def test_pybenencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> None: +def test_benencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> None: out = tmp_path / "out.ben" out.write_bytes(b"existing") @@ -841,7 +841,7 @@ def test_decoder_bundle_missing_asset_raises_keyerror(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybenencoder_bundle_without_graph(tmp_path: Path) -> None: +def test_benencoder_bundle_without_graph(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "no_graph.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -855,7 +855,7 @@ def test_pybenencoder_bundle_without_graph(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybenencoder_bundle_graph_from_dict(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_dict(tmp_path: Path) -> None: graph = {"test": True} path = tmp_path / "dict_graph.bendl" with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: @@ -864,7 +864,7 @@ def test_pybenencoder_bundle_graph_from_dict(tmp_path: Path) -> None: assert dec.read_graph() == graph -def test_pybenencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: graph = {"test": "bytes"} path = tmp_path / "bytes_graph.bendl" with BenEncoder( @@ -874,7 +874,7 @@ def test_pybenencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: graph = {"test": "bytearray"} path = tmp_path / "ba_graph.bendl" with BenEncoder( @@ -887,7 +887,7 @@ def test_pybenencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: graph = {"test": "path"} gpath = tmp_path / "g.json" gpath.write_text(json.dumps(graph), encoding="utf-8") @@ -897,7 +897,7 @@ def test_pybenencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: graph = {"test": "str_path"} gpath = tmp_path / "g2.json" gpath.write_text(json.dumps(graph), encoding="utf-8") @@ -907,7 +907,7 @@ def test_pybenencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: graph = {"test": "bytesio"} path = tmp_path / "bio_graph.bendl" with BenEncoder( @@ -920,7 +920,7 @@ def test_pybenencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: +def test_benencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: graph = {"test": "stringio"} path = tmp_path / "sio_graph.bendl" with BenEncoder( @@ -933,7 +933,7 @@ def test_pybenencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: assert BenDecoder(path).read_graph() == graph -def test_pybenencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> None: +def test_benencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> None: with pytest.raises(ValueError, match="graph.*cannot be combined"): BenEncoder( tmp_path / "bad.ben", @@ -944,7 +944,7 @@ def test_pybenencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> ) -def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: +def test_benencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: with pytest.raises(ValueError, match="graph must be"): BenEncoder( tmp_path / "bad.bendl", @@ -954,7 +954,7 @@ def test_pybenencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: ) -def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: +def test_benencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: path = tmp_path / "idempotent.bendl" enc = BenEncoder(path, overwrite=True, variant="standard") enc.write([1, 2]) @@ -963,7 +963,7 @@ def test_pybenencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: assert list(BenDecoder(path)) == [[1, 2]] -def test_pybenencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: +def test_benencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: path = tmp_path / "closed.bendl" enc = BenEncoder(path, overwrite=True, variant="standard") enc.write([1]) @@ -977,7 +977,7 @@ def test_pybenencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: +def test_bendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] path = tmp_path / "auto.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -988,7 +988,7 @@ def test_pybendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_toc_methods(tmp_path: Path) -> None: +def test_bendecoder_bundle_toc_methods(tmp_path: Path) -> None: graph = {"g": 1} path = tmp_path / "toc.bendl" with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: @@ -1022,7 +1022,7 @@ def test_pybendecoder_bundle_toc_methods(tmp_path: Path) -> None: assert dec.read_relabel_map() is None -def test_pybendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: samples = [[i] for i in range(1, 11)] path = tmp_path / "subsample.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1042,7 +1042,7 @@ def test_pybendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: assert list(dec3) == [samples[1], samples[4], samples[7]] -def test_pybendecoder_bundle_len_and_count(tmp_path: Path) -> None: +def test_bendecoder_bundle_len_and_count(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "len.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1055,7 +1055,7 @@ def test_pybendecoder_bundle_len_and_count(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_iteration_restart(tmp_path: Path) -> None: +def test_bendecoder_bundle_iteration_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "restart.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1067,7 +1067,7 @@ def test_pybendecoder_bundle_iteration_restart(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> None: +def test_bendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] path = tmp_path / "re_sub.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1081,7 +1081,7 @@ def test_pybendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> N assert list(dec) == expected -def test_pybendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: +def test_bendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: path = tmp_path / "plain.ben" with BenEncoder( path, overwrite=True, variant="standard", ben_file_only=True @@ -1107,7 +1107,7 @@ def test_pybendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: getattr(dec, method)(*args) -def test_pybendecoder_bundle_count_samples_preserves_subsample_len( +def test_bendecoder_bundle_count_samples_preserves_subsample_len( tmp_path: Path, ) -> None: samples = [[i] for i in range(1, 9)] @@ -1128,7 +1128,7 @@ def test_pybendecoder_bundle_count_samples_preserves_subsample_len( # --------------------------------------------------------------------------- -def test_pybendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: +def test_bendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) @@ -1153,7 +1153,7 @@ def test_pybendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_xben_plain_stream(tmp_path: Path) -> None: +def test_bendecoder_xben_plain_stream(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) @@ -1179,7 +1179,7 @@ def test_pybendecoder_xben_plain_stream(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "empty_idx.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1191,7 +1191,7 @@ def test_pybendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: dec.subsample_indices([]) -def test_pybendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_idx.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1203,7 +1203,7 @@ def test_pybendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: dec.subsample_indices([0, 1, 2]) -def test_pybendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_start.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1215,7 +1215,7 @@ def test_pybendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: dec.subsample_range(0, 2) -def test_pybendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "bad_range.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1227,7 +1227,7 @@ def test_pybendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> Non dec.subsample_range(5, 2) -def test_pybendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_step.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1239,7 +1239,7 @@ def test_pybendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: dec.subsample_every(0) -def test_pybendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None: +def test_bendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None: samples = [[1], [2]] path = tmp_path / "zero_off.bendl" with BenEncoder(path, overwrite=True, variant="standard") as enc: @@ -1256,7 +1256,7 @@ def test_pybendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None # --------------------------------------------------------------------------- -def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: +def test_bendecoder_plain_subsample_indices(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub.ben" with BenEncoder( @@ -1270,7 +1270,7 @@ def test_pybendecoder_plain_subsample_indices(tmp_path: Path) -> None: assert list(dec) == [[1], [3], [5]] -def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: +def test_bendecoder_plain_subsample_range(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_range.ben" with BenEncoder( @@ -1284,7 +1284,7 @@ def test_pybendecoder_plain_subsample_range(tmp_path: Path) -> None: assert list(dec) == [[2], [3], [4]] -def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: +def test_bendecoder_plain_subsample_every(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5], [6]] path = tmp_path / "plain_every.ben" with BenEncoder( @@ -1303,7 +1303,7 @@ def test_pybendecoder_plain_subsample_every(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: +def test_bendecoder_plain_len_and_count(tmp_path: Path) -> None: samples = [[1], [2], [3]] path = tmp_path / "plain_len.ben" with BenEncoder( @@ -1317,7 +1317,7 @@ def test_pybendecoder_plain_len_and_count(tmp_path: Path) -> None: assert len(dec) == 3 -def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: +def test_bendecoder_plain_len_after_subsample(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] path = tmp_path / "plain_sub_len.ben" with BenEncoder( @@ -1338,7 +1338,7 @@ def test_pybendecoder_plain_len_after_subsample(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: +def test_bendecoder_plain_multiple_iterations(tmp_path: Path) -> None: samples = [[1, 2], [3, 4]] path = tmp_path / "multi_iter.ben" with BenEncoder( @@ -1353,7 +1353,7 @@ def test_pybendecoder_plain_multiple_iterations(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: +def test_bendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: samples = [[i] for i in range(1, 8)] path = tmp_path / "plain_re_sub.ben" with BenEncoder( @@ -1374,7 +1374,7 @@ def test_pybendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> No # --------------------------------------------------------------------------- -def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: +def test_benencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: samples = [[10, 20, 30], [40, 50, 60]] path = tmp_path / "ben_only.ben" with BenEncoder( @@ -1388,7 +1388,7 @@ def test_pybenencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybenencoder_ben_file_only_mkv(tmp_path: Path) -> None: +def test_benencoder_ben_file_only_mkv(tmp_path: Path) -> None: samples = [[1, 2], [1, 2], [3, 4]] path = tmp_path / "ben_mkv.ben" with BenEncoder( @@ -1401,7 +1401,7 @@ def test_pybenencoder_ben_file_only_mkv(tmp_path: Path) -> None: assert list(dec) == samples -def test_pybenencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: +def test_benencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: samples = [[5, 6]] path = tmp_path / "close_reopen.ben" enc = BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) @@ -1417,7 +1417,7 @@ def test_pybenencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybenencoder_bundle_with_metadata(tmp_path: Path) -> None: +def test_benencoder_bundle_with_metadata(tmp_path: Path) -> None: samples = [[1, 2]] graph = {"nodes": [{"id": 0}], "adjacency": [[]]} path = tmp_path / "with_meta.bendl" @@ -1434,7 +1434,7 @@ def test_pybenencoder_bundle_with_metadata(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: +def test_bendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: path = tmp_path / "plain_extract.ben" with BenEncoder( path, overwrite=True, variant="standard", ben_file_only=True @@ -1516,7 +1516,7 @@ def test_encode_ben_to_xben_roundtrip(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_unknown_mode_raises(tmp_path: Path) -> None: +def test_bendecoder_unknown_mode_raises(tmp_path: Path) -> None: path = tmp_path / "dummy.ben" path.write_bytes(b"\x00" * 100) with pytest.raises(Exception): @@ -1528,7 +1528,7 @@ def test_pybendecoder_unknown_mode_raises(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_pybendecoder_mkv_plain_stream(tmp_path: Path) -> None: +def test_bendecoder_mkv_plain_stream(tmp_path: Path) -> None: samples = [[1, 2], [1, 2], [3, 4]] src = tmp_path / "mkv_src.jsonl" write_jsonl(samples, src) From b2a0a372f21b0b44dbe37524289068b6859f8e42 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 2 Jun 2026 01:33:37 -0600 Subject: [PATCH 132/221] make two-delta work better on things like SB or arbitrary ensembles --- ben/src/codec/decode/tests/twodelta.rs | 212 +++++++++++++ ben/src/codec/encode/twodelta.rs | 5 + ben/src/codec/frames/decode.rs | 8 +- ben/src/io/reader/stream_reader/ben.rs | 57 ++-- ben/src/io/reader/stream_reader/frames.rs | 4 +- ben/src/io/reader/stream_reader/mod.rs | 6 - ben/src/io/reader/tests.rs | 99 ++++++- ben/src/io/reader/twodelta.rs | 5 + ben/src/io/writer/stream_writer/ben.rs | 76 +++-- ben/src/io/writer/stream_writer/xben.rs | 280 +++++++++++------- ben/src/io/writer/tests.rs | 96 +++++- ben/src/io/writer/twodelta.rs | 88 ++++++ ben/src/lib.rs | 5 + ben/src/ops/relabel/tests.rs | 18 +- .../fixtures/v1.0.0/source_twodelta.jsonl | 5 + ben/tests/fixtures/v1.0.0/twodelta.ben | Bin 62 -> 64 bytes ben/tests/fixtures/v1.0.0/twodelta.xben | Bin 112 -> 116 bytes ben/tests/test_format_stability.rs | 92 ++++-- ben/tests/test_impls_pipeline.rs | 55 ++-- ben/tests/test_stress_edges.rs | 2 + 20 files changed, 879 insertions(+), 234 deletions(-) create mode 100644 ben/tests/fixtures/v1.0.0/source_twodelta.jsonl diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index 679a5c4..ed89806 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -9,6 +9,7 @@ use crate::codec::decode::{ }; use crate::codec::encode::{encode_ben_to_xben, encode_twodelta_frame}; use crate::codec::frames::BenEncodeFrame; +use crate::io::reader::twodelta::{BEN_TWODELTA_DELTA_TAG, BEN_TWODELTA_SNAPSHOT_TAG}; use crate::io::writer::BenStreamWriter; use crate::util::rle::rle_to_vec; use crate::BenVariant; @@ -224,6 +225,8 @@ fn decode_ben_to_jsonl_twodelta_multiple_repeated_deltas() { fn decode_ben_to_jsonl_underflow_anchor() { // Mirrors test_jsonl_decode_ben_underflow: 2-byte payload, 1 padding bit. let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[2, 3, 0, 0, 0, 2, 0b01100_100, 0b01_11011_0]); ben.extend_from_slice(&1u16.to_be_bytes()); @@ -244,6 +247,8 @@ fn decode_ben_to_jsonl_underflow_anchor() { fn decode_ben_to_jsonl_exact_anchor() { // Mirrors test_jsonl_decode_ben_exact: 5-byte payload, zero padding. let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[2, 3, 0, 0, 0, 5]); ben.extend_from_slice(&[ 0b01100_100, @@ -279,6 +284,8 @@ fn decode_ben_to_jsonl_exact_anchor() { #[test] fn decode_ben_to_jsonl_16bit_val_anchor() { let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[10, 3, 0, 0, 0, 5]); ben.extend_from_slice(&[ 0b00000000, @@ -305,6 +312,8 @@ fn decode_ben_to_jsonl_16bit_val_anchor() { #[test] fn decode_ben_to_jsonl_16bit_len_anchor() { let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[2, 10, 0, 0, 0, 5]); ben.extend_from_slice(&[ 0b01000000, @@ -331,6 +340,8 @@ fn decode_ben_to_jsonl_16bit_len_anchor() { #[test] fn decode_ben_to_jsonl_max_val_65535_anchor() { let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[16, 4, 0, 0, 0, 8]); ben.extend_from_slice(&[ 0b00000000, @@ -360,6 +371,8 @@ fn decode_ben_to_jsonl_max_val_65535_anchor() { #[test] fn decode_ben_to_jsonl_max_len_65535_anchor() { let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[6, 16, 0, 0, 0, 9]); ben.extend_from_slice(&[ 0b01011100, @@ -390,6 +403,8 @@ fn decode_ben_to_jsonl_max_len_65535_anchor() { #[test] fn decode_ben_to_jsonl_max_val_and_len_65535_anchor() { let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[16, 16, 0, 0, 0, 12]); ben.extend_from_slice(&[ 0b00000000, @@ -424,6 +439,8 @@ fn decode_ben_to_jsonl_max_val_and_len_65535_anchor() { fn decode_ben_to_jsonl_single_element_anchor() { // Anchor assignment [23], count=1. let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[5, 1, 0, 0, 0, 1, 0b101111_00]); ben.extend_from_slice(&1u16.to_be_bytes()); @@ -437,6 +454,8 @@ fn decode_ben_to_jsonl_single_element_anchor() { fn decode_ben_to_jsonl_single_one_anchor() { // Anchor assignment [1], count=1. let mut ben = b"TWODELTA BEN FILE".to_vec(); + // The anchor (first) frame carries the snapshot tag; its body is MkvChain-formatted. + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[1, 1, 0, 0, 0, 1, 0b11_000000]); ben.extend_from_slice(&1u16.to_be_bytes()); @@ -466,9 +485,14 @@ fn decode_ben_to_jsonl_three_frames_byte_level() { // Delta [2,1]→[1,2]: // pair=(1,2), run_lengths=[1,1], same encoding // raw_bytes = [0,1, 0,2, 1, 0,0,0,1, 0xC0, 0,1] + // + // Each frame is prefixed with its per-frame tag: snapshot for the anchor, delta for the rest. let mut ben = b"TWODELTA BEN FILE".to_vec(); + ben.push(BEN_TWODELTA_SNAPSHOT_TAG); ben.extend_from_slice(&[2, 1, 0, 0, 0, 1, 0x74, 0, 1]); + ben.push(BEN_TWODELTA_DELTA_TAG); ben.extend_from_slice(&[0, 2, 0, 1, 1, 0, 0, 0, 1, 0xC0, 0, 1]); + ben.push(BEN_TWODELTA_DELTA_TAG); ben.extend_from_slice(&[0, 1, 0, 2, 1, 0, 0, 0, 1, 0xC0, 0, 1]); let mut out = Vec::new(); @@ -552,3 +576,191 @@ fn decode_xben_to_jsonl_twodelta_with_repetitions() { let expected: String = (1..=3).map(|i| expected_line(&anchor, i)).collect(); assert_eq!(jsonl, expected.as_bytes()); } + +// ─── snapshot/delta fallback: tags, round-trips, error handling ───────── + +/// Walk a `TwoDelta` BEN stream past its banner and collect the per-frame tag bytes, skipping over +/// each frame body so the next tag is found. Panics on a malformed/unknown tag. +fn collect_twodelta_tags(ben: &[u8]) -> Vec { + const BANNER_LEN: usize = 17; + let mut tags = Vec::new(); + let mut pos = BANNER_LEN; + while pos < ben.len() { + let tag = ben[pos]; + pos += 1; + match tag { + BEN_TWODELTA_SNAPSHOT_TAG => { + // MkvChain body: max_val(1) max_len(1) n_bytes(4 BE) payload(n_bytes) count(2). + let n_bytes = + u32::from_be_bytes(ben[pos + 2..pos + 6].try_into().unwrap()) as usize; + pos += 1 + 1 + 4 + n_bytes + 2; + } + BEN_TWODELTA_DELTA_TAG => { + // Delta body: pair(4) max_len(1) n_bytes(4 BE) payload(n_bytes) count(2). + let n_bytes = + u32::from_be_bytes(ben[pos + 5..pos + 9].try_into().unwrap()) as usize; + pos += 4 + 1 + 4 + n_bytes + 2; + } + other => panic!("unexpected tag byte {other:#04x} at offset {}", pos - 1), + } + tags.push(tag); + } + tags +} + +fn decode_twodelta_ben_to_assignments(ben: &[u8]) -> Vec> { + let reader = crate::io::reader::BenStreamReader::from_ben(ben).unwrap(); + reader.map(|r| r.unwrap().0).collect() +} + +#[test] +fn twodelta_ben_first_frame_carries_snapshot_tag() { + let ben = make_twodelta_ben(&[vec![1u16, 1, 2, 2], vec![1u16, 2, 1, 2]]); + assert_eq!(collect_twodelta_tags(&ben).first().copied(), Some(BEN_TWODELTA_SNAPSHOT_TAG)); +} + +#[test] +fn twodelta_ben_interleaved_swap_multiswap_swap_tags_and_roundtrip() { + // anchor → 2-swap → 3-id swap (snapshot) → 2-swap. The final delta must decode correctly, + // proving masks were rebuilt across the mid-stream snapshot. + let a0 = vec![1u16, 1, 2, 2]; // snapshot (anchor) + let a1 = vec![1u16, 2, 1, 2]; // delta from a0 (swap pos1/pos2) + let a2 = vec![3u16, 3, 1, 2]; // 3 ids change vs a1 → snapshot + let a3 = vec![3u16, 3, 2, 1]; // delta from a2 (swap pos2/pos3; both ids present in a2) + let assignments = vec![a0.clone(), a1.clone(), a2.clone(), a3.clone()]; + let ben = make_twodelta_ben(&assignments); + + assert_eq!( + collect_twodelta_tags(&ben), + vec![ + BEN_TWODELTA_SNAPSHOT_TAG, + BEN_TWODELTA_DELTA_TAG, + BEN_TWODELTA_SNAPSHOT_TAG, + BEN_TWODELTA_DELTA_TAG, + ] + ); + assert_eq!(decode_twodelta_ben_to_assignments(&ben), assignments); +} + +#[test] +fn twodelta_ben_new_district_falls_back_to_snapshot_then_resumes_delta() { + // A 2-id transition that introduces a district absent from the previous assignment has no mask + // to delta against → snapshot. Once present, later 2-swaps among those ids delta normally. + let c0 = vec![1u16, 1, 1, 1]; // snapshot (anchor) + let c1 = vec![1u16, 1, 2, 2]; // introduces district 2 → snapshot fallback + let c2 = vec![1u16, 2, 1, 2]; // delta from c1 (both ids present) + let assignments = vec![c0.clone(), c1.clone(), c2.clone()]; + let ben = make_twodelta_ben(&assignments); + + assert_eq!( + collect_twodelta_tags(&ben), + vec![ + BEN_TWODELTA_SNAPSHOT_TAG, + BEN_TWODELTA_SNAPSHOT_TAG, + BEN_TWODELTA_DELTA_TAG, + ] + ); + assert_eq!(decode_twodelta_ben_to_assignments(&ben), assignments); +} + +#[test] +fn twodelta_ben_general_ensemble_roundtrip() { + // A pseudo-random mix of multi-district moves and 2-swaps over a fixed label set. + let mut assignments = Vec::new(); + let mut state: u64 = 0x9E3779B97F4A7C15; + let mut next = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + state + }; + for _ in 0..200 { + let assignment: Vec = (0..16).map(|_| (next() % 4) as u16 + 1).collect(); + assignments.push(assignment); + } + let ben = make_twodelta_ben(&assignments); + assert_eq!(decode_twodelta_ben_to_assignments(&ben), assignments); +} + +#[test] +fn twodelta_ben_count_samples_over_mixed_stream() { + let anchor = vec![1u16, 1, 2, 2]; + let assignments = vec![ + anchor.clone(), + anchor.clone(), // repeat of anchor + vec![1u16, 2, 1, 2], // delta + vec![3u16, 3, 1, 2], // snapshot + vec![3u16, 3, 1, 2], // repeat of snapshot + vec![3u16, 3, 2, 1], // delta + ]; + let ben = make_twodelta_ben(&assignments); + let reader = crate::io::reader::BenStreamReader::from_ben(ben.as_slice()).unwrap(); + assert_eq!(reader.count_samples().unwrap(), assignments.len()); +} + +#[test] +fn twodelta_ben_unknown_tag_rejected() { + let mut ben = make_twodelta_ben(&[vec![1u16, 1, 2, 2]]); + ben.push(0x07); // not a valid per-frame tag + let mut out = Vec::new(); + let err = decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); +} + +#[test] +fn twodelta_ben_truncated_after_tag_errors_not_clean_eof() { + let mut ben = make_twodelta_ben(&[vec![1u16, 1, 2, 2]]); + ben.push(BEN_TWODELTA_DELTA_TAG); // a tag with no body following + let mut out = Vec::new(); + let err = decode_ben_to_jsonl(ben.as_slice(), &mut out).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); +} + +#[test] +fn decode_xben_to_jsonl_twodelta_mixed_via_translate_roundtrip() { + // The BEN→XBEN translate path must parse the per-frame tag and emit a mid-stream full frame + // for the snapshot, which then decodes correctly. + let a0 = vec![1u16, 1, 2, 2]; + let a1 = vec![1u16, 2, 1, 2]; // delta + let a2 = vec![3u16, 3, 1, 2]; // snapshot (3 ids) + let a3 = vec![3u16, 3, 2, 1]; // delta from the snapshot + let assignments = vec![a0.clone(), a1.clone(), a2.clone(), a3.clone()]; + let ben = make_twodelta_ben(&assignments); + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + let expected = expected_line(&a0, 1) + + &expected_line(&a1, 2) + + &expected_line(&a2, 3) + + &expected_line(&a3, 4); + assert_eq!(jsonl, expected.as_bytes()); +} + +#[test] +fn decode_xben_to_jsonl_twodelta_delta_snapshot_repeat_delta_via_translate() { + // delta → snapshot → repeat → delta must round-trip through the translate path. + let a0 = vec![1u16, 1, 2, 2]; + let a1 = vec![1u16, 2, 1, 2]; // delta + let a2 = vec![3u16, 3, 1, 2]; // snapshot + let a3 = vec![3u16, 3, 1, 2]; // repeat of the snapshot + let a4 = vec![3u16, 3, 2, 1]; // delta from the snapshot + let assignments = vec![a0.clone(), a1.clone(), a2.clone(), a3.clone(), a4.clone()]; + let ben = make_twodelta_ben(&assignments); + + let mut xben = Vec::new(); + encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None) + .unwrap(); + + let mut jsonl = Vec::new(); + decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); + let expected: String = assignments + .iter() + .enumerate() + .map(|(i, a)| expected_line(a, i + 1)) + .collect(); + assert_eq!(jsonl, expected.as_bytes()); +} diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index 02a2482..e1cfb3a 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -6,6 +6,11 @@ use std::io::{Error, ErrorKind, Result}; /// Encode a transition between two assignment vectors as a TwoDelta frame, optionally using /// caller-supplied hints to accelerate encoding. /// +/// This single-frame builder is **strict**: it errors on any transition that is not a clean 2-swap +/// (see [`EncodeError::TwoDeltaTooManyIds`]). Stream-level generality — falling back to a snapshot +/// frame for multi-district or new-district transitions — is provided by the writer's transition +/// classifier, not by this builder. +/// /// # Arguments /// /// * `previous_assignment` - The full assignment vector from the preceding sample. diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index e5243f9..84532d6 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -56,9 +56,11 @@ impl BenDecodeFrame { /// Returns `Ok(None)` on a clean EOF at a frame boundary, `Ok(Some(frame))` on success, and /// `Err` on any I/O or format error. /// - /// Note: in a `TwoDelta` *stream*, the first frame is encoded in `MkvChain` wire format. The - /// caller (e.g. [`BenStreamReader`]) tracks that state and passes [`BenVariant::MkvChain`] for - /// the first frame and [`BenVariant::TwoDelta`] for the rest. + /// Note: in a `TwoDelta` *stream* the body layout is chosen per frame — snapshot frames are + /// `MkvChain`-formatted and delta frames are `TwoDelta`-formatted. That choice is carried by a + /// 1-byte tag the stream reader (e.g. [`BenStreamReader`]) consumes before calling this; it + /// resolves the tag to a [`BenVariant`] and passes it here. This function reads the body for + /// whatever variant it is given and is unaware of the tag. /// /// [`BenStreamReader`]: crate::io::reader::BenStreamReader pub fn from_reader(reader: &mut impl Read, variant: BenVariant) -> io::Result> { diff --git a/ben/src/io/reader/stream_reader/ben.rs b/ben/src/io/reader/stream_reader/ben.rs index b043919..41769e5 100644 --- a/ben/src/io/reader/stream_reader/ben.rs +++ b/ben/src/io/reader/stream_reader/ben.rs @@ -2,37 +2,59 @@ use std::io::{self, Read}; +use byteorder::ReadBytesExt; + use super::zero_count_frame_error; use crate::codec::BenDecodeFrame; use crate::io::reader::subsample::MkvRecord; +use crate::io::reader::twodelta::{BEN_TWODELTA_DELTA_TAG, BEN_TWODELTA_SNAPSHOT_TAG}; use crate::progress::Spinner; use crate::BenVariant; /// Read the next frame from the underlying BEN stream. /// -/// In a `TwoDelta` stream the first frame is encoded in `MkvChain` wire format; this helper tracks -/// that state so the frame module stays variant-clean. +/// Every frame of a `TwoDelta` stream is prefixed with a 1-byte tag selecting its body layout: a +/// `BEN_TWODELTA_SNAPSHOT_TAG` frame is `MkvChain`-formatted and a `BEN_TWODELTA_DELTA_TAG` frame is +/// a delta. The tag is consumed here so the frame module stays variant-clean. Non-`TwoDelta` +/// streams carry no tag and read their fixed body directly. pub(super) fn pop_frame_from_reader( reader: &mut R, variant: BenVariant, - twodelta_consumed_first_frame: &mut bool, ) -> Option> { - let read_variant = if variant == BenVariant::TwoDelta && !*twodelta_consumed_first_frame { - *twodelta_consumed_first_frame = true; - BenVariant::MkvChain - } else { - variant - }; + if variant != BenVariant::TwoDelta { + return BenDecodeFrame::from_reader(reader, variant).transpose(); + } - BenDecodeFrame::from_reader(reader, read_variant).transpose() + // A clean EOF *at the tag boundary* ends the stream; an EOF after the tag is a truncated frame. + let tag = match reader.read_u8() { + Ok(t) => t, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return None, + Err(e) => return Some(Err(e)), + }; + let resolved = match tag { + BEN_TWODELTA_SNAPSHOT_TAG => BenVariant::MkvChain, + BEN_TWODELTA_DELTA_TAG => BenVariant::TwoDelta, + other => { + return Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown TwoDelta frame tag byte {other:#04x}"), + ))) + } + }; + match BenDecodeFrame::from_reader(reader, resolved) { + Ok(Some(frame)) => Some(Ok(frame)), + Ok(None) => Some(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "truncated TwoDelta frame: tag byte present but frame body missing", + ))), + Err(e) => Some(Err(e)), + } } -#[allow(clippy::too_many_arguments)] pub(super) fn for_each_assignment_ben( reader: &mut R, variant: BenVariant, previous_assignment: &mut Option>, - twodelta_consumed_first_frame: &mut bool, sample_count: &mut usize, spinner: &mut Option, silent: bool, @@ -42,7 +64,7 @@ where F: FnMut(&[u16], u16) -> io::Result, { loop { - let frame = match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + let frame = match pop_frame_from_reader(reader, variant) { Some(Ok(frame)) => frame, Some(Err(e)) => return Err(e), None => return Ok(()), @@ -69,17 +91,15 @@ where } } -#[allow(clippy::too_many_arguments)] pub(super) fn next_record_ben( reader: &mut R, variant: BenVariant, previous_assignment: &mut Option>, - twodelta_consumed_first_frame: &mut bool, sample_count: &mut usize, spinner: &mut Option, silent: bool, ) -> Option> { - let frame = match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + let frame = match pop_frame_from_reader(reader, variant) { Some(Ok(frame)) => frame, Some(Err(e)) => return Some(Err(e)), None => return None, @@ -103,11 +123,8 @@ pub(super) fn next_record_ben( } pub(super) fn count_samples_ben(mut reader: R, variant: BenVariant) -> io::Result { - let mut twodelta_consumed_first_frame = false; let mut total = 0usize; - while let Some(frame_res) = - pop_frame_from_reader(&mut reader, variant, &mut twodelta_consumed_first_frame) - { + while let Some(frame_res) = pop_frame_from_reader(&mut reader, variant) { let count = frame_res?.count(); if count == 0 { return Err(zero_count_frame_error("BEN")); diff --git a/ben/src/io/reader/stream_reader/frames.rs b/ben/src/io/reader/stream_reader/frames.rs index 10f00f3..a2958dd 100644 --- a/ben/src/io/reader/stream_reader/frames.rs +++ b/ben/src/io/reader/stream_reader/frames.rs @@ -64,12 +64,11 @@ impl Iterator for BenStreamFrameReader { BenStreamInner::Ben { reader, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, } => match variant { BenVariant::Standard | BenVariant::MkvChain => { - match pop_frame_from_reader(reader, variant, twodelta_consumed_first_frame) { + match pop_frame_from_reader(reader, variant) { Some(Ok(frame)) => { let count = frame.count(); if count == 0 { @@ -86,7 +85,6 @@ impl Iterator for BenStreamFrameReader { reader, variant, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, silent, diff --git a/ben/src/io/reader/stream_reader/mod.rs b/ben/src/io/reader/stream_reader/mod.rs index 86ed4f3..ed4194f 100644 --- a/ben/src/io/reader/stream_reader/mod.rs +++ b/ben/src/io/reader/stream_reader/mod.rs @@ -55,7 +55,6 @@ pub(crate) enum BenStreamInner { Ben { reader: R, previous_assignment: Option>, - twodelta_consumed_first_frame: bool, sample_count: usize, spinner: Option, }, @@ -91,7 +90,6 @@ impl BenStreamReader { inner: BenStreamInner::Ben { reader, previous_assignment: None, - twodelta_consumed_first_frame: false, sample_count: 0, spinner: None, }, @@ -199,14 +197,12 @@ impl BenStreamReader { BenStreamInner::Ben { reader, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, } => ben::for_each_assignment_ben( reader, variant, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, silent, @@ -250,14 +246,12 @@ impl Iterator for BenStreamReader { BenStreamInner::Ben { reader, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, } => ben::next_record_ben( reader, variant, previous_assignment, - twodelta_consumed_first_frame, sample_count, spinner, silent, diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index 4a1d951..e84b83a 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -211,6 +211,88 @@ fn xz_reader_into_frames_twodelta() { assert_eq!(frames.len(), 2); } +/// Build a plain-BEN stream using BenStreamWriter directly. +fn make_ben_from_assignments(assignments: &[Vec], variant: BenVariant) -> Vec { + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, variant).unwrap(); + for a in assignments { + writer.write_assignment(a.clone()).unwrap(); + } + } + ben +} + +/// A TwoDelta sequence that exercises every framing path: anchor (full), a 2-swap delta, a +/// multi-district transition (mid-stream snapshot), a repeat of that snapshot, and a delta rebased +/// onto it. +fn mixed_twodelta_assignments() -> Vec> { + vec![ + vec![1u16, 1, 2, 2], + vec![1u16, 2, 1, 2], + vec![3u16, 3, 1, 2], + vec![3u16, 3, 1, 2], + vec![3u16, 3, 2, 1], + ] +} + +/// Drive a raw-frame iterator to completion, expanding each self-contained frame `count` times. +fn expand_raw_frames(frames: BenStreamFrameReader) -> Vec> { + let mut out = Vec::new(); + for item in frames { + let (frame, count) = item.unwrap(); + let assignment = frame.expand_self_contained().unwrap(); + for _ in 0..count { + out.push(assignment.clone()); + } + } + out +} + +#[test] +fn raw_frame_surface_roundtrips_mixed_twodelta_ben() { + // The subsample/raw-frame surface materializes each TwoDelta frame and re-encodes it as a + // self-contained Standard frame. A mixed snapshot/delta stream must round-trip across it. + let assignments = mixed_twodelta_assignments(); + let ben = make_ben_from_assignments(&assignments, BenVariant::TwoDelta); + let frames = BenStreamReader::from_ben(Cursor::new(ben)).unwrap().into_frames(); + assert_eq!(expand_raw_frames(frames), assignments); +} + +#[test] +fn raw_frame_surface_roundtrips_mixed_twodelta_xben() { + let assignments = mixed_twodelta_assignments(); + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let frames = BenStreamReader::from_xben(Cursor::new(xben)).unwrap().into_frames(); + assert_eq!(expand_raw_frames(frames), assignments); +} + +#[test] +fn subsample_mixed_twodelta_ben_selects_correct_samples() { + // Subsampling rides on the raw-frame surface; selecting across the mid-stream snapshot must + // still rebase the later delta correctly. + let assignments = mixed_twodelta_assignments(); + let ben = make_ben_from_assignments(&assignments, BenVariant::TwoDelta); + let results: Vec<_> = BenStreamReader::from_ben(Cursor::new(ben)) + .unwrap() + .into_subsample_by_indices(vec![1, 3, 5]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results, vec![assignments[0].clone(), assignments[2].clone(), assignments[4].clone()]); +} + +#[test] +fn subsample_mixed_twodelta_xben_selects_correct_samples() { + let assignments = mixed_twodelta_assignments(); + let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); + let results: Vec<_> = BenStreamReader::from_xben(Cursor::new(xben)) + .unwrap() + .into_subsample_by_indices(vec![1, 3, 5]) + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(results, vec![assignments[0].clone(), assignments[2].clone(), assignments[4].clone()]); +} + #[test] fn xz_frame_reader_new() { let jsonl = r#"{"assignment":[1,1,2,2],"sample":1} @@ -1618,17 +1700,18 @@ fn raw_frame_iter_propagates_twodelta_decode_error() { writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } - // Locate the TwoDelta delta frame start by parsing the anchor (MkvChain) frame header: - // banner(17) + max_val_bits(1) + max_len_bits(1) + n_bytes(4 BE) + payload(n_bytes) + count(2) - // = anchor_end. + // Locate the TwoDelta delta frame start by parsing the anchor (snapshot/MkvChain) frame header: + // banner(17) + snapshot_tag(1) + max_val_bits(1) + max_len_bits(1) + n_bytes(4 BE) + + // payload(n_bytes) + count(2) = anchor_end (which points at the next frame's tag byte). let banner_len = 17usize; + let anchor_start = banner_len + 1; // skip the snapshot tag let n_bytes = - u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; - let anchor_end = banner_len + 6 + n_bytes + 2; + u32::from_be_bytes(ben[anchor_start + 2..anchor_start + 6].try_into().unwrap()) as usize; + let anchor_end = anchor_start + 6 + n_bytes + 2; - // The TwoDelta delta frame: pair_a(2) + pair_b(2) + max_len_bits(1) + ... Set max_len_bits to - // 0, which triggers InvalidData during decoding. - ben[anchor_end + 4] = 0; + // The delta frame: delta_tag(1) + pair_a(2) + pair_b(2) + max_len_bits(1) + ... Set max_len_bits + // to 0, which triggers InvalidData during decoding. + ben[anchor_end + 5] = 0; let reader = BenStreamReader::from_ben(Cursor::new(ben)).unwrap(); let mut iter = reader.into_frames(); diff --git a/ben/src/io/reader/twodelta.rs b/ben/src/io/reader/twodelta.rs index adc43f1..075fd4e 100644 --- a/ben/src/io/reader/twodelta.rs +++ b/ben/src/io/reader/twodelta.rs @@ -1,2 +1,7 @@ pub(crate) const XBEN_TWODELTA_FULL_TAG: u8 = 0; pub(crate) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; + +// Per-frame discriminator prepended to every frame of a plain-BEN `TwoDelta` stream (writer copy in +// `io::writer::twodelta`). Distinct from the XBEN columnar tags above; the two copies must agree. +pub(crate) const BEN_TWODELTA_SNAPSHOT_TAG: u8 = 0x00; +pub(crate) const BEN_TWODELTA_DELTA_TAG: u8 = 0x01; diff --git a/ben/src/io/writer/stream_writer/ben.rs b/ben/src/io/writer/stream_writer/ben.rs index 0789d72..2616e3b 100644 --- a/ben/src/io/writer/stream_writer/ben.rs +++ b/ben/src/io/writer/stream_writer/ben.rs @@ -7,7 +7,10 @@ use crate::codec::encode::encode_twodelta_frame_with_hint; use crate::codec::BenEncodeFrame; use crate::BenVariant; -use super::super::twodelta::twodelta_repeat_runs; +use super::super::twodelta::{ + classify_transition, pair_has_masks, twodelta_repeat_runs, TransitionKind, + BEN_TWODELTA_DELTA_TAG, BEN_TWODELTA_SNAPSHOT_TAG, +}; /// State for the BEN arm. Variant lives here as the single source of truth. pub(super) struct BenState { @@ -62,35 +65,56 @@ impl BenState { } BenVariant::TwoDelta => { if self.previous_assignment.is_empty() { - // First frame: encode as MkvChain wire format and seed the position masks for - // subsequent delta frames. - for (idx, &val) in assignment.iter().enumerate() { - self.previous_masks.entry(val).or_default().push(idx); - } - let frame = BenEncodeFrame::from_assignment( - assignment, - BenVariant::MkvChain, - Some(count), - ); - self.writer.write_all(frame.as_slice())?; - } else if self.previous_assignment.as_slice() == assignment { - let frame = twodelta_repeat_frame(assignment, count)?; - self.writer.write_all(frame.as_slice())?; + // First frame: a snapshot. Seeds the position masks for subsequent deltas. + self.write_twodelta_snapshot(assignment, count)?; } else { - let frame = encode_twodelta_frame_with_hint( - &self.previous_assignment, - assignment, - None, - Some(&mut self.previous_masks), - Some(count), - )?; - self.writer.write_all(frame.as_slice())?; + match classify_transition(&self.previous_assignment, assignment)? { + TransitionKind::Repeat => { + let frame = twodelta_repeat_frame(assignment, count)?; + self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; + self.writer.write_all(frame.as_slice())?; + } + // Clean 2-swap where both districts already exist: cheap delta against the + // maintained masks. + TransitionKind::Delta(a, b) + if pair_has_masks(&self.previous_masks, a, b) => + { + let frame = encode_twodelta_frame_with_hint( + &self.previous_assignment, + assignment, + Some((a, b)), + Some(&mut self.previous_masks), + Some(count), + )?; + self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; + self.writer.write_all(frame.as_slice())?; + } + // A >2-district transition, or a 2-id transition that introduces a district + // absent from the previous assignment (no mask to delta against): full + // snapshot, then rebuild masks so the next delta has a correct baseline. + TransitionKind::Delta(..) | TransitionKind::Snapshot => { + self.write_twodelta_snapshot(assignment, count)?; + } + } } } } Ok(()) } + /// Write a snapshot frame (`MkvChain` wire format under the snapshot tag) and (re)seed the + /// position masks from `assignment` so any following delta frame has a correct baseline. + fn write_twodelta_snapshot(&mut self, assignment: &[u16], count: u16) -> io::Result<()> { + self.previous_masks.clear(); + for (idx, &val) in assignment.iter().enumerate() { + self.previous_masks.entry(val).or_default().push(idx); + } + let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count)); + self.writer.write_all(&[BEN_TWODELTA_SNAPSHOT_TAG])?; + self.writer.write_all(frame.as_slice())?; + Ok(()) + } + pub(super) fn write_assignment(&mut self, assign_vec: Vec) -> io::Result<()> { if self.pending_assignment.as_deref() == Some(assign_vec.as_slice()) { if self.pending_count == u16::MAX { @@ -114,9 +138,9 @@ impl BenState { self.flush_pending_frame()?; self.encode_and_write_frame(&assignment, count)?; // For TwoDelta, the next delta is encoded against the just-emitted frame. - // `encode_and_write_frame` already updated `previous_masks` when the previous_assignment - // was empty; in all variants we need to update `previous_assignment` here so a subsequent - // `write_assignment` sees the right baseline. + // `encode_and_write_frame` already updated `previous_masks` (a snapshot reseeds them, a + // delta maintains them in place); in all variants we update `previous_assignment` here so a + // subsequent `write_assignment` sees the right baseline. self.previous_assignment = assignment; Ok(()) } diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index 4698fad..95bdd7f 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -16,7 +16,9 @@ use crate::BenVariant; use super::super::frames::BufferedDeltaFrame; use super::super::twodelta::{ - twodelta_repeat_runs, XBEN_TWODELTA_CHUNK_TAG, XBEN_TWODELTA_FULL_TAG, + classify_transition, pair_has_masks, twodelta_repeat_runs, TransitionKind, + BEN_TWODELTA_DELTA_TAG, BEN_TWODELTA_SNAPSHOT_TAG, XBEN_TWODELTA_CHUNK_TAG, + XBEN_TWODELTA_FULL_TAG, }; use super::super::utils::encode_xben_twodelta_full_frame; @@ -36,8 +38,11 @@ pub(super) enum XBenState { TwoDelta { previous_assignment: Vec, previous_masks: HashMap>, - pending_initial_full_assignment: Option>, - pending_initial_full_count: u16, + /// A full frame buffered awaiting its final repetition count. Used both for the initial + /// anchor and for mid-stream snapshots (>2-district transitions). A full frame writes its + /// count *after* the payload, so it cannot be emitted until a distinct assignment arrives. + pending_full_assignment: Option>, + pending_full_count: u16, twodelta_chunk_size: usize, chunk_buffer: Vec, }, @@ -54,8 +59,8 @@ impl XBenState { BenVariant::TwoDelta => XBenState::TwoDelta { previous_assignment: Vec::new(), previous_masks: HashMap::new(), - pending_initial_full_assignment: None, - pending_initial_full_count: 0, + pending_full_assignment: None, + pending_full_count: 0, twodelta_chunk_size, chunk_buffer: Vec::new(), }, @@ -114,24 +119,24 @@ impl XBenInner { XBenState::TwoDelta { previous_assignment, previous_masks, - pending_initial_full_assignment, - pending_initial_full_count, + pending_full_assignment, + pending_full_count, twodelta_chunk_size, chunk_buffer, } => { // First assignment ever: buffer as the initial full frame. - if pending_initial_full_assignment.is_none() && previous_assignment.is_empty() { - *pending_initial_full_assignment = Some(assign_vec); - *pending_initial_full_count = 1; + if pending_full_assignment.is_none() && previous_assignment.is_empty() { + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; return Ok(()); } - // Repeat of the pending initial full frame. - if pending_initial_full_assignment.as_deref() == Some(assign_vec.as_slice()) { - if *pending_initial_full_count == u16::MAX { - flush_twodelta_initial( + // Repeat of the pending full frame (initial anchor or a mid-stream snapshot). + if pending_full_assignment.as_deref() == Some(assign_vec.as_slice()) { + if *pending_full_count == u16::MAX { + flush_twodelta_full( &mut self.encoder, - pending_initial_full_assignment, - pending_initial_full_count, + pending_full_assignment, + pending_full_count, previous_assignment, previous_masks, )?; @@ -140,7 +145,7 @@ impl XBenInner { *previous_assignment = assign_vec; return Ok(()); } - *pending_initial_full_count += 1; + *pending_full_count += 1; return Ok(()); } // Repeat of the last delta frame in the current chunk. @@ -156,42 +161,65 @@ impl XBenInner { } return Ok(()); } - // New distinct assignment: flush the initial full frame if pending. - if pending_initial_full_assignment.is_some() { - flush_twodelta_initial( + // New distinct assignment: flush a pending full frame so it precedes the new body. + if pending_full_assignment.is_some() { + flush_twodelta_full( &mut self.encoder, - pending_initial_full_assignment, - pending_initial_full_count, + pending_full_assignment, + pending_full_count, previous_assignment, previous_masks, )?; } - // Encode the delta frame and add it to the chunk buffer. - let frame = encode_twodelta_frame_with_hint( - &*previous_assignment, - &assign_vec, - None, - Some(previous_masks), - None, - )?; - let (pair, run_lengths) = match frame { - BenEncodeFrame::TwoDelta { - pair, - run_length_vector, - .. - } => (pair, run_length_vector), - _ => unreachable!( - "encode_twodelta_frame_with_hint always returns the TwoDelta arm" - ), - }; - chunk_buffer.push(BufferedDeltaFrame { - pair, - run_lengths, - count: 1, - }); - *previous_assignment = assign_vec; - if chunk_buffer.len() >= *twodelta_chunk_size { - flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + // Classify the transition; fall back to a deferred snapshot when >2 ids change. + match classify_transition(previous_assignment, &assign_vec)? { + // `previous == assign_vec` only reaches here when the chunk was just flushed + // (so the repeat-of-last-delta fast path above was skipped). Encode it as a + // repeat delta against the previous frame. + TransitionKind::Repeat => { + let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; + chunk_buffer.push(repeat); + *previous_assignment = assign_vec; + } + // Clean 2-swap where both districts already exist: cheap delta. + TransitionKind::Delta(a, b) if pair_has_masks(previous_masks, a, b) => { + let frame = encode_twodelta_frame_with_hint( + &*previous_assignment, + &assign_vec, + Some((a, b)), + Some(previous_masks), + None, + )?; + let (pair, run_lengths) = match frame { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!( + "encode_twodelta_frame_with_hint always returns the TwoDelta arm" + ), + }; + chunk_buffer.push(BufferedDeltaFrame { + pair, + run_lengths, + count: 1, + }); + *previous_assignment = assign_vec; + if chunk_buffer.len() >= *twodelta_chunk_size { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + } + } + // A >2-district transition, or a 2-id transition introducing a district absent + // from the previous assignment: defer it as a pending full frame. Flush the + // current chunk first so its deltas precede the snapshot in the stream. The + // full frame's count is written after its payload, so it cannot be emitted + // until a following distinct assignment (or `flush`) settles the count. + TransitionKind::Delta(..) | TransitionKind::Snapshot => { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } } } } @@ -209,15 +237,17 @@ impl XBenInner { XBenState::TwoDelta { previous_assignment, previous_masks, - pending_initial_full_assignment, - pending_initial_full_count, + pending_full_assignment, + pending_full_count, chunk_buffer, .. } => { - flush_twodelta_initial( + // At most one of these is non-empty: buffering a pending full frame always flushes + // the chunk first, and pushing a delta clears any pending full frame. + flush_twodelta_full( &mut self.encoder, - pending_initial_full_assignment, - pending_initial_full_count, + pending_full_assignment, + pending_full_count, previous_assignment, previous_masks, )?; @@ -241,68 +271,89 @@ impl XBenInner { _ => unreachable!(), }; - // First frame: standard BEN RLE → XBEN full frame. - let max_val_bits = reader.read_u8()?; - let max_len_bits = reader.read_u8()?; - let n_bytes = reader.read_u32::()?; - let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - let first_count = reader.read_u16::()?; - - let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); - encoded.push(XBEN_TWODELTA_FULL_TAG); - encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); - for &(value, len) in &runs { - encoded.extend_from_slice(&value.to_be_bytes()); - encoded.extend_from_slice(&len.to_be_bytes()); - } - self.encoder.write_all(&encoded)?; - self.encoder.write_all(&first_count.to_be_bytes())?; - - let mut sample_count = first_count as usize; + let mut sample_count = 0usize; let spinner = Spinner::new("Encoding line"); - spinner.set_count(sample_count as u64); - // Delta frames: unpack bitpacked run lengths and buffer into chunks. + // Each BEN frame is prefixed with a per-frame tag. This path keeps no masks and + // materializes no assignments, so there is nothing to reset across a snapshot — it simply + // mirrors the BEN framing onto the XBEN columnar layout. loop { - let pair_a = match reader.read_u16::() { - Ok(v) => v, + let tag = match reader.read_u8() { + Ok(t) => t, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, Err(e) => return Err(e), }; - let pair_b = reader.read_u16::()?; - let delta_max_len_bits = reader.read_u8()?; - let delta_n_bytes = reader.read_u32::()?; - - let mut payload = vec![0u8; delta_n_bytes as usize]; - reader.read_exact(&mut payload)?; - let count = reader.read_u16::()?; - - let (pair, run_lengths) = match BenEncodeFrame::from_parts( - (pair_a, pair_b), - delta_max_len_bits, - payload, - count, - ) { - BenEncodeFrame::TwoDelta { - pair, - run_length_vector, - .. - } => (pair, run_length_vector), - _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), - }; - chunk_buffer.push(BufferedDeltaFrame { - pair, - run_lengths, - count, - }); + match tag { + // Snapshot: a MkvChain-formatted body → XBEN full frame. Flush the current chunk + // first so its deltas precede the full frame in the stream. + BEN_TWODELTA_SNAPSHOT_TAG => { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; - if chunk_buffer.len() >= chunk_size { - flush_chunk_inner(&mut self.encoder, chunk_buffer)?; - } + let max_val_bits = reader.read_u8()?; + let max_len_bits = reader.read_u8()?; + let n_bytes = reader.read_u32::()?; + let runs = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; + let count = reader.read_u16::()?; + + let mut encoded = Vec::with_capacity(1 + 4 + runs.len() * 4); + encoded.push(XBEN_TWODELTA_FULL_TAG); + encoded.extend_from_slice(&(runs.len() as u32).to_be_bytes()); + for &(value, len) in &runs { + encoded.extend_from_slice(&value.to_be_bytes()); + encoded.extend_from_slice(&len.to_be_bytes()); + } + self.encoder.write_all(&encoded)?; + self.encoder.write_all(&count.to_be_bytes())?; + + sample_count += count as usize; + spinner.set_count(sample_count as u64); + } + // Delta: unpack the bit-packed run lengths and buffer into the current chunk. + BEN_TWODELTA_DELTA_TAG => { + let pair_a = reader.read_u16::()?; + let pair_b = reader.read_u16::()?; + let delta_max_len_bits = reader.read_u8()?; + let delta_n_bytes = reader.read_u32::()?; + + let mut payload = vec![0u8; delta_n_bytes as usize]; + reader.read_exact(&mut payload)?; + let count = reader.read_u16::()?; + + let (pair, run_lengths) = match BenEncodeFrame::from_parts( + (pair_a, pair_b), + delta_max_len_bits, + payload, + count, + ) { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), + }; + + chunk_buffer.push(BufferedDeltaFrame { + pair, + run_lengths, + count, + }); - sample_count += count as usize; - spinner.set_count(sample_count as u64); + if chunk_buffer.len() >= chunk_size { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + } + + sample_count += count as usize; + spinner.set_count(sample_count as u64); + } + other => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown TwoDelta frame tag byte {other:#04x}"), + )) + } + } } flush_chunk_inner(&mut self.encoder, chunk_buffer)?; @@ -357,20 +408,25 @@ fn flush_mkv_pending( Ok(()) } -fn flush_twodelta_initial( +/// Emit a buffered full frame (payload then trailing count) and rebase the delta state onto it. +/// +/// Used for both the initial anchor and mid-stream snapshots, so `previous_masks` is cleared before +/// reseeding rather than only pushed onto — the map is non-empty after the first frame. +fn flush_twodelta_full( encoder: &mut XzEncoder, - pending_initial_full_assignment: &mut Option>, - pending_initial_full_count: &mut u16, + pending_full_assignment: &mut Option>, + pending_full_count: &mut u16, previous_assignment: &mut Vec, previous_masks: &mut HashMap>, ) -> io::Result<()> { - let pending = match pending_initial_full_assignment.take() { + let pending = match pending_full_assignment.take() { Some(p) => p, None => return Ok(()), }; - let count = *pending_initial_full_count; - *pending_initial_full_count = 0; + let count = *pending_full_count; + *pending_full_count = 0; + previous_masks.clear(); for (idx, &val) in pending.iter().enumerate() { previous_masks.entry(val).or_default().push(idx); } diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 02672ad..f72744e 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -618,19 +618,97 @@ fn writer_translate_ben_twodelta_chunk_flush() { assert_eq!(results, assignments); } -// ── TwoDelta encoding error propagation ───────────────────────────── +// ── TwoDelta >2-district fallback ─────────────────────────────────── #[test] -fn xz_writer_twodelta_too_many_ids_propagates_on_write() { - // Writing a third assignment that changes 3 distinct IDs errors at the TwoDelta encode - // boundary. +fn xz_writer_twodelta_too_many_ids_falls_back_to_snapshot() { + // A transition that changes 3 distinct ids is no longer an error: it emits a full snapshot + // frame and still round-trips. let anchor = vec![1u16, 1, 2, 2]; - let invalid = vec![2u16, 3, 1, 3]; // 3 distinct changing ids + let multi = vec![2u16, 3, 1, 3]; // 3 distinct changing ids let mut xben = Vec::new(); - let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); - writer.write_assignment(anchor).unwrap(); - let err = writer.write_assignment(invalid).unwrap_err(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + { + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); + writer.write_assignment(anchor.clone()).unwrap(); + writer.write_assignment(multi.clone()).unwrap(); + writer.finish().unwrap(); + } + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); + let results: Vec<_> = reader.map(|r| r.unwrap().0).collect(); + assert_eq!(results, vec![anchor, multi]); +} + +#[test] +fn xz_writer_twodelta_mixed_snapshot_delta_direct_roundtrip() { + // The direct XBEN writer (not the BEN→XBEN translate path) must emit a mid-stream full frame + // for a >2-district transition and rebase later deltas onto it. + let assignments = vec![ + vec![1u16, 1, 2, 2], // anchor (full) + vec![1u16, 2, 1, 2], // delta + vec![3u16, 3, 1, 2], // 3 ids → mid-stream full + vec![3u16, 3, 2, 1], // delta from the snapshot + ]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); +} + +#[test] +fn xz_writer_twodelta_new_district_falls_back_to_snapshot_direct() { + // A 2-id transition that introduces a district absent from the previous assignment has no mask + // to delta against, so the direct XBEN writer must emit a snapshot; a later 2-swap among now + // present ids deltas normally. + let assignments = vec![ + vec![1u16, 1, 1, 1], // anchor + vec![1u16, 1, 2, 2], // introduces district 2 → snapshot + vec![1u16, 2, 1, 2], // delta (both ids present) + ]; + assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); +} + +#[test] +fn xz_writer_twodelta_delta_snapshot_repeat_delta_direct() { + // delta → snapshot → repeat → delta: the repeat increments the deferred full frame's count, + // and the following delta rebases onto the flushed snapshot. + let s = vec![3u16, 3, 1, 2]; + let assignments = vec![ + vec![1u16, 1, 2, 2], // anchor + vec![1u16, 2, 1, 2], // delta + s.clone(), // snapshot + s.clone(), // repeat of snapshot + vec![3u16, 3, 2, 1], // delta from the snapshot + ]; + assert_eq!( + roundtrip_xben_counts(&assignments, BenVariant::TwoDelta), + vec![ + (vec![1u16, 1, 2, 2], 1), + (vec![1u16, 2, 1, 2], 1), + (s.clone(), 2), + (vec![3u16, 3, 2, 1], 1), + ] + ); +} + +#[test] +fn xz_writer_twodelta_pending_full_count_overflow_u16max() { + // A snapshot repeated past u16::MAX flushes the full frame (count == u16::MAX) and emits the + // overflow repeat as a delta in the following chunk, so the total still round-trips. + let anchor = vec![1u16, 1, 2, 2]; + let delta = vec![1u16, 2, 1, 2]; // 2-swap from anchor + let snapshot = vec![3u16, 3, 3, 3]; // 3 ids change → snapshot + let repeats = u16::MAX as usize + 1; // one past the count ceiling + + let mut xben = Vec::new(); + { + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); + writer.write_assignment(anchor.clone()).unwrap(); + writer.write_assignment(delta.clone()).unwrap(); + for _ in 0..repeats { + writer.write_assignment(snapshot.clone()).unwrap(); + } + writer.finish().unwrap(); + } + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); + let total: usize = reader.map(|r| r.unwrap().1 as usize).sum(); + assert_eq!(total, 2 + repeats); } // ── MkvChain u16::MAX overflow ─────────────────────────────────────── diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 3437990..5173910 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -1,8 +1,16 @@ +use std::collections::HashMap; use std::io; pub(crate) const XBEN_TWODELTA_FULL_TAG: u8 = 0; pub(crate) const XBEN_TWODELTA_CHUNK_TAG: u8 = 2; +// Per-frame discriminator prepended to every frame of a plain-BEN `TwoDelta` stream. This is a +// distinct namespace from the XBEN columnar tags above: a BEN stream interleaves self-describing +// snapshot and delta frames, so the wire format is `[tag u8][body]`. The reader copy of these +// constants lives in `io::reader::twodelta`; the two must stay in agreement. +pub(crate) const BEN_TWODELTA_SNAPSHOT_TAG: u8 = 0x00; +pub(crate) const BEN_TWODELTA_DELTA_TAG: u8 = 0x01; + /// Default number of delta frames per columnar chunk in XBEN TwoDelta. pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; @@ -49,3 +57,83 @@ pub(crate) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16) Ok(((first, second), run_lengths)) } + +/// How an inter-sample transition should be framed in a `TwoDelta` stream. +pub(crate) enum TransitionKind { + /// No position changed value; the sample repeats the previous one. + Repeat, + /// Every changed position swaps between exactly these two district ids; encode as a delta. + Delta(u16, u16), + /// More than two distinct district ids change; fall back to a full snapshot frame. + Snapshot, +} + +/// Classify a transition between two equal-length assignment vectors in a single O(n) scan. +/// +/// Walks `previous` against `current`, collecting the set of district ids seen at changed positions +/// and short-circuiting to [`TransitionKind::Snapshot`] as soon as a third distinct id appears. A +/// union of exactly two ids is necessarily a clean 2-swap: every changed position has both its old +/// and new value within the pair, so positions outside the pair cannot have moved (that would +/// introduce a third id). No change at all is a [`TransitionKind::Repeat`]. +/// +/// A full scan is required for correctness: the mask-hint fast path only walks the inferred pair's +/// positions and would silently miss a third district changing at an out-of-pair position. +/// +/// `zip` would silently truncate to the shorter vector, so the length is checked explicitly, +/// preserving the validation the strict single-frame encoder performs. +pub(crate) fn classify_transition( + previous: &[u16], + current: &[u16], +) -> io::Result { + if previous.len() != current.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "TwoDelta transition length mismatch: previous has {} positions, current has {}", + previous.len(), + current.len() + ), + )); + } + + let mut first: Option = None; + let mut second: Option = None; + let mut changed = false; + + for (&prev_val, &curr_val) in previous.iter().zip(current.iter()) { + if prev_val == curr_val { + continue; + } + changed = true; + for val in [prev_val, curr_val] { + match (first, second) { + (None, _) => first = Some(val), + (Some(f), _) if f == val => {} + (Some(_), None) => second = Some(val), + (Some(_), Some(s)) if s == val => {} + (Some(_), Some(_)) => return Ok(TransitionKind::Snapshot), + } + } + } + + if !changed { + return Ok(TransitionKind::Repeat); + } + + match (first, second) { + // A changed position contributes two distinct ids, so once `changed` is set both slots + // are filled. + (Some(a), Some(b)) => Ok(TransitionKind::Delta(a, b)), + _ => unreachable!("a differing position yields two distinct ids"), + } +} + +/// Whether both ids of a classified delta pair have a usable position mask, i.e. both districts +/// already appear in the previous assignment. +/// +/// The mask-hint encoder requires this. A 2-id transition can still introduce a district that was +/// absent from the previous assignment (e.g. an empty district that just gained nodes); it has no +/// mask to delta against and must be encoded via a snapshot instead. +pub(crate) fn pair_has_masks(masks: &HashMap>, a: u16, b: u16) -> bool { + masks.get(&a).is_some_and(|m| !m.is_empty()) && masks.get(&b).is_some_and(|m| !m.is_empty()) +} diff --git a/ben/src/lib.rs b/ben/src/lib.rs index 2cca17d..e3e9101 100755 --- a/ben/src/lib.rs +++ b/ben/src/lib.rs @@ -50,6 +50,11 @@ pub enum BenVariant { /// Store one frame plus a repetition count for repeated consecutive samples. MkvChain, /// Store delta-encoded frames for improved compression of correlated samples. + /// + /// Works on any ensemble: a transition that swaps exactly two districts is delta-compressed, + /// and any other transition (a multi-district move, random/independent sampling, or a district + /// that was previously empty) emits a full snapshot frame. Each frame carries a 1-byte tag + /// selecting snapshot vs delta, so delta and snapshot frames interleave freely. TwoDelta, } diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 7eb1715..075e164 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -1026,11 +1026,14 @@ fn test_relabel_ben_file_twodelta_malformed_frame_error_propagates() { writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } + // banner(17) + snapshot_tag(1) precede the anchor frame; a delta_tag(1) precedes the delta + // frame, so the delta's max_len_bits sits at anchor_end + 5. let banner_len = 17usize; + let anchor_start = banner_len + 1; let n_bytes = - u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; - let anchor_end = banner_len + 6 + n_bytes + 2; - ben[anchor_end + 4] = 0; + u32::from_be_bytes(ben[anchor_start + 2..anchor_start + 6].try_into().unwrap()) as usize; + let anchor_end = anchor_start + 6 + n_bytes + 2; + ben[anchor_end + 5] = 0; let mut output = Vec::new(); let err = @@ -1047,11 +1050,14 @@ fn test_relabel_ben_file_with_map_twodelta_malformed_frame_error_propagates() { writer.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); writer.write_assignment(vec![2u16, 1, 2, 1]).unwrap(); } + // banner(17) + snapshot_tag(1) precede the anchor frame; a delta_tag(1) precedes the delta + // frame, so the delta's max_len_bits sits at anchor_end + 5. let banner_len = 17usize; + let anchor_start = banner_len + 1; let n_bytes = - u32::from_be_bytes(ben[banner_len + 2..banner_len + 6].try_into().unwrap()) as usize; - let anchor_end = banner_len + 6 + n_bytes + 2; - ben[anchor_end + 4] = 0; + u32::from_be_bytes(ben[anchor_start + 2..anchor_start + 6].try_into().unwrap()) as usize; + let anchor_end = anchor_start + 6 + n_bytes + 2; + ben[anchor_end + 5] = 0; let map: HashMap = (0..4).map(|i| (i, i)).collect(); let mut output = Vec::new(); diff --git a/ben/tests/fixtures/v1.0.0/source_twodelta.jsonl b/ben/tests/fixtures/v1.0.0/source_twodelta.jsonl new file mode 100644 index 0000000..8c5d198 --- /dev/null +++ b/ben/tests/fixtures/v1.0.0/source_twodelta.jsonl @@ -0,0 +1,5 @@ +{"assignment":[1,1,2,2],"sample":1} +{"assignment":[1,2,1,2],"sample":2} +{"assignment":[1,2,1,2],"sample":3} +{"assignment":[3,3,1,2],"sample":4} +{"assignment":[3,3,2,1],"sample":5} diff --git a/ben/tests/fixtures/v1.0.0/twodelta.ben b/ben/tests/fixtures/v1.0.0/twodelta.ben index 9f50b1fc2cbd65143ae6c1c6e8b33f647c0e3684..516c3e52556dbcaf0ac1bb7ea2f17e87049e1904 100644 GIT binary patch literal 64 zcmWFu_jhsi32{_#a`jVi^Yn3LU}9omU|`H*U}R)qWMEm!4Hmz^zyyShP&Oj~G_wb< diff --git a/ben/tests/fixtures/v1.0.0/twodelta.xben b/ben/tests/fixtures/v1.0.0/twodelta.xben index 5c1fc7d914c83c5b0cee76129c7077c729c97052..967582ca5820ce623e6a7cd568a2e510c0d6f356 100644 GIT binary patch delta 90 zcmXRYQT=OP6%ob2!0=z_rX$M%yI4g=F(3~Dt`~3r^?)IU!EB#I(iE(0rM42s>d{Ni4=qIp$}V1dXe0BTwu@Bjb+ delta 86 zcmXRZQ2lFO6%ob2!0=z_rX$M%>tIDjF(3~DxCB47JYWc7FrKKYs8zl(EnFpCyz1N~ griPzVZf9P3T?VOQ^bcPDjCCnn(Yz`~u&&4`08)P$ivR!s diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs index 887f987..083dbc9 100644 --- a/ben/tests/test_format_stability.rs +++ b/ben/tests/test_format_stability.rs @@ -43,6 +43,18 @@ const CANONICAL_JSONL: &str = "\ {\"assignment\":[2,2,2,1],\"sample\":5} "; +/// Canonical JSONL used to mint the `TwoDelta` fixtures only. `TwoDelta` is unreleased, so it gets +/// its own source that deliberately exercises mixed snapshot/delta framing: an anchor snapshot, a +/// 2-swap delta, a repeat (count), a **>2-district transition** that forces a mid-stream snapshot, +/// and a 2-swap delta rebased onto that snapshot. +const TWODELTA_CANONICAL_JSONL: &str = "\ +{\"assignment\":[1,1,2,2],\"sample\":1} +{\"assignment\":[1,2,1,2],\"sample\":2} +{\"assignment\":[1,2,1,2],\"sample\":3} +{\"assignment\":[3,3,1,2],\"sample\":4} +{\"assignment\":[3,3,2,1],\"sample\":5} +"; + /// Graph JSON committed as the `graph.json` asset inside the BENDL fixtures. Tiny but /// representative of a real adjacency-style graph. const CANONICAL_GRAPH_JSON: &str = "{\"nodes\":4,\"edges\":[[0,1],[1,2],[2,3],[3,0]]}"; @@ -68,58 +80,60 @@ fn read_fixture(name: &str) -> Vec { }) } -/// Decode a committed BEN fixture and assert the round-trip matches `CANONICAL_JSONL`. -fn assert_ben_fixture_round_trips(name: &str) { +/// Decode a committed BEN fixture and assert the round-trip matches `expected`. The `expected` +/// source is a parameter (not a hardcoded constant) so the released Standard/MkvChain fixtures and +/// the separately-sourced TwoDelta fixture can share this helper without entangling their inputs. +fn assert_ben_fixture_round_trips(name: &str, expected: &str) { let bytes = read_fixture(name); let mut out = Vec::new(); decode_ben_to_jsonl(&bytes[..], &mut out).expect("ben decode"); assert_eq!( String::from_utf8(out).expect("decoder output is utf-8"), - CANONICAL_JSONL, + expected, "fixture {name} did not round-trip" ); } -/// Decode a committed XBEN fixture and assert the round-trip matches `CANONICAL_JSONL`. -fn assert_xben_fixture_round_trips(name: &str) { +/// Decode a committed XBEN fixture and assert the round-trip matches `expected`. +fn assert_xben_fixture_round_trips(name: &str, expected: &str) { let bytes = read_fixture(name); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(&bytes[..]), &mut out).expect("xben decode"); assert_eq!( String::from_utf8(out).expect("decoder output is utf-8"), - CANONICAL_JSONL, + expected, "fixture {name} did not round-trip" ); } #[test] fn standard_ben_v1_0_0_round_trips() { - assert_ben_fixture_round_trips("standard.ben"); + assert_ben_fixture_round_trips("standard.ben", CANONICAL_JSONL); } #[test] fn mkvchain_ben_v1_0_0_round_trips() { - assert_ben_fixture_round_trips("mkvchain.ben"); + assert_ben_fixture_round_trips("mkvchain.ben", CANONICAL_JSONL); } #[test] fn twodelta_ben_v1_0_0_round_trips() { - assert_ben_fixture_round_trips("twodelta.ben"); + assert_ben_fixture_round_trips("twodelta.ben", TWODELTA_CANONICAL_JSONL); } #[test] fn standard_xben_v1_0_0_round_trips() { - assert_xben_fixture_round_trips("standard.xben"); + assert_xben_fixture_round_trips("standard.xben", CANONICAL_JSONL); } #[test] fn mkvchain_xben_v1_0_0_round_trips() { - assert_xben_fixture_round_trips("mkvchain.xben"); + assert_xben_fixture_round_trips("mkvchain.xben", CANONICAL_JSONL); } #[test] fn twodelta_xben_v1_0_0_round_trips() { - assert_xben_fixture_round_trips("twodelta.xben"); + assert_xben_fixture_round_trips("twodelta.xben", TWODELTA_CANONICAL_JSONL); } #[test] @@ -254,20 +268,19 @@ fn write_fixture(name: &str, bytes: &[u8]) { std::fs::write(&path, bytes).unwrap_or_else(|e| panic!("write {path:?}: {e}")); } -fn mint_ben(variant: BenVariant) -> Vec { +fn mint_ben(variant: BenVariant, jsonl: &str) -> Vec { let mut out = Vec::new(); - encode_jsonl_to_ben(Cursor::new(CANONICAL_JSONL.as_bytes()), &mut out, variant) - .expect("encode ben"); + encode_jsonl_to_ben(Cursor::new(jsonl.as_bytes()), &mut out, variant).expect("encode ben"); out } -fn mint_xben(variant: BenVariant) -> Vec { +fn mint_xben(variant: BenVariant, jsonl: &str) -> Vec { let mut out = Vec::new(); // Force single-threaded encoding with a fixed compression level so the bytes are deterministic. // Defaults vary across machines (n_threads = available parallelism), which would make // re-generation non-reproducible across hosts. encode_jsonl_to_xben( - Cursor::new(CANONICAL_JSONL.as_bytes()), + Cursor::new(jsonl.as_bytes()), &mut out, variant, Some(1), @@ -382,12 +395,24 @@ fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { #[test] #[ignore = "regenerates committed v1.0.0 fixtures; never run as part of normal CI"] fn generate_format_stability_fixtures() { - write_fixture("standard.ben", &mint_ben(BenVariant::Standard)); - write_fixture("mkvchain.ben", &mint_ben(BenVariant::MkvChain)); - write_fixture("twodelta.ben", &mint_ben(BenVariant::TwoDelta)); - write_fixture("standard.xben", &mint_xben(BenVariant::Standard)); - write_fixture("mkvchain.xben", &mint_xben(BenVariant::MkvChain)); - write_fixture("twodelta.xben", &mint_xben(BenVariant::TwoDelta)); + write_fixture("standard.ben", &mint_ben(BenVariant::Standard, CANONICAL_JSONL)); + write_fixture("mkvchain.ben", &mint_ben(BenVariant::MkvChain, CANONICAL_JSONL)); + write_fixture( + "twodelta.ben", + &mint_ben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), + ); + write_fixture( + "standard.xben", + &mint_xben(BenVariant::Standard, CANONICAL_JSONL), + ); + write_fixture( + "mkvchain.xben", + &mint_xben(BenVariant::MkvChain, CANONICAL_JSONL), + ); + write_fixture( + "twodelta.xben", + &mint_xben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), + ); let flags_set = mint_flags_set_bendl(); write_fixture("flags_set.bendl", &flags_set); @@ -398,9 +423,30 @@ fn generate_format_stability_fixtures() { // Also commit the canonical sources alongside so a human can read what the fixtures represent // without invoking the codec. write_fixture("source.jsonl", CANONICAL_JSONL.as_bytes()); + write_fixture("source_twodelta.jsonl", TWODELTA_CANONICAL_JSONL.as_bytes()); write_fixture("source_graph.json", CANONICAL_GRAPH_JSON.as_bytes()); write_fixture("source_metadata.json", CANONICAL_METADATA_JSON.as_bytes()); // Print a checklist so the engineer regenerating fixtures sees what landed. eprintln!("Wrote v1.0.0 fixtures to {:?}", fixtures_dir()); } + +#[test] +#[ignore = "regenerates only the (unreleased) TwoDelta fixtures; never run as part of normal CI"] +fn regenerate_twodelta_fixtures() { + // `TwoDelta` is unreleased, so its wire format may change and its fixtures may be re-minted in + // place (the "committed before any release shipped" escape hatch in `docs/format-stability.md`). + // This regenerator touches *only* the TwoDelta fixtures and their source, leaving every released + // Standard/MkvChain/BENDL fixture byte-for-byte untouched. + write_fixture( + "twodelta.ben", + &mint_ben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), + ); + write_fixture( + "twodelta.xben", + &mint_xben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), + ); + write_fixture("source_twodelta.jsonl", TWODELTA_CANONICAL_JSONL.as_bytes()); + + eprintln!("Re-minted TwoDelta fixtures in {:?}", fixtures_dir()); +} diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index faa4dbe..5f2eda1 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -1470,13 +1470,17 @@ fn twodelta_first_frame_carries_repeat_trailer() { encoder.finish().unwrap(); } + // The first frame is a snapshot: a 1-byte snapshot tag (0x00) precedes the MkvChain-formatted + // body, which is the Standard frame bytes plus a trailing u16 repetition count. let expected_first = BenEncodeFrame::from_assignment(&first, BenVariant::Standard, None); assert_eq!(&ben[..17], b"TWODELTA BEN FILE"); + assert_eq!(ben[17], 0x00, "first frame should carry the snapshot tag"); + let body_start = 18; assert_eq!( - &ben[17..17 + expected_first.as_slice().len()], + &ben[body_start..body_start + expected_first.as_slice().len()], expected_first.as_slice() ); - let count_offset = 17 + expected_first.as_slice().len(); + let count_offset = body_start + expected_first.as_slice().len(); assert_eq!( u16::from_be_bytes([ben[count_offset], ben[count_offset + 1]]), 2 @@ -1484,27 +1488,42 @@ fn twodelta_first_frame_carries_repeat_trailer() { } #[test] -fn twodelta_rejects_non_pair_transition() { +fn twodelta_non_pair_transition_falls_back_to_snapshot() { + // A >2-district transition is no longer an error: it emits a snapshot frame and round-trips. + let a0 = vec![1u16, 1, 2, 2]; + let a1 = vec![1u16, 3, 2, 4]; // 1->3 and 2->4: 4 distinct ids across changed positions let mut ben = Vec::new(); - let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); - encoder.write_assignment(vec![1u16, 1, 2, 2]).unwrap(); - encoder.write_assignment(vec![1u16, 3, 2, 4]).unwrap(); - let err = encoder.finish().err().unwrap(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + { + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + encoder.write_assignment(a0.clone()).unwrap(); + encoder.write_assignment(a1.clone()).unwrap(); + encoder.finish().unwrap(); + } + let decoded: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(decoded, vec![a0, a1]); } #[test] -fn twodelta_write_json_value_rejects_non_pair_transition() { +fn twodelta_write_json_value_non_pair_transition_falls_back_to_snapshot() { let mut ben = Vec::new(); - let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); - encoder - .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) - .unwrap(); - encoder - .write_json_value(json!({"assignment": [1u16, 3, 2, 4]})) - .unwrap(); - let err = encoder.finish().err().unwrap(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + { + let mut encoder = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + encoder + .write_json_value(json!({"assignment": [1u16, 1, 2, 2]})) + .unwrap(); + encoder + .write_json_value(json!({"assignment": [1u16, 3, 2, 4]})) + .unwrap(); + encoder.finish().unwrap(); + } + let decoded: Vec<_> = BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .map(|r| r.unwrap().0) + .collect(); + assert_eq!(decoded, vec![vec![1u16, 1, 2, 2], vec![1u16, 3, 2, 4]]); } #[test] diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 0d70093..4446c9e 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -310,7 +310,9 @@ fn malformed_ben_bit_widths_return_invalid_data() { fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { let anchor = BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)); let mut ben = TWODELTA_BEN_BANNER.to_vec(); + ben.push(0x00); // snapshot tag for the anchor (MkvChain-formatted body) ben.extend_from_slice(anchor.as_slice()); + ben.push(0x01); // delta tag; the body below has max_len_bits == 0, which is invalid ben.extend_from_slice(&[0, 1, 0, 2, 0, 0, 0, 0, 0, 1]); let mut reader = BenStreamReader::from_ben(ben.as_slice()) From b13b0ed6d1506a967d9f5af997bcdde7d630c96b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 2 Jun 2026 01:33:44 -0600 Subject: [PATCH 133/221] fix clippy --- ben-py/src/decode/decoder.rs | 2 +- ben-py/src/encode/encoder.rs | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 5e0cef7..273d468 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -249,7 +249,7 @@ impl PyBenDecoder { return Err(PyException::new_err("indices must not be empty")); } let base_len = ensure_base_len(&mut slf, py)?; - if indices[0] <= 0 { + if indices[0] == 0 { return Err(PyException::new_err("indices must be 1-based")); } if indices.last().unwrap() > &base_len { diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 70689ab..037cc6c 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -21,9 +21,11 @@ enum EncoderState { BenOnly(BenStreamWriter>), /// `.bendl` bundle path: the session owns the buffered file and the `BenStreamWriter` writes /// through it. `sample_count` is tracked alongside so it can be plumbed into - /// `finish_into_writer` at `close()` time. + /// `finish_into_writer` at `close()` time. The writer is boxed because the bundle-streaming + /// `BenStreamWriter` is much larger than the plain-BEN one, which would otherwise bloat every + /// `EncoderState` to the larger variant's size. BundleStreaming { - writer: BenStreamWriter>>, + writer: Box>>>, sample_count: i64, }, } @@ -115,7 +117,7 @@ impl PyBenEncoder { let session = writer.into_stream_session().map_err(Self::map_bundle_err)?; let writer = BenStreamWriter::for_ben(session, ben_var).map_err(Self::map_io_err)?; EncoderState::BundleStreaming { - writer, + writer: Box::new(writer), sample_count: 0, } }; From 6475871af158ed5e1b5218b5f344e36adcbf9c96 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 3 Jun 2026 12:02:37 -0600 Subject: [PATCH 134/221] reorg python side and add bundle bindings --- ben-py/binary_ensemble/__init__.py | 47 +- ben-py/binary_ensemble/_core.pyi | 520 ++----- ben-py/binary_ensemble/bundle.py | 178 +++ ben-py/binary_ensemble/bundle.pyi | 31 + ben-py/binary_ensemble/codec.py | 27 + ben-py/binary_ensemble/codec.pyi | 15 + ben-py/binary_ensemble/graph.py | 55 + ben-py/binary_ensemble/graph.pyi | 15 + ben-py/binary_ensemble/stream.py | 15 + ben-py/binary_ensemble/stream.pyi | 4 + ben-py/docs/user/using_ben_py.ipynb | 253 +-- ben-py/pyproject.toml | 3 + ben-py/src/common.rs | 93 +- ben-py/src/decode/bundle_decoder.rs | 331 ++++ ben-py/src/decode/cursor.rs | 235 +++ ben-py/src/decode/decoder.rs | 568 +------ ben-py/src/decode/helpers.rs | 126 +- ben-py/src/decode/mod.rs | 3 + ben-py/src/decode/types.rs | 47 +- ben-py/src/encode/bundle_encoder.rs | 452 ++++++ ben-py/src/encode/encoder.rs | 163 +- ben-py/src/encode/helpers.rs | 59 - ben-py/src/encode/mod.rs | 3 +- ben-py/src/encode/py_funcs.rs | 2 +- ben-py/src/graph/helpers.rs | 60 + ben-py/src/graph/mod.rs | 6 + ben-py/src/graph/py_funcs.rs | 37 + ben-py/src/lib.rs | 7 + ben-py/src/recompress.rs | 137 ++ ben-py/tests/test_bundle.py | 2057 ++++--------------------- ben-py/tests/test_bundle_api.py | 317 ++++ ben-py/tests/test_graph.py | 83 + ben-py/tests/test_python_pipelines.py | 1373 ++--------------- ben-py/tests/test_recompress.py | 113 ++ ben-py/tests/test_surface.py | 266 ++++ ben-py/uv.lock | 4 + 36 files changed, 3347 insertions(+), 4358 deletions(-) create mode 100644 ben-py/binary_ensemble/bundle.py create mode 100644 ben-py/binary_ensemble/bundle.pyi create mode 100644 ben-py/binary_ensemble/codec.py create mode 100644 ben-py/binary_ensemble/codec.pyi create mode 100644 ben-py/binary_ensemble/graph.py create mode 100644 ben-py/binary_ensemble/graph.pyi create mode 100644 ben-py/binary_ensemble/stream.py create mode 100644 ben-py/binary_ensemble/stream.pyi create mode 100644 ben-py/src/decode/bundle_decoder.rs create mode 100644 ben-py/src/decode/cursor.rs create mode 100644 ben-py/src/encode/bundle_encoder.rs delete mode 100644 ben-py/src/encode/helpers.rs create mode 100644 ben-py/src/graph/helpers.rs create mode 100644 ben-py/src/graph/mod.rs create mode 100644 ben-py/src/graph/py_funcs.rs create mode 100644 ben-py/src/recompress.rs create mode 100644 ben-py/tests/test_bundle_api.py create mode 100644 ben-py/tests/test_graph.py create mode 100644 ben-py/tests/test_recompress.py create mode 100644 ben-py/tests/test_surface.py diff --git a/ben-py/binary_ensemble/__init__.py b/ben-py/binary_ensemble/__init__.py index 68227f4..2b837b8 100644 --- a/ben-py/binary_ensemble/__init__.py +++ b/ben-py/binary_ensemble/__init__.py @@ -1,20 +1,49 @@ -from ._core import ( - BenDecoder, - BenEncoder, - encode_jsonl_to_ben, - encode_ben_to_xben, - encode_jsonl_to_xben, +"""Binary Ensemble (BEN/XBEN) Python API. + +The public surface mirrors the CLI's ``ben`` vs ``bendl`` split: + +- :mod:`binary_ensemble.bundle` — the recommended single-file ``.bendl`` format: + :class:`~binary_ensemble.bundle.BendlEncoder`, + :class:`~binary_ensemble.bundle.BendlDecoder`, and + :func:`~binary_ensemble.bundle.compress_stream`. +- :mod:`binary_ensemble.stream` — plain BEN/XBEN streams: + :class:`~binary_ensemble.stream.BenEncoder`, + :class:`~binary_ensemble.stream.BenDecoder`. +- :mod:`binary_ensemble.codec` — whole-file JSONL ↔ BEN ↔ XBEN transforms. +- :mod:`binary_ensemble.graph` — graph reordering utilities. + +All public symbols are re-exported here for convenience. +""" + +from binary_ensemble import bundle, codec, graph, stream +from binary_ensemble.bundle import BendlDecoder, BendlEncoder, compress_stream +from binary_ensemble.codec import ( decode_ben_to_jsonl, - decode_xben_to_jsonl, decode_xben_to_ben, + decode_xben_to_jsonl, + encode_ben_to_xben, + encode_jsonl_to_ben, + encode_jsonl_to_xben, ) +from binary_ensemble.stream import BenDecoder, BenEncoder __all__ = [ - "BenDecoder", + # Submodules + "stream", + "bundle", + "codec", + "graph", + # Bundle (recommended) + "BendlEncoder", + "BendlDecoder", + "compress_stream", + # Stream "BenEncoder", + "BenDecoder", + # Codec "encode_jsonl_to_ben", - "encode_ben_to_xben", "encode_jsonl_to_xben", + "encode_ben_to_xben", "decode_ben_to_jsonl", "decode_xben_to_jsonl", "decode_xben_to_ben", diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index cec8783..5254823 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -1,444 +1,168 @@ -from typing import Any, Iterable, Iterator, Literal -from pathlib import Path +"""Type stubs for the compiled ``binary_ensemble._core`` extension. -class PyBenDecoder: - """Iterator over assignments in a BEN, XBEN, or BENDL file. +These describe the raw PyO3 surface. End users should import the ergonomic +facades from :mod:`binary_ensemble.stream`, :mod:`binary_ensemble.bundle`, +:mod:`binary_ensemble.codec`, and :mod:`binary_ensemble.graph` instead. +""" - Open a decoder over a plain stream (`.ben` / `.xben`) or a bundle - (`.bendl`). The file's leading bytes are sniffed; when the BENDL magic - is present, the bundle header decides between BEN and XBEN and the - ``mode`` argument is ignored. Iteration walks only the embedded stream - region, and the bundle's table of contents / asset payloads are - available through the bundle-inspection methods. +from pathlib import Path +from typing import Any, Iterable, Iterator, Literal - Construction is lazy with respect to sample counting: opening the decoder does - not scan the whole file. The first call to :func:`len` or :meth:`count_samples` - will count samples and cache the result. +# --------------------------------------------------------------------------- +# Stream decoder / encoder (plain .ben / .xben) +# --------------------------------------------------------------------------- - Parameters - ---------- - file_path : - Path to the input file. - mode : {"ben", "xben"}, default "ben" - Select container format. Only consulted for plain streams; for - bundles the header dictates the format. +class BenDecoder: + """Iterator over assignments in a plain BEN or XBEN stream. - Raises - ------ - OSError - If the file cannot be opened. - Exception - If the underlying decoder fails to initialize. + Stream-only: opening this on a ``.bendl`` bundle raises and points at + :class:`BendlDecoder`. Sample counting is lazy and cached. """ def __init__( self, file_path: str | Path, mode: Literal["ben", "xben"] = "ben" ) -> None: ... - def __iter__(self) -> Iterator[list[int]]: - """Return an iterator over the samples, restarting from the start. - - Each call to :func:`iter` (including the implicit call made by - ``for x in dec:``) rebuilds the underlying frame walker and, if a - subsample selection has been installed via :meth:`subsample_indices`, - :meth:`subsample_range`, or :meth:`subsample_every`, reapplies it. - Iteration can therefore be performed multiple times on the same - decoder. - """ - ... + def __iter__(self) -> Iterator[list[int]]: ... def __next__(self) -> list[int]: ... - def __len__(self) -> int: - """Return the number of samples. - - Notes - ----- - The first call may require a full scan of the underlying file and can be - expensive for very large BEN/XBEN datasets. The result is cached after - the first successful count. - """ - ... - def count_samples(self) -> int: - """Count and cache the total number of samples in the source file. - - Always reports the base (unfiltered) sample count, even after a - ``subsample_*`` call has been applied. Equivalent to ``len(dec)`` - when no subsample selection is active. The first call may perform - a full-file scan; the result is cached. - """ - ... - def subsample_indices(self, indices: Iterable[int]) -> "PyBenDecoder": - """Keep only the given **1-based** sample indices. - - Duplicates are ignored and order is irrelevant; the set is sorted & deduped internally. - Returns the same decoder (fluent API). - - - Arguments - --------- - indices : - Iterable of 1-based sample indices to keep. - - Returns - ------- - PyBenDecoder - The same decoder (fluent API). - """ - ... - - def subsample_range(self, start: int, end: int) -> "PyBenDecoder": - """Keep only samples in the inclusive **1-based** range [start, end]. - - The base sample count is computed on demand if needed for bounds - validation. - - Arguments - --------- - start : - 1-based index of the first sample to keep. - end : - 1-based index of the last sample to keep. - - Returns - ------- - PyBenDecoder - The same decoder (fluent API). - """ - ... - - def subsample_every(self, step: int, offset: int = 1) -> "PyBenDecoder": - """Keep every `step`-th sample starting at **1-based** `offset`. - Returns the same decoder (fluent API). - - The base sample count is computed on demand if needed for bounds - validation. - - Arguments - --------- - step : - Step size (keep every `step`-th sample). - offset : - 1-based index of the first sample to keep (default: 1). - - Returns - ------- - PyBenDecoder - The same decoder (fluent API). - """ - ... - - # ----------------------------------------------------------------- - # Bundle-inspection surface. - # - # These methods only make sense when the decoder was opened on a - # `.bendl` file; on a plain `.ben`/`.xben` stream they raise a clear - # error pointing the user at the right tool. - # ----------------------------------------------------------------- - - def is_bundle(self) -> bool: - """Return True if the decoder was opened on a `.bendl` bundle.""" - ... - - def assignment_format(self) -> Literal["ben", "xben"]: - """Return the container format of the underlying stream.""" - ... - - def version(self) -> tuple[int, int]: - """Return the bundle's format version as ``(major, minor)``. - - Raises an error on plain streams. - """ - ... - - def is_complete(self) -> bool: - """Return whether the bundle was successfully finalized. - - Raises an error on plain streams. - """ - ... - - def asset_names(self) -> list[str]: - """Return the names of every entry in the bundle's directory. - - Raises an error on plain streams. - """ - ... - - def list_assets(self) -> list[dict[str, Any]]: - """Return the full bundle directory as a list of dicts with keys - ``name``, ``type``, ``offset``, ``len``, and ``flags``. - - Raises an error on plain streams. - """ - ... - - def read_asset_bytes(self, name: str) -> bytes: - """Return the (decoded) bytes of the named asset. - - Raises an error on plain streams, a ``KeyError`` when the asset is - absent, and an ``OSError`` when the payload cannot be read. - """ - ... - - def read_json_asset(self, name: str) -> Any: - """Parse a JSON asset into a Python object. - - Raises an error on plain streams, a ``KeyError`` when the asset is - absent, and an exception when the bytes are not valid UTF-8 JSON. - """ - ... - - def read_graph(self) -> Any | None: - """Return the bundle's ``graph.json`` asset as a parsed JSON - object, or ``None`` if absent. Raises on plain streams. - """ - ... - - def read_metadata(self) -> Any | None: - """Return the bundle's ``metadata.json`` asset as a parsed JSON - object, or ``None`` if absent. Raises on plain streams. - """ - ... - - def read_relabel_map(self) -> Any | None: - """Return the bundle's ``relabel_map.json`` asset as a parsed - JSON object, or ``None`` if absent. Raises on plain streams. - """ - ... - - def extract_stream(self, out_path: str | Path, overwrite: bool = False) -> None: - """Copy the embedded assignment stream to a file. - - The resulting file can be opened directly with - ``PyBenDecoder(out_path, mode=dec.assignment_format())``. - - Raises an error on plain streams, an ``OSError`` when the output - file already exists and *overwrite* is ``False``. - """ - ... - -class PyBenEncoder: - """Encoder for Binary Ensemble (.ben) files. - - - The encoder supports writing assignments to a BEN file using a context manager and the `write` - method. + def __len__(self) -> int: ... + def count_samples(self) -> int: ... + def subsample_indices(self, indices: Iterable[int]) -> "BenDecoder": ... + def subsample_range(self, start: int, end: int) -> "BenDecoder": ... + def subsample_every(self, step: int, offset: int = 1) -> "BenDecoder": ... + def assignment_format(self) -> Literal["ben", "xben"]: ... - - Example - ------- - - - .. code-block:: python - - from binary_ensemble import PyBenEncoder - - assignments = [ - [1, 2, 1, 1, 2, 2], - [2, 1, 1, 2, 2, 1], - [1, 1, 2, 1, 2, 2], - ] - - with PyBenEncoder("output.ben", overwrite=True) as encoder: - for assignment in assignments: - encoder.write(assignment) - - """ +class BenEncoder: + """Encoder for plain Binary Ensemble (`.ben`) streams.""" def __init__( self, file_path: str | Path, overwrite: bool = False, variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, - ) -> None: - """Initializes the encoder and opens the underlying file. - - Parameters - ---------- - file_path : - Path to the output BEN file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain", "twodelta"}, optional - Select BEN variant. If None, defaults to "mkv_chain". - - Raises - ------ - OSError - If the file cannot be opened. - Exception - If the underlying encoder fails to initialize. - """ - ... - - def write(self, assignment: list[int]) -> None: - """Write a single assignment to the BEN file. - - Parameters - ---------- - assignment : - List of integers representing the assignment. - """ - ... - - def close(self) -> None: - """Closes the encoder and the underlying file. - - Also handles flushing any buffered data. - """ - ... - - def __enter__(self) -> "PyBenEncoder": ... + ) -> None: ... + def write(self, assignment: list[int]) -> None: ... + def close(self) -> None: ... + def __enter__(self) -> "BenEncoder": ... def __exit__(self, exc_type, exc, tb) -> bool: ... -def decompress_ben_to_jsonl( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: - """Converts a BEN file to a JSONL file. +# --------------------------------------------------------------------------- +# Bundle decoder / encoder (.bendl) +# --------------------------------------------------------------------------- - Parameters - ---------- - in_file : - Path to the input BEN file. - out_file : - Path to the output JSONL file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. +class BendlDecoder: + """Reader and iterator for a ``.bendl`` bundle. - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. + Bundle-only: opening this on a plain ``.ben``/``.xben`` stream raises and + points at :class:`BenDecoder`. Iteration walks the embedded assignment + stream; the bundle directory and asset payloads are exposed through the + inspection methods. A finalized assets-only bundle (empty stream) iterates to + nothing with ``len == 0``. """ - ... -def decompress_xben_to_jsonl( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: - """Converts an XBEN file to a JSONL file. + def __init__(self, file_path: str | Path) -> None: ... + def __iter__(self) -> Iterator[list[int]]: ... + def __next__(self) -> list[int]: ... + def __len__(self) -> int: ... + def count_samples(self) -> int: ... + def subsample_indices(self, indices: Iterable[int]) -> "BendlDecoder": ... + def subsample_range(self, start: int, end: int) -> "BendlDecoder": ... + def subsample_every(self, step: int, offset: int = 1) -> "BendlDecoder": ... + def assignment_format(self) -> Literal["ben", "xben"]: ... + def version(self) -> tuple[int, int]: ... + def is_complete(self) -> bool: ... + def asset_names(self) -> list[str]: ... + def list_assets(self) -> list[dict[str, Any]]: ... + def read_asset_bytes(self, name: str) -> bytes: ... + def read_json_asset(self, name: str) -> Any: ... + # Returns a NetworkX graph (``networkx.Graph``/``MultiGraph``) rebuilt from the + # stored adjacency JSON, or ``None`` if absent. Use ``read_json_asset("graph.json")`` + # for the raw parsed dict. + def read_graph(self) -> Any | None: ... + def read_metadata(self) -> Any | None: ... + def read_node_permutation_map(self) -> Any | None: ... + def extract_stream( + self, + out_path: str | Path, + overwrite: bool = False, + allow_unfinalized: bool = False, + ) -> None: ... - Parameters - ---------- - in_file : - Path to the input XBEN file. - out_file : - Path to the output JSONL file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. +class BendlStreamSession: + """Single-use context manager over a bundle's assignment stream. - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. + Obtained from :meth:`BendlEncoder.stream`; finalizes the bundle on a clean + close and leaves it unfinalized if the context exits via an exception. """ - ... -def decompress_xben_to_ben( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: - """Converts an XBEN file to a BEN file. + def write(self, assignment: list[int]) -> None: ... + def close(self) -> None: ... + def __enter__(self) -> "BendlStreamSession": ... + def __exit__(self, exc_type, exc, tb) -> bool: ... - Parameters - ---------- - in_file : - Path to the input XBEN file. - out_file : - Path to the output BEN file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. +class BendlEncoder: + """Writer for a ``.bendl`` bundle (create mode) or appender (append mode).""" - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. - """ - ... + def __init__(self, file_path: str | Path, overwrite: bool = False) -> None: ... + @staticmethod + def append(file_path: str | Path) -> "BendlEncoder": ... + def add_asset( + self, name: str, payload: bytes, content_type: Literal["json", "text"] + ) -> None: ... + def add_metadata(self, metadata: Any) -> None: ... + # Returns the (possibly reordered) graph as a NetworkX graph, matching + # BendlDecoder.read_graph. + def add_graph(self, graph: Any, preprocess_method: str | None) -> Any: ... + def stream( + self, + format: Literal["ben"] = "ben", + variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, + ) -> BendlStreamSession: ... + def close(self) -> None: ... + def __enter__(self) -> "BendlEncoder": ... + def __exit__(self, exc_type, exc, tb) -> bool: ... + +# --------------------------------------------------------------------------- +# Whole-file stream / JSONL transforms +# --------------------------------------------------------------------------- -def compress_jsonl_to_ben( +def encode_jsonl_to_ben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, -) -> None: - """Converts a JSONL file to a BEN file. - - Parameters - ---------- - in_file : - Path to the input JSONL file. - out_file : - Path to the output BEN file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain", "twodelta"}, optional - Select BEN variant. If None, defaults to "mkv_chain". - - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. - ValueError - If the input file is not a valid JSONL file or if the variant cannot be inferred. - """ - ... - -def compress_jsonl_to_xben( + variant: Literal["standard", "mkv_chain", "twodelta"] = "mkv_chain", +) -> None: ... +def encode_jsonl_to_xben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, + variant: Literal["standard", "mkv_chain", "twodelta"] = "mkv_chain", n_threads: int | None = None, compression_level: int | None = None, -) -> None: - """Converts a JSONL file to an XBEN file. - - Parameters - ---------- - in_file : - Path to the input JSONL file. - out_file : - Path to the output XBEN file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. - variant : {"standard", "mkv_chain", "twodelta"}, optional - Select BEN variant. If None, defaults to "mkv_chain". - n_threads : - Number of threads to use for compression. If None, defaults to the number of CPU cores. - compression_level : - Compression level to use for LZMA compression (0-9). If None, defaults to 9 (highest). - - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. - ValueError - If the input file is not a valid JSONL file or if the variant cannot be inferred. - """ - ... - -def compress_ben_to_xben( + xz_block_size: int | None = None, +) -> None: ... +def encode_ben_to_xben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, n_threads: int | None = None, compression_level: int | None = None, -) -> None: - """Converts a BEN file to an XBEN file. + xz_block_size: int | None = None, +) -> None: ... +def decode_ben_to_jsonl( + in_file: str | Path, out_file: str | Path, overwrite: bool = False +) -> None: ... +def decode_xben_to_jsonl( + in_file: str | Path, out_file: str | Path, overwrite: bool = False +) -> None: ... +def decode_xben_to_ben( + in_file: str | Path, out_file: str | Path, overwrite: bool = False +) -> None: ... - Parameters - ---------- - in_file : - Path to the input BEN file. - out_file : - Path to the output XBEN file. - overwrite : - Whether to overwrite the output file if it exists. Defaults to False. - n_threads : - Number of threads to use for compression. If None, defaults to the number of CPU cores. - compression_level : - Compression level to use for LZMA compression (0-9). If None, defaults to 9 (highest). +# --------------------------------------------------------------------------- +# Graph reordering and bundle recompression +# --------------------------------------------------------------------------- - Raises - ------ - OSError - If the input file cannot be opened or the output file cannot be created. - """ - ... +def graph_reorder(graph: Any, method: str) -> tuple[Any, Any]: ... +def recompress_bundle( + in_file: str | Path, out_file: str | Path, overwrite: bool = False +) -> None: ... diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py new file mode 100644 index 0000000..edfee8c --- /dev/null +++ b/ben-py/binary_ensemble/bundle.py @@ -0,0 +1,178 @@ +"""The ``.bendl`` bundle format — the recommended single-file container. + +A bundle wraps a BEN/XBEN assignment stream together with front-loaded assets: a +dual ``graph.json``, a ``node_permutation_map.json``, a ``metadata.json``, and +arbitrary custom blobs. :class:`BendlEncoder` writes one; :class:`BendlDecoder` +reads and iterates one. + +Typical write:: + + with BendlEncoder(path, overwrite=True) as enc: + enc.add_graph(graph, preprocess_method="rcm") # None => store raw + enc.add_metadata({"seed": 1234}) + with enc.stream("ben") as stream: + for assignment in chain: + stream.write(assignment) + +Typical read:: + + dec = BendlDecoder(path) + graph = dec.read_graph() + for assignment in dec: + ... +""" + +from __future__ import annotations + +import json +import os +import tempfile +from typing import Any, Optional, Union + +from binary_ensemble._core import BendlDecoder +from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder +from binary_ensemble._core import recompress_bundle as _recompress_bundle + +__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream"] + + +def _coerce_bytes(payload: Union[bytes, bytearray, memoryview, str]) -> bytes: + """Coerce an ``add_asset`` payload to bytes (``str`` is UTF-8 encoded).""" + if isinstance(payload, str): + return payload.encode("utf-8") + if isinstance(payload, (bytes, bytearray, memoryview)): + return bytes(payload) + raise TypeError( + f"asset payload must be bytes or str, got {type(payload).__name__}" + ) + + +class BendlEncoder: + """Writer for a ``.bendl`` bundle (create mode) or an asset appender (append mode). + + In create mode (the constructor), assets may be added before or after a + single-use ``stream()``; closing finalizes the bundle. In append mode + (:meth:`append`), an existing finalized bundle is grown with new assets and + ``stream()`` is unavailable. + """ + + def __init__(self, file_path, overwrite: bool = False) -> None: + self._enc = _CoreBendlEncoder(file_path, overwrite=overwrite) + + @classmethod + def append(cls, file_path) -> "BendlEncoder": + """Open an existing *finalized* bundle to append new assets. + + ``stream()`` is unavailable in append mode; each ``add_*`` commits + immediately. + """ + self = cls.__new__(cls) + self._enc = _CoreBendlEncoder.append(file_path) + return self + + def add_graph(self, graph: Any, preprocess_method: Optional[str]) -> Any: + """Embed the dual ``graph.json`` and return the (possibly reordered) graph. + + When ``preprocess_method`` is not ``None`` the graph is reordered (e.g. + ``"rcm"``, ``"mlc"``, or a node-attribute key) and both ``graph.json`` + and ``node_permutation_map.json`` are stored; the reordered graph is + returned so the chain runs on that ordering. Reordering is pre-stream + only. ``None`` stores the graph as-is with no permutation map. + + The graph is returned as a NetworkX graph (matching + :meth:`BendlDecoder.read_graph`), so its node order is the order the + chain should write assignments in. + """ + return self._enc.add_graph(graph, preprocess_method) + + def add_metadata(self, metadata: Any) -> None: + """Embed the canonical ``metadata.json`` asset (a dict/list, bytes, or path).""" + self._enc.add_metadata(metadata) + + def add_asset( + self, + name: str, + payload: Union[bytes, bytearray, memoryview, str], + content_type: str, + ) -> None: + """Embed a custom asset under ``name``. + + ``content_type`` is ``"json"`` (payload must be valid UTF-8 JSON; the + decoder will auto-parse it) or ``"text"`` (payload must be valid UTF-8). + """ + data = _coerce_bytes(payload) + if content_type == "json": + try: + json.loads(data.decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError) as exc: + raise ValueError( + f"content_type='json' requires valid UTF-8 JSON: {exc}" + ) from exc + elif content_type == "text": + try: + data.decode("utf-8") + except UnicodeDecodeError as exc: + raise ValueError( + f"content_type='text' requires valid UTF-8: {exc}" + ) from exc + else: + raise ValueError( + f"content_type must be 'json' or 'text', got {content_type!r}" + ) + self._enc.add_asset(name, data, content_type) + + def stream(self, format: str = "ben", variant: Optional[str] = None): + """Open the single-use assignment stream context manager. + + Only ``"ben"`` is accepted; produce XBEN bundles via + :func:`compress_stream`. ``variant`` selects the BEN variant + (default ``"mkv_chain"``). + """ + return self._enc.stream(format, variant) + + def close(self) -> None: + """Finalize (create mode) or finish (append mode) the bundle. Idempotent.""" + self._enc.close() + + def __enter__(self) -> "BendlEncoder": + return self + + def __exit__(self, exc_type, exc, tb) -> bool: + self.close() + return False + + +def compress_stream( + path, + out_file=None, + in_place: bool = False, +) -> None: + """Recompress a bundle's embedded BEN stream to XBEN, preserving every asset. + + Provide exactly one of ``in_place=True`` (recompress to a temp file and + atomically swap it over ``path``) or ``out_file`` (write a new bundle). + Passing both, or neither, raises. + + All assets (graph, metadata, node_permutation_map, custom blobs) are + preserved by decoded payload, name, type, and JSON flag; storage compression + is normalized to the writer's default policy. An assets-only bundle (empty + stream) recompresses to an empty XBEN bundle. + """ + if in_place and out_file is not None: + raise ValueError("pass either in_place=True or out_file, not both") + if not in_place and out_file is None: + raise ValueError("pass either in_place=True or out_file") + + if in_place: + directory = os.path.dirname(os.path.abspath(os.fspath(path))) + fd, tmp = tempfile.mkstemp(suffix=".bendl", dir=directory) + os.close(fd) + try: + _recompress_bundle(path, tmp, overwrite=True) + os.replace(tmp, path) + except BaseException: + if os.path.exists(tmp): + os.remove(tmp) + raise + else: + _recompress_bundle(path, out_file, overwrite=False) diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi new file mode 100644 index 0000000..8b39104 --- /dev/null +++ b/ben-py/binary_ensemble/bundle.pyi @@ -0,0 +1,31 @@ +from typing import Any, Optional, Union + +from binary_ensemble._core import BendlDecoder as BendlDecoder +from binary_ensemble._core import BendlStreamSession + +__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream"] + +class BendlEncoder: + def __init__(self, file_path, overwrite: bool = False) -> None: ... + @classmethod + def append(cls, file_path) -> "BendlEncoder": ... + def add_graph(self, graph: Any, preprocess_method: Optional[str]) -> Any: ... + def add_metadata(self, metadata: Any) -> None: ... + def add_asset( + self, + name: str, + payload: Union[bytes, bytearray, memoryview, str], + content_type: str, + ) -> None: ... + def stream( + self, format: str = "ben", variant: Optional[str] = None + ) -> BendlStreamSession: ... + def close(self) -> None: ... + def __enter__(self) -> "BendlEncoder": ... + def __exit__(self, exc_type, exc, tb) -> bool: ... + +def compress_stream( + path, + out_file=None, + in_place: bool = False, +) -> None: ... diff --git a/ben-py/binary_ensemble/codec.py b/ben-py/binary_ensemble/codec.py new file mode 100644 index 0000000..9049816 --- /dev/null +++ b/ben-py/binary_ensemble/codec.py @@ -0,0 +1,27 @@ +"""Whole-file stream/JSONL transforms. + +These helpers convert entire files in one call, without an iterator: JSONL ↔ +BEN ↔ XBEN. For streaming sample-by-sample access use +:class:`binary_ensemble.stream.BenDecoder`; for the single-file bundle format +use :mod:`binary_ensemble.bundle`. +""" + +from __future__ import annotations + +from binary_ensemble._core import ( + decode_ben_to_jsonl, + decode_xben_to_ben, + decode_xben_to_jsonl, + encode_ben_to_xben, + encode_jsonl_to_ben, + encode_jsonl_to_xben, +) + +__all__ = [ + "encode_jsonl_to_ben", + "encode_jsonl_to_xben", + "encode_ben_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", +] diff --git a/ben-py/binary_ensemble/codec.pyi b/ben-py/binary_ensemble/codec.pyi new file mode 100644 index 0000000..f9df5d8 --- /dev/null +++ b/ben-py/binary_ensemble/codec.pyi @@ -0,0 +1,15 @@ +from binary_ensemble._core import decode_ben_to_jsonl as decode_ben_to_jsonl +from binary_ensemble._core import decode_xben_to_ben as decode_xben_to_ben +from binary_ensemble._core import decode_xben_to_jsonl as decode_xben_to_jsonl +from binary_ensemble._core import encode_ben_to_xben as encode_ben_to_xben +from binary_ensemble._core import encode_jsonl_to_ben as encode_jsonl_to_ben +from binary_ensemble._core import encode_jsonl_to_xben as encode_jsonl_to_xben + +__all__ = [ + "encode_jsonl_to_ben", + "encode_jsonl_to_xben", + "encode_ben_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", +] diff --git a/ben-py/binary_ensemble/graph.py b/ben-py/binary_ensemble/graph.py new file mode 100644 index 0000000..6a263fd --- /dev/null +++ b/ben-py/binary_ensemble/graph.py @@ -0,0 +1,55 @@ +"""Graph reordering utilities (the reben orderings). + +Reordering a dual graph before building a chain (or a bundle) can dramatically +improve BEN/XBEN compression. Each function takes a NetworkX adjacency-format +graph (a ``dict``/``list``, raw JSON ``bytes``, a file-like with ``.read()``, or +a path) and returns ``(reordered_graph, node_permutation_map)``: + +- ``reordered_graph`` is a live NetworkX graph in its new node ordering (the same + shape :meth:`binary_ensemble.bundle.BendlEncoder.add_graph` and + :meth:`binary_ensemble.bundle.BendlDecoder.read_graph` return). +- ``node_permutation_map`` is the parsed ``node_permutation_map.json`` payload — + an object with a ``node_permutation_old_to_new`` field mapping original + zero-based node positions to their new positions. + +To reorder *and* embed the result in a bundle in one step, pass +``preprocess_method`` to :meth:`binary_ensemble.bundle.BendlEncoder.add_graph`. +""" + +from __future__ import annotations + +from typing import Any, Tuple + +from binary_ensemble._core import graph_reorder + +__all__ = [ + "reorder", + "reorder_multi_level_cluster", + "reorder_reverse_cuthill_mckee", + "reorder_by_key", +] + + +def reorder(graph: Any, method: str) -> Tuple[Any, Any]: + """Reorder ``graph`` by ``method`` and return ``(reordered_graph, node_permutation_map)``. + + ``method`` is one of ``"multi-level-cluster"`` / ``"mlc"``, + ``"reverse-cuthill-mckee"`` / ``"rcm"``, or a node-attribute key (e.g. + ``"geoid"``, or the special ``"id"`` for the NetworkX node id). + """ + return graph_reorder(graph, method) + + +def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: + """Reorder ``graph`` using recursive multi-level clustering.""" + return graph_reorder(graph, "multi-level-cluster") + + +def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: + """Reorder ``graph`` using Reverse Cuthill-McKee.""" + return graph_reorder(graph, "reverse-cuthill-mckee") + + +def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: + """Reorder ``graph`` by sorting on a node-attribute ``key`` (use ``"id"`` for node id).""" + return graph_reorder(graph, key) diff --git a/ben-py/binary_ensemble/graph.pyi b/ben-py/binary_ensemble/graph.pyi new file mode 100644 index 0000000..0317d93 --- /dev/null +++ b/ben-py/binary_ensemble/graph.pyi @@ -0,0 +1,15 @@ +from typing import Any, Tuple + +__all__ = [ + "reorder", + "reorder_multi_level_cluster", + "reorder_reverse_cuthill_mckee", + "reorder_by_key", +] + +# Each helper returns (reordered_graph, node_permutation_map): the graph is a live +# NetworkX graph, the map is the parsed node_permutation_map.json dict. +def reorder(graph: Any, method: str) -> Tuple[Any, Any]: ... +def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: ... +def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: ... +def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: ... diff --git a/ben-py/binary_ensemble/stream.py b/ben-py/binary_ensemble/stream.py new file mode 100644 index 0000000..d89852a --- /dev/null +++ b/ben-py/binary_ensemble/stream.py @@ -0,0 +1,15 @@ +"""Plain BEN/XBEN stream encoding and decoding. + +``BenEncoder`` writes a plain ``.ben`` stream; ``BenDecoder`` iterates a plain +``.ben`` / ``.xben`` stream. Both are stream-only: opening a decoder on a +``.bendl`` bundle, or trying to read bundle assets, raises and points you at +:mod:`binary_ensemble.bundle`. For the recommended single-file bundle format, +use :class:`binary_ensemble.bundle.BendlEncoder` / +:class:`binary_ensemble.bundle.BendlDecoder`. +""" + +from __future__ import annotations + +from binary_ensemble._core import BenDecoder, BenEncoder + +__all__ = ["BenEncoder", "BenDecoder"] diff --git a/ben-py/binary_ensemble/stream.pyi b/ben-py/binary_ensemble/stream.pyi new file mode 100644 index 0000000..911845e --- /dev/null +++ b/ben-py/binary_ensemble/stream.pyi @@ -0,0 +1,4 @@ +from binary_ensemble._core import BenDecoder as BenDecoder +from binary_ensemble._core import BenEncoder as BenEncoder + +__all__ = ["BenEncoder", "BenDecoder"] diff --git a/ben-py/docs/user/using_ben_py.ipynb b/ben-py/docs/user/using_ben_py.ipynb index 5ae2efb..4af2348 100644 --- a/ben-py/docs/user/using_ben_py.ipynb +++ b/ben-py/docs/user/using_ben_py.ipynb @@ -145,18 +145,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "9296ca41", "metadata": {}, "outputs": [], "source": [ - "from binary_ensemble import (\n", - " compress_jsonl_to_ben,\n", - " compress_jsonl_to_xben,\n", - " compress_ben_to_xben,\n", - " decompress_ben_to_jsonl,\n", - " decompress_xben_to_jsonl,\n", - " decompress_xben_to_ben,\n", + "from binary_ensemble.codec import (\n", + " encode_jsonl_to_ben,\n", + " encode_jsonl_to_xben,\n", + " encode_ben_to_xben,\n", + " decode_ben_to_jsonl,\n", + " decode_xben_to_jsonl,\n", + " decode_xben_to_ben,\n", ")" ] }, @@ -173,12 +173,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "1e1e32b0", "metadata": {}, "outputs": [], "source": [ - "compress_jsonl_to_ben(\n", + "encode_jsonl_to_ben(\n", " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", ")" @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "2f1ce280", "metadata": {}, "outputs": [ @@ -209,7 +209,7 @@ ], "source": [ "try:\n", - " compress_jsonl_to_ben(\n", + " encode_jsonl_to_ben(\n", " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " )\n", @@ -255,7 +255,7 @@ "metadata": {}, "outputs": [], "source": [ - "compress_jsonl_to_xben(\n", + "encode_jsonl_to_xben(\n", " in_file=\"example_data/small_example.jsonl\",\n", " out_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", " overwrite=True,\n", @@ -264,7 +264,7 @@ " compression_level=9,\n", ")\n", "\n", - "compress_ben_to_xben(\n", + "encode_ben_to_xben(\n", " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " out_file=\"example_data/small_example_jsonl_to_ben_to_xben.jsonl.xben\",\n", " overwrite=True,\n", @@ -314,19 +314,19 @@ "metadata": {}, "outputs": [], "source": [ - "decompress_ben_to_jsonl(\n", + "decode_ben_to_jsonl(\n", " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", " out_file=\"example_data/small_example_jsonl_to_ben_to_jsonl.jsonl\",\n", " overwrite=True,\n", ")\n", "\n", - "decompress_xben_to_jsonl(\n", + "decode_xben_to_jsonl(\n", " in_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", " out_file=\"example_data/small_example_jsonl_to_xben_to_jsonl.jsonl\",\n", " overwrite=True,\n", ")\n", "\n", - "decompress_xben_to_ben(\n", + "decode_xben_to_ben(\n", " in_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", " out_file=\"example_data/small_example_jsonl_to_xben_to_ben.jsonl.ben\",\n", " overwrite=True,\n", @@ -402,23 +402,26 @@ "id": "a5ef02f8", "metadata": {}, "source": [ - "Okay, now it is time to write the output into a BEN format. The most important thing that\n", - "we need to keep track of here is the order of the `Assignment` returned by GerryChain. In general\n", - "GerryChain makes no guarantees about the ordering of the nodes in the output assignment, and to\n", - "write to a BEN file, we MUST make sure that the ordering of the values in the assignment vector\n", - "lines up with the order of the nodes in the graph." + "Okay, now it is time to write the output. The recommended format is a **`.bendl` bundle**: a single\n", + "self-describing file that stores the dual graph (and any metadata) alongside the assignment stream,\n", + "so a collaborator can open it without hunting down the matching graph JSON.\n", + "\n", + "The most important thing we need to keep track of is the order of the `Assignment` returned by\n", + "GerryChain. In general GerryChain makes no guarantees about the ordering of the nodes in the output\n", + "assignment, and to write the stream we MUST make sure that the ordering of the values in the\n", + "assignment vector lines up with the order of the nodes in the graph." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "dec15cda", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cece4170d7034a36bcf1626ef58d9412", + "model_id": "ffff4f4cfe644c3ea93516c8532626cd", "version_major": 2, "version_minor": 0 }, @@ -431,18 +434,24 @@ } ], "source": [ - "from binary_ensemble import BenEncoder\n", + "from binary_ensemble.bundle import BendlEncoder\n", "\n", "graph_node_order = list(graph.nodes)\n", "\n", - "with BenEncoder(\"example_data/gerrychain_10000.jsonl.ben\", overwrite=True) as encoder:\n", - " for partition in recom_chain.with_progress_bar():\n", - " assignment_series = partition.assignment.to_series()\n", - " # Assignment vectors must be lists of integers\n", - " ordered_assignment = (\n", - " assignment_series.loc[graph_node_order].astype(int).tolist()\n", - " )\n", - " encoder.write(ordered_assignment)" + "with BendlEncoder(\"example_data/gerrychain_10000.bendl\", overwrite=True) as encoder:\n", + " # Embed the dual graph so the bundle is self-describing. Pass\n", + " # preprocess_method=None to store it as-is; pass e.g. \"rcm\" to reorder the\n", + " # nodes for better compression (which also records a node_permutation_map).\n", + " encoder.add_graph(\"example_data/gerrymandria.json\", preprocess_method=None)\n", + "\n", + " with encoder.stream(\"ben\", variant=\"twodelta\") as stream:\n", + " for partition in recom_chain.with_progress_bar():\n", + " assignment_series = partition.assignment.to_series()\n", + " # Assignment vectors must be lists of integers\n", + " ordered_assignment = (\n", + " assignment_series.loc[graph_node_order].astype(int).tolist()\n", + " )\n", + " stream.write(ordered_assignment)" ] }, { @@ -452,46 +461,18 @@ "source": [ "### Decoding\n", "\n", - "Decoding with binary-ensemble should also feel fairly simple: just iterate over the file and pull out the \n", - "assignment vector that you would like to work with." + "Decoding is just as simple: open the bundle with `BendlDecoder` and iterate over it to pull out the\n", + "assignment vectors. Because the dual graph is embedded, it can be recovered directly from the bundle\n", + "with `decoder.read_graph()` — no separate graph file required." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "801c6fb7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sample: 1, Cut Edge Count: 56\n", - "Sample: 1001, Cut Edge Count: 41\n", - "Sample: 2001, Cut Edge Count: 44\n", - "Sample: 3001, Cut Edge Count: 41\n", - "Sample: 4001, Cut Edge Count: 32\n", - "Sample: 5001, Cut Edge Count: 38\n", - "Sample: 6001, Cut Edge Count: 36\n", - "Sample: 7001, Cut Edge Count: 41\n", - "Sample: 8001, Cut Edge Count: 37\n", - "Sample: 9001, Cut Edge Count: 41\n" - ] - } - ], - "source": [ - "from binary_ensemble import BenDecoder\n", - "import pandas as pd\n", - "\n", - "\n", - "graph_node_order_series = pd.Index(graph.nodes)\n", - "\n", - "for i, assignment in enumerate(BenDecoder(\"example_data/gerrychain_10000.jsonl.ben\")):\n", - " assignment = pd.Series(assignment, index=graph_node_order_series)\n", - " partition = Partition(graph, assignment=assignment, updaters=my_updaters)\n", - " if i % 1000 == 0:\n", - " print(f\"Sample: {i + 1}, Cut Edge Count: {len(partition['cut_edges'])}\")" - ] + "outputs": [], + "source": "from binary_ensemble import BendlDecoder\nimport pandas as pd\n\ndecoder = BendlDecoder(\"example_data/gerrychain_10000.bendl\")\n\n# The dual graph is embedded in the bundle, so recover it directly with\n# read_graph() — no separate graph file needed. It comes back as a live NetworkX\n# graph whose node order matches the order the assignments were written in.\npackaged_graph = decoder.read_graph()\ngraph_node_order_series = pd.Index(packaged_graph.nodes)\n\nprint(f\"Bundle carries {len(decoder)} samples; assets: {decoder.asset_names()}\")\n\nfor i, assignment in enumerate(decoder):\n assignment = pd.Series(assignment, index=graph_node_order_series)\n partition = Partition(packaged_graph, assignment=assignment, updaters=my_updaters)\n if i % 1000 == 0:\n print(f\"Sample: {i + 1}, Cut Edge Count: {len(partition['cut_edges'])}\")\n" }, { "cell_type": "markdown", @@ -510,13 +491,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "ff8e8e4a", "metadata": {}, "outputs": [], "source": [ + "from binary_ensemble.stream import BenDecoder\n", + "\n", "# Warning, this BEN file will be ~2Gb\n", - "decompress_xben_to_ben(\n", + "decode_xben_to_ben(\n", " in_file=\"example_data/100k_CO_chain.jsonl.xben\",\n", " out_file=\"example_data/100k_CO_chain.jsonl.ben\",\n", " overwrite=True,\n", @@ -525,20 +508,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "24761ca6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[8, 8, 5, 4, 5, 5, 5, 5, 3, 3]\n", - "[1, 1, 6, 5, 6, 6, 6, 6, 2, 2]\n", - "[1, 1, 3, 8, 8, 8, 3, 3, 4, 4]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.ben\"\n", @@ -548,23 +521,10 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "0a815edf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[5, 5, 6, 1, 2, 1, 6, 6, 4, 4]\n", - "[5, 5, 1, 1, 2, 1, 1, 1, 4, 4]\n", - "[7, 7, 1, 1, 2, 1, 1, 1, 4, 4]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(\n", " 1000, 1005\n", @@ -574,28 +534,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "3be48458", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[8, 8, 5, 4, 5, 5, 5, 5, 3, 3]\n", - "[7, 7, 3, 3, 3, 3, 3, 3, 2, 2]\n", - "[5, 5, 1, 3, 4, 4, 1, 1, 3, 3]\n", - "[5, 5, 3, 8, 3, 3, 3, 6, 1, 1]\n", - "[8, 8, 4, 4, 3, 4, 4, 7, 1, 1]\n", - "[5, 1, 7, 3, 3, 7, 7, 3, 6, 6]\n", - "[4, 4, 1, 3, 1, 1, 1, 1, 5, 5]\n", - "[6, 6, 7, 2, 1, 2, 7, 7, 7, 5]\n", - "[4, 4, 8, 1, 8, 8, 8, 3, 7, 7]\n", - "[1, 1, 5, 5, 5, 5, 5, 5, 6, 6]\n", - "[1, 1, 3, 8, 8, 8, 3, 3, 4, 4]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(\n", " 10000\n", @@ -614,28 +556,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "51d9f27a", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_239360/229284435.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_indices([1, 23978, 100000]):\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[8, 8, 5, 4, 5, 5, 5, 5, 3, 3]\n", - "[1, 1, 6, 5, 6, 6, 6, 6, 2, 2]\n", - "[1, 1, 3, 8, 8, 8, 3, 3, 4, 4]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", @@ -645,31 +569,10 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "a51d0019", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_239360/1010090289.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_range(1000,1005):\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[5, 5, 6, 1, 2, 1, 6, 6, 4, 4]\n", - "[5, 5, 1, 1, 2, 1, 1, 1, 4, 4]\n", - "[7, 7, 1, 1, 2, 1, 1, 1, 4, 4]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n", - "[7, 7, 4, 4, 2, 4, 4, 4, 1, 1]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", @@ -679,36 +582,10 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "eeb1c112", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_239360/49125867.py:1: UserWarning: XBEN may take a second to start decoding.\n", - " for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\").subsample_every(10000):\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[8, 8, 5, 4, 5, 5, 5, 5, 3, 3]\n", - "[7, 7, 3, 3, 3, 3, 3, 3, 2, 2]\n", - "[5, 5, 1, 3, 4, 4, 1, 1, 3, 3]\n", - "[5, 5, 3, 8, 3, 3, 3, 6, 1, 1]\n", - "[8, 8, 4, 4, 3, 4, 4, 7, 1, 1]\n", - "[5, 1, 7, 3, 3, 7, 7, 3, 6, 6]\n", - "[4, 4, 1, 3, 1, 1, 1, 1, 5, 5]\n", - "[6, 6, 7, 2, 1, 2, 7, 7, 7, 5]\n", - "[4, 4, 8, 1, 8, 8, 8, 3, 7, 7]\n", - "[1, 1, 5, 5, 5, 5, 5, 5, 6, 6]\n", - "[1, 1, 3, 8, 8, 8, 3, 3, 4, 4]\n" - ] - } - ], + "outputs": [], "source": [ "for assignment in BenDecoder(\n", " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", @@ -746,4 +623,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index 4bda97e..1829e51 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -9,6 +9,9 @@ dynamic = ["version"] description = "Python bindings for the Binary Ensemble Package." readme = "README.md" requires-python = ">=3.11" +dependencies = [ + "networkx>=3.0", +] [tool.maturin] diff --git a/ben-py/src/common.rs b/ben-py/src/common.rs index 88f5a40..f222875 100644 --- a/ben-py/src/common.rs +++ b/ben-py/src/common.rs @@ -1,6 +1,7 @@ use binary_ensemble::BenVariant; -use pyo3::exceptions::{PyIOError, PyValueError}; -use pyo3::prelude::PyResult; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyDict, PyList}; use std::fs::File; use std::io::{BufReader, BufWriter}; use std::path::PathBuf; @@ -56,3 +57,91 @@ pub fn open_output(out_file: &PathBuf, overwrite: bool) -> PyResult, obj: &Bound<'_, PyAny>) -> PyResult> { + // Dict / list → json.dumps. + if obj.is_instance_of::() || obj.is_instance_of::() { + let json_mod = py.import("json")?; + let dumped = json_mod.call_method1("dumps", (obj,))?; + let s: String = dumped.extract()?; + return Ok(s.into_bytes()); + } + + // Raw bytes / bytearray. + if let Ok(b) = obj.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = obj.extract::>() { + return Ok(b); + } + + // File-like: must have .read(). Check before str/path, since a plain `str` / `Path` has no + // `.read()` attribute and will fall through. + if obj.hasattr("read")? { + let data = obj.call_method0("read")?; + if let Ok(b) = data.downcast::() { + return Ok(b.as_bytes().to_vec()); + } + if let Ok(b) = data.extract::>() { + return Ok(b); + } + if let Ok(s) = data.extract::() { + return Ok(s.into_bytes()); + } + return Err(PyException::new_err( + "graph .read() must return bytes or str", + )); + } + + // Path / str → read the file at that path. + let path: PathBuf = obj.extract().map_err(|_| { + PyValueError::new_err( + "graph must be a dict/list, bytes, a file-like with .read(), or a path", + ) + })?; + std::fs::read(&path).map_err(|e| { + PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) + }) +} + +/// Build a live NetworkX graph from an already-parsed adjacency-format JSON object. +/// +/// The shared tail behind every API that hands a graph back to the caller — +/// `BendlEncoder.add_graph`, `BendlDecoder.read_graph`, and the `graph` reordering utilities — so +/// they all return graphs in the same shape. +pub fn networkx_graph_from_json(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult> { + let json_graph = py.import("networkx.readwrite.json_graph")?; + Ok(json_graph.call_method1("adjacency_graph", (data,))?.into()) +} + +/// Build a live NetworkX graph from adjacency-format JSON bytes. +pub fn networkx_graph_from_bytes(py: Python<'_>, bytes: &[u8]) -> PyResult> { + let json_mod = py.import("json")?; + let text = std::str::from_utf8(bytes) + .map_err(|e| PyException::new_err(format!("graph is not valid UTF-8: {e}")))?; + let data = json_mod.call_method1("loads", (text,))?; + networkx_graph_from_json(py, &data) +} + +/// Count the number of nodes declared in a NetworkX adjacency-format graph's `nodes` array. +/// +/// Used to validate that each assignment written to a bundle stream matches the embedded graph's +/// node count. +pub fn graph_node_count(graph_bytes: &[u8]) -> PyResult { + let value: serde_json::Value = serde_json::from_slice(graph_bytes) + .map_err(|e| PyValueError::new_err(format!("graph is not valid JSON: {e}")))?; + value + .get("nodes") + .and_then(|n| n.as_array()) + .map(|a| a.len()) + .ok_or_else(|| PyValueError::new_err("graph JSON has no 'nodes' array to count")) +} diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs new file mode 100644 index 0000000..2e55863 --- /dev/null +++ b/ben-py/src/decode/bundle_decoder.rs @@ -0,0 +1,331 @@ +use super::cursor::SampleCursor; +use super::helpers::{detect_is_bundle, warn_xben_startup}; +use super::types::{DecoderMode, StreamSource}; +use binary_ensemble::io::bundle::format::{ + ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + ASSET_TYPE_NODE_PERMUTATION_MAP, +}; +use binary_ensemble::io::bundle::BendlReader; +use pyo3::exceptions::{PyException, PyIOError, PyKeyError}; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufReader, BufWriter, Write}; +use std::path::PathBuf; + +/// Reader and iterator for a `.bendl` bundle. +/// +/// This decoder is bundle-only: opening it on a plain `.ben`/`.xben` stream raises and points the +/// caller at `BenDecoder`. It exposes the bundle inspection surface (`version`, `is_complete`, +/// `asset_names`, `list_assets`, canonical and generic asset getters, `extract_stream`) and iterates +/// the embedded assignment stream. +#[pyclass(module = "binary_ensemble", name = "BendlDecoder", unsendable)] +pub struct PyBendlDecoder { + path: PathBuf, + reader: BendlReader>, + cursor: SampleCursor, +} + +#[pymethods] +impl PyBendlDecoder { + /// Open a decoder on a `.bendl` bundle. + /// + /// The file's leading bytes are sniffed; a plain `.ben`/`.xben` stream is rejected with a + /// pointer at `BenDecoder`. The bundle header decides the embedded BEN/XBEN format. + #[new] + #[pyo3(signature = (file_path))] + #[pyo3(text_signature = "(file_path)")] + fn new(py: Python<'_>, file_path: PathBuf) -> PyResult { + let is_bundle = detect_is_bundle(&file_path).map_err(|e| { + PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) + })?; + if !is_bundle { + return Err(PyException::new_err(format!( + "{} is not a .bendl bundle (missing BENDL magic). Open plain BEN/XBEN \ + streams with binary_ensemble.stream.BenDecoder instead.", + file_path.display() + ))); + } + + let file = File::open(&file_path).map_err(|e| { + PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) + })?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + file_path.display() + )) + })?; + let fmt = reader.assignment_format().ok_or_else(|| { + PyException::new_err("Bundle header has an unrecognized assignment_format field.") + })?; + let mode = DecoderMode::from_assignment_format(fmt); + let (stream_offset, stream_len) = reader.assignment_stream_range().map_err(|e| { + PyException::new_err(format!( + "Failed to determine stream region in {}: {e}", + file_path.display() + )) + })?; + + // Emit the XBEN startup warning once, up front. + if matches!(mode, DecoderMode::XBen) { + warn_xben_startup(py)?; + } + + let header_sample_count = reader.sample_count(); + let empty = reader.is_finalized() && stream_len == 0; + let source = StreamSource::Bundle { + path: file_path.clone(), + stream_offset, + stream_len, + header_sample_count, + empty, + }; + + Ok(Self { + path: file_path, + reader, + cursor: SampleCursor::new(source, mode), + }) + } + + // ----------------------------------------------------------------- + // Iteration over the embedded stream. + // ----------------------------------------------------------------- + + fn __iter__(mut slf: PyRefMut) -> PyResult> { + slf.cursor.restart()?; + Ok(slf.into()) + } + + fn __next__(&mut self) -> PyResult>> { + self.cursor.next() + } + + fn __len__(&mut self, py: Python<'_>) -> PyResult { + self.cursor.len(py) + } + + #[pyo3(text_signature = "(self)")] + fn count_samples(&mut self, py: Python<'_>) -> PyResult { + self.cursor.count_samples(py) + } + + #[pyo3(text_signature = "(self, indices, /)")] + fn subsample_indices<'py>( + mut slf: PyRefMut<'py, Self>, + indices: Vec, + py: Python<'_>, + ) -> PyResult> { + slf.cursor.subsample_indices(indices, py)?; + Ok(slf.into()) + } + + #[pyo3(text_signature = "(self, start, end, /)")] + fn subsample_range<'py>( + mut slf: PyRefMut<'py, Self>, + start: usize, + end: usize, + py: Python<'_>, + ) -> PyResult> { + slf.cursor.subsample_range(start, end, py)?; + Ok(slf.into()) + } + + #[pyo3(signature = (step, offset=1))] + fn subsample_every<'py>( + mut slf: PyRefMut<'py, Self>, + step: usize, + offset: usize, + py: Python<'_>, + ) -> PyResult> { + slf.cursor.subsample_every(step, offset, py)?; + Ok(slf.into()) + } + + // ----------------------------------------------------------------- + // Bundle inspection surface. + // ----------------------------------------------------------------- + + /// Return the container format of the embedded assignment stream as `"ben"` or `"xben"`. + #[pyo3(text_signature = "(self)")] + fn assignment_format(&self) -> &'static str { + self.cursor.mode().as_str() + } + + /// Return the bundle's format version as a `(major, minor)` tuple. + #[pyo3(text_signature = "(self)")] + fn version(&self) -> (u16, u16) { + let h = self.reader.header(); + (h.major_version, h.minor_version) + } + + /// Whether the bundle was successfully finalized. + #[pyo3(text_signature = "(self)")] + fn is_complete(&self) -> bool { + self.reader.is_finalized() + } + + /// Names of every entry in the bundle's directory, in directory order. + #[pyo3(text_signature = "(self)")] + fn asset_names(&self) -> Vec { + self.reader.assets().iter().map(|e| e.name.clone()).collect() + } + + /// Return the full bundle directory as a list of dicts with keys `name`, `type`, `offset`, + /// `len`, and `flags` (a list of string tags). + #[pyo3(text_signature = "(self)")] + fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { + let entries = self.reader.assets(); + let mut out = Vec::with_capacity(entries.len()); + for entry in entries { + let d = PyDict::new(py); + d.set_item("name", &entry.name)?; + d.set_item("type", entry.asset_type)?; + d.set_item("offset", entry.payload_offset)?; + d.set_item("len", entry.payload_len)?; + let mut flags: Vec<&str> = Vec::new(); + if entry.asset_flags & ASSET_FLAG_JSON != 0 { + flags.push("json"); + } + if entry.asset_flags & ASSET_FLAG_XZ != 0 { + flags.push("xz"); + } + if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { + flags.push("checksum"); + } + d.set_item("flags", flags)?; + out.push(d); + } + Ok(out) + } + + /// Read the (decoded) bytes of a named asset as a Python `bytes` object. + #[pyo3(text_signature = "(self, name, /)")] + fn read_asset_bytes(&mut self, name: &str) -> PyResult> { + let entry = self + .reader + .find_asset_by_name(name) + .cloned() + .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; + self.reader + .asset_bytes(&entry) + .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) + } + + /// Parse a JSON asset into a Python object (dict, list, …). + #[pyo3(text_signature = "(self, name, /)")] + fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { + let bytes = self.read_asset_bytes(name)?; + json_loads(py, &bytes, name) + } + + /// Read the bundle's `graph.json` asset as a NetworkX graph, or `None` if absent. + /// + /// The stored adjacency-format JSON is rebuilt into a live graph via + /// `networkx.readwrite.json_graph.adjacency_graph`, so its node order matches the order + /// assignments were written in and it can be handed straight to consumers like GerryChain's + /// `Partition`. The raw JSON is still available through `read_json_asset("graph.json")`. + #[pyo3(text_signature = "(self)")] + fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { + let Some(data) = self.read_known_json(py, ASSET_TYPE_GRAPH, "graph.json")? else { + return Ok(None); + }; + Ok(Some(crate::common::networkx_graph_from_json( + py, + data.bind(py), + )?)) + } + + /// Read the bundle's `metadata.json` asset as parsed JSON, or `None` if absent. + #[pyo3(text_signature = "(self)")] + fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { + self.read_known_json(py, ASSET_TYPE_METADATA, "metadata.json") + } + + /// Read the bundle's `node_permutation_map.json` asset as parsed JSON, or `None` if absent. + #[pyo3(text_signature = "(self)")] + fn read_node_permutation_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { + self.read_known_json(py, ASSET_TYPE_NODE_PERMUTATION_MAP, "node_permutation_map.json") + } + + /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be + /// opened directly with `BenDecoder(out_path, mode=dec.assignment_format())`. + #[pyo3(signature = (out_path, overwrite=false, allow_unfinalized=false))] + #[pyo3(text_signature = "(self, out_path, overwrite=False, allow_unfinalized=False)")] + fn extract_stream( + &mut self, + out_path: PathBuf, + overwrite: bool, + allow_unfinalized: bool, + ) -> PyResult<()> { + if out_path.exists() && !overwrite { + return Err(PyIOError::new_err(format!( + "Output file {} already exists (use overwrite=True to replace).", + out_path.display() + ))); + } + let mut stream = if allow_unfinalized && !self.reader.is_finalized() { + self.reader + .assignment_stream_reader_unverified() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? + } else { + self.reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? + }; + + let out = if overwrite { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&out_path) + } else { + OpenOptions::new().write(true).create_new(true).open(&out_path) + } + .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; + let mut out = BufWriter::new(out); + + io::copy(&mut stream, &mut out) + .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; + out.flush() + .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; + Ok(()) + } + + fn __repr__(&self) -> String { + let h = self.reader.header(); + format!( + "BendlDecoder(path={:?}, format={:?}, complete={}, assets={})", + self.path, + self.cursor.mode().as_str(), + h.is_finalized(), + self.reader.assets().len(), + ) + } +} + +impl PyBendlDecoder { + /// Read a known singleton asset by type, returning `None` when it is absent. + fn read_known_json<'py>( + &mut self, + py: Python<'py>, + asset_type: u16, + name: &str, + ) -> PyResult>> { + if self.reader.find_asset_by_type(asset_type).is_none() { + return Ok(None); + } + Ok(Some(self.read_json_asset(py, name)?)) + } +} + +/// Parse JSON bytes into a Python object, with errors naming the asset. +fn json_loads(py: Python<'_>, bytes: &[u8], name: &str) -> PyResult> { + let json_mod = py.import("json")?; + let text = std::str::from_utf8(bytes) + .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; + let parsed = json_mod.call_method1("loads", (text,))?; + Ok(parsed.into()) +} diff --git a/ben-py/src/decode/cursor.rs b/ben-py/src/decode/cursor.rs new file mode 100644 index 0000000..f71bea8 --- /dev/null +++ b/ben-py/src/decode/cursor.rs @@ -0,0 +1,235 @@ +//! Shared iteration and subsampling core for the stream and bundle decoders. +//! +//! [`SampleCursor`] owns everything needed to walk an assignment stream and to apply a subsample +//! selection, independent of whether the bytes come from a plain `.ben`/`.xben` file or from a +//! `.bendl` bundle's embedded stream region. Both `PyBenDecoder` and `PyBendlDecoder` embed one and +//! forward their iteration / `len` / `subsample_*` methods to it, so the single-pass restart logic, +//! the `MkvRecord` run expansion, and the subsample bounds checks cannot drift between the two. + +use super::helpers::{build_frames_for_subsample, build_iter, scan_samples}; +use super::types::{ActiveSelection, DecoderMode, DynIter, StreamSource}; +use binary_ensemble::io::reader::{Selection, SubsampleFrameDecoder}; +use pyo3::exceptions::{PyException, PyUserWarning}; +use pyo3::prelude::*; +use pyo3::types::PyDict; + +/// Iteration state shared by the stream and bundle decoders. +pub(super) struct SampleCursor { + source: StreamSource, + mode: DecoderMode, + /// Lazily-constructed frame iterator. Construction is deferred so opening a bundle with an empty + /// or truncated stream still succeeds — only methods that actually walk the stream need a live + /// iterator. + iter: Option, + current_assignment: Option>, + remaining_count: u16, + base_len: Option, + len_hint: Option, + active_selection: ActiveSelection, +} + +impl SampleCursor { + pub(super) fn new(source: StreamSource, mode: DecoderMode) -> Self { + Self { + source, + mode, + iter: None, + current_assignment: None, + remaining_count: 0, + base_len: None, + len_hint: None, + active_selection: ActiveSelection::None, + } + } + + pub(super) fn mode(&self) -> DecoderMode { + self.mode + } + + /// Eagerly construct the iterator now, surfacing a malformed-banner error at open time. Used by + /// the plain-stream decoder, which can only learn the variant by opening the reader. + pub(super) fn prime_iter(&mut self) -> PyResult<()> { + self.iter = Some(build_iter(&self.source, self.mode)?); + Ok(()) + } + + /// Reset and rebuild the iterator from the start, reapplying any active subsample selection. + pub(super) fn restart(&mut self) -> PyResult<()> { + self.current_assignment = None; + self.remaining_count = 0; + + let new_iter: DynIter = match self.active_selection.clone() { + ActiveSelection::None => build_iter(&self.source, self.mode)?, + sel => { + let frames = build_frames_for_subsample(&self.source, self.mode)?; + let ben_sel = sel + .to_selection() + .expect("active subsample selection must be convertible"); + Box::new(SubsampleFrameDecoder::new(frames, ben_sel)) + } + }; + self.iter = Some(new_iter); + Ok(()) + } + + pub(super) fn next(&mut self) -> PyResult>> { + if self.remaining_count > 0 { + self.remaining_count -= 1; + let a = self.current_assignment.as_ref().unwrap().clone(); + return Ok(Some(a)); + } + // Build the iterator on first use (e.g. when iteration begins without an explicit + // `__iter__` call). For bundle sources with empty/truncated streams this is where a + // BEN-banner-required error surfaces, instead of at decoder construction. + if self.iter.is_none() { + self.iter = Some(build_iter(&self.source, self.mode)?); + } + let next = self + .iter + .as_mut() + .expect("iter populated by the lazy-init branch above") + .next(); + match next { + Some(Ok((assignment, count))) => { + if count == 0 { + return Err(PyException::new_err( + "Decoder yielded a zero-count record; data may be corrupted.", + )); + } + self.current_assignment = Some(assignment.clone()); + self.remaining_count = count - 1; + Ok(Some(assignment)) + } + Some(Err(e)) => Err(PyException::new_err(format!("Error decoding next item: {e}"))), + None => Ok(None), + } + } + + /// Report the number of samples `len(dec)` should return: the filtered count when a subsample + /// selection is active, otherwise the base count. + pub(super) fn len(&mut self, py: Python<'_>) -> PyResult { + if let Some(len_hint) = self.len_hint { + return Ok(len_hint); + } + let base_len = self.ensure_base_len(py)?; + self.len_hint = Some(base_len); + Ok(base_len) + } + + /// Always report the base (unfiltered) sample count, even after `subsample_*` has been applied. + /// Deliberately does not touch `len_hint`, which tracks the filtered count for `__len__`. + pub(super) fn count_samples(&mut self, py: Python<'_>) -> PyResult { + self.ensure_base_len(py) + } + + pub(super) fn subsample_indices( + &mut self, + mut indices: Vec, + py: Python<'_>, + ) -> PyResult<()> { + if !indices.iter().is_sorted() { + // We need to sort and deduplicate the indices. This is necessary so we can efficiently + // iterate over the underlying data. Unstable sort is fine because we do not care about + // the order of equal elements. + let warnings = py.import("warnings")?; + let kwargs = PyDict::new(py); + warnings.call_method( + "warn", + ( + "Indices must be sorted and unique; sorting and deduplicating.", + py.get_type::(), + ), + Some(&kwargs), + )?; + } + indices.sort_unstable(); + indices.dedup(); + + if indices.is_empty() { + return Err(PyException::new_err("indices must not be empty")); + } + let base_len = self.ensure_base_len(py)?; + if indices[0] == 0 { + return Err(PyException::new_err("indices must be 1-based")); + } + if indices.last().unwrap() > &base_len { + return Err(PyException::new_err(format!( + "indices must be <= number of samples in base data ({base_len})" + ))); + } + let len_hint = indices.len(); + self.active_selection = ActiveSelection::Indices(indices.clone()); + let sel = Selection::Indices(indices.into_iter().peekable()); + self.reset_with_selection(sel, len_hint) + } + + pub(super) fn subsample_range( + &mut self, + start: usize, + end: usize, + py: Python<'_>, + ) -> PyResult<()> { + if start == 0 || end < start { + return Err(PyException::new_err( + "range must be 1-based and end >= start", + )); + } + let base_len = self.ensure_base_len(py)?; + if end > base_len { + return Err(PyException::new_err(format!( + "end must be <= number of samples in base data ({base_len})" + ))); + } + self.active_selection = ActiveSelection::Range { start, end }; + let sel = Selection::Range { start, end }; + let len_hint = end - start + 1; + self.reset_with_selection(sel, len_hint) + } + + pub(super) fn subsample_every( + &mut self, + step: usize, + offset: usize, + py: Python<'_>, + ) -> PyResult<()> { + if step == 0 || offset == 0 { + return Err(PyException::new_err("step and offset must be >= 1")); + } + let base_len = self.ensure_base_len(py)?; + if offset > base_len { + return Err(PyException::new_err(format!( + "offset must be <= number of samples in base data ({base_len})" + ))); + } + self.active_selection = ActiveSelection::Every { step, offset }; + let sel = Selection::Every { step, offset }; + let len_hint = (base_len + step - 1 - (offset - 1)) / step; + self.reset_with_selection(sel, len_hint) + } + + fn reset_with_selection(&mut self, selection: Selection, len_hint: usize) -> PyResult<()> { + let frames = build_frames_for_subsample(&self.source, self.mode)?; + let frame_decoder = SubsampleFrameDecoder::new(frames, selection); + self.iter = Some(Box::new(frame_decoder)); + self.current_assignment = None; + self.remaining_count = 0; + self.len_hint = Some(len_hint); + Ok(()) + } + + fn ensure_base_len(&mut self, py: Python<'_>) -> PyResult { + if let Some(base_len) = self.base_len { + return Ok(base_len); + } + let base_len = match &self.source { + StreamSource::Bundle { empty: true, .. } => 0, + StreamSource::Bundle { + header_sample_count: Some(n), + .. + } if *n >= 0 => *n as usize, + _ => scan_samples(&self.source, self.mode, py)?, + }; + self.base_len = Some(base_len); + Ok(base_len) + } +} diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 273d468..666fe17 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -1,51 +1,31 @@ -use super::helpers::{ - build_bundle_iter, build_frames_for_subsample, build_plain_iter, detect_is_bundle, - scan_bundle_samples, warn_xben_startup, -}; -use super::types::{ActiveSelection, BundleState, DecoderBackend, DecoderMode, DynIter}; -use binary_ensemble::io::bundle::format::{ - ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, - ASSET_TYPE_NODE_PERMUTATION_MAP, -}; -use binary_ensemble::io::bundle::BendlReader; -use binary_ensemble::io::reader::{count_samples_from_file, Selection, SubsampleFrameDecoder}; -use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyUserWarning}; +use super::cursor::SampleCursor; +use super::helpers::{detect_is_bundle, warn_xben_startup}; +use super::types::{DecoderMode, StreamSource}; +use pyo3::exceptions::{PyException, PyIOError}; use pyo3::prelude::*; -use pyo3::types::PyDict; -use std::fs::{File, OpenOptions}; -use std::io::{self, BufReader, BufWriter, Write}; use std::path::PathBuf; +/// Iterator over assignments in a plain BEN or XBEN stream. +/// +/// This decoder is stream-only: opening it on a `.bendl` bundle raises and points the caller at +/// `BendlDecoder`. Bundle inspection (assets, directory, embedded-stream extraction) lives on +/// `BendlDecoder`, mirroring the `ben` vs `bendl` CLI split. #[pyclass(module = "binary_ensemble", name = "BenDecoder", unsendable)] pub struct PyBenDecoder { - path: PathBuf, - mode: DecoderMode, - backend: DecoderBackend, - /// Lazily-constructed frame iterator. We defer construction so opening a bundle whose stream - /// is empty or truncated still succeeds — only methods that actually walk the stream need - /// a live iterator. - iter: Option, - current_assignment: Option>, - remaining_count: u16, - base_len: Option, - len_hint: Option, - active_selection: ActiveSelection, + cursor: SampleCursor, } #[pymethods] impl PyBenDecoder { - /// Open a decoder on a `.ben`, `.xben`, or `.bendl` file. + /// Open a decoder on a plain `.ben` or `.xben` file. /// - /// The file's leading bytes are sniffed to decide whether it is a bundle. When the file is a - /// `.bendl`, the bundle's header decides the BEN/XBEN format and the `mode` argument is - /// ignored; when the file is a plain stream, `mode` selects between the BEN and XBEN readers - /// and defaults to `"ben"`. + /// The file's leading bytes are sniffed; a `.bendl` bundle is rejected with a pointer at + /// `BendlDecoder`. `mode` selects between the BEN and XBEN readers and defaults to `"ben"`. /// /// # Arguments /// /// * `file_path` - Path to the input file. - /// * `mode` - Either `"ben"` or `"xben"`. Only consulted for plain streams; bundles use - /// `assignment_format` from the header. + /// * `mode` - Either `"ben"` or `"xben"`. #[new] #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] @@ -58,211 +38,52 @@ impl PyBenDecoder { })?; if is_bundle { - let file = File::open(&file_path).map_err(|e| { - PyIOError::new_err(format!("Failed to open {}: {e}", file_path.display())) - })?; - let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { - PyException::new_err(format!( - "Failed to parse bundle header in {}: {e}", - file_path.display() - )) - })?; - let fmt = reader.assignment_format().ok_or_else(|| { - PyException::new_err("Bundle header has an unrecognized assignment_format field.") - })?; - let derived_mode = DecoderMode::from_assignment_format(fmt); - let (stream_offset, stream_len) = reader.assignment_stream_range().map_err(|e| { - PyException::new_err(format!( - "Failed to determine stream region in {}: {e}", - file_path.display() - )) - })?; - let state = BundleState { - reader, - stream_offset, - stream_len, - }; - - // Emit the XBEN startup warning once, up front. - if matches!(derived_mode, DecoderMode::XBen) { - warn_xben_startup(py)?; - } + return Err(PyException::new_err(format!( + "{} is a .bendl bundle, not a plain BEN/XBEN stream. Open it with \ + binary_ensemble.bundle.BendlDecoder instead.", + file_path.display() + ))); + } - // Iter construction is deferred: opening a bundle with an empty or truncated stream is - // legal (incomplete or zero-sample finalized bundles), and metadata methods like - // `count_samples`, `asset_names`, and `extract_stream` don't need a live iterator. - // Iteration paths build it on demand. - Ok(Self { - path: file_path, - mode: derived_mode, - backend: DecoderBackend::Bundle(state), - iter: None, - current_assignment: None, - remaining_count: 0, - base_len: None, - len_hint: None, - active_selection: ActiveSelection::None, - }) - } else { - if matches!(parsed_mode, DecoderMode::XBen) { - warn_xben_startup(py)?; - } - // For plain streams, opening the file as a BEN/XBEN reader is the only way to learn the - // variant — keep eager construction so we surface a malformed-banner error at open - // time, matching the documented behaviour of `BenDecoder("…", mode="ben")`. - let iter = build_plain_iter(&file_path, parsed_mode)?; - Ok(Self { - path: file_path, - mode: parsed_mode, - backend: DecoderBackend::Plain, - iter: Some(iter), - current_assignment: None, - remaining_count: 0, - base_len: None, - len_hint: None, - active_selection: ActiveSelection::None, - }) + if matches!(parsed_mode, DecoderMode::XBen) { + warn_xben_startup(py)?; } + + // For plain streams, opening the file as a BEN/XBEN reader is the only way to learn the + // variant — keep eager construction so a malformed-banner error surfaces at open time. + let mut cursor = SampleCursor::new(StreamSource::Plain { path: file_path }, parsed_mode); + cursor.prime_iter()?; + Ok(Self { cursor }) } /// Return `self` as an iterator, rebuilding the underlying frame walker so iteration can be - /// restarted. - /// - /// Calling `iter(dec)` (or using `for x in dec: …`) more than once is supported: each call - /// reopens the stream region from the start and, if a subsample selection is active, reapplies - /// it. + /// restarted. A subsample selection installed via `subsample_*` is reapplied on each restart. fn __iter__(mut slf: PyRefMut) -> PyResult> { - slf.current_assignment = None; - slf.remaining_count = 0; - - let path = slf.path.clone(); - let mode = slf.mode; - let selection = slf.active_selection.clone(); - - let new_iter: DynIter = match selection { - ActiveSelection::None => match &slf.backend { - DecoderBackend::Plain => build_plain_iter(&path, mode)?, - DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, - }, - sel => { - let frames = build_frames_for_subsample(&path, mode, &slf.backend)?; - let ben_sel = sel - .to_selection() - .expect("active subsample selection must be convertible"); - Box::new(SubsampleFrameDecoder::new(frames, ben_sel)) - } - }; - - slf.iter = Some(new_iter); + slf.cursor.restart()?; Ok(slf.into()) } - fn __next__(mut slf: PyRefMut) -> PyResult>> { - if slf.remaining_count > 0 { - slf.remaining_count -= 1; - let a = slf.current_assignment.as_ref().unwrap().clone(); - return Ok(Some(a)); - } - // Build the iterator on first use (e.g. when iteration begins without an explicit - // `__iter__` call). For bundle backends with empty/truncated streams this is where the - // BEN-banner-required error surfaces, instead of at `BenDecoder(...)` construction. - if slf.iter.is_none() { - let path = slf.path.clone(); - let mode = slf.mode; - let new_iter: DynIter = match &slf.backend { - DecoderBackend::Plain => build_plain_iter(&path, mode)?, - DecoderBackend::Bundle(state) => build_bundle_iter(&path, state, mode)?, - }; - slf.iter = Some(new_iter); - } - let next = slf - .iter - .as_mut() - .expect("iter populated by the lazy-init branch above") - .next(); - match next { - Some(Ok((assignment, count))) => { - if count == 0 { - return Err(PyException::new_err( - "Decoder yielded a zero-count record; data may be corrupted.", - )); - } - slf.current_assignment = Some(assignment.clone()); - slf.remaining_count = count - 1; - Ok(Some(assignment)) - } - Some(Err(e)) => Err(PyException::new_err(format!( - "Error decoding next item: {e}" - ))), - None => Ok(None), - } + fn __next__(&mut self) -> PyResult>> { + self.cursor.next() } // Because we want progress bars!!! - fn __len__(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - if let Some(len_hint) = slf.len_hint { - return Ok(len_hint); - } - - let base_len = ensure_base_len(&mut slf, py)?; - slf.len_hint = Some(base_len); - Ok(base_len) + fn __len__(&mut self, py: Python<'_>) -> PyResult { + self.cursor.len(py) } #[pyo3(text_signature = "(self)")] - fn count_samples(mut slf: PyRefMut, py: Python<'_>) -> PyResult { - // Always reports the total number of samples in the source file, even after `subsample_*` - // has been applied. We deliberately do not touch `len_hint` here: when a subsample - // selection is active, `len_hint` tracks the filtered count that `__len__` should return, - // and clobbering it would break `len(dec)` after a `count_samples()` call. - ensure_base_len(&mut slf, py) + fn count_samples(&mut self, py: Python<'_>) -> PyResult { + self.cursor.count_samples(py) } #[pyo3(text_signature = "(self, indices, /)")] fn subsample_indices<'py>( mut slf: PyRefMut<'py, Self>, - mut indices: Vec, + indices: Vec, py: Python<'_>, ) -> PyResult> { - if !indices.iter().is_sorted() { - // We need to sort and deduplicate the indices This is a bit annoying, but it is - // necessary to ensure that we can efficiently iterate over the underlying data. We use - // unstable sort because we don't care about the order of equal elements and it is - // faster than stable sort. - let warnings = py.import("warnings")?; - let kwargs = PyDict::new(py); - // kwargs.set_item("stacklevel", 2)?; - - warnings.call_method( - "warn", - ( - "Indices must be sorted and unique; sorting and deduplicating.", - py.get_type::(), - ), - Some(&kwargs), - )?; - } - indices.sort_unstable(); - indices.dedup(); - - if indices.is_empty() { - return Err(PyException::new_err("indices must not be empty")); - } - let base_len = ensure_base_len(&mut slf, py)?; - if indices[0] == 0 { - return Err(PyException::new_err("indices must be 1-based")); - } - if indices.last().unwrap() > &base_len { - return Err(PyException::new_err(format!( - "indices must be <= number of samples in base data ({})", - base_len - ))); - } - let len_hint = indices.len(); - - slf.active_selection = ActiveSelection::Indices(indices.clone()); - let sel = Selection::Indices(indices.into_iter().peekable()); - reset_with_selection(&mut slf, sel, len_hint)?; + slf.cursor.subsample_indices(indices, py)?; Ok(slf.into()) } @@ -273,23 +94,7 @@ impl PyBenDecoder { end: usize, py: Python<'_>, ) -> PyResult> { - if start == 0 || end < start { - return Err(PyException::new_err( - "range must be 1-based and end >= start", - )); - } - let base_len = ensure_base_len(&mut slf, py)?; - if end > base_len { - return Err(PyException::new_err(format!( - "end must be <= number of samples in base data ({})", - base_len - ))); - } - - slf.active_selection = ActiveSelection::Range { start, end }; - let sel = Selection::Range { start, end }; - let len_hint = end - start + 1; - reset_with_selection(&mut slf, sel, len_hint)?; + slf.cursor.subsample_range(start, end, py)?; Ok(slf.into()) } @@ -300,302 +105,13 @@ impl PyBenDecoder { offset: usize, py: Python<'_>, ) -> PyResult> { - if step == 0 || offset == 0 { - return Err(PyException::new_err("step and offset must be >= 1")); - } - let base_len = ensure_base_len(&mut slf, py)?; - if offset > base_len { - return Err(PyException::new_err(format!( - "offset must be <= number of samples in base data ({})", - base_len - ))); - } - slf.active_selection = ActiveSelection::Every { step, offset }; - let sel = Selection::Every { step, offset }; - let len_hint = (base_len + step - 1 - (offset - 1)) / step; - reset_with_selection(&mut slf, sel, len_hint)?; + slf.cursor.subsample_every(step, offset, py)?; Ok(slf.into()) } - // --------------------------------------------------------------------- - // Bundle-inspection surface. - // - // These methods only make sense when the decoder was opened on a `.bendl` file; on a plain - // `.ben`/`.xben` stream they raise a clear error pointing the user at the right tool. - // --------------------------------------------------------------------- - - /// Whether this decoder is backed by a `.bendl` bundle (`True`) or a plain `.ben`/`.xben` - /// stream (`False`). - #[pyo3(text_signature = "(self)")] - fn is_bundle(&self) -> bool { - self.backend.is_bundle() - } - - /// Return the container format of the underlying assignment stream as `"ben"` or `"xben"`. + /// Return the container format of the underlying stream as `"ben"` or `"xben"`. #[pyo3(text_signature = "(self)")] fn assignment_format(&self) -> &'static str { - self.mode.as_str() - } - - /// Return the bundle's format version as a `(major, minor)` tuple. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn version(&self) -> PyResult<(u16, u16)> { - let state = self.require_bundle("version()")?; - let h = state.reader.header(); - Ok((h.major_version, h.minor_version)) - } - - /// Whether the bundle was successfully finalized. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn is_complete(&self) -> PyResult { - let state = self.require_bundle("is_complete()")?; - Ok(state.reader.is_finalized()) - } - - /// Names of every entry in the bundle's directory, in directory order. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn asset_names(&self) -> PyResult> { - let state = self.require_bundle("asset_names()")?; - Ok(state - .reader - .assets() - .iter() - .map(|e| e.name.clone()) - .collect()) - } - - /// Return the full bundle directory as a list of dicts with keys `name`, `type`, `offset`, - /// `len`, and `flags` (a list of string tags). Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { - let state = self.require_bundle("list_assets()")?; - let entries = state.reader.assets(); - let mut out = Vec::with_capacity(entries.len()); - for entry in entries { - let d = PyDict::new(py); - d.set_item("name", &entry.name)?; - d.set_item("type", entry.asset_type)?; - d.set_item("offset", entry.payload_offset)?; - d.set_item("len", entry.payload_len)?; - let mut flags: Vec<&str> = Vec::new(); - if entry.asset_flags & ASSET_FLAG_JSON != 0 { - flags.push("json"); - } - if entry.asset_flags & ASSET_FLAG_XZ != 0 { - flags.push("xz"); - } - if entry.asset_flags & ASSET_FLAG_CHECKSUM != 0 { - flags.push("checksum"); - } - d.set_item("flags", flags)?; - out.push(d); - } - Ok(out) - } - - /// Read the (decoded) bytes of a named asset as a Python `bytes` object. Errors on plain - /// streams. - #[pyo3(text_signature = "(self, name, /)")] - fn read_asset_bytes(&mut self, name: &str) -> PyResult> { - let state = self.require_bundle_mut("read_asset_bytes()")?; - let entry = state - .reader - .find_asset_by_name(name) - .cloned() - .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; - state - .reader - .asset_bytes(&entry) - .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) - } - - /// Parse a JSON asset into a Python object (dict, list, …). Errors on plain streams and when - /// the asset does not exist or is not valid UTF-8 / JSON. - #[pyo3(text_signature = "(self, name, /)")] - fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { - let bytes = self.read_asset_bytes(name)?; - let json_mod = py.import("json")?; - let text = std::str::from_utf8(&bytes) - .map_err(|e| PyException::new_err(format!("asset {name:?} is not valid UTF-8: {e}")))?; - let parsed = json_mod.call_method1("loads", (text,))?; - Ok(parsed.into()) - } - - /// Read the bundle's `graph.json` asset as a parsed JSON object. Returns `None` if the bundle - /// does not carry a graph asset. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_graph()")?; - if state.reader.find_asset_by_type(ASSET_TYPE_GRAPH).is_none() { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "graph.json")?)) - } - - /// Read the bundle's `metadata.json` asset as a parsed JSON object, or `None` if absent. Errors - /// on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_metadata<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_metadata()")?; - if state - .reader - .find_asset_by_type(ASSET_TYPE_METADATA) - .is_none() - { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "metadata.json")?)) - } - - /// Read the bundle's `node_permutation_map.json` asset as a parsed JSON object, or `None` if - /// absent. Errors on plain streams. - #[pyo3(text_signature = "(self)")] - fn read_relabel_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { - { - let state = self.require_bundle_mut("read_relabel_map()")?; - if state - .reader - .find_asset_by_type(ASSET_TYPE_NODE_PERMUTATION_MAP) - .is_none() - { - return Ok(None); - } - } - Ok(Some(self.read_json_asset(py, "node_permutation_map.json")?)) - } - - /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be - /// opened directly with `PyBenDecoder(out_path, mode=dec.assignment_format())`. Errors on plain - /// streams. - #[pyo3(signature = (out_path, overwrite=false, allow_unfinalized=false))] - #[pyo3(text_signature = "(self, out_path, overwrite=False, allow_unfinalized=False)")] - fn extract_stream( - &mut self, - out_path: PathBuf, - overwrite: bool, - allow_unfinalized: bool, - ) -> PyResult<()> { - let state = self.require_bundle_mut("extract_stream()")?; - if out_path.exists() && !overwrite { - return Err(PyIOError::new_err(format!( - "Output file {} already exists (use overwrite=True to replace).", - out_path.display() - ))); - } - let mut stream = if allow_unfinalized && !state.reader.is_finalized() { - state - .reader - .assignment_stream_reader_unverified() - .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? - } else { - state - .reader - .assignment_stream_reader() - .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? - }; - - let out = if overwrite { - OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&out_path) - } else { - OpenOptions::new() - .write(true) - .create_new(true) - .open(&out_path) - } - .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; - let mut out = BufWriter::new(out); - - io::copy(&mut stream, &mut out) - .map_err(|e| PyIOError::new_err(format!("Failed to copy stream bytes: {e}")))?; - out.flush() - .map_err(|e| PyIOError::new_err(format!("Failed to flush output: {e}")))?; - Ok(()) - } -} - -impl PyBenDecoder { - /// Borrow the bundle state or raise a clear Python error explaining that the decoder was opened - /// on a plain stream. - fn require_bundle(&self, op: &str) -> PyResult<&BundleState> { - match &self.backend { - DecoderBackend::Bundle(state) => Ok(state), - DecoderBackend::Plain => Err(PyException::new_err(format!( - "{op} is only available on .bendl bundles; this decoder was opened \ - on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ - via PyBenEncoder with ben_file_only=False) to get bundle features.", - self.mode.as_str() - ))), - } + self.cursor.mode().as_str() } - - fn require_bundle_mut(&mut self, op: &str) -> PyResult<&mut BundleState> { - match &mut self.backend { - DecoderBackend::Bundle(state) => Ok(state), - DecoderBackend::Plain => Err(PyException::new_err(format!( - "{op} is only available on .bendl bundles; this decoder was opened \ - on a plain .{} file. Wrap the stream in a .bendl bundle (e.g. \ - via PyBenEncoder with ben_file_only=False) to get bundle features.", - self.mode.as_str() - ))), - } - } -} - -fn reset_with_selection( - decoder: &mut PyBenDecoder, - selection: Selection, - len_hint: usize, -) -> PyResult<()> { - let frames = build_frames_for_subsample(&decoder.path, decoder.mode, &decoder.backend)?; - let frame_decoder = SubsampleFrameDecoder::new(frames, selection); - decoder.iter = Some(Box::new(frame_decoder)); - decoder.current_assignment = None; - decoder.remaining_count = 0; - decoder.len_hint = Some(len_hint); - Ok(()) -} - -fn ensure_base_len(decoder: &mut PyBenDecoder, py: Python<'_>) -> PyResult { - if let Some(base_len) = decoder.base_len { - return Ok(base_len); - } - - let base_len = match &decoder.backend { - DecoderBackend::Plain => { - let path = decoder.path.clone(); - let format = decoder.mode.wire_format(); - py.detach(|| count_samples_from_file(&path, format)) - .map_err(|e| { - PyException::new_err(format!( - "Failed to count samples in {}: {e}", - path.display() - )) - })? - } - DecoderBackend::Bundle(state) => { - // Prefer the authoritative sample_count carried in the bundle header, which is set for - // finalized bundles and is O(1). Fall back to scanning the stream region when the - // header has no count (unfinalized append target, or a header byte we cannot - // interpret). - if let Some(n) = state.reader.sample_count() { - if n >= 0 { - n as usize - } else { - scan_bundle_samples(&decoder.path, state, decoder.mode)? - } - } else { - scan_bundle_samples(&decoder.path, state, decoder.mode)? - } - } - }; - decoder.base_len = Some(base_len); - Ok(base_len) } diff --git a/ben-py/src/decode/helpers.rs b/ben-py/src/decode/helpers.rs index eb35c2d..a4459f5 100644 --- a/ben-py/src/decode/helpers.rs +++ b/ben-py/src/decode/helpers.rs @@ -1,9 +1,9 @@ -use super::types::{BundleState, DecoderBackend, DecoderMode, DynIter}; +use super::types::{DecoderMode, DynIter, StreamSource}; use crate::common::open_input; use binary_ensemble::io::bundle::format::BENDL_MAGIC; use binary_ensemble::io::reader::{ - build_frame_iter, build_frame_iter_from_reader, count_samples_from_frame_iter, BenStreamReader, - BenWireFormat, + build_frame_iter, build_frame_iter_from_reader, count_samples_from_file, + count_samples_from_frame_iter, BenStreamReader, BenWireFormat, FrameIter, }; use pyo3::exceptions::{PyException, PyIOError, PyUserWarning}; use pyo3::prelude::*; @@ -57,51 +57,65 @@ where } } -/// Build a plain-stream iterator from `path` using `mode`. -pub(super) fn build_plain_iter(path: &Path, mode: DecoderMode) -> PyResult { - let reader = open_input(&path.to_path_buf())?; - open_stream_reader(reader, mode.wire_format()) -} - -/// Open a second file handle on the bundle path, seek to the stream region, and wrap it in the -/// appropriate assignment reader so the decoder iterator only walks the embedded stream. -pub(super) fn build_bundle_iter( - path: &Path, - state: &BundleState, - mode: DecoderMode, -) -> PyResult { - let reader = open_bundle_stream_reader(path, state)?; - open_stream_reader(reader, mode.wire_format()) -} - -/// Create a `Read`-only handle bounded to the bundle's assignment stream region. -pub(super) fn open_bundle_stream_reader( +/// Create a `Read`-only handle bounded to a bundle's assignment stream region. +fn open_bundle_stream_reader( path: &Path, - state: &BundleState, + stream_offset: u64, + stream_len: u64, ) -> PyResult>> { let file = File::open(path) .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", path.display())))?; let mut buf = BufReader::new(file); - buf.seek(SeekFrom::Start(state.stream_offset)) + buf.seek(SeekFrom::Start(stream_offset)) .map_err(|e| PyIOError::new_err(format!("Failed to seek into bundle stream: {e}")))?; - Ok(buf.take(state.stream_len)) + Ok(buf.take(stream_len)) +} + +/// Build a fresh assignment iterator for the given source. +/// +/// A finalized assets-only bundle (`StreamSource::Bundle { empty: true, .. }`) has no BEN banner to +/// parse, so it yields an empty iterator rather than failing on the missing banner. +pub(super) fn build_iter(source: &StreamSource, mode: DecoderMode) -> PyResult { + match source { + StreamSource::Plain { path } => { + let reader = open_input(&path.to_path_buf())?; + open_stream_reader(reader, mode.wire_format()) + } + StreamSource::Bundle { empty: true, .. } => Ok(Box::new(std::iter::empty())), + StreamSource::Bundle { + path, + stream_offset, + stream_len, + .. + } => { + let reader = open_bundle_stream_reader(path, *stream_offset, *stream_len)?; + open_stream_reader(reader, mode.wire_format()) + } + } } +/// Build a frame iterator for subsample selection over the given source. pub(super) fn build_frames_for_subsample( - path: &Path, + source: &StreamSource, mode: DecoderMode, - backend: &DecoderBackend, -) -> PyResult { +) -> PyResult { let format = mode.wire_format(); - match backend { - DecoderBackend::Plain => build_frame_iter(&path.to_path_buf(), format).map_err(|e| { - PyException::new_err(format!( - "Failed to create frame iterator from {}: {e}", - path.display() - )) - }), - DecoderBackend::Bundle(state) => { - let reader = open_bundle_stream_reader(path, state)?; + match source { + StreamSource::Plain { path } => { + build_frame_iter(&path.to_path_buf(), format).map_err(|e| { + PyException::new_err(format!( + "Failed to create frame iterator from {}: {e}", + path.display() + )) + }) + } + StreamSource::Bundle { + path, + stream_offset, + stream_len, + .. + } => { + let reader = open_bundle_stream_reader(path, *stream_offset, *stream_len)?; build_frame_iter_from_reader(reader, format).map_err(|e| { PyException::new_err(format!( "Failed to create frame iterator from bundle {}: {e}", @@ -112,17 +126,33 @@ pub(super) fn build_frames_for_subsample( } } -pub(super) fn scan_bundle_samples( - path: &Path, - state: &BundleState, +/// Count the samples in a source by reading it from the start. +pub(super) fn scan_samples( + source: &StreamSource, mode: DecoderMode, + py: Python<'_>, ) -> PyResult { - let reader = open_bundle_stream_reader(path, state)?; - let iter = build_frame_iter_from_reader(reader, mode.wire_format()).map_err(|e| { - PyException::new_err(format!( - "Failed to open bundle stream for sample count: {e}" - )) - })?; - count_samples_from_frame_iter(iter) - .map_err(|e| PyException::new_err(format!("Failed to count samples in bundle: {e}"))) + match source { + StreamSource::Plain { path } => { + let path = path.clone(); + let format = mode.wire_format(); + py.detach(|| count_samples_from_file(&path, format)) + .map_err(|e| { + PyException::new_err(format!("Failed to count samples in {}: {e}", path.display())) + }) + } + StreamSource::Bundle { + path, + stream_offset, + stream_len, + .. + } => { + let reader = open_bundle_stream_reader(path, *stream_offset, *stream_len)?; + let iter = build_frame_iter_from_reader(reader, mode.wire_format()).map_err(|e| { + PyException::new_err(format!("Failed to open bundle stream for sample count: {e}")) + })?; + count_samples_from_frame_iter(iter) + .map_err(|e| PyException::new_err(format!("Failed to count samples in bundle: {e}"))) + } + } } diff --git a/ben-py/src/decode/mod.rs b/ben-py/src/decode/mod.rs index 6ae56c8..33e4c6e 100644 --- a/ben-py/src/decode/mod.rs +++ b/ben-py/src/decode/mod.rs @@ -1,9 +1,12 @@ //! Python bindings for BEN/XBEN decoding and `.bendl` bundle inspection. +mod bundle_decoder; +mod cursor; mod decoder; mod helpers; mod py_funcs; mod types; +pub use bundle_decoder::PyBendlDecoder; pub use decoder::PyBenDecoder; pub use py_funcs::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; diff --git a/ben-py/src/decode/types.rs b/ben-py/src/decode/types.rs index 07c5004..db15cca 100644 --- a/ben-py/src/decode/types.rs +++ b/ben-py/src/decode/types.rs @@ -1,10 +1,9 @@ use binary_ensemble::io::bundle::format::AssignmentFormat; -use binary_ensemble::io::bundle::BendlReader; use binary_ensemble::io::reader::{BenWireFormat, MkvRecord, Selection}; use pyo3::exceptions::PyException; use pyo3::prelude::*; -use std::fs::File; -use std::io::{self, BufReader}; +use std::io; +use std::path::PathBuf; pub(super) type DynIter = Box> + Send>; @@ -47,26 +46,30 @@ impl DecoderMode { } } -/// Cached bundle state for a decoder opened on a `.bendl` file. +/// Where the iterable assignment stream lives. /// -/// Holds a dedicated [`BendlReader`] so the decoder can satisfy TOC inspection and asset-read calls -/// without disturbing the iterator (which reads the stream region through a separate file handle). -pub(super) struct BundleState { - pub reader: BendlReader>, - pub stream_offset: u64, - pub stream_len: u64, -} - -/// What the decoder was actually opened on. -pub(super) enum DecoderBackend { - Plain, - Bundle(BundleState), -} - -impl DecoderBackend { - pub(super) fn is_bundle(&self) -> bool { - matches!(self, DecoderBackend::Bundle(_)) - } +/// A plain `.ben`/`.xben` file is read from the start; a `.bendl` bundle is read through a second +/// file handle bounded to the embedded stream region. Carrying the region offsets (rather than a +/// live [`binary_ensemble::io::bundle::BendlReader`]) keeps the iteration core free of the bundle +/// inspection surface, so [`super::cursor::SampleCursor`] is shared verbatim between the stream and +/// bundle decoders. +#[derive(Clone)] +pub(super) enum StreamSource { + Plain { + path: PathBuf, + }, + Bundle { + path: PathBuf, + stream_offset: u64, + stream_len: u64, + /// Authoritative sample count from a finalized bundle header, or `None` when the bundle is + /// unfinalized (forcing a stream scan). + header_sample_count: Option, + /// `true` for a finalized bundle whose stream region is empty (an assets-only bundle with no + /// BEN banner). Iteration over such a source yields nothing instead of failing on the + /// missing banner. + empty: bool, + }, } /// Stored form of the most recently installed subsampling selection. diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs new file mode 100644 index 0000000..bfb6159 --- /dev/null +++ b/ben-py/src/encode/bundle_encoder.rs @@ -0,0 +1,452 @@ +//! `.bendl` bundle authoring bindings: [`PyBendlEncoder`] and its [`PyBendlStreamSession`]. +//! +//! The encoder threads the bundle through the library's typestate machinery — `BendlWriter` +//! (assets) → `BendlStreamSession` (stream) → `BendlWriter::finish` (finalize) — for the create +//! path, and reopens a `BendlAppender` per asset for post-stream / append-mode adds. The state enum +//! below tracks which phase the encoder is in so a second `stream()` is refused and so `add_*` +//! routes through the writer pre-stream and the appender afterwards. + +use crate::common::{ + graph_node_count, networkx_graph_from_bytes, open_output, parse_graph_input, parse_variant, +}; +use crate::graph::helpers::reorder_graph_to_bytes; +use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; +use binary_ensemble::io::bundle::writer::BendlAppender; +use binary_ensemble::io::bundle::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; +use binary_ensemble::io::writer::BenStreamWriter; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufWriter}; +use std::path::PathBuf; + +fn map_bundle_err(err: BendlWriteError) -> PyErr { + match err { + BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + other => PyException::new_err(format!("{other}")), + } +} + +fn map_io_err(err: io::Error) -> PyErr { + PyIOError::new_err(format!("{err}")) +} + +/// Map a `content_type` string to writer options. `"json"` sets the JSON flag so the decoder can +/// auto-parse; `"text"` stores without it. +fn opts_for(content_type: &str) -> PyResult { + match content_type { + "json" => Ok(AddAssetOptions::defaults().json()), + "text" => Ok(AddAssetOptions::defaults()), + other => Err(PyValueError::new_err(format!( + "content_type must be 'json' or 'text', got {other:?}" + ))), + } +} + +/// Phase of a [`PyBendlEncoder`]. +enum BundleState { + /// Create mode, before the stream: the writer owns the file and accepts asset writes. + PreStream { + writer: Box>>, + /// Node count of a pre-stream graph, used to validate each written assignment. + graph_node_count: Option, + }, + /// A stream session is open; the writer has been moved into the session object. The session + /// signals back via [`PyBendlEncoder::mark_finalized`] / [`PyBendlEncoder::mark_failed`]. + Streaming, + /// The bundle is finalized on disk: post-stream create mode, or append mode. `add_*` reopen a + /// `BendlAppender` on the path and commit immediately. + Appendable, + /// The stream session exited via an exception; the bundle is unfinalized on disk and `close()` + /// must not finalize over the truncated stream. + Failed, + /// The encoder has been closed. + Closed, +} + +/// Writer for a single `.bendl` bundle. +#[pyclass(module = "binary_ensemble", name = "BendlEncoder", unsendable)] +pub struct PyBendlEncoder { + path: PathBuf, + append_mode: bool, + state: BundleState, +} + +#[pymethods] +impl PyBendlEncoder { + /// Open a new bundle writer in create mode. + /// + /// # Arguments + /// + /// * `file_path` - Output path. Must not exist unless `overwrite=True`. + /// * `overwrite` - Replace an existing file at `file_path`. + #[new] + #[pyo3(signature = (file_path, overwrite = false))] + #[pyo3(text_signature = "(file_path, overwrite=False)")] + fn new(file_path: PathBuf, overwrite: bool) -> PyResult { + let buf = open_output(&file_path, overwrite)?; + let writer = BendlWriter::new(buf, AssignmentFormat::Ben).map_err(map_io_err)?; + Ok(Self { + path: file_path, + append_mode: false, + state: BundleState::PreStream { + writer: Box::new(writer), + graph_node_count: None, + }, + }) + } + + /// Open an existing finalized bundle for append. `stream()` is unavailable; `add_*` commit + /// immediately. + #[staticmethod] + #[pyo3(signature = (file_path))] + #[pyo3(text_signature = "(file_path)")] + fn append(file_path: PathBuf) -> PyResult { + // Validate the target is a finalized bundle up front by opening (and discarding) an + // appender. The actual asset adds reopen their own appender per call. + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&file_path) + .map_err(|e| { + PyIOError::new_err(format!( + "Failed to open {} for append: {e}", + file_path.display() + )) + })?; + let appender = BendlAppender::open(file).map_err(map_bundle_err)?; + appender.abort(); + Ok(Self { + path: file_path, + append_mode: true, + state: BundleState::Appendable, + }) + } + + /// Add a custom asset (asset type `CUSTOM`). `content_type` is `"json"` or `"text"`. + #[pyo3(signature = (name, payload, content_type))] + #[pyo3(text_signature = "(self, name, payload, content_type)")] + fn add_asset(&mut self, name: &str, payload: Vec, content_type: &str) -> PyResult<()> { + let opts = opts_for(content_type)?; + if let BundleState::PreStream { writer, .. } = &mut self.state { + return writer + .add_custom_asset(name, &payload, opts) + .map_err(map_bundle_err); + } + if matches!(self.state, BundleState::Appendable) { + return self.append_commit(|a| a.add_custom_asset(name, &payload, opts)); + } + Err(state_error(&self.state, "add_asset")) + } + + /// Add the canonical `metadata.json` known asset. `metadata` accepts the same inputs as a graph + /// (dict/list, bytes, a file-like with `.read()`, or a path). + #[pyo3(signature = (metadata))] + #[pyo3(text_signature = "(self, metadata)")] + fn add_metadata(&mut self, py: Python<'_>, metadata: Bound<'_, PyAny>) -> PyResult<()> { + let bytes = parse_graph_input(py, &metadata)?; + let opts = AddAssetOptions::defaults().json(); + if let BundleState::PreStream { writer, .. } = &mut self.state { + return writer + .add_known_asset(KnownAssetKind::Metadata, &bytes, opts) + .map_err(map_bundle_err); + } + if matches!(self.state, BundleState::Appendable) { + return self + .append_commit(|a| a.add_known_asset(KnownAssetKind::Metadata, &bytes, opts)); + } + Err(state_error(&self.state, "add_metadata")) + } + + /// Add the `graph.json` known asset. + /// + /// When `preprocess_method` is not `None`, the graph is reordered via the chosen method, both + /// `graph.json` and `node_permutation_map.json` are stored, and the reordered graph is returned + /// (as a NetworkX graph, matching `BendlDecoder.read_graph`) so the chain runs on that ordering. + /// Reordering is pre-stream only. When `preprocess_method` is `None`, the graph is stored as-is + /// (no permutation map) and may also be attached post-stream / in append mode. The returned + /// graph's node count is recorded for per-write validation. + #[pyo3(signature = (graph, preprocess_method))] + #[pyo3(text_signature = "(self, graph, preprocess_method)")] + fn add_graph( + &mut self, + py: Python<'_>, + graph: Bound<'_, PyAny>, + preprocess_method: Option, + ) -> PyResult> { + let graph_bytes = parse_graph_input(py, &graph)?; + let opts = AddAssetOptions::defaults().json(); + + if let Some(method) = preprocess_method { + // Reordering rewrites the node ordering the chain must write in, so it is pre-stream + // only. + if !matches!(self.state, BundleState::PreStream { .. }) { + return Err(PyException::new_err( + "a reordering add_graph (preprocess_method != None) is only allowed before \ + stream(); post-stream or append-mode graphs must use preprocess_method=None", + )); + } + let (reordered, map) = reorder_graph_to_bytes(&graph_bytes, &method)?; + let count = graph_node_count(&reordered)?; + if let BundleState::PreStream { + writer, + graph_node_count: gnc, + } = &mut self.state + { + writer + .add_known_asset(KnownAssetKind::Graph, &reordered, opts.clone()) + .map_err(map_bundle_err)?; + writer + .add_known_asset(KnownAssetKind::NodePermutationMap, &map, opts) + .map_err(map_bundle_err)?; + *gnc = Some(count); + } + return networkx_graph_from_bytes(py, &reordered); + } + + // Raw graph: stored as-is, no permutation map. + let count = graph_node_count(&graph_bytes)?; + if let BundleState::PreStream { + writer, + graph_node_count: gnc, + } = &mut self.state + { + writer + .add_known_asset(KnownAssetKind::Graph, &graph_bytes, opts) + .map_err(map_bundle_err)?; + *gnc = Some(count); + return networkx_graph_from_bytes(py, &graph_bytes); + } + if matches!(self.state, BundleState::Appendable) { + self.append_commit(|a| a.add_known_asset(KnownAssetKind::Graph, &graph_bytes, opts))?; + return networkx_graph_from_bytes(py, &graph_bytes); + } + Err(state_error(&self.state, "add_graph")) + } + + /// Open the single-use assignment stream. Only `"ben"` is accepted today; XBEN comes from + /// `bundle.compress_stream`. `variant` selects the BEN variant (default `"mkv_chain"`). + #[pyo3(signature = (format = "ben", variant = None))] + #[pyo3(text_signature = "(self, format='ben', variant=None)")] + fn stream( + slf: Bound<'_, Self>, + format: &str, + variant: Option, + ) -> PyResult { + if format != "ben" { + return Err(PyValueError::new_err(format!( + "stream format must be 'ben' (got {format:?}); produce XBEN via \ + binary_ensemble.bundle.compress_stream" + ))); + } + let ben_var = parse_variant(variant.as_deref())?; + + let encoder_handle: Py = slf.clone().unbind(); + let mut me = slf.borrow_mut(); + + if me.append_mode { + return Err(PyException::new_err( + "stream() is unavailable in append mode; open a fresh BendlEncoder to write a \ + new stream", + )); + } + match &me.state { + BundleState::PreStream { .. } => {} + BundleState::Streaming => { + return Err(PyException::new_err("a stream is already open")) + } + BundleState::Appendable => { + return Err(PyException::new_err( + "a stream has already been written to this bundle", + )) + } + BundleState::Failed => { + return Err(PyException::new_err( + "the previous stream failed; this bundle is unfinalized", + )) + } + BundleState::Closed => return Err(PyException::new_err("encoder is closed")), + } + + let prev = std::mem::replace(&mut me.state, BundleState::Streaming); + let BundleState::PreStream { + writer, + graph_node_count, + } = prev + else { + unreachable!("validated PreStream above") + }; + + let build = (|| { + let session = writer.into_stream_session().map_err(map_bundle_err)?; + let ben_writer = BenStreamWriter::for_ben(session, ben_var).map_err(map_io_err)?; + Ok::<_, PyErr>(ben_writer) + })(); + + match build { + Ok(ben_writer) => Ok(PyBendlStreamSession { + writer: Some(Box::new(ben_writer)), + sample_count: 0, + graph_node_count, + encoder: encoder_handle, + }), + Err(e) => { + me.state = BundleState::Failed; + Err(e) + } + } + } + + /// Finalize the bundle. Idempotent. In create mode a normal close (including before any + /// `stream()`) finalizes the bundle; after a failed stream it does not finalize. In append mode + /// it is a no-op after the already-committed appends. + fn close(&mut self) -> PyResult<()> { + match &self.state { + // The session owns the writer and finalizes on its own close. + BundleState::Streaming => Ok(()), + BundleState::Appendable | BundleState::Failed | BundleState::Closed => { + self.state = BundleState::Closed; + Ok(()) + } + BundleState::PreStream { .. } => { + let prev = std::mem::replace(&mut self.state, BundleState::Closed); + if let BundleState::PreStream { writer, .. } = prev { + writer.finish().map_err(map_bundle_err)?; + } + Ok(()) + } + } + } + + fn __enter__(slf: PyRefMut) -> PyRefMut { + slf + } + + fn __exit__( + &mut self, + _exc_type: Option<&Bound<'_, PyAny>>, + _exc_value: Option<&Bound<'_, PyAny>>, + _traceback: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + self.close()?; + Ok(false) + } +} + +impl PyBendlEncoder { + /// Mark the bundle finalized after a successful stream session close, so subsequent `add_*` go + /// through the appender. + pub(crate) fn mark_finalized(&mut self) { + self.state = BundleState::Appendable; + } + + /// Mark the bundle failed after the stream session exited via an exception. + pub(crate) fn mark_failed(&mut self) { + self.state = BundleState::Failed; + } + + /// Open a fresh appender on the bundle path, run one asset operation, and commit it. + fn append_commit(&self, op: F) -> PyResult<()> + where + F: FnOnce(&mut BendlAppender) -> Result<(), BendlWriteError>, + { + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&self.path) + .map_err(|e| { + PyIOError::new_err(format!( + "Failed to open {} for append: {e}", + self.path.display() + )) + })?; + let mut appender = BendlAppender::open(file).map_err(map_bundle_err)?; + op(&mut appender).map_err(map_bundle_err)?; + appender.commit().map_err(map_bundle_err)?; + Ok(()) + } +} + +/// Build a clear error for an operation attempted in an invalid encoder state. +fn state_error(state: &BundleState, op: &str) -> PyErr { + let reason = match state { + BundleState::Streaming => "the assignment stream is open; close it before adding assets", + BundleState::Failed => "the previous stream failed; this bundle is unfinalized", + BundleState::Closed => "the encoder is closed", + BundleState::PreStream { .. } | BundleState::Appendable => "invalid state", + }; + PyException::new_err(format!("cannot {op}: {reason}")) +} + +/// Single-use context manager over the bundle's assignment stream. +#[pyclass(module = "binary_ensemble", name = "BendlStreamSession", unsendable)] +pub struct PyBendlStreamSession { + writer: Option>>>>, + sample_count: i64, + /// Node count of a pre-stream graph, used to validate each written assignment. + graph_node_count: Option, + encoder: Py, +} + +#[pymethods] +impl PyBendlStreamSession { + /// Encode a single assignment. When the bundle carries a pre-stream graph, the assignment + /// length must equal the graph's node count. + #[pyo3(signature = (assignment))] + #[pyo3(text_signature = "(self, assignment)")] + fn write(&mut self, assignment: Vec) -> PyResult<()> { + if let Some(n) = self.graph_node_count { + if assignment.len() != n { + return Err(PyValueError::new_err(format!( + "assignment length {} does not match graph node count {n}", + assignment.len() + ))); + } + } + let writer = self + .writer + .as_mut() + .ok_or_else(|| PyIOError::new_err("stream session is already closed"))?; + writer + .write_assignment(assignment) + .map_err(map_io_err)?; + self.sample_count += 1; + Ok(()) + } + + /// Finalize the bundle and close the stream. Idempotent. + fn close(&mut self, py: Python<'_>) -> PyResult<()> { + let Some(writer) = self.writer.take() else { + return Ok(()); + }; + let session = writer.finish_into_inner().map_err(map_io_err)?; + let bundle = session.finish_into_writer(self.sample_count); + bundle.finish().map_err(map_bundle_err)?; + self.encoder.borrow_mut(py).mark_finalized(); + Ok(()) + } + + fn __enter__(slf: PyRefMut) -> PyRefMut { + slf + } + + fn __exit__( + &mut self, + py: Python<'_>, + exc_type: Option<&Bound<'_, PyAny>>, + _exc_value: Option<&Bound<'_, PyAny>>, + _traceback: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + if exc_type.is_some() { + // Leave the bundle unfinalized: dropping the writer abandons the session without + // patching `finalized`, so the partial write is recoverable via allow_unfinalized + // rather than being stamped complete over a truncated stream. + self.writer = None; + self.encoder.borrow_mut(py).mark_failed(); + Ok(false) + } else { + self.close(py)?; + Ok(false) + } + } +} diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 037cc6c..e456874 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -1,48 +1,21 @@ -use super::helpers::parse_graph_input; use crate::common::{open_output, parse_variant}; -use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; -use binary_ensemble::io::bundle::{ - AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, -}; use binary_ensemble::io::writer::BenStreamWriter; -use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::exceptions::PyIOError; use pyo3::prelude::*; use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -/// Per-call encoder state. The bundle path threads ownership of the underlying file through -/// `BendlWriter` → `BendlStreamSession` → `BenStreamWriter`, so when `close()` runs we walk the -/// chain back from `BenStreamWriter::finish_into_inner` (returning the session) to -/// `BendlStreamSession::finish_into_writer` (returning the bundle writer) to `BendlWriter::finish` -/// (returning the buffered file). -enum EncoderState { - /// Plain `.ben` file path: writes directly to a buffered file with no bundle framing. - BenOnly(BenStreamWriter>), - /// `.bendl` bundle path: the session owns the buffered file and the `BenStreamWriter` writes - /// through it. `sample_count` is tracked alongside so it can be plumbed into - /// `finish_into_writer` at `close()` time. The writer is boxed because the bundle-streaming - /// `BenStreamWriter` is much larger than the plain-BEN one, which would otherwise bloat every - /// `EncoderState` to the larger variant's size. - BundleStreaming { - writer: Box>>>, - sample_count: i64, - }, -} - -#[pyclass(name = "BenEncoder", unsendable)] +/// Encoder for plain Binary Ensemble (`.ben`) streams. +/// +/// This encoder writes a plain BEN stream with no bundle framing. To produce a `.bendl` bundle +/// (with an embedded graph, metadata, or other assets) use `binary_ensemble.bundle.BendlEncoder`. +#[pyclass(module = "binary_ensemble", name = "BenEncoder", unsendable)] pub struct PyBenEncoder { - state: Option, + writer: Option>>, } impl PyBenEncoder { - fn map_bundle_err(err: BendlWriteError) -> PyErr { - match err { - BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), - other => PyException::new_err(format!("{other}")), - } - } - fn map_io_err(err: io::Error) -> PyErr { PyIOError::new_err(format!("{err}")) } @@ -50,138 +23,56 @@ impl PyBenEncoder { #[pymethods] impl PyBenEncoder { - /// Open a new encoder. The default output is a `.bendl` bundle with an embedded assignment - /// stream and an optional embedded graph; set `ben_file_only=True` to emit a plain `.ben` file - /// instead. + /// Open a new encoder that writes a plain `.ben` stream. /// /// # Arguments /// /// * `file_path` - Output path. Must not exist unless `overwrite=True`. /// * `overwrite` - Replace an existing file at `file_path`. /// * `variant` - BEN variant for the assignment stream (`"standard"`, `"mkv_chain"`, or - /// `"twodelta"`). - /// * `graph` - Optional graph to embed as the `graph.json` asset when writing a bundle. Accepts - /// a `pathlib.Path` / `str` path, a `bytes` object containing UTF-8 JSON, a Python `dict` / - /// `list` that will be serialized with `json.dumps`, or a file-like object with a `.read()` - /// method. Passing a graph alongside `ben_file_only=True` is an error. - /// * `ben_file_only` - If `True`, emit a plain `.ben` file with no bundle framing. Defaults to - /// `False`. + /// `"twodelta"`). Defaults to `"mkv_chain"` when `None`. #[new] - #[pyo3(signature = ( - file_path, - overwrite = false, - variant = None, - graph = None, - ben_file_only = false, - ))] - #[pyo3( - text_signature = "(file_path, overwrite=False, variant=None, graph=None, ben_file_only=False)" - )] - fn new( - py: Python<'_>, - file_path: PathBuf, - overwrite: bool, - variant: Option, - graph: Option>, - ben_file_only: bool, - ) -> PyResult { + #[pyo3(signature = (file_path, overwrite = false, variant = None))] + #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] + fn new(file_path: PathBuf, overwrite: bool, variant: Option) -> PyResult { let ben_var = parse_variant(variant.as_deref())?; - - if ben_file_only && graph.is_some() { - return Err(PyValueError::new_err( - "graph= cannot be combined with ben_file_only=True (the graph \ - would have nowhere to live in a plain .ben file).", - )); - } - let buf = open_output(&file_path, overwrite)?; - - let state = if ben_file_only { - EncoderState::BenOnly(BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?) - } else { - // Bundle path. Add the optional graph asset before opening the stream session — the - // bundle writer auto-compresses graphs (default_compresses_by_type), so we hand it raw - // JSON bytes and let it apply the XZ flag. - let mut writer = - BendlWriter::new(buf, AssignmentFormat::Ben).map_err(Self::map_io_err)?; - if let Some(graph_obj) = graph { - let raw = parse_graph_input(py, &graph_obj)?; - writer - .add_known_asset( - KnownAssetKind::Graph, - &raw, - AddAssetOptions::defaults().json(), - ) - .map_err(Self::map_bundle_err)?; - } - let session = writer.into_stream_session().map_err(Self::map_bundle_err)?; - let writer = BenStreamWriter::for_ben(session, ben_var).map_err(Self::map_io_err)?; - EncoderState::BundleStreaming { - writer: Box::new(writer), - sample_count: 0, - } - }; - - Ok(Self { state: Some(state) }) + let writer = BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?; + Ok(Self { + writer: Some(writer), + }) } /// Encode a single assignment and append it to the output stream. #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { - let state = self - .state + let writer = self + .writer .as_mut() .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; - match state { - EncoderState::BenOnly(w) => { - w.write_assignment(assignment).map_err(Self::map_io_err)?; - } - EncoderState::BundleStreaming { - writer, - sample_count, - } => { - writer - .write_assignment(assignment) - .map_err(Self::map_io_err)?; - *sample_count += 1; - } - } - Ok(()) + writer.write_assignment(assignment).map_err(Self::map_io_err) } - /// Flush the assignment stream and, for bundle output, patch the header and write the trailing - /// directory. Idempotent. + /// Flush the assignment stream and close the underlying file. Idempotent. fn close(&mut self) -> PyResult<()> { - let Some(state) = self.state.take() else { + let Some(writer) = self.writer.take() else { return Ok(()); }; - match state { - EncoderState::BenOnly(writer) => { - let mut buf = writer.finish_into_inner().map_err(Self::map_io_err)?; - buf.flush().map_err(Self::map_io_err)?; - } - EncoderState::BundleStreaming { - writer, - sample_count, - } => { - let session = writer.finish_into_inner().map_err(Self::map_io_err)?; - let bundle = session.finish_into_writer(sample_count); - bundle.finish().map_err(Self::map_bundle_err)?; - } - } + let mut buf = writer.finish_into_inner().map_err(Self::map_io_err)?; + buf.flush().map_err(Self::map_io_err)?; Ok(()) } - fn __enter__(slf: pyo3::PyRefMut) -> pyo3::PyRefMut { + fn __enter__(slf: PyRefMut) -> PyRefMut { slf } fn __exit__( &mut self, - _exc_type: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, - _exc_value: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, - _traceback: Option<&pyo3::Bound<'_, pyo3::types::PyAny>>, + _exc_type: Option<&Bound<'_, PyAny>>, + _exc_value: Option<&Bound<'_, PyAny>>, + _traceback: Option<&Bound<'_, PyAny>>, ) -> PyResult { self.close()?; Ok(false) diff --git a/ben-py/src/encode/helpers.rs b/ben-py/src/encode/helpers.rs deleted file mode 100644 index 99bf379..0000000 --- a/ben-py/src/encode/helpers.rs +++ /dev/null @@ -1,59 +0,0 @@ -use pyo3::exceptions::{PyException, PyIOError, PyValueError}; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyDict, PyList}; -use std::path::PathBuf; - -/// Normalize a user-supplied graph argument into raw UTF-8 JSON bytes. -/// -/// Accepted forms: -/// -/// - `dict` / `list`: serialized via `json.dumps`. -/// - `bytes` / `bytearray`: used verbatim. -/// - any object with a `.read()` method (e.g. `io.BytesIO`, open files): `.read()` is called and -/// the result is coerced to bytes. -/// - `pathlib.Path` or `str`: treated as a filesystem path to read. -pub(super) fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { - // Dict / list → json.dumps. - if obj.is_instance_of::() || obj.is_instance_of::() { - let json_mod = py.import("json")?; - let dumped = json_mod.call_method1("dumps", (obj,))?; - let s: String = dumped.extract()?; - return Ok(s.into_bytes()); - } - - // Raw bytes / bytearray. - if let Ok(b) = obj.downcast::() { - return Ok(b.as_bytes().to_vec()); - } - if let Ok(b) = obj.extract::>() { - return Ok(b); - } - - // File-like: must have .read(). Check before str/path, since a plain `str` / `Path` has no - // `.read()` attribute and will fall through. - if obj.hasattr("read")? { - let data = obj.call_method0("read")?; - if let Ok(b) = data.downcast::() { - return Ok(b.as_bytes().to_vec()); - } - if let Ok(b) = data.extract::>() { - return Ok(b); - } - if let Ok(s) = data.extract::() { - return Ok(s.into_bytes()); - } - return Err(PyException::new_err( - "graph .read() must return bytes or str", - )); - } - - // Path / str → read the file at that path. - let path: PathBuf = obj.extract().map_err(|_| { - PyValueError::new_err( - "graph must be a dict/list, bytes, a file-like with .read(), or a path", - ) - })?; - std::fs::read(&path).map_err(|e| { - PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) - }) -} diff --git a/ben-py/src/encode/mod.rs b/ben-py/src/encode/mod.rs index a02afc1..74bef58 100644 --- a/ben-py/src/encode/mod.rs +++ b/ben-py/src/encode/mod.rs @@ -1,8 +1,9 @@ //! Python bindings for BEN/XBEN encoding and `.bendl` bundle authoring. +mod bundle_encoder; mod encoder; -mod helpers; mod py_funcs; +pub use bundle_encoder::{PyBendlEncoder, PyBendlStreamSession}; pub use encoder::PyBenEncoder; pub use py_funcs::{encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben}; diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index 6117c12..cae1f58 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -46,7 +46,7 @@ pub fn encode_ben_to_xben( #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=false, variant='mkv_chain')")] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False, variant='mkv_chain')")] pub fn encode_jsonl_to_ben( in_file: PathBuf, out_file: PathBuf, diff --git a/ben-py/src/graph/helpers.rs b/ben-py/src/graph/helpers.rs new file mode 100644 index 0000000..9d66a8a --- /dev/null +++ b/ben-py/src/graph/helpers.rs @@ -0,0 +1,60 @@ +use binary_ensemble::json::graph::{ + sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, +}; +use pyo3::exceptions::PyException; +use pyo3::prelude::*; +use serde_json::json; +use std::io::Cursor; + +/// How a `preprocess_method` / graph-utility method string maps onto reben's reordering machinery. +enum Reorder { + /// A topology-based ordering algorithm, paired with its canonical kebab-case name. + Ordering(GraphOrderingMethod, &'static str), + /// A node-attribute key sort (e.g. `"geoid"`, or the special `"id"` for the NetworkX node id). + Key(String), +} + +fn classify(method: &str) -> Reorder { + match method { + "mlc" | "multi-level-cluster" => { + Reorder::Ordering(GraphOrderingMethod::MultiLevelCluster, "multi-level-cluster") + } + "rcm" | "reverse-cuthill-mckee" => { + Reorder::Ordering(GraphOrderingMethod::ReverseCuthillMckee, "reverse-cuthill-mckee") + } + other => Reorder::Key(other.to_string()), + } +} + +/// Reorder a NetworkX adjacency-format graph and emit a `node_permutation_map.json` payload. +/// +/// Returns `(reordered_graph_bytes, node_permutation_map_bytes)`. The permutation map is a JSON +/// object carrying the required `node_permutation_old_to_new` field (original zero-based node +/// positions → new positions) plus an optional `key` or `ordering_method` recording how the order +/// was produced. The reben file-path fields (`input_file` / `output_file`) are omitted, since the +/// Python graph utilities have no such paths. +pub fn reorder_graph_to_bytes(graph_bytes: &[u8], method: &str) -> PyResult<(Vec, Vec)> { + let mut reordered = Vec::new(); + let (map, key_field, ordering_field) = match classify(method) { + Reorder::Ordering(ordering, name) => { + let map = sort_json_file_by_ordering(Cursor::new(graph_bytes), &mut reordered, ordering) + .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; + (map, None::, Some(name)) + } + Reorder::Key(key) => { + let map = sort_json_file_by_key(Cursor::new(graph_bytes), &mut reordered, &key) + .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; + (map, Some(key), None) + } + }; + + let map_json = json!({ + "key": key_field, + "ordering_method": ordering_field, + "node_permutation_old_to_new": map, + }); + let map_bytes = serde_json::to_vec(&map_json) + .map_err(|e| PyException::new_err(format!("Failed to serialize permutation map: {e}")))?; + + Ok((reordered, map_bytes)) +} diff --git a/ben-py/src/graph/mod.rs b/ben-py/src/graph/mod.rs new file mode 100644 index 0000000..5360c13 --- /dev/null +++ b/ben-py/src/graph/mod.rs @@ -0,0 +1,6 @@ +//! Python bindings for graph reordering utilities (the reben orderings). + +pub mod helpers; +mod py_funcs; + +pub use py_funcs::graph_reorder; diff --git a/ben-py/src/graph/py_funcs.rs b/ben-py/src/graph/py_funcs.rs new file mode 100644 index 0000000..347c4f8 --- /dev/null +++ b/ben-py/src/graph/py_funcs.rs @@ -0,0 +1,37 @@ +use super::helpers::reorder_graph_to_bytes; +use crate::common::{networkx_graph_from_bytes, parse_graph_input}; +use pyo3::exceptions::PyException; +use pyo3::prelude::*; + +/// Parse JSON bytes into a Python object (used for the permutation map). +fn json_loads(py: Python<'_>, bytes: &[u8]) -> PyResult> { + let json_mod = py.import("json")?; + let text = std::str::from_utf8(bytes) + .map_err(|e| PyException::new_err(format!("reordered output is not valid UTF-8: {e}")))?; + Ok(json_mod.call_method1("loads", (text,))?.into()) +} + +/// Reorder a NetworkX adjacency-format graph and return `(reordered_graph, node_permutation_map)`. +/// +/// `reordered_graph` is a live NetworkX graph (matching `BendlEncoder.add_graph` / +/// `BendlDecoder.read_graph`); `node_permutation_map` is the parsed map JSON. +/// +/// `method` selects the ordering: `"multi-level-cluster"` / `"mlc"`, +/// `"reverse-cuthill-mckee"` / `"rcm"`, or a node-attribute key (e.g. `"geoid"`, or the special +/// `"id"` for the NetworkX node id). The permutation map matches the on-disk +/// `node_permutation_map.json` convention (a `node_permutation_old_to_new` object). +#[pyfunction] +#[pyo3(signature = (graph, method))] +#[pyo3(text_signature = "(graph, method)")] +pub fn graph_reorder<'py>( + py: Python<'py>, + graph: Bound<'py, PyAny>, + method: &str, +) -> PyResult<(Py, Py)> { + let graph_bytes = parse_graph_input(py, &graph)?; + let (reordered_bytes, map_bytes) = reorder_graph_to_bytes(&graph_bytes, method)?; + Ok(( + networkx_graph_from_bytes(py, &reordered_bytes)?, + json_loads(py, &map_bytes)?, + )) +} diff --git a/ben-py/src/lib.rs b/ben-py/src/lib.rs index 5ac526f..a79eb47 100755 --- a/ben-py/src/lib.rs +++ b/ben-py/src/lib.rs @@ -4,17 +4,24 @@ use pyo3::wrap_pyfunction; pub mod common; pub mod decode; pub mod encode; +pub mod graph; +pub mod recompress; #[pymodule] fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(crate::decode::decode_ben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::decode::decode_xben_to_ben, m)?)?; m.add_function(wrap_pyfunction!(crate::decode::decode_xben_to_jsonl, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_ben, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_xben, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_ben_to_xben, m)?)?; + m.add_function(wrap_pyfunction!(crate::graph::graph_reorder, m)?)?; + m.add_function(wrap_pyfunction!(crate::recompress::recompress_bundle, m)?)?; Ok(()) } diff --git a/ben-py/src/recompress.rs b/ben-py/src/recompress.rs new file mode 100644 index 0000000..d0514a0 --- /dev/null +++ b/ben-py/src/recompress.rs @@ -0,0 +1,137 @@ +//! Binding for recompressing a `.bendl` bundle's embedded BEN stream to XBEN. +//! +//! This repackages a bundle: it reads back every asset's decoded payload and the BEN assignment +//! stream, re-encodes the stream as XBEN, and writes a fresh `Xben`-format bundle with the same +//! assets (name, type, JSON flag, decoded bytes). Storage compression is normalized to the writer's +//! default policy — the decoded payload bytes are preserved, not the byte-for-byte on-disk form. + +use crate::common::open_output; +use binary_ensemble::codec::encode::encode_ben_to_xben; +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, KnownAssetKind, ASSET_FLAG_JSON, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + ASSET_TYPE_NODE_PERMUTATION_MAP, +}; +use binary_ensemble::io::bundle::{AddAssetOptions, BendlReader, BendlWriteError, BendlWriter}; +use pyo3::exceptions::{PyException, PyIOError}; +use pyo3::prelude::*; +use std::fs::File; +use std::io::{BufReader, Cursor, Read, Write}; +use std::path::PathBuf; + +fn map_bundle_err(err: BendlWriteError) -> PyErr { + match err { + BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + other => PyException::new_err(format!("{other}")), + } +} + +/// A single asset read back from the source bundle, ready to be re-added to the new one. +struct PreservedAsset { + asset_type: u16, + name: String, + is_json: bool, + payload: Vec, +} + +fn known_kind(asset_type: u16) -> Option { + match asset_type { + ASSET_TYPE_METADATA => Some(KnownAssetKind::Metadata), + ASSET_TYPE_GRAPH => Some(KnownAssetKind::Graph), + ASSET_TYPE_NODE_PERMUTATION_MAP => Some(KnownAssetKind::NodePermutationMap), + _ => None, + } +} + +fn add_preserved( + writer: &mut BendlWriter, + asset: &PreservedAsset, +) -> Result<(), BendlWriteError> { + let opts = if asset.is_json { + AddAssetOptions::defaults().json() + } else { + AddAssetOptions::defaults() + }; + match known_kind(asset.asset_type) { + Some(kind) => writer.add_known_asset(kind, &asset.payload, opts), + None => writer.add_custom_asset(&asset.name, &asset.payload, opts), + } +} + +/// Recompress the BEN stream of the bundle at `in_file` to XBEN, writing a new bundle at `out_file`. +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite = false))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] +pub fn recompress_bundle(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> PyResult<()> { + let file = File::open(&in_file) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + in_file.display() + )) + })?; + + if !reader.is_finalized() { + return Err(PyException::new_err( + "compress_stream requires a finalized bundle", + )); + } + let sample_count = reader.header().sample_count; + let stream_len = reader.header().stream_len; + let empty = stream_len == 0 && sample_count == 0; + + // Read every asset's decoded payload up front (each read borrows the reader exclusively). + let entries: Vec<_> = reader.assets().to_vec(); + let mut assets = Vec::with_capacity(entries.len()); + for entry in &entries { + let payload = reader.asset_bytes(entry).map_err(|e| { + PyIOError::new_err(format!("Failed to read asset {:?}: {e}", entry.name)) + })?; + assets.push(PreservedAsset { + asset_type: entry.asset_type, + name: entry.name.clone(), + is_json: entry.asset_flags & ASSET_FLAG_JSON != 0, + payload, + }); + } + + // Recompress the BEN stream to XBEN bytes (skipped for an empty stream — there is no banner). + let xben_bytes = if empty { + Vec::new() + } else { + let mut ben_bytes = Vec::new(); + let mut stream = reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))?; + stream + .read_to_end(&mut ben_bytes) + .map_err(|e| PyIOError::new_err(format!("Failed to read BEN stream: {e}")))?; + let mut out = Vec::new(); + encode_ben_to_xben(Cursor::new(ben_bytes), &mut out, None, None, None, None).map_err( + |e| PyException::new_err(format!("Failed to recompress BEN stream to XBEN: {e}")), + )?; + out + }; + + // Build the new XBEN bundle. + let buf = open_output(&out_file, overwrite)?; + let mut writer = BendlWriter::new(buf, AssignmentFormat::Xben).map_err(|e| { + PyIOError::new_err(format!("Failed to initialize bundle writer: {e}")) + })?; + for asset in &assets { + add_preserved(&mut writer, asset).map_err(map_bundle_err)?; + } + + if empty { + writer.finish().map_err(map_bundle_err)?; + } else { + let mut session = writer.into_stream_session().map_err(map_bundle_err)?; + session + .write_all(&xben_bytes) + .map_err(|e| PyIOError::new_err(format!("Failed to write XBEN stream: {e}")))?; + let writer = session.finish_into_writer(sample_count); + writer.finish().map_err(map_bundle_err)?; + } + + Ok(()) +} diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index cd13bbd..c7fe92f 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -1,19 +1,17 @@ -"""Tests for bundle (.bendl) support in BenDecoder. +"""Byte-level robustness tests for ``BendlDecoder``. -These tests do not rely on the `bendl` CLI binary being built. Instead, they -construct `.bendl` bundles directly in Python from the on-disk format spec -documented in ``ben/src/io/bundle/format.rs``. This keeps the tests -self-contained and lets them stress odd byte layouts that a CLI-based helper -could not produce (truncated files, bad magic, dangling offsets, etc). +These construct ``.bendl`` bundles directly in Python from the on-disk format +spec (``ben/src/io/bundle/format.rs``). This keeps the tests self-contained and +lets them stress odd byte layouts a writer could not produce (truncated files, +bad magic, dangling offsets, etc). Real BEN/XBEN stream payloads are produced via +``BenEncoder`` / ``encode_jsonl_to_xben`` so the stream region always matches +what the main pipeline produces. -Real BEN/XBEN stream payloads are produced via ``BenEncoder`` / -``encode_jsonl_to_xben`` so the stream region always matches what the -main compression pipeline would produce. +Bundle *authoring* (``BendlEncoder``) is covered in ``test_bundle_api.py``. """ from __future__ import annotations -import io import json import lzma import random @@ -23,14 +21,8 @@ import pytest -import binary_ensemble -from binary_ensemble import ( - BenDecoder, - BenEncoder, - encode_jsonl_to_ben, - encode_jsonl_to_xben, -) - +from binary_ensemble import BenDecoder, BenEncoder, encode_jsonl_to_xben +from binary_ensemble.bundle import BendlDecoder # --------------------------------------------------------------------------- # Format constants (mirror ben/src/io/bundle/format.rs) @@ -60,7 +52,6 @@ def _crc32c(data: bytes) -> int: - """Compute CRC32C (Castagnoli), matching the Rust bundle checksum contract.""" crc = 0xFFFFFFFF for byte in data: crc ^= byte @@ -126,7 +117,7 @@ def _pack_directory_entry( asset_type, asset_flags, len(name_bytes), - 0, # reserved + 0, payload_offset, payload_len, len(checksum_bytes), @@ -140,13 +131,10 @@ def _pack_directory(entries: Iterable[bytes]) -> bytes: def _xz(data: bytes) -> bytes: - """Compress ``data`` with the xz container so the Rust xz2 decoder accepts it.""" return lzma.compress(data, format=lzma.FORMAT_XZ, preset=6) class _Asset: - """Helper describing one asset to place in a hand-built bundle.""" - def __init__( self, *, @@ -189,23 +177,10 @@ def build_bundle( major_version: int = BENDL_MAJOR_VERSION, checksums: bool = True, ) -> bytes: - """Construct the bytes of a `.bendl` file from pieces. - - The layout is ``[header][asset payloads][stream][directory]``. This - helper mirrors the writer's finalize path closely enough to produce - bundles that the Rust reader accepts, while also exposing enough knobs - to generate deliberately broken bundles for negative tests. By default - it mirrors the current writer and stores CRC32C checksums for finalized - streams and assets; pass ``checksums=False`` for foreign/no-checksum - fixtures. - """ assets = list(assets) - buf = bytearray() - # Reserve header space. buf.extend(b"\x00" * HEADER_SIZE) - # Write asset payloads and remember (offset, len, encoded_bytes) for each. encoded_assets: List[Tuple[int, int, bytes]] = [] for asset in assets: offset = len(buf) @@ -272,22 +247,15 @@ def _write_jsonl(samples: List[List[int]], path: Path) -> None: f.write("\n") -def _ben_bytes_for( - samples: List[List[int]], tmp: Path, variant: str = "standard" -) -> bytes: - """Produce real BEN bytes for ``samples`` via ``BenEncoder``.""" +def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: ben_path = tmp / "inner.ben" - with BenEncoder( - ben_path, overwrite=True, variant=variant, ben_file_only=True - ) as enc: + with BenEncoder(ben_path, overwrite=True, variant=variant) as enc: for a in samples: enc.write(a) return ben_path.read_bytes() -def _xben_bytes_for( - samples: List[List[int]], tmp: Path, variant: str = "standard" -) -> bytes: +def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: src = tmp / "src.jsonl" _write_jsonl(samples, src) out = tmp / "inner.xben" @@ -303,179 +271,126 @@ def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: # --------------------------------------------------------------------------- -# Baseline happy-path tests +# Happy path # --------------------------------------------------------------------------- -def test_module_exports_decoder_and_encoder() -> None: - assert "BenDecoder" in binary_ensemble.__all__ - assert "BenEncoder" in binary_ensemble.__all__ - assert "PyBundleReader" not in binary_ensemble.__all__ - - -def test_bundle_reader_round_trip_ben_with_assets(tmp_path: Path) -> None: +def test_bundle_round_trip_ben_with_assets(tmp_path: Path) -> None: rng = random.Random(4242) - samples = [ - [rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40) - ] - - graph_json = b'{"nodes":[0,1,2,3],"edges":[[0,1],[1,2],[2,3]]}' + samples = [[rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40)] + # NetworkX adjacency format (what read_graph rebuilds into a live graph). + graph_json = ( + b'{"directed":false,"multigraph":false,"graph":{},' + b'"nodes":[{"id":0},{"id":1},{"id":2},{"id":3}],' + b'"adjacency":[[{"id":1}],[{"id":0},{"id":2}],[{"id":1},{"id":3}],[{"id":2}]]}' + ) metadata_json = b'{"note":"hello bundle","seed":4242}' - relabel_json = b'{"0":"A","1":"B","2":"C","3":"D"}' + perm_json = b'{"node_permutation_old_to_new":{"0":1,"1":0}}' custom_blob = bytes(range(256)) bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples), - assignment_format=ASSIGNMENT_FORMAT_BEN, assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=metadata_json, - is_json=True, - compress=False, - ), - _Asset( - asset_type=ASSET_TYPE_GRAPH, - name="graph.json", - payload=graph_json, - is_json=True, - compress=True, - ), - _Asset( - asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, - name="node_permutation_map.json", - payload=relabel_json, - is_json=True, - compress=False, - ), - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="notes.bin", - payload=custom_blob, - is_json=False, - compress=False, - ), + _Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=metadata_json, is_json=True), + _Asset(asset_type=ASSET_TYPE_GRAPH, name="graph.json", payload=graph_json, is_json=True, compress=True), + _Asset(asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, name="node_permutation_map.json", payload=perm_json, is_json=True), + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="notes.bin", payload=custom_blob), ], ) path = _write_bundle(tmp_path / "out.bendl", bundle) + dec = BendlDecoder(path) - reader = BenDecoder(path) - - assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) - assert reader.is_complete() is True - assert reader.count_samples() == len(samples) - assert reader.assignment_format() == "ben" - - names = reader.asset_names() - assert names == [ + assert dec.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) + assert dec.is_complete() is True + assert dec.count_samples() == len(samples) + assert dec.assignment_format() == "ben" + assert dec.asset_names() == [ "metadata.json", "graph.json", "node_permutation_map.json", "notes.bin", ] - assets = reader.list_assets() - assert [a["name"] for a in assets] == names - by_name = {a["name"]: a for a in assets} + by_name = {a["name"]: a for a in dec.list_assets()} assert by_name["graph.json"]["type"] == ASSET_TYPE_GRAPH assert "xz" in by_name["graph.json"]["flags"] - assert "json" in by_name["graph.json"]["flags"] - assert "xz" not in by_name["metadata.json"]["flags"] - assert "json" in by_name["metadata.json"]["flags"] assert by_name["notes.bin"]["flags"] == ["checksum"] - # payload_offset must sit at or past the end of the header. - for entry in assets: + for entry in dec.list_assets(): assert entry["offset"] >= HEADER_SIZE - assert entry["len"] > 0 - - # Raw byte access (decompresses xz transparently). - assert reader.read_asset_bytes("metadata.json") == metadata_json - assert reader.read_asset_bytes("graph.json") == graph_json - assert reader.read_asset_bytes("node_permutation_map.json") == relabel_json - assert reader.read_asset_bytes("notes.bin") == custom_blob - # Typed JSON helpers. - assert reader.read_metadata() == json.loads(metadata_json) - assert reader.read_graph() == json.loads(graph_json) - assert reader.read_relabel_map() == json.loads(relabel_json) - - # read_json_asset by name. - assert reader.read_json_asset("metadata.json") == json.loads(metadata_json) + assert dec.read_asset_bytes("graph.json") == graph_json + assert dec.read_asset_bytes("notes.bin") == custom_blob + assert dec.read_metadata() == json.loads(metadata_json) + # read_graph() rebuilds a live NetworkX graph; raw JSON stays on read_json_asset. + assert dec.read_json_asset("graph.json") == json.loads(graph_json) + graph_obj = dec.read_graph() + assert sorted(graph_obj.nodes) == [0, 1, 2, 3] + assert {tuple(sorted(e)) for e in graph_obj.edges} == {(0, 1), (1, 2), (2, 3)} + assert dec.read_node_permutation_map() == json.loads(perm_json) + assert dec.read_json_asset("metadata.json") == json.loads(metadata_json) - # extract_stream then decode via BenDecoder. extracted = tmp_path / "stream.ben" - reader.extract_stream(extracted) - got = list(BenDecoder(extracted, mode="ben")) - assert got == samples - - # __repr__ should not crash. - r = repr(reader) - assert r is not None + dec.extract_stream(extracted) + assert list(BenDecoder(extracted, mode="ben")) == samples + assert repr(dec) is not None -def test_bundle_reader_round_trip_xben(tmp_path: Path) -> None: +def test_bundle_round_trip_xben(tmp_path: Path) -> None: samples = [[1, 2, 3], [1, 2, 3], [4, 4, 5], [6, 7, 8]] bundle = build_bundle( stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), sample_count=len(samples), assignment_format=ASSIGNMENT_FORMAT_XBEN, - assets=[], ) path = _write_bundle(tmp_path / "xout.bendl", bundle) - reader = BenDecoder(path) - - assert reader.assignment_format() == "xben" - assert reader.is_complete() - assert reader.count_samples() == len(samples) - assert reader.asset_names() == [] - - # extract_stream → file must round-trip via the xben decoder. + dec = BendlDecoder(path) + assert dec.assignment_format() == "xben" + assert dec.count_samples() == len(samples) + assert dec.asset_names() == [] extracted = tmp_path / "stream.xben" - reader.extract_stream(extracted) + dec.extract_stream(extracted) assert list(BenDecoder(extracted, mode="xben")) == samples -def test_bundle_reader_canonical_helpers_return_none_when_absent( - tmp_path: Path, -) -> None: - samples = [[1, 2, 3]] +def test_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assets=[ - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="only_custom.bin", - payload=b"x", - ), - ], + stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), + sample_count=1, + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="only_custom.bin", payload=b"x")], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) - reader = BenDecoder(path) - assert reader.read_metadata() is None - assert reader.read_graph() is None - assert reader.read_relabel_map() is None + dec = BendlDecoder(path) + assert dec.read_metadata() is None + assert dec.read_graph() is None + assert dec.read_node_permutation_map() is None -def test_bundle_reader_asset_free_empty_stream(tmp_path: Path) -> None: - # A bundle with no assets and an empty stream is legal (spec says so). - bundle = build_bundle(stream_bytes=b"", sample_count=0, assets=[]) +def test_asset_free_empty_stream(tmp_path: Path) -> None: + bundle = build_bundle(stream_bytes=b"", sample_count=0) path = _write_bundle(tmp_path / "empty.bendl", bundle) - reader = BenDecoder(path) - assert reader.is_complete() - assert reader.count_samples() == 0 - assert reader.asset_names() == [] - assert reader.list_assets() == [] - # extract_stream writes a zero-byte file. + dec = BendlDecoder(path) + assert dec.is_complete() + assert dec.count_samples() == 0 + assert len(dec) == 0 + assert list(dec) == [] + assert dec.asset_names() == [] out = tmp_path / "empty.ben" - reader.extract_stream(out) + dec.extract_stream(out) assert out.read_bytes() == b"" +def test_banner_only_zero_frame_stream(tmp_path: Path) -> None: + # A real BEN banner with zero frames iterates to [] and counts 0. + bundle = build_bundle(stream_bytes=_ben_bytes_for([], tmp_path), sample_count=0) + path = _write_bundle(tmp_path / "banner.bendl", bundle) + dec = BendlDecoder(path) + assert len(dec) == 0 + assert list(dec) == [] + + # --------------------------------------------------------------------------- -# Robustness: asset lookup and JSON parsing +# Asset lookup / JSON parsing # --------------------------------------------------------------------------- @@ -483,179 +398,164 @@ def test_read_asset_bytes_raises_keyerror_for_unknown_name(tmp_path: Path) -> No bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name="there.bin", payload=b"x"), - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="there.bin", payload=b"x")], ) path = _write_bundle(tmp_path / "x.bendl", bundle) - reader = BenDecoder(path) + dec = BendlDecoder(path) with pytest.raises(KeyError, match="no asset named"): - reader.read_asset_bytes("missing.bin") + dec.read_asset_bytes("missing.bin") with pytest.raises(KeyError): - reader.read_json_asset("missing.json") + dec.read_json_asset("missing.json") -def test_read_json_asset_rejects_non_utf8_payload(tmp_path: Path) -> None: +def test_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="binary.bin", - payload=b"\xff\xfe\xfd", # not valid UTF-8 - is_json=False, - compress=False, - ) - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="binary.bin", payload=b"\xff\xfe\xfd")], ) path = _write_bundle(tmp_path / "bin.bendl", bundle) - reader = BenDecoder(path) - # Raw bytes come back fine. - assert reader.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" - # But the JSON helper must reject non-UTF8 bytes. + dec = BendlDecoder(path) + assert dec.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" with pytest.raises(Exception, match="not valid UTF-8"): - reader.read_json_asset("binary.bin") + dec.read_json_asset("binary.bin") def test_read_json_asset_rejects_malformed_json(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=b"not a json {{{", - is_json=True, - ) - ], + assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b"not a json {{{", is_json=True)], ) path = _write_bundle(tmp_path / "m.bendl", bundle) - reader = BenDecoder(path) - # Raw bytes: fine. - assert reader.read_asset_bytes("metadata.json") == b"not a json {{{" - # Parsed via python's json module: must raise. + dec = BendlDecoder(path) + assert dec.read_asset_bytes("metadata.json") == b"not a json {{{" with pytest.raises(json.JSONDecodeError): - reader.read_metadata() + dec.read_metadata() def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: - # Directory entries store UTF-8 names; a multi-byte name should work. name = "tëst_ääää_✓.bin" bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name=name, payload=b"payload"), - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name=name, payload=b"payload")], ) path = _write_bundle(tmp_path / "u.bendl", bundle) - reader = BenDecoder(path) - assert reader.asset_names() == [name] - assert reader.read_asset_bytes(name) == b"payload" + dec = BendlDecoder(path) + assert dec.asset_names() == [name] + assert dec.read_asset_bytes(name) == b"payload" def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: - # Stress the directory with a large-ish asset count. payloads = {f"asset_{i:04d}.bin": bytes([i & 0xFF] * (i + 1)) for i in range(200)} - assets = [ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) - for n, p in payloads.items() + assets = [_Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) for n, p in payloads.items()] + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), sample_count=1, assets=assets) + path = _write_bundle(tmp_path / "many.bendl", bundle) + dec = BendlDecoder(path) + assert dec.asset_names() == list(payloads.keys()) + for i in (0, 1, 42, 199): + name = f"asset_{i:04d}.bin" + assert dec.read_asset_bytes(name) == payloads[name] + + +def test_list_assets_flag_fidelity(tmp_path: Path) -> None: + combos: List[Tuple[bool, bool, bool]] = [ + (False, False, False), (True, False, False), (False, True, False), + (False, False, True), (True, True, False), (True, False, True), + (False, True, True), (True, True, True), ] + assets: List[_Asset] = [] + expected: List[List[str]] = [] + for i, (is_json, compress, has_checksum) in enumerate(combos): + payload = f'{{"i":{i}}}'.encode("utf-8") if is_json else bytes([i % 256]) * 32 + checksum = b"\xde\xad\xbe\xef" if has_checksum else None + assets.append(_Asset(asset_type=ASSET_TYPE_CUSTOM, name=f"asset-{i}.bin", payload=payload, is_json=is_json, compress=compress, checksum=checksum)) + want: List[str] = [] + if is_json: + want.append("json") + if compress: + want.append("xz") + if has_checksum: + want.append("checksum") + expected.append(want) + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, assets=assets, checksums=False) + path = _write_bundle(tmp_path / "flags.bendl", bundle) + got = BendlDecoder(path).list_assets() + for entry, want in zip(got, expected): + assert entry["flags"] == want + + +def test_zero_length_custom_payload(tmp_path: Path) -> None: bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), + stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, - assets=assets, + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="empty.bin", payload=b"")], ) - path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = BenDecoder(path) - names = reader.asset_names() - assert names == list(payloads.keys()) - # Spot-check the contents round-trip. - for i in (0, 1, 42, 199): - name = f"asset_{i:04d}.bin" - assert reader.read_asset_bytes(name) == payloads[name] + path = _write_bundle(tmp_path / "zlen.bendl", bundle) + dec = BendlDecoder(path) + assert dec.read_asset_bytes("empty.bin") == b"" + entry = next(a for a in dec.list_assets() if a["name"] == "empty.bin") + assert entry["len"] == 0 # --------------------------------------------------------------------------- -# Robustness: extract_stream overwrite semantics +# extract_stream overwrite semantics # --------------------------------------------------------------------------- def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) path = _write_bundle(tmp_path / "a.bendl", bundle) - reader = BenDecoder(path) + dec = BendlDecoder(path) target = tmp_path / "already.ben" target.write_bytes(b"pre-existing") with pytest.raises(OSError, match="already exists"): - reader.extract_stream(target) - # File must be untouched. + dec.extract_stream(target) assert target.read_bytes() == b"pre-existing" -def test_extract_stream_overwrites_when_requested(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), - sample_count=2, - ) - path = _write_bundle(tmp_path / "b.bendl", bundle) - reader = BenDecoder(path) - target = tmp_path / "out.ben" - target.write_bytes(b"filler") - reader.extract_stream(target, overwrite=True) - # Re-opening the extracted file via BenDecoder confirms it's a valid .ben. - assert list(BenDecoder(target, mode="ben")) == [[1, 2], [3, 4]] +def test_extract_stream_into_missing_parent_dir_raises(tmp_path: Path) -> None: + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) + path = _write_bundle(tmp_path / "mini.bendl", bundle) + dec = BendlDecoder(path) + with pytest.raises(OSError): + dec.extract_stream(tmp_path / "does" / "not" / "exist" / "out.ben") # --------------------------------------------------------------------------- -# Robustness: invalid headers and corrupted bundles +# Invalid headers / corrupted bundles # --------------------------------------------------------------------------- def test_open_rejects_missing_file(tmp_path: Path) -> None: with pytest.raises(OSError, match="Failed to open"): - BenDecoder(tmp_path / "does_not_exist.bendl") + BendlDecoder(tmp_path / "does_not_exist.bendl") -def test_open_rejects_bad_magic(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - magic=b"NOTABEND", - ) - path = _write_bundle(tmp_path / "bad.bendl", bundle) - # Bad magic → detect_is_bundle returns False → treated as plain BEN - # stream → fails because the bytes aren't a valid BEN banner. - with pytest.raises(Exception): - BenDecoder(path) +def test_open_rejects_plain_stream(tmp_path: Path) -> None: + plain = tmp_path / "plain.ben" + with BenEncoder(plain, overwrite=True, variant="standard") as enc: + enc.write([1, 2, 3]) + with pytest.raises(Exception, match="not a .bendl bundle"): + BendlDecoder(plain) def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - major_version=999, - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, major_version=999) path = _write_bundle(tmp_path / "oldfuture.bendl", bundle) with pytest.raises(Exception, match="Failed to parse bundle header"): - BenDecoder(path) + BendlDecoder(path) def test_open_rejects_truncated_header(tmp_path: Path) -> None: path = tmp_path / "short.bendl" - path.write_bytes(b"BENDL\x00\x00\x01\x00") # magic plus 2 bytes — not enough + path.write_bytes(b"BENDL\x00\x00\x01\x00") with pytest.raises(Exception, match="Failed to parse bundle header"): - BenDecoder(path) + BendlDecoder(path) -def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> None: - # Corrupt the directory's leading u32 entry-count so the reader tries - # to decode many more entries than the file actually contains. +def test_open_rejects_inflated_entry_count(tmp_path: Path) -> None: bundle = bytearray( build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), @@ -664,29 +564,26 @@ def test_open_rejects_directory_with_inflated_entry_count(tmp_path: Path) -> Non ) ) directory_offset = struct.unpack_from(" None: +def test_open_rejects_chopped_directory(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="x", payload=b"abc")], ) - # Drop the final two bytes of the directory. path = _write_bundle(tmp_path / "chop.bendl", bundle[:-2]) with pytest.raises(Exception): - BenDecoder(path) + BendlDecoder(path) def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: stream = _ben_bytes_for([[1, 2]], tmp_path) - - duplicate_names = build_bundle( + dup = build_bundle( stream_bytes=stream, sample_count=1, assets=[ @@ -694,30 +591,18 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: _Asset(asset_type=ASSET_TYPE_CUSTOM, name="dup.bin", payload=b"b"), ], ) - path = _write_bundle(tmp_path / "dup.bendl", duplicate_names) with pytest.raises(Exception, match="malformed directory"): - BenDecoder(path) - - wrong_singleton_name = build_bundle( + BendlDecoder(_write_bundle(tmp_path / "dup.bendl", dup)) + wrong = build_bundle( stream_bytes=stream, sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="not_metadata.json", - payload=b"{}", - is_json=True, - ) - ], + assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="not_metadata.json", payload=b"{}", is_json=True)], ) - path = _write_bundle(tmp_path / "singleton.bendl", wrong_singleton_name) with pytest.raises(Exception, match="malformed directory"): - BenDecoder(path) + BendlDecoder(_write_bundle(tmp_path / "singleton.bendl", wrong)) -def test_open_rejects_declared_directory_len_with_trailing_bytes( - tmp_path: Path, -) -> None: +def test_open_rejects_trailing_directory_bytes(tmp_path: Path) -> None: bundle = bytearray( build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), @@ -728,1560 +613,260 @@ def test_open_rejects_declared_directory_len_with_trailing_bytes( directory_len = struct.unpack_from(" None: - # Provisional bundle with complete=0: the decoder falls back to - # scanning the stream region (from stream_offset to EOF) to count - # samples instead of trusting the header. - stream = _ben_bytes_for([[1, 2, 3]], tmp_path) - header = _pack_header( - complete=COMPLETE_NO, - assignment_format=ASSIGNMENT_FORMAT_BEN, - directory_offset=0, - directory_len=0, - stream_offset=HEADER_SIZE, - stream_len=0, - sample_count=-1, - ) - path = _write_bundle(tmp_path / "incomplete.bendl", header + stream) - reader = BenDecoder(path) - assert reader.is_complete() is False - assert reader.count_samples() == 1 - assert reader.asset_names() == [] - # Verified extraction requires a finalized stream checksum. - out = tmp_path / "extracted.ben" - with pytest.raises(Exception, match="unfinalized"): - reader.extract_stream(out) - reader.extract_stream(out, overwrite=True, allow_unfinalized=True) - assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] + BendlDecoder(path) -def test_unknown_assignment_format_byte_rejects_at_construction(tmp_path: Path) -> None: - # Assignment format byte = 99 → unrecognized. BenDecoder must - # reject the bundle at construction time. - bundle = bytearray( - build_bundle( - stream_bytes=b"", - sample_count=0, - assets=[], - ) - ) - # assignment_format byte is at offset 13 in the header. +def test_unknown_assignment_format_rejected(tmp_path: Path) -> None: + bundle = bytearray(build_bundle(stream_bytes=b"", sample_count=0)) bundle[13] = 99 path = _write_bundle(tmp_path / "wtfmt.bendl", bytes(bundle)) with pytest.raises(Exception, match="unrecognized assignment_format"): - BenDecoder(path) + BendlDecoder(path) -def test_corrupted_xz_asset_raises_io_error(tmp_path: Path) -> None: +def test_corrupted_xz_asset_raises(tmp_path: Path) -> None: bundle = bytearray( build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_GRAPH, - name="graph.json", - payload=b'{"nodes":[0,1,2,3,4,5,6,7,8,9]}', - is_json=True, - compress=True, - ) - ], + assets=[_Asset(asset_type=ASSET_TYPE_GRAPH, name="graph.json", payload=b'{"nodes":[0,1,2,3,4,5,6,7,8,9]}', is_json=True, compress=True)], ) ) - - # Hunt for the xz payload bytes and flip one in the middle. - # We know the xz magic is b"\xfd7zXZ". xz_start = bundle.find(b"\xfd7zXZ") - assert xz_start != -1, "expected xz magic in hand-built bundle" - # Flip a byte well past the magic so the decoder reads it and fails. + assert xz_start != -1 bundle[xz_start + 20] ^= 0xFF path = _write_bundle(tmp_path / "badxz.bendl", bytes(bundle)) - reader = BenDecoder(path) - # Opening works — the header/directory are intact. + dec = BendlDecoder(path) with pytest.raises(OSError): - reader.read_asset_bytes("graph.json") - - -def test_directory_entry_with_zero_length_custom_payload(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1]], tmp_path), - sample_count=1, - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name="empty.bin", payload=b""), - ], - ) - path = _write_bundle(tmp_path / "zlen.bendl", bundle) - reader = BenDecoder(path) - assert reader.read_asset_bytes("empty.bin") == b"" - entry = next(a for a in reader.list_assets() if a["name"] == "empty.bin") - assert entry["len"] == 0 - - -def test_repr_on_incomplete_bundle(tmp_path: Path) -> None: - stream = _ben_bytes_for([[1, 2]], tmp_path) - header = _pack_header( - complete=COMPLETE_NO, - assignment_format=ASSIGNMENT_FORMAT_BEN, - directory_offset=0, - directory_len=0, - stream_offset=HEADER_SIZE, - stream_len=0, - sample_count=-1, - ) - path = _write_bundle(tmp_path / "rep.bendl", header + stream) - reader = BenDecoder(path) - # Incomplete bundle should open without error. - assert reader.is_complete() is False - assert reader.asset_names() == [] + dec.read_asset_bytes("graph.json") # --------------------------------------------------------------------------- -# Robustness: interrupted / truncated BEN streams inside a bundle +# Incomplete / truncated streams # --------------------------------------------------------------------------- -def _incomplete_bundle(stream_bytes: bytes) -> bytes: - """Simulate a writer that crashed mid-stream: valid header, partial - stream bytes, and no directory table at all (complete=0).""" +def _incomplete_bundle(stream_bytes: bytes, stream_len: int = 0) -> bytes: header = _pack_header( complete=COMPLETE_NO, assignment_format=ASSIGNMENT_FORMAT_BEN, directory_offset=0, directory_len=0, stream_offset=HEADER_SIZE, - stream_len=0, + stream_len=stream_len, sample_count=-1, ) return header + stream_bytes -def test_interrupted_ben_stream_mid_frame_decodes_valid_prefix(tmp_path: Path) -> None: - # Simulate a writer that was killed after flushing the header and - # part of the BEN stream, but before the stream was finished or the - # directory was written. +def test_incomplete_bundle_scans_for_sample_count(tmp_path: Path) -> None: + stream = _ben_bytes_for([[1, 2, 3]], tmp_path) + path = _write_bundle(tmp_path / "incomplete.bendl", _incomplete_bundle(stream)) + dec = BendlDecoder(path) + assert dec.is_complete() is False + assert dec.count_samples() == 1 + assert dec.asset_names() == [] + out = tmp_path / "extracted.ben" + with pytest.raises(Exception, match="unfinalized"): + dec.extract_stream(out) + dec.extract_stream(out, overwrite=True, allow_unfinalized=True) + assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] + + +def test_interrupted_ben_stream_decodes_valid_prefix(tmp_path: Path) -> None: samples = [[1, 1, 2, 2], [3, 3, 4, 4], [5, 5, 6, 6], [7, 7, 8, 8], [9, 9, 9, 9]] full_ben = _ben_bytes_for(samples, tmp_path) - # Cut the BEN bytes well past the 17-byte banner but before the end - # so the truncation lands mid-frame. - assert len(full_ben) > 25 partial = full_ben[: len(full_ben) - 3] path = _write_bundle(tmp_path / "crashed.bendl", _incomplete_bundle(partial)) - - reader = BenDecoder(path) - assert reader.is_complete() is False - assert reader.assignment_format() == "ben" - # count_samples scans the truncated stream; it may error or return a - # partial count — either is acceptable. - try: - n = reader.count_samples() - assert n < len(samples) - except Exception: - pass - - # Verified extraction refuses unfinalized streams because their checksum is - # not authoritative yet. + dec = BendlDecoder(path) + assert dec.is_complete() is False extracted = tmp_path / "partial.ben" with pytest.raises(Exception, match="unfinalized"): - reader.extract_stream(extracted) - reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) + dec.extract_stream(extracted) + dec.extract_stream(extracted, overwrite=True, allow_unfinalized=True) assert extracted.read_bytes() == partial - # The extracted file opens as a BEN stream (banner is intact). - dec = BenDecoder(extracted, mode="ben") - # Iterating through the truncated stream must either yield a strict - # prefix of the samples and then raise, or raise on the very first - # frame — both are acceptable outcomes. What is NOT acceptable is - # silently producing garbage or decoding past the truncation. - produced: list[list[int]] = [] - with pytest.raises(Exception): - for a in dec: - produced.append(a) - # Whatever came out must be a strict prefix of the original samples. - assert produced == samples[: len(produced)] - assert len(produced) < len(samples) - - -def test_interrupted_ben_stream_inside_banner_fails_to_open_decoder( - tmp_path: Path, -) -> None: - # Truncate the BEN bytes inside the 17-byte banner region. - full_ben = _ben_bytes_for([[1, 2, 3]], tmp_path) - path = _write_bundle(tmp_path / "head_cut.bendl", _incomplete_bundle(full_ben[:8])) - reader = BenDecoder(path) - assert reader.is_complete() is False - - extracted = tmp_path / "head_cut.ben" - with pytest.raises(Exception, match="unfinalized"): - reader.extract_stream(extracted) - reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) - # The decoder must reject a BEN file whose banner is incomplete. - with pytest.raises(Exception, match="Failed to create BenDecoder"): - BenDecoder(extracted, mode="ben") - - -def test_interrupted_ben_stream_zero_bytes_after_header(tmp_path: Path) -> None: - # The worst case: the writer crashed after writing the header and - # before any stream bytes landed. +def test_interrupted_zero_bytes_after_header(tmp_path: Path) -> None: path = _write_bundle(tmp_path / "zero.bendl", _incomplete_bundle(b"")) - - reader = BenDecoder(path) - assert reader.is_complete() is False - assert reader.asset_names() == [] - # Zero stream bytes → scan fails (no BEN banner). + dec = BendlDecoder(path) + assert dec.is_complete() is False + assert dec.asset_names() == [] with pytest.raises(Exception): - reader.count_samples() - + dec.count_samples() extracted = tmp_path / "zero.ben" - with pytest.raises(Exception, match="unfinalized"): - reader.extract_stream(extracted) - reader.extract_stream(extracted, overwrite=True, allow_unfinalized=True) + dec.extract_stream(extracted, overwrite=True, allow_unfinalized=True) assert extracted.read_bytes() == b"" - # A zero-byte .ben has no banner → decoder construction must fail. - with pytest.raises(Exception, match="Failed to create BenDecoder"): - BenDecoder(extracted, mode="ben") - - -def test_finalized_bundle_with_inflated_stream_len_survives_open( - tmp_path: Path, -) -> None: - # Build a valid finalized bundle, then patch stream_len to a value - # larger than the actual stream payload. This simulates the narrow - # window where the writer updated the header but was killed before - # writing the directory — and something (or someone) re-flagged it - # as finalized. + + +def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) -> None: samples = [[1, 2, 3], [4, 5, 6]] - bundle = bytearray( - build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - ) - # stream_len lives at header offset 48..56. + bundle = bytearray(build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples))) old_stream_len = struct.unpack_from(" None: - # Confirm that the same reader can serve asset reads after an - # extract_stream call (i.e. internal seek state doesn't wedge things). - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), - sample_count=2, - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=b'{"x":1}', - is_json=True, - ) - ], - ) - path = _write_bundle(tmp_path / "seq.bendl", bundle) - reader = BenDecoder(path) - reader.extract_stream(tmp_path / "s.ben") - assert reader.read_metadata() == {"x": 1} - reader.extract_stream(tmp_path / "s2.ben", overwrite=True) - assert reader.read_asset_bytes("metadata.json") == b'{"x":1}' + assert len(extracted.read_bytes()) <= old_stream_len + 10_000 # --------------------------------------------------------------------------- -# Stress / fuzz +# Interleaving / idempotence # --------------------------------------------------------------------------- -def test_long_asset_name_near_u16_max(tmp_path: Path) -> None: - # name_len in the directory entry is u16, so ~65500 is near the top. - # Anything above u16::MAX should be rejected by a real writer — we only - # stress the reader here, so we stay safely under 65535. - long_name = "x" * 65500 + ".bin" - assert len(long_name.encode("utf-8")) < 65536 - payload = b"payload-for-absurdly-long-name" +def test_read_after_extract_still_works(tmp_path: Path) -> None: bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1]], tmp_path), - sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name=long_name, payload=payload)], + stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), + sample_count=2, + assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b'{"x":1}', is_json=True)], ) - path = _write_bundle(tmp_path / "long.bendl", bundle) - reader = BenDecoder(path) - assert reader.asset_names() == [long_name] - assert reader.read_asset_bytes(long_name) == payload - + path = _write_bundle(tmp_path / "seq.bendl", bundle) + dec = BendlDecoder(path) + dec.extract_stream(tmp_path / "s.ben") + assert dec.read_metadata() == {"x": 1} + dec.extract_stream(tmp_path / "s2.ben", overwrite=True) + assert dec.read_asset_bytes("metadata.json") == b'{"x":1}' -def test_list_assets_flag_fidelity(tmp_path: Path) -> None: - # Every combination of (json, xz, checksum) should round-trip verbatim - # through list_assets()["flags"]. - combos: List[Tuple[bool, bool, bool]] = [ - (False, False, False), - (True, False, False), - (False, True, False), - (False, False, True), - (True, True, False), - (True, False, True), - (False, True, True), - (True, True, True), - ] - assets: List[_Asset] = [] - expected: List[List[str]] = [] - for i, (is_json, compress, has_checksum) in enumerate(combos): - payload = f'{{"i":{i}}}'.encode("utf-8") if is_json else bytes([i % 256]) * 32 - checksum = b"\xde\xad\xbe\xef" if has_checksum else None - assets.append( - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name=f"asset-{i}.bin", - payload=payload, - is_json=is_json, - compress=compress, - checksum=checksum, - ) - ) - want: List[str] = [] - if is_json: - want.append("json") - if compress: - want.append("xz") - if has_checksum: - want.append("checksum") - expected.append(want) +def test_toc_interleaved_with_iteration(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - assets=assets, - checksums=False, + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=len(samples), + assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b'{"tag":42}', is_json=True)], ) - path = _write_bundle(tmp_path / "flags.bendl", bundle) - reader = BenDecoder(path) - got = reader.list_assets() - assert len(got) == len(combos) - for entry, want in zip(got, expected): - assert entry["flags"] == want + path = _write_bundle(tmp_path / "interleave.bendl", bundle) + dec = BendlDecoder(path) + it = iter(dec) + assert next(it) == samples[0] + assert dec.read_metadata() == {"tag": 42} + assert next(it) == samples[1] + assert dec.read_asset_bytes("metadata.json") == b'{"tag":42}' + assert next(it) == samples[2] + with pytest.raises(StopIteration): + next(it) -def test_read_asset_bytes_is_idempotent(tmp_path: Path) -> None: - # Reading the same asset twice (with an xz round-trip in between) must - # return byte-identical content, proving no internal state gets mutated. +def test_read_asset_bytes_idempotent(tmp_path: Path) -> None: payload = b"repeat-me " * 100 bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, assets=[ - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="raw.bin", - payload=payload, - ), - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="compressed.bin", - payload=payload, - compress=True, - ), + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="raw.bin", payload=payload), + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="compressed.bin", payload=payload, compress=True), ], ) path = _write_bundle(tmp_path / "idem.bendl", bundle) - reader = BenDecoder(path) + dec = BendlDecoder(path) for _ in range(5): - assert reader.read_asset_bytes("raw.bin") == payload - assert reader.read_asset_bytes("compressed.bin") == payload - + assert dec.read_asset_bytes("raw.bin") == payload + assert dec.read_asset_bytes("compressed.bin") == payload -def test_stress_many_heterogeneous_assets_round_trip(tmp_path: Path) -> None: - # A full directory with rotating flags. This exercises directory - # scaling, offset bookkeeping, and name lookup on a non-trivial directory. - N = 256 - assets: List[_Asset] = [] - expected: List[Tuple[str, bytes]] = [] - rng = random.Random(0xBEEF) - for i in range(N): - payload = rng.randbytes(rng.randint(1, 200)) - compress = i % 3 == 0 - is_json = i % 5 == 0 - # When is_json is set we need valid UTF-8; use a safe synthetic blob. - if is_json: - payload = f'{{"i":{i},"n":{rng.randint(0, 1000)}}}'.encode("utf-8") - assets.append( - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name=f"asset-{i:04d}.bin", - payload=payload, - is_json=is_json, - compress=compress, - ) - ) - expected.append((f"asset-{i:04d}.bin", payload)) - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2, 3], [4, 5, 6]], tmp_path), - sample_count=2, - assets=assets, - ) - path = _write_bundle(tmp_path / "many.bendl", bundle) - reader = BenDecoder(path) - - assert reader.asset_names() == [name for name, _ in expected] - # Sample every 37th asset and verify the payload decodes correctly - # (xz pass-through on ~a third of them). - for idx in range(0, N, 37): - name, want = expected[idx] - assert reader.read_asset_bytes(name) == want - # Spot-check a JSON asset that was flagged json+compressed? Only json alone. - json_idxs = [i for i in range(N) if i % 5 == 0 and i % 3 != 0] - assert json_idxs # sanity - sample = json_idxs[len(json_idxs) // 2] - name, want = expected[sample] - assert reader.read_json_asset(name) == json.loads(want) +# --------------------------------------------------------------------------- +# Iteration restart and subsampling +# --------------------------------------------------------------------------- -def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: - # Build 20 deliberately-different bundles from a seeded PRNG. Each one - # mixes random asset sizes, random flags, random samples, and is then - # fully round-tripped through BenDecoder on a .bendl bundle. - rng = random.Random(0xFEED_FACE) - for trial in range(20): - n_assets = rng.randint(0, 12) - assets: List[_Asset] = [] - truth: List[Tuple[str, bytes]] = [] - for i in range(n_assets): - size = rng.choice([0, 1, 7, 64, 500, 4096]) - payload = rng.randbytes(size) - compress = rng.random() < 0.4 - assets.append( - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name=f"t{trial}-a{i}.bin", - payload=payload, - compress=compress, - ) - ) - truth.append((f"t{trial}-a{i}.bin", payload)) - - n_samples = rng.randint(1, 25) - samples = [ - [rng.randint(1, 8) for _ in range(rng.randint(1, 40))] - for _ in range(n_samples) - ] - - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=n_samples, - assets=assets, - ) - path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) - - reader = BenDecoder(path) - assert reader.is_complete() - assert reader.count_samples() == n_samples - assert reader.asset_names() == [name for name, _ in truth] - for name, want in truth: - assert reader.read_asset_bytes(name) == want - - extracted = tmp_path / f"fuzz-{trial}.ben" - reader.extract_stream(extracted) - assert list(BenDecoder(extracted, mode="ben")) == samples - - -def test_interleaved_asset_and_stream_operations(tmp_path: Path) -> None: - # Interleave every user-facing method to prove the reader does not - # wedge its internal seek state when operations are reordered. - samples = [[1, 2], [3, 4], [5, 6], [7, 8]] - metadata = b'{"hello":"world"}' - graph = b'{"nodes":[0,1,2],"edges":[[0,1],[1,2]]}' - custom = b"\x00\x01\x02\x03" * 64 - - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=metadata, - is_json=True, - ), - _Asset( - asset_type=ASSET_TYPE_GRAPH, - name="graph.json", - payload=graph, - is_json=True, - compress=True, - ), - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="blob.bin", - payload=custom, - ), - ], - ) - path = _write_bundle(tmp_path / "interleave.bendl", bundle) - reader = BenDecoder(path) - - # Strongly non-sequential access pattern. - assert reader.read_asset_bytes("blob.bin") == custom - assert reader.read_metadata() == {"hello": "world"} - reader.extract_stream(tmp_path / "a.ben") - assert reader.read_graph() == json.loads(graph) - reader.extract_stream(tmp_path / "b.ben", overwrite=True) - assert reader.read_asset_bytes("metadata.json") == metadata - assert reader.read_asset_bytes("blob.bin") == custom - assert reader.read_asset_bytes("graph.json") == graph - reader.extract_stream(tmp_path / "c.ben", overwrite=True) - - # Every extracted stream must be byte-identical. - a = (tmp_path / "a.ben").read_bytes() - b = (tmp_path / "b.ben").read_bytes() - c = (tmp_path / "c.ben").read_bytes() - assert a == b == c - assert list(BenDecoder(tmp_path / "a.ben", mode="ben")) == samples - - -def test_extract_stream_into_missing_parent_dir_raises_ioerror(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - ) - path = _write_bundle(tmp_path / "mini.bendl", bundle) - reader = BenDecoder(path) - missing = tmp_path / "does" / "not" / "exist" / "out.ben" - with pytest.raises(OSError): - reader.extract_stream(missing) - - -# --------------------------------------------------------------------------- -# BenEncoder bundle-output tests -# --------------------------------------------------------------------------- - - -SAMPLE_GRAPH = { - "directed": False, - "multigraph": False, - "graph": {}, - "nodes": [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}], - "adjacency": [ - [{"id": 1}], - [{"id": 0}, {"id": 2}], - [{"id": 1}, {"id": 3}], - [{"id": 2}], - ], -} - - -def test_benencoder_default_emits_bundle_without_graph(tmp_path: Path) -> None: - out = tmp_path / "stream.bendl" - samples = [[1, 1, 2, 2], [3, 3, 2, 2], [3, 3, 3, 3]] - with BenEncoder(out, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) - assert reader.is_complete() - assert reader.count_samples() == len(samples) - assert reader.assignment_format() == "ben" - # No graph because none was provided. - assert reader.asset_names() == [] - assert reader.read_graph() is None - - extracted = tmp_path / "extracted.ben" - reader.extract_stream(extracted) - assert list(BenDecoder(extracted, mode="ben")) == samples - - -def test_benencoder_bundle_embeds_graph_from_dict(tmp_path: Path) -> None: - out = tmp_path / "with_graph.bendl" - samples = [[1, 1, 2, 2], [1, 1, 3, 3]] - with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.is_complete() - assert reader.count_samples() == len(samples) - assert reader.asset_names() == ["graph.json"] - - assets = reader.list_assets() - assert len(assets) == 1 - graph_entry = assets[0] - assert graph_entry["name"] == "graph.json" - # Default bundle policy xz-compresses graph.json. - assert "xz" in graph_entry["flags"] - assert "json" in graph_entry["flags"] - - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_embeds_graph_from_path(tmp_path: Path) -> None: - graph_path = tmp_path / "graph.json" - graph_path.write_text(json.dumps(SAMPLE_GRAPH)) - - out = tmp_path / "with_graph_path.bendl" - samples = [[0, 0, 1, 1]] - with BenEncoder(out, overwrite=True, variant="standard", graph=graph_path) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.asset_names() == ["graph.json"] - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_embeds_graph_from_str_path(tmp_path: Path) -> None: - # String paths must be accepted verbatim (same coercion Path arguments - # go through elsewhere in the API). - graph_path = tmp_path / "graph-str.json" - graph_path.write_text(json.dumps(SAMPLE_GRAPH)) - - out = tmp_path / "via-str.bendl" - samples = [[0, 1, 0, 1]] - with BenEncoder( - out, overwrite=True, variant="standard", graph=str(graph_path) - ) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_embeds_graph_from_bytes(tmp_path: Path) -> None: - raw = json.dumps(SAMPLE_GRAPH).encode("utf-8") - out = tmp_path / "via-bytes.bendl" - samples = [[2, 2, 2, 2]] - with BenEncoder(out, overwrite=True, variant="standard", graph=raw) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_embeds_graph_from_bytesio(tmp_path: Path) -> None: - buf = io.BytesIO(json.dumps(SAMPLE_GRAPH).encode("utf-8")) - out = tmp_path / "via-bytesio.bendl" - samples = [[1, 2, 1, 2]] - with BenEncoder(out, overwrite=True, variant="standard", graph=buf) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_embeds_graph_from_stringio(tmp_path: Path) -> None: - buf = io.StringIO(json.dumps(SAMPLE_GRAPH)) - out = tmp_path / "via-stringio.bendl" - samples = [[3, 3, 3, 3]] - with BenEncoder(out, overwrite=True, variant="standard", graph=buf) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_bundle_round_trip_via_extract_stream(tmp_path: Path) -> None: - out = tmp_path / "full.bendl" - rng = random.Random(0xCAFE) - samples = [[rng.randint(1, 8) for _ in range(12)] for _ in range(15)] - with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: - for a in samples: - enc.write(a) - - reader = BenDecoder(out) - assert reader.count_samples() == len(samples) - extracted = tmp_path / "full.ben" - reader.extract_stream(extracted) - assert list(BenDecoder(extracted, mode="ben")) == samples - # And the graph still round-trips from the same reader. - assert reader.read_graph() == SAMPLE_GRAPH - - -def test_benencoder_ben_file_only_rejects_graph(tmp_path: Path) -> None: - out = tmp_path / "ben-with-graph.ben" - with pytest.raises(ValueError, match="ben_file_only"): - BenEncoder( - out, - overwrite=True, - variant="standard", - graph=SAMPLE_GRAPH, - ben_file_only=True, - ) - - -def test_benencoder_ben_file_only_matches_old_format(tmp_path: Path) -> None: - # A ben_file_only=True output should be byte-identical to the legacy - # plain-BEN path, so the header has no BENDL magic. - out = tmp_path / "legacy.ben" - with BenEncoder(out, overwrite=True, variant="standard", ben_file_only=True) as enc: - enc.write([1, 2, 3]) - blob = out.read_bytes() - assert not blob.startswith(BENDL_MAGIC) - # BenDecoder should still read it in ben mode. - assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] - - -def test_benencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: - out = tmp_path / "idem.bendl" - enc = BenEncoder(out, overwrite=True, variant="standard") - enc.write([1, 1, 2]) - enc.close() - enc.close() # second close must be a no-op - with pytest.raises(OSError, match="already been closed"): - enc.write([1, 2, 3]) - - reader = BenDecoder(out) - assert reader.is_complete() - assert reader.count_samples() == 1 - - -def test_benencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: - out = tmp_path / "bad.bendl" - with pytest.raises(ValueError, match="graph must be"): - BenEncoder(out, overwrite=True, variant="standard", graph=12345) - - -# --------------------------------------------------------------------------- -# BenDecoder opened directly on a .bendl bundle. -# -# The decoder auto-detects the BENDL magic and, when present, iterates only -# the embedded stream region while exposing TOC / asset helpers on the side. -# When opened on a plain .ben/.xben stream, iteration still works but the -# bundle methods must raise a clear error. -# --------------------------------------------------------------------------- - - -def test_bendecoder_auto_detects_ben_bundle(tmp_path: Path) -> None: - samples = [[1, 2, 3], [1, 2, 3], [4, 4, 5]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assignment_format=ASSIGNMENT_FORMAT_BEN, - ) - path = _write_bundle(tmp_path / "stream.bendl", bundle) - - dec = BenDecoder(path) - assert dec.is_bundle() is True - assert dec.assignment_format() == "ben" - assert dec.is_complete() is True - assert dec.version() == (BENDL_MAJOR_VERSION, BENDL_MINOR_VERSION) - assert len(dec) == len(samples) - assert list(dec) == samples - - -def test_bendecoder_auto_detects_xben_bundle(tmp_path: Path) -> None: - samples = [[1, 1, 2, 2], [3, 3, 4, 4]] - bundle = build_bundle( - stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), - sample_count=len(samples), - assignment_format=ASSIGNMENT_FORMAT_XBEN, - ) - path = _write_bundle(tmp_path / "stream.bendl", bundle) - - dec = BenDecoder(path) - assert dec.is_bundle() is True - assert dec.assignment_format() == "xben" - assert len(dec) == len(samples) - assert list(dec) == samples - - -def test_bendecoder_bundle_toc_and_assets(tmp_path: Path) -> None: - samples = [[1, 2, 3]] - graph_json = b'{"nodes":[0,1],"edges":[[0,1]]}' - metadata_json = b'{"note":"hello"}' - relabel_json = b'{"0":"A","1":"B"}' - - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=metadata_json, - is_json=True, - ), - _Asset( - asset_type=ASSET_TYPE_GRAPH, - name="graph.json", - payload=graph_json, - is_json=True, - compress=True, - ), - _Asset( - asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, - name="node_permutation_map.json", - payload=relabel_json, - is_json=True, - ), - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="notes.bin", - payload=b"\x00\x01\x02", - ), - ], - ) - path = _write_bundle(tmp_path / "rich.bendl", bundle) - - dec = BenDecoder(path) - - # TOC surface - assert dec.asset_names() == [ - "metadata.json", - "graph.json", - "node_permutation_map.json", - "notes.bin", - ] - assets = dec.list_assets() - assert [a["name"] for a in assets] == dec.asset_names() - by_name = {a["name"]: a for a in assets} - assert "xz" in by_name["graph.json"]["flags"] - assert "json" in by_name["graph.json"]["flags"] - assert by_name["notes.bin"]["flags"] == ["checksum"] - - # Raw and JSON asset access - assert dec.read_asset_bytes("metadata.json") == metadata_json - assert dec.read_asset_bytes("graph.json") == graph_json - assert dec.read_metadata() == json.loads(metadata_json) - assert dec.read_graph() == json.loads(graph_json) - assert dec.read_relabel_map() == json.loads(relabel_json) - assert dec.read_json_asset("metadata.json") == json.loads(metadata_json) - - # Unknown asset by name raises KeyError. - with pytest.raises(KeyError, match="no asset named"): - dec.read_asset_bytes("missing.bin") - - # Iteration still works after the TOC surface has been used. - assert list(dec) == samples - - -def test_bendecoder_bundle_canonical_helpers_return_none_when_absent( - tmp_path: Path, -) -> None: - samples = [[1, 2]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="custom.bin", payload=b"x")], - ) - path = _write_bundle(tmp_path / "sparse.bendl", bundle) - dec = BenDecoder(path) - assert dec.read_graph() is None - assert dec.read_metadata() is None - assert dec.read_relabel_map() is None - - -def test_bendecoder_bundle_subsample_range(tmp_path: Path) -> None: - samples = [[i, i + 1] for i in range(1, 11)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "range.bendl", bundle) - - dec = BenDecoder(path) - dec.subsample_range(3, 6) - assert list(dec) == samples[2:6] - - -def test_bendecoder_bundle_subsample_indices(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 9)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "idx.bendl", bundle) - - dec = BenDecoder(path) - dec.subsample_indices([1, 4, 8]) - assert list(dec) == [samples[0], samples[3], samples[7]] - - -def test_bendecoder_bundle_subsample_every(tmp_path: Path) -> None: - samples = [[i, i] for i in range(1, 11)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "every.bendl", bundle) - - dec = BenDecoder(path) - dec.subsample_every(3, 2) - assert list(dec) == [samples[1], samples[4], samples[7]] - - -def test_bendecoder_bundle_mode_arg_is_ignored(tmp_path: Path) -> None: - # For bundles, the header decides the format — a caller-supplied - # `mode="xben"` on a BEN bundle must not confuse the reader. - samples = [[1, 2, 3]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assignment_format=ASSIGNMENT_FORMAT_BEN, - ) - path = _write_bundle(tmp_path / "ignore_mode.bendl", bundle) - - dec = BenDecoder(path, mode="xben") - assert dec.assignment_format() == "ben" - assert list(dec) == samples - - -def test_bendecoder_on_plain_stream_supports_iteration(tmp_path: Path) -> None: - # Opening a plain .ben file must still iterate unchanged; the new - # bundle surface is simply unavailable. - samples = [[1, 2, 3], [4, 5, 6]] - ben_path = tmp_path / "plain.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(ben_path) - assert dec.is_bundle() is False - assert dec.assignment_format() == "ben" +def test_iteration_can_restart(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "twice.bendl", bundle) + dec = BendlDecoder(path) assert list(dec) == samples - - -@pytest.mark.parametrize( - "method_call", - [ - lambda d: d.version(), - lambda d: d.is_complete(), - lambda d: d.asset_names(), - lambda d: d.list_assets(), - lambda d: d.read_asset_bytes("metadata.json"), - lambda d: d.read_json_asset("metadata.json"), - lambda d: d.read_graph(), - lambda d: d.read_metadata(), - lambda d: d.read_relabel_map(), - ], -) -def test_bendecoder_plain_stream_rejects_bundle_methods( - tmp_path: Path, method_call -) -> None: - ben_path = tmp_path / "plain.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - enc.write([1, 2, 3]) - - dec = BenDecoder(ben_path) - with pytest.raises(Exception, match="only available on .bendl bundles"): - method_call(dec) - - -def test_bendecoder_plain_stream_error_mentions_ben_file_only( - tmp_path: Path, -) -> None: - ben_path = tmp_path / "plain.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - enc.write([1]) - - dec = BenDecoder(ben_path) - with pytest.raises(Exception, match="ben_file_only=False"): - dec.read_graph() - - -def test_bendecoder_opens_bundle_produced_by_benencoder(tmp_path: Path) -> None: - # End-to-end: a bundle written by BenEncoder (with a graph asset) - # must round-trip through a single BenDecoder call — no need to - # extract the stream first. - out = tmp_path / "e2e.bendl" - with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: - for a in [[1, 2, 3], [2, 3, 4]]: - enc.write(a) - - dec = BenDecoder(out) - assert dec.is_bundle() is True - assert dec.is_complete() is True - assert dec.assignment_format() == "ben" - assert dec.read_graph() == SAMPLE_GRAPH - assert list(dec) == [[1, 2, 3], [2, 3, 4]] - - -def test_bendecoder_incomplete_bundle_counts_via_scan(tmp_path: Path) -> None: - # An incomplete bundle has complete=0 and no directory — its header - # carries no authoritative sample_count, so __len__ must fall back - # to scanning the stream region. This exercises the - # `scan_bundle_samples` path in the decoder. - samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - stream = _ben_bytes_for(samples, tmp_path) - header = _pack_header( - complete=COMPLETE_NO, - assignment_format=ASSIGNMENT_FORMAT_BEN, - directory_offset=0, - directory_len=0, - stream_offset=HEADER_SIZE, - stream_len=len(stream), - sample_count=-1, - ) - path = tmp_path / "incomplete.bendl" - path.write_bytes(header + stream) - - dec = BenDecoder(path) - assert dec.is_bundle() is True - assert dec.is_complete() is False - # len() forces the fallback scan, which must agree with the data. - assert len(dec) == len(samples) - # A second call uses the cached value and still returns the same. - assert len(dec) == len(samples) - # The iterator itself still works. assert list(dec) == samples -def test_bendecoder_incomplete_bundle_count_samples_matches_len( - tmp_path: Path, -) -> None: - # Explicit count_samples() also flows through scan_bundle_samples - # for incomplete bundles. - samples = [[i, i + 1] for i in range(1, 6)] - stream = _ben_bytes_for(samples, tmp_path) - header = _pack_header( - complete=COMPLETE_NO, - assignment_format=ASSIGNMENT_FORMAT_BEN, - directory_offset=0, - directory_len=0, - stream_offset=HEADER_SIZE, - stream_len=len(stream), - sample_count=-1, - ) - path = tmp_path / "incomplete_count.bendl" - path.write_bytes(header + stream) - - dec = BenDecoder(path) - assert dec.count_samples() == len(samples) - assert len(dec) == len(samples) - - -def test_bendecoder_rejects_unknown_assignment_format(tmp_path: Path) -> None: - # A finalized bundle whose assignment_format byte is neither BEN - # nor XBEN must surface a clear error at decoder construction, not - # silently fall through. - samples = [[1, 2, 3]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assignment_format=99, - ) - path = _write_bundle(tmp_path / "weird_fmt.bendl", bundle) - with pytest.raises(Exception, match="unrecognized assignment_format"): - BenDecoder(path) - - -def test_bendecoder_empty_stream_bundle(tmp_path: Path) -> None: - # A bundle containing a valid BEN banner but zero frames must be - # openable and produce an empty iterator / zero-length decoder. - bundle = build_bundle(stream_bytes=_ben_bytes_for([], tmp_path), sample_count=0) - path = _write_bundle(tmp_path / "empty.bendl", bundle) - - dec = BenDecoder(path) - assert dec.is_bundle() is True - assert len(dec) == 0 - assert dec.count_samples() == 0 - assert list(dec) == [] - assert dec.asset_names() == [] - assert dec.list_assets() == [] - - -def test_bendecoder_bundle_toc_interleaved_with_iteration(tmp_path: Path) -> None: - # Calling TOC / asset methods in between __next__ calls must not - # break the iterator — the TOC access uses a separate BendlReader, - # not the file handle backing the iterator. - samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=b'{"tag":42}', - is_json=True, - ) - ], - ) - path = _write_bundle(tmp_path / "interleave.bendl", bundle) - - dec = BenDecoder(path) +def test_partial_iteration_then_restart(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6], [7, 8]] + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "partial.bendl", bundle) + dec = BendlDecoder(path) it = iter(dec) - assert next(it) == samples[0] - # TOC read between samples - assert dec.read_metadata() == {"tag": 42} - assert dec.asset_names() == ["metadata.json"] assert next(it) == samples[1] - # And another TOC read - assert dec.read_asset_bytes("metadata.json") == b'{"tag":42}' - assert next(it) == samples[2] - with pytest.raises(StopIteration): - next(it) - - -def test_bendecoder_bundle_subsample_range_rejects_out_of_bounds( - tmp_path: Path, -) -> None: - samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "range_bad.bendl", bundle) - dec = BenDecoder(path) - with pytest.raises(Exception, match="end must be <= number of samples"): - dec.subsample_range(1, 99) - with pytest.raises(Exception, match="1-based"): - dec.subsample_range(0, 1) - - -def test_bendecoder_bundle_subsample_indices_rejects_out_of_bounds( - tmp_path: Path, -) -> None: - samples = [[1, 2], [3, 4]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "idx_bad.bendl", bundle) - dec = BenDecoder(path) - with pytest.raises(Exception, match="number of samples"): - dec.subsample_indices([1, 42]) - # Empty index list is also rejected. - dec2 = BenDecoder(path) - with pytest.raises(Exception, match="must not be empty"): - dec2.subsample_indices([]) - - -def test_bendecoder_bundle_subsample_every_rejects_bad_args(tmp_path: Path) -> None: - samples = [[1], [2], [3]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "every_bad.bendl", bundle) - dec = BenDecoder(path) - with pytest.raises(Exception, match="offset must be <= number of samples"): - dec.subsample_every(1, 99) - dec2 = BenDecoder(path) - with pytest.raises(Exception, match="step and offset must be >= 1"): - dec2.subsample_every(0, 1) - - -def test_bendecoder_plain_stream_len_is_cached(tmp_path: Path) -> None: - # __len__ caches the scan result; calling it twice must not re-scan - # but must return the same answer. - samples = [[1, 2], [3, 4], [5, 6]] - ben_path = tmp_path / "cached.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - dec = BenDecoder(ben_path) - assert len(dec) == len(samples) - assert len(dec) == len(samples) - # Explicit count_samples must also agree. - assert dec.count_samples() == len(samples) - - -def test_bendecoder_detects_very_short_file_as_plain(tmp_path: Path) -> None: - # A 4-byte file cannot start with the BENDL magic; detect_is_bundle - # must return false on UnexpectedEof, after which plain-stream - # decoding fails with a banner error. - path = tmp_path / "tiny.ben" - path.write_bytes(b"abcd") - with pytest.raises(Exception): - BenDecoder(path) - - -def test_bendecoder_empty_file_is_treated_as_plain(tmp_path: Path) -> None: - path = tmp_path / "empty.ben" - path.write_bytes(b"") - with pytest.raises(Exception): - BenDecoder(path) - - -def test_bendecoder_bundle_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: - # read_json_asset on the decoder should reject non-UTF-8 the same as - # error behavior when an asset isn't valid UTF-8. - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1]], tmp_path), - sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_CUSTOM, - name="binary.bin", - payload=b"\xff\xfe\xfd", - ) - ], - ) - path = _write_bundle(tmp_path / "bad_utf8.bendl", bundle) - dec = BenDecoder(path) - # Raw bytes are fine. - assert dec.read_asset_bytes("binary.bin") == b"\xff\xfe\xfd" - with pytest.raises(Exception, match="not valid UTF-8"): - dec.read_json_asset("binary.bin") - - -def test_bendecoder_bundle_read_json_asset_rejects_bad_json(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1]], tmp_path), - sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=b"not json {", - is_json=True, - ) - ], - ) - path = _write_bundle(tmp_path / "bad_json.bendl", bundle) - dec = BenDecoder(path) - with pytest.raises(json.JSONDecodeError): - dec.read_metadata() - - -def test_bendecoder_bundle_graph_asset_is_xz_transparent(tmp_path: Path) -> None: - # A bundle built with BenEncoder compresses the graph asset as xz; - # read_graph() on BenDecoder must still return the decoded JSON. - out = tmp_path / "xz_graph.bendl" - with BenEncoder(out, overwrite=True, variant="standard", graph=SAMPLE_GRAPH) as enc: - enc.write([1, 2, 3]) - dec = BenDecoder(out) - # Spot-check that graph.json was actually stored compressed. - by_name = {a["name"]: a for a in dec.list_assets()} - assert "xz" in by_name["graph.json"]["flags"] - assert dec.read_graph() == SAMPLE_GRAPH - - -def test_bendecoder_bundle_xben_with_assets(tmp_path: Path) -> None: - # XBEN bundles with TOC entries were not previously covered — only - # the plain XBEN-bundle auto-detect case. Verify iteration AND TOC - # access both work on an XBEN bundle. - samples = [[1, 1, 2, 2], [2, 2, 1, 1], [3, 3, 3, 3]] - meta = b'{"variant":"mkv_chain"}' - bundle = build_bundle( - stream_bytes=_xben_bytes_for(samples, tmp_path, variant="mkv_chain"), - sample_count=len(samples), - assignment_format=ASSIGNMENT_FORMAT_XBEN, - assets=[ - _Asset( - asset_type=ASSET_TYPE_METADATA, - name="metadata.json", - payload=meta, - is_json=True, - ) - ], - ) - path = _write_bundle(tmp_path / "xben_assets.bendl", bundle) - - dec = BenDecoder(path) - assert dec.assignment_format() == "xben" - assert dec.asset_names() == ["metadata.json"] - assert dec.read_metadata() == {"variant": "mkv_chain"} assert list(dec) == samples -def test_bendecoder_bundle_subsample_indices_unsorted_warns(tmp_path: Path) -> None: - # The subsample_indices path that sorts+dedupes unsorted input also - # has to work for bundles. Mixing in duplicates should still yield - # the deduplicated selection. - samples = [[i] for i in range(1, 6)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "unsorted.bendl", bundle) - dec = BenDecoder(path) - with pytest.warns(UserWarning, match="sorted and unique"): - dec.subsample_indices([4, 1, 4, 1]) - assert list(dec) == [[1], [4]] - - -def test_bendecoder_plain_xben_assignment_format(tmp_path: Path) -> None: - # `assignment_format()` must report "xben" when opened on a plain - # XBEN stream as well, not only on bundles. - samples = [[1, 1, 2, 2], [2, 2, 1, 1]] - src = tmp_path / "src.jsonl" - _write_jsonl(samples, src) - xben_path = tmp_path / "plain.xben" - encode_jsonl_to_xben( - src, - xben_path, - overwrite=True, - variant="standard", - n_threads=1, - compression_level=1, - ) - with pytest.warns(UserWarning): - dec = BenDecoder(xben_path, mode="xben") - assert dec.is_bundle() is False - assert dec.assignment_format() == "xben" - assert list(dec) == samples - - -def test_bendecoder_incomplete_bundle_rejects_toc_methods_that_need_directory( - tmp_path: Path, -) -> None: - # An incomplete bundle has no directory, so there are no assets to - # list — asset-free surface still returns empty structures, which is - # the contract for finalized asset-free bundles too. Just verify it - # doesn't crash. - samples = [[1, 2]] - stream = _ben_bytes_for(samples, tmp_path) - header = _pack_header( - complete=COMPLETE_NO, - assignment_format=ASSIGNMENT_FORMAT_BEN, - directory_offset=0, - directory_len=0, - stream_offset=HEADER_SIZE, - stream_len=len(stream), - sample_count=-1, - ) - path = tmp_path / "incomplete_toc.bendl" - path.write_bytes(header + stream) - - dec = BenDecoder(path) - assert dec.is_bundle() is True - assert dec.is_complete() is False - assert dec.asset_names() == [] - assert dec.list_assets() == [] - assert dec.read_graph() is None - assert dec.read_metadata() is None - assert dec.read_relabel_map() is None - - -def test_bendecoder_bundle_iteration_can_restart(tmp_path: Path) -> None: - # `__iter__` rebuilds the underlying frame walker so `for x in dec:` - # can be used more than once against a bundle. - samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "twice.bendl", bundle) - dec = BenDecoder(path) - assert list(dec) == samples - # A second pass reopens the stream region from the start. - assert list(dec) == samples - - -def test_bendecoder_plain_stream_iteration_can_restart(tmp_path: Path) -> None: - samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - ben_path = tmp_path / "twice.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - dec = BenDecoder(ben_path) - assert list(dec) == samples - assert list(dec) == samples - - -def test_bendecoder_subsample_range_survives_reiteration(tmp_path: Path) -> None: - # Subsample selections must persist across `__iter__` calls, so - # iterating the same (subsampled) decoder twice gives the same - # filtered window each time. - samples = [[i, i + 1] for i in range(1, 11)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "range_twice.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_range(3, 6) - expected = samples[2:6] - assert list(dec) == expected - assert list(dec) == expected - +def test_subsample_modes(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 11)] + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "sub.bendl", bundle) -def test_bendecoder_subsample_indices_survives_reiteration(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 8)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "idx_twice.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_indices([2, 5, 7]) - expected = [samples[1], samples[4], samples[6]] - assert list(dec) == expected - assert list(dec) == expected + dec = BendlDecoder(path).subsample_range(3, 6) + assert list(dec) == samples[2:6] + assert list(dec) == samples[2:6] # survives reiteration + dec2 = BendlDecoder(path).subsample_indices([1, 4, 8]) + assert list(dec2) == [samples[0], samples[3], samples[7]] -def test_bendecoder_subsample_every_survives_reiteration(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 11)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "every_twice.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_every(3, 2) - expected = [samples[1], samples[4], samples[7]] - assert list(dec) == expected - assert list(dec) == expected - - -def test_bendecoder_resubsample_replaces_previous_selection(tmp_path: Path) -> None: - # Calling subsample_* a second time must replace the first selection - # AND survive reiteration with the new selection. - samples = [[i] for i in range(1, 8)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "reselect.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_range(1, 3) - assert list(dec) == samples[:3] - dec.subsample_indices([4, 7]) - expected = [samples[3], samples[6]] - assert list(dec) == expected - assert list(dec) == expected - - -def test_bendecoder_partial_iteration_then_restart(tmp_path: Path) -> None: - # Consuming part of the iterator and then calling `iter()` / `list()` - # again must restart cleanly from the first sample, not resume - # mid-stream. - samples = [[1, 2], [3, 4], [5, 6], [7, 8]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "partial.bendl", bundle) - dec = BenDecoder(path) - it = iter(dec) - assert next(it) == samples[0] - assert next(it) == samples[1] - # Any new pass (list / for / iter) rebuilds and starts over. - assert list(dec) == samples + dec3 = BendlDecoder(path).subsample_every(3, 2) + assert list(dec3) == [samples[1], samples[4], samples[7]] -def test_bendecoder_count_samples_after_subsample_preserves_len( - tmp_path: Path, -) -> None: - # After `subsample_*`, `len(dec)` must reflect the filtered count. - # Calling `count_samples()` reports the base (unfiltered) count but - # must not clobber the filtered `len(dec)` value. +def test_subsample_count_preserves_filtered_len(tmp_path: Path) -> None: samples = [[i] for i in range(1, 9)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "count_after_sub.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_range(2, 5) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "cnt.bendl", bundle) + dec = BendlDecoder(path).subsample_range(2, 5) assert len(dec) == 4 assert dec.count_samples() == len(samples) - # The filtered length contract must survive a count_samples() call. assert len(dec) == 4 assert list(dec) == samples[1:5] -def test_bendecoder_count_samples_plain_after_subsample_preserves_len( - tmp_path: Path, -) -> None: - # Same contract as above, but on a plain .ben stream to cover the - # non-bundle branch of `ensure_base_len`. - samples = [[i] for i in range(1, 11)] - ben_path = tmp_path / "plain_count.ben" - with BenEncoder( - ben_path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - dec = BenDecoder(ben_path) - dec.subsample_every(3, 1) - expected = samples[::3] - assert len(dec) == len(expected) - assert dec.count_samples() == len(samples) - assert len(dec) == len(expected) - assert list(dec) == expected - - -def test_bendecoder_subsample_then_count_samples_then_reiterate( - tmp_path: Path, -) -> None: - # Composing subsample → count_samples → restart iteration must keep - # the filtered view intact across the restart. - samples = [[i, i + 1] for i in range(1, 9)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "sub_count_restart.bendl", bundle) - dec = BenDecoder(path) - dec.subsample_indices([1, 4, 8]) - assert dec.count_samples() == len(samples) - expected = [samples[0], samples[3], samples[7]] - assert list(dec) == expected - assert list(dec) == expected +def test_subsample_out_of_bounds(tmp_path: Path) -> None: + samples = [[1, 2], [3, 4], [5, 6]] + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "oob.bendl", bundle) + with pytest.raises(Exception, match="end must be <= number of samples"): + BendlDecoder(path).subsample_range(1, 99) + with pytest.raises(Exception, match="number of samples"): + BendlDecoder(path).subsample_indices([1, 42]) + with pytest.warns(UserWarning, match="sorted and unique"): + dec = BendlDecoder(path).subsample_indices([3, 1, 3, 1]) + assert list(dec) == [samples[0], samples[2]] -def test_bendecoder_bundle_read_json_asset_missing_name_raises_keyerror( - tmp_path: Path, -) -> None: - # `read_json_asset` on a valid bundle that does not carry the named - # asset must surface a KeyError, matching `read_asset_bytes`. - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), - sample_count=1, - ) - path = _write_bundle(tmp_path / "missing_json.bendl", bundle) - dec = BenDecoder(path) - with pytest.raises(KeyError, match="nope.json"): - dec.read_json_asset("nope.json") - - -def test_bendecoder_bundle_len_uses_header_fast_path(tmp_path: Path) -> None: - # For a finalized bundle, `len(dec)` should use the O(1) header - # sample_count fast path rather than scanning the stream. We can't - # observe the scan directly, but we can verify the result matches - # the count declared in the header even when the stream is a real - # BEN payload. +def test_len_uses_header_fast_path(tmp_path: Path) -> None: samples = [[i] for i in range(1, 6)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), - sample_count=len(samples), - ) - path = _write_bundle(tmp_path / "fast_len.bendl", bundle) - dec = BenDecoder(path) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + path = _write_bundle(tmp_path / "fast.bendl", bundle) + dec = BendlDecoder(path) assert len(dec) == len(samples) - # A second call returns the cached value and must agree. assert len(dec) == len(samples) assert dec.count_samples() == len(samples) + + +def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: + rng = random.Random(0xFEED_FACE) + for trial in range(15): + n_assets = rng.randint(0, 10) + assets: List[_Asset] = [] + truth: List[Tuple[str, bytes]] = [] + for i in range(n_assets): + payload = rng.randbytes(rng.choice([0, 1, 7, 64, 500])) + assets.append(_Asset(asset_type=ASSET_TYPE_CUSTOM, name=f"t{trial}-a{i}.bin", payload=payload, compress=rng.random() < 0.4)) + truth.append((f"t{trial}-a{i}.bin", payload)) + n_samples = rng.randint(1, 25) + samples = [[rng.randint(1, 8) for _ in range(rng.randint(1, 40))] for _ in range(n_samples)] + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=n_samples, assets=assets) + path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) + dec = BendlDecoder(path) + assert dec.count_samples() == n_samples + assert dec.asset_names() == [name for name, _ in truth] + for name, want in truth: + assert dec.read_asset_bytes(name) == want + extracted = tmp_path / f"fuzz-{trial}.ben" + dec.extract_stream(extracted) + assert list(BenDecoder(extracted, mode="ben")) == samples diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py new file mode 100644 index 0000000..b415dc3 --- /dev/null +++ b/ben-py/tests/test_bundle_api.py @@ -0,0 +1,317 @@ +"""Lifecycle tests for the ``BendlEncoder`` authoring facade. + +Covers create vs append mode, the single-use stream session, asset/graph/metadata +adds before and after the stream, content-type validation, graph↔chain node-count +validation, the assets-only (stream-less) bundle, and the unfinalized-on-exception +recovery path. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from binary_ensemble.bundle import BendlDecoder, BendlEncoder + +EXAMPLE_GRAPH = ( + Path(__file__).resolve().parent.parent + / "docs" + / "user" + / "example_data" + / "gerrymandria.json" +) + + +def _graph(): + return json.loads(EXAMPLE_GRAPH.read_text()) + + +def _n(): + return len(_graph()["nodes"]) + + +# --------------------------------------------------------------------------- +# Round trips +# --------------------------------------------------------------------------- + + +def test_create_round_trip_all_asset_kinds(tmp_path: Path) -> None: + n = _n() + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(6)] + path = tmp_path / "full.bendl" + with BendlEncoder(path, overwrite=True) as enc: + returned = enc.add_graph(_graph(), preprocess_method=None) + enc.add_metadata({"seed": 1234}) + with enc.stream("ben") as stream: + for a in samples: + stream.write(a) + enc.add_asset("notes.txt", "hello world", content_type="text") + enc.add_asset("post.json", json.dumps({"k": [1, 2, 3]}), content_type="json") + + # add_graph hands back a live NetworkX graph. + assert returned.number_of_nodes() == n + dec = BendlDecoder(path) + assert dec.is_complete() + assert dec.count_samples() == len(samples) + assert dec.assignment_format() == "ben" + assert dec.asset_names() == ["graph.json", "metadata.json", "notes.txt", "post.json"] + assert dec.read_metadata() == {"seed": 1234} + assert dec.read_asset_bytes("notes.txt") == b"hello world" + assert dec.read_json_asset("post.json") == {"k": [1, 2, 3]} + assert dec.read_node_permutation_map() is None # raw graph => no perm map + assert list(dec) == samples + + +def test_post_stream_add_commits_immediately(tmp_path: Path) -> None: + path = tmp_path / "commit.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream("ben") as s: + s.write([1, 2]) + enc.add_asset("a.txt", "one", content_type="text") + # A successful post-stream add is durable on disk before close(). + assert BendlDecoder(path).read_asset_bytes("a.txt") == b"one" + enc.add_asset("b.txt", "two", content_type="text") + enc.close() + assert BendlDecoder(path).asset_names() == ["a.txt", "b.txt"] + + +def test_context_manager_and_idempotent_close(tmp_path: Path) -> None: + path = tmp_path / "ctx.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream("ben") as s: + s.write([1, 2, 3]) + enc.close() + enc.close() # idempotent + assert list(BendlDecoder(path)) == [[1, 2, 3]] + + +def test_overwrite_required_for_existing_path(tmp_path: Path) -> None: + path = tmp_path / "exists.bendl" + path.write_bytes(b"existing") + with pytest.raises(OSError, match="already exists"): + BendlEncoder(path, overwrite=False) + enc = BendlEncoder(path, overwrite=True) + enc.close() + assert BendlDecoder(path).is_complete() + + +# --------------------------------------------------------------------------- +# Assets-only bundle (stream-less close) +# --------------------------------------------------------------------------- + + +def test_stream_less_close_yields_assets_only_bundle(tmp_path: Path) -> None: + path = tmp_path / "assets_only.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_metadata({"only": "assets"}) + enc.close() + dec = BendlDecoder(path) + assert dec.is_complete() + assert dec.count_samples() == 0 + assert len(dec) == 0 + assert list(dec) == [] + assert dec.read_metadata() == {"only": "assets"} + + +def test_empty_close_with_no_assets(tmp_path: Path) -> None: + path = tmp_path / "truly_empty.bendl" + with BendlEncoder(path, overwrite=True): + pass + dec = BendlDecoder(path) + assert dec.is_complete() + assert dec.asset_names() == [] + assert list(dec) == [] + + +# --------------------------------------------------------------------------- +# Exception inside stream context +# --------------------------------------------------------------------------- + + +def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: + path = tmp_path / "fail.bendl" + with pytest.raises(RuntimeError, match="boom"): + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1, 2, 3]) + raise RuntimeError("boom") + dec = BendlDecoder(path) + assert dec.is_complete() is False + # Verified extraction refuses an unfinalized stream... + with pytest.raises(Exception, match="unfinalized"): + dec.extract_stream(tmp_path / "recovered.ben") + # ...but the partial write is recoverable. + dec.extract_stream(tmp_path / "recovered.ben", overwrite=True, allow_unfinalized=True) + assert (tmp_path / "recovered.ben").stat().st_size > 0 + + +# --------------------------------------------------------------------------- +# content_type validation +# --------------------------------------------------------------------------- + + +def test_add_asset_content_type_validation(tmp_path: Path) -> None: + enc = BendlEncoder(tmp_path / "v.bendl", overwrite=True) + with pytest.raises(ValueError, match="must be 'json' or 'text'"): + enc.add_asset("x", b"data", content_type="binary") + with pytest.raises(ValueError, match="valid UTF-8 JSON"): + enc.add_asset("bad.json", "not json", content_type="json") + with pytest.raises(ValueError, match="valid UTF-8"): + enc.add_asset("bad.txt", b"\xff\xfe", content_type="text") + # Valid forms succeed. + enc.add_asset("ok.json", '{"a":1}', content_type="json") + enc.add_asset("ok.txt", "fine", content_type="text") + enc.close() + dec = BendlDecoder(tmp_path / "v.bendl") + assert dec.read_json_asset("ok.json") == {"a": 1} + flags = {a["name"]: a["flags"] for a in dec.list_assets()} + assert "json" in flags["ok.json"] + assert "json" not in flags["ok.txt"] + + +# --------------------------------------------------------------------------- +# add_graph reorder / raw / validation +# --------------------------------------------------------------------------- + + +def test_add_graph_reorder_emits_graph_and_permutation_map(tmp_path: Path) -> None: + n = _n() + path = tmp_path / "reord.bendl" + enc = BendlEncoder(path, overwrite=True) + reordered = enc.add_graph(_graph(), preprocess_method="rcm") + with enc.stream("ben") as s: + s.write([1] * n) + enc.close() + + dec = BendlDecoder(path) + assert dec.asset_names() == ["graph.json", "node_permutation_map.json"] + # add_graph and read_graph both hand back live NetworkX graphs with matching + # nodes in the same (reordered) order. + assert list(reordered.nodes) == list(dec.read_graph().nodes) + assert reordered.number_of_nodes() == n + pmap = dec.read_node_permutation_map() + mapping = pmap["node_permutation_old_to_new"] + assert len(mapping) == n + # old->new is a bijection over [0, n). + assert sorted(int(k) for k in mapping) == list(range(n)) + assert sorted(mapping.values()) == list(range(n)) + + +def test_add_graph_none_stores_raw_without_permutation_map(tmp_path: Path) -> None: + path = tmp_path / "raw.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_graph(_graph(), preprocess_method=None) + enc.close() + dec = BendlDecoder(path) + assert dec.asset_names() == ["graph.json"] + assert dec.read_node_permutation_map() is None + + +def test_add_graph_node_count_mismatch_raises(tmp_path: Path) -> None: + n = _n() + enc = BendlEncoder(tmp_path / "nc.bendl", overwrite=True) + enc.add_graph(_graph(), preprocess_method=None) + with enc.stream("ben") as s: + s.write([1] * n) # correct + with pytest.raises(ValueError, match="does not match graph node count"): + s.write([1] * (n - 1)) + + +def test_reorder_add_graph_after_stream_raises_but_raw_succeeds(tmp_path: Path) -> None: + n = _n() + path = tmp_path / "after.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream("ben") as s: + s.write([1] * n) + with pytest.raises(Exception, match="only allowed before"): + enc.add_graph(_graph(), preprocess_method="rcm") + # A raw graph attaches fine post-stream. + enc.add_graph(_graph(), preprocess_method=None) + enc.close() + assert BendlDecoder(path).asset_names() == ["graph.json"] + + +def test_duplicate_graph_raises(tmp_path: Path) -> None: + enc = BendlEncoder(tmp_path / "dup.bendl", overwrite=True) + enc.add_graph(_graph(), preprocess_method=None) + with pytest.raises(Exception, match="duplicate singleton"): + enc.add_graph(_graph(), preprocess_method=None) + + +# --------------------------------------------------------------------------- +# Stream-format and second-stream guards +# --------------------------------------------------------------------------- + + +def test_stream_rejects_non_ben_format(tmp_path: Path) -> None: + enc = BendlEncoder(tmp_path / "fmt.bendl", overwrite=True) + with pytest.raises(ValueError, match="must be 'ben'"): + enc.stream("xben") + + +def test_second_stream_refused(tmp_path: Path) -> None: + path = tmp_path / "two.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream("ben") as s: + s.write([1, 2]) + with pytest.raises(Exception, match="already been written"): + enc.stream("ben") + + +# --------------------------------------------------------------------------- +# Append mode +# --------------------------------------------------------------------------- + + +def test_append_mode_adds_assets(tmp_path: Path) -> None: + path = tmp_path / "app.bendl" + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1, 2, 3]) + + ap = BendlEncoder.append(path) + ap.add_metadata({"appended": True}) + ap.add_asset("late.txt", "added later", content_type="text") + ap.close() + + dec = BendlDecoder(path) + assert dec.read_metadata() == {"appended": True} + assert dec.read_asset_bytes("late.txt") == b"added later" + assert list(dec) == [[1, 2, 3]] + + +def test_append_mode_disallows_stream(tmp_path: Path) -> None: + path = tmp_path / "app2.bendl" + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1]) + ap = BendlEncoder.append(path) + with pytest.raises(Exception, match="append mode"): + ap.stream("ben") + + +def test_append_mode_reorder_graph_raises(tmp_path: Path) -> None: + path = tmp_path / "app3.bendl" + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1] * _n()) + ap = BendlEncoder.append(path) + with pytest.raises(Exception, match="only allowed before"): + ap.add_graph(_graph(), preprocess_method="rcm") + # Raw graph append works. + ap.add_graph(_graph(), preprocess_method=None) + ap.close() + assert "graph.json" in BendlDecoder(path).asset_names() + + +def test_append_on_unfinalized_bundle_raises(tmp_path: Path) -> None: + path = tmp_path / "unfin.bendl" + with pytest.raises(RuntimeError): + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1, 2]) + raise RuntimeError("stop") + with pytest.raises(Exception): + BendlEncoder.append(path) diff --git a/ben-py/tests/test_graph.py b/ben-py/tests/test_graph.py new file mode 100644 index 0000000..58c111d --- /dev/null +++ b/ben-py/tests/test_graph.py @@ -0,0 +1,83 @@ +"""Tests for the standalone ``binary_ensemble.graph`` reordering utilities.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from binary_ensemble import graph as g + +EXAMPLE_GRAPH = ( + Path(__file__).resolve().parent.parent + / "docs" + / "user" + / "example_data" + / "gerrymandria.json" +) + + +def _graph(): + return json.loads(EXAMPLE_GRAPH.read_text()) + + +def _n(): + return len(_graph()["nodes"]) + + +def _check_consistent(reordered, pmap, n): + # reordered is a live NetworkX graph. + assert reordered.number_of_nodes() == n + mapping = pmap["node_permutation_old_to_new"] + assert len(mapping) == n + # old->new is a bijection over [0, n). + assert sorted(int(k) for k in mapping) == list(range(n)) + assert sorted(mapping.values()) == list(range(n)) + + +def test_reorder_rcm() -> None: + n = _n() + reordered, pmap = g.reorder(_graph(), "rcm") + _check_consistent(reordered, pmap, n) + assert pmap["ordering_method"] == "reverse-cuthill-mckee" + assert pmap["key"] is None + + +def test_reorder_mlc() -> None: + n = _n() + reordered, pmap = g.reorder_multi_level_cluster(_graph()) + _check_consistent(reordered, pmap, n) + assert pmap["ordering_method"] == "multi-level-cluster" + + +def test_reorder_reverse_cuthill_mckee_helper() -> None: + n = _n() + reordered, pmap = g.reorder_reverse_cuthill_mckee(_graph()) + _check_consistent(reordered, pmap, n) + assert pmap["ordering_method"] == "reverse-cuthill-mckee" + + +def test_reorder_by_key_id() -> None: + n = _n() + reordered, pmap = g.reorder_by_key(_graph(), "id") + _check_consistent(reordered, pmap, n) + assert pmap["key"] == "id" + assert pmap["ordering_method"] is None + + +def test_reorder_accepts_bytes_and_path() -> None: + n = _n() + raw = EXAMPLE_GRAPH.read_bytes() + r1, p1 = g.reorder(raw, "rcm") + r2, p2 = g.reorder(str(EXAMPLE_GRAPH), "rcm") + _check_consistent(r1, p1, n) + # path and bytes inputs agree (NetworkX graphs compare by identity, so check + # node order and the permutation map instead). + assert list(r1.nodes) == list(r2.nodes) + assert p1 == p2 + + +def test_reorder_rejects_unparseable_graph() -> None: + with pytest.raises(Exception, match="Failed to reorder graph"): + g.reorder(b"not valid json at all", "rcm") diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index 38b060c..5a68aa3 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -1,4 +1,10 @@ -import io +"""Pipeline, plain-stream encoder/decoder, and subsampling tests. + +Covers the whole-file ``codec`` transforms and the stream-only ``BenEncoder`` / +``BenDecoder``. Bundle (``.bendl``) behavior lives in ``test_bundle.py`` and +``test_bundle_api.py``. +""" + import json import random from pathlib import Path @@ -6,23 +12,21 @@ import pytest -import binary_ensemble from binary_ensemble import ( BenDecoder, BenEncoder, - encode_ben_to_xben, - encode_jsonl_to_ben, - encode_jsonl_to_xben, decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, + encode_ben_to_xben, + encode_jsonl_to_ben, + encode_jsonl_to_xben, ) # ---------- Helpers ---------- def expand_rle(rle: Iterable[tuple[int, int]], cap: int) -> list[int]: - """Expand RLE pairs into a flat assignment vector, capped at cap.""" out: List[int] = [] for val, length in rle: take = min(length, max(0, cap - len(out))) @@ -35,16 +39,13 @@ def expand_rle(rle: Iterable[tuple[int, int]], cap: int) -> list[int]: def gen_assignment( rng: random.Random, max_val: int, max_run: int, max_len: int ) -> list[int]: - """Generate one assignment by RLE with bounded length.""" rle = [] - # Keep it small/fast: up to ~50 runs n_runs = rng.randint(10, 50) for _ in range(n_runs): val = rng.randint(1, max_val) length = rng.randint(1, max_run) rle.append((val, length)) v = expand_rle(rle, max_len) - # Ensure non-empty return v or [1] @@ -57,14 +58,9 @@ def gen_sequence_standard( def gen_sequence_mkv( rng: random.Random, n_samples: int, *, max_val=50, max_run=300, max_len=1500 ) -> list[list[int]]: - """ - Like Rust test: inject duplicate exact assignments periodically to - exercise MKV grouping. Ensures total length n_samples. - """ seq: list[list[int]] = [] while len(seq) < n_samples: base = gen_assignment(rng, max_val, max_run, max_len) - # repeat this assignment 1..10 times (but don’t exceed n_samples) reps = min(rng.randint(1, 10), n_samples - len(seq)) seq.extend([base] * reps) return seq @@ -88,497 +84,234 @@ def read_jsonl_assignments(path: Path) -> list[list[int]]: return out -# ---------- Tests mirroring Rust ---------- +# ---------- Codec pipelines ---------- def test_ben_pipeline(tmp_path: Path) -> None: rng = random.Random(129530786) - n_samples = 100 - seq = gen_sequence_standard(rng, n_samples) - + seq = gen_sequence_standard(rng, 100) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - ben = tmp_path / "out.ben" out_jsonl = tmp_path / "round.jsonl" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") decode_ben_to_jsonl(ben, out_jsonl, overwrite=True) - assert src.read_bytes() == out_jsonl.read_bytes() def test_mkvben_pipeline(tmp_path: Path) -> None: rng = random.Random(129530786) - n_samples = 100 - seq = gen_sequence_mkv(rng, n_samples) - + seq = gen_sequence_mkv(rng, 100) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - ben = tmp_path / "out_mkv.ben" out_jsonl = tmp_path / "round_mkv.jsonl" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") decode_ben_to_jsonl(ben, out_jsonl, overwrite=True) - assert src.read_bytes() == out_jsonl.read_bytes() def test_xben_pipeline(tmp_path: Path) -> None: rng = random.Random(129530786) - n_samples = 50 - seq = gen_sequence_standard(rng, n_samples) - + seq = gen_sequence_standard(rng, 50) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - xben = tmp_path / "out.xben" ben = tmp_path / "out.ben" round_jsonl = tmp_path / "round.jsonl" - encode_jsonl_to_xben( src, xben, overwrite=True, variant="standard", n_threads=1, compression_level=1 ) decode_xben_to_ben(xben, ben, overwrite=True) decode_ben_to_jsonl(ben, round_jsonl, overwrite=True) - assert src.read_bytes() == round_jsonl.read_bytes() -def test_xmkvben_pipeline(tmp_path: Path) -> None: - rng = random.Random(129530786) - n_samples = 50 - seq = gen_sequence_mkv(rng, n_samples) - +def test_ben_to_xben_and_back(tmp_path: Path) -> None: + rng = random.Random(314159) + seq = gen_sequence_mkv(rng, 80) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - - xben = tmp_path / "out_mkv.xben" - ben = tmp_path / "out_mkv.ben" - round_jsonl = tmp_path / "round_mkv.jsonl" - - encode_jsonl_to_xben( - src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 - ) - decode_xben_to_ben(xben, ben, overwrite=True) - decode_ben_to_jsonl(ben, round_jsonl, overwrite=True) - - assert src.read_bytes() == round_jsonl.read_bytes() + ben = tmp_path / "in.ben" + xben = tmp_path / "roundtrip.xben" + ben2 = tmp_path / "out.ben" + out_jsonl = tmp_path / "out.jsonl" + encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") + encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) + decode_xben_to_ben(xben, ben2, overwrite=True) + decode_ben_to_jsonl(ben2, out_jsonl, overwrite=True) + assert src.read_bytes() == out_jsonl.read_bytes() -# ---------- Iterator/decoder parity with JSONL ---------- +# ---------- Iterator parity ---------- def test_decoder_iterator_matches_jsonl_ben(tmp_path: Path) -> None: rng = random.Random(129530786) - n_samples = 120 - seq = gen_sequence_standard(rng, n_samples) - + seq = gen_sequence_standard(rng, 120) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - ben = tmp_path / "out.ben" encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - # Baseline: assignments from JSONL baseline = read_jsonl_assignments(src) - - # BenDecoder over BEN - got: list[list[int]] = [] - dec = BenDecoder(ben, mode="ben") - for a in dec: - got.append(a) - - assert got == baseline + assert list(BenDecoder(ben, mode="ben")) == baseline def test_decoder_iterator_matches_jsonl_xben(tmp_path: Path) -> None: rng = random.Random(129530786) - n_samples = 120 - seq = gen_sequence_mkv(rng, n_samples) - + seq = gen_sequence_mkv(rng, 120) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - xben = tmp_path / "out.xben" encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) - - # Baseline via full decompression roundtrip = tmp_path / "direct.jsonl" decode_xben_to_jsonl(xben, roundtrip, overwrite=True) baseline = read_jsonl_assignments(roundtrip) + assert list(BenDecoder(xben, mode="xben")) == baseline - # Iterator directly over XBEN - got: list[list[int]] = [] - dec = BenDecoder(xben, mode="xben") - for a in dec: - got.append(a) - assert got == baseline - - -# ---------- Subsampling tests ---------- +# ---------- Subsampling on plain streams ---------- def test_subsample_indices(tmp_path: Path) -> None: rng = random.Random(2_022_11_11) - n_samples = 200 - seq = gen_sequence_mkv(rng, n_samples) - + seq = gen_sequence_mkv(rng, 200) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - xben = tmp_path / "out.xben" encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) - - # choose indices: 1,4,7,… - want = list(range(1, n_samples + 1, 3)) + want = list(range(1, 201, 3)) baseline = [seq[i - 1] for i in want] - - got: list[list[int]] = [] - dec = BenDecoder(xben, mode="xben").subsample_indices(want) - for a in dec: - got.append(a) - - assert got == baseline + assert list(BenDecoder(xben, mode="xben").subsample_indices(want)) == baseline def test_subsample_range(tmp_path: Path) -> None: rng = random.Random(42) - n_samples = 150 - seq = gen_sequence_mkv(rng, n_samples) - + seq = gen_sequence_mkv(rng, 150) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - ben = tmp_path / "out.ben" encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - - start, end = 11, 77 - baseline = seq[start - 1 : end] - - got: list[list[int]] = [] - dec = BenDecoder(ben, mode="ben").subsample_range(start, end) - for a in dec: - got.append(a) - - assert got == baseline + assert list(BenDecoder(ben, mode="ben").subsample_range(11, 77)) == seq[10:77] def test_subsample_every(tmp_path: Path) -> None: rng = random.Random(1337) - n_samples = 180 - seq = gen_sequence_mkv(rng, n_samples) - + seq = gen_sequence_mkv(rng, 180) src = tmp_path / "src.jsonl" write_jsonl(seq, src) - xben = tmp_path / "out.xben" encode_jsonl_to_xben( src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 ) + baseline = [seq[i - 1] for i in range(2, 181, 5)] + assert list(BenDecoder(xben, mode="xben").subsample_every(5, 2)) == baseline - step, offset = 5, 2 # keep 2,7,12,… - baseline = [seq[i - 1] for i in range(offset, n_samples + 1, step)] - got: list[list[int]] = [] - dec = BenDecoder(xben, mode="xben").subsample_every(step, offset) - for a in dec: - got.append(a) +def test_plain_stream_iteration_restart(tmp_path: Path) -> None: + samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + path = tmp_path / "twice.ben" + with BenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + dec = BenDecoder(path) + assert list(dec) == samples + assert list(dec) == samples + - assert got == baseline +def test_plain_stream_subsample_survives_reiteration(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 8)] + path = tmp_path / "re.ben" + with BenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + dec = BenDecoder(path).subsample_every(2, offset=1) + expected = [[1], [3], [5], [7]] + assert list(dec) == expected + assert list(dec) == expected -# ---------- Encoder surface (context manager & write) ---------- +# ---------- Encoder surface ---------- def test_benencoder_roundtrip(tmp_path: Path) -> None: rng = random.Random(777) - n_samples = 60 - seq = gen_sequence_standard(rng, n_samples) - + seq = gen_sequence_standard(rng, 60) ben = tmp_path / "out.ben" - with BenEncoder(ben, overwrite=True, variant="standard", ben_file_only=True) as enc: + with BenEncoder(ben, overwrite=True, variant="standard") as enc: for a in seq: enc.write(a) + assert list(BenDecoder(ben, mode="ben")) == seq - # Use decoder to read back - got = list(BenDecoder(ben, mode="ben")) - assert got == seq - - -# ---------- BEN -> XBEN convenience conversion ---------- - - -def test_ben_to_xben_and_back(tmp_path: Path) -> None: - rng = random.Random(314159) - n_samples = 80 - seq = gen_sequence_mkv(rng, n_samples) - - src = tmp_path / "src.jsonl" - write_jsonl(seq, src) - - ben = tmp_path / "in.ben" - xben = tmp_path / "roundtrip.xben" - ben2 = tmp_path / "out.ben" - out_jsonl = tmp_path / "out.jsonl" - - encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) - decode_xben_to_ben(xben, ben2, overwrite=True) - decode_ben_to_jsonl(ben2, out_jsonl, overwrite=True) - - assert src.read_bytes() == out_jsonl.read_bytes() - - -def test_decoder_subsample_indices_rejects_empty_input(tmp_path: Path) -> None: - rng = random.Random(123) - seq = gen_sequence_standard(rng, 10) - - src = tmp_path / "src.jsonl" - write_jsonl(seq, src) - - ben = tmp_path / "out.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - dec = BenDecoder(ben, mode="ben") - with pytest.raises(Exception, match="indices must not be empty"): - dec.subsample_indices([]) - - -def test_decoder_subsample_every_rejects_offset_past_end(tmp_path: Path) -> None: - rng = random.Random(456) - seq = gen_sequence_standard(rng, 10) - - src = tmp_path / "src.jsonl" - write_jsonl(seq, src) - - ben = tmp_path / "out.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - dec = BenDecoder(ben, mode="ben") - with pytest.raises(Exception, match="offset must be <="): - dec.subsample_every(2, 99) - - -def test_compress_helpers_reject_unknown_variants(tmp_path: Path) -> None: - rng = random.Random(789) - seq = gen_sequence_standard(rng, 5) - - src = tmp_path / "src.jsonl" - write_jsonl(seq, src) - - with pytest.raises(ValueError, match="Unknown variant"): - encode_jsonl_to_ben(src, tmp_path / "out.ben", overwrite=True, variant="weird") - with pytest.raises(ValueError, match="Unknown variant"): - encode_jsonl_to_xben( - src, tmp_path / "out.xben", overwrite=True, variant="weird" - ) - - -def test_module_exports_are_exposed() -> None: - expected = { - "BenDecoder", - "BenEncoder", - "encode_jsonl_to_ben", - "encode_ben_to_xben", - "encode_jsonl_to_xben", - "decode_ben_to_jsonl", - "decode_xben_to_jsonl", - "decode_xben_to_ben", - } - assert expected.issubset(set(binary_ensemble.__all__)) - for name in expected: - assert hasattr(binary_ensemble, name) - assert hasattr(binary_ensemble, "_core") - - -def test_benencoder_defaults_and_markov_alias_work(tmp_path: Path) -> None: +def test_benencoder_default_and_markov_alias(tmp_path: Path) -> None: samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3]] - default_ben = tmp_path / "default.ben" - with BenEncoder(default_ben, overwrite=True, ben_file_only=True) as enc: - for sample in samples: - enc.write(sample) + with BenEncoder(default_ben, overwrite=True) as enc: + for s in samples: + enc.write(s) assert list(BenDecoder(default_ben, mode="ben")) == samples src = tmp_path / "src.jsonl" write_jsonl(samples, src) - alias_ben = tmp_path / "alias.ben" - alias_xben = tmp_path / "alias.xben" encode_jsonl_to_ben(src, alias_ben, overwrite=True, variant="markov") - encode_jsonl_to_xben( - src, - alias_xben, - overwrite=True, - variant="markov", - n_threads=1, - compression_level=1, - ) assert list(BenDecoder(alias_ben, mode="ben")) == samples - assert list(BenDecoder(alias_xben, mode="xben")) == samples + + +def test_benencoder_produces_plain_stream_not_bundle(tmp_path: Path) -> None: + # BenEncoder must never emit BENDL framing. + out = tmp_path / "plain.ben" + with BenEncoder(out, overwrite=True, variant="standard") as enc: + enc.write([1, 2, 3]) + assert not out.read_bytes().startswith(b"BENDL") + assert list(BenDecoder(out, mode="ben")) == [[1, 2, 3]] def test_benencoder_close_and_write_error_paths(tmp_path: Path) -> None: out = tmp_path / "out.ben" - enc = BenEncoder(out, overwrite=True, variant="standard", ben_file_only=True) + enc = BenEncoder(out, overwrite=True, variant="standard") enc.write([1, 2, 3]) enc.close() - enc.close() + enc.close() # idempotent with pytest.raises(OSError, match="already been closed"): enc.write([1, 2, 3]) - ctx_path = tmp_path / "ctx.ben" - with BenEncoder( - ctx_path, overwrite=True, variant="standard", ben_file_only=True - ) as ctx_enc: - ctx_enc.write([4, 5, 6]) - assert list(BenDecoder(ctx_path, mode="ben")) == [[4, 5, 6]] - - invalid_path = tmp_path / "invalid_assignment.ben" - with BenEncoder( - invalid_path, overwrite=True, variant="standard", ben_file_only=True - ) as invalid_enc: + invalid_path = tmp_path / "invalid.ben" + with BenEncoder(invalid_path, overwrite=True, variant="standard") as inv: with pytest.raises(Exception): - invalid_enc.write([-1]) + inv.write([-1]) with pytest.raises(Exception): - invalid_enc.write([65536]) + inv.write([65536]) def test_benencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> None: out = tmp_path / "out.ben" out.write_bytes(b"existing") - with pytest.raises(ValueError, match="Unknown variant"): BenEncoder(tmp_path / "bad.ben", overwrite=False, variant="weird") - with pytest.raises(OSError, match="already exists"): BenEncoder(out, overwrite=False, variant="standard") - with pytest.raises(OSError, match="Failed to create"): - BenEncoder( - tmp_path / "missing-dir" / "out.ben", - overwrite=False, - variant="standard", - ) - - -def test_compress_helpers_reject_same_path_missing_input_and_bad_json( - tmp_path: Path, -) -> None: - src = tmp_path / "src.jsonl" - write_jsonl([[1, 1, 2]], src) - - with pytest.raises(OSError, match="must differ"): - encode_jsonl_to_ben(src, src, overwrite=True, variant="standard") - - with pytest.raises(OSError, match="does not exist"): - encode_jsonl_to_ben( - tmp_path / "missing.jsonl", - tmp_path / "out.ben", - overwrite=True, - variant="standard", - ) - - bad_json = tmp_path / "bad.jsonl" - bad_json.write_text("not json\n", encoding="utf-8") - with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): - encode_jsonl_to_ben( - bad_json, - tmp_path / "bad.ben", - overwrite=True, - variant="standard", - ) - - bad_assign = tmp_path / "bad_assign.jsonl" - bad_assign.write_text('{"assignment":"bad","sample":1}\n', encoding="utf-8") - with pytest.raises(OSError, match="Failed to convert JSONL to XBEN"): - encode_jsonl_to_xben( - bad_assign, - tmp_path / "bad.xben", - overwrite=True, - variant="standard", - n_threads=1, - compression_level=1, - ) - - with pytest.raises(OSError, match="Failed to create"): - encode_jsonl_to_ben( - src, - tmp_path / "missing-dir" / "out.ben", - overwrite=True, - variant="standard", - ) - + BenEncoder(tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard") -def test_encode_ben_to_xben_rejects_same_path_missing_input_invalid_header_and_existing_output( - tmp_path: Path, -) -> None: - with pytest.raises(OSError, match="does not exist"): - encode_ben_to_xben( - tmp_path / "missing.ben", - tmp_path / "out.xben", - overwrite=True, - n_threads=1, - compression_level=1, - ) - - bad_ben = tmp_path / "bad.ben" - bad_ben.write_bytes(b"garbage") - - with pytest.raises(OSError, match="must differ"): - encode_ben_to_xben( - bad_ben, - bad_ben, - overwrite=True, - n_threads=1, - compression_level=1, - ) - - with pytest.raises(OSError, match="Failed to convert BEN to XBEN"): - encode_ben_to_xben( - bad_ben, - tmp_path / "out.xben", - overwrite=True, - n_threads=1, - compression_level=1, - ) - src = tmp_path / "src.jsonl" - write_jsonl([[1, 2, 3]], src) - ben = tmp_path / "good.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - out = tmp_path / "exists.xben" - out.write_bytes(b"exists") - with pytest.raises(OSError, match="already exists"): - encode_ben_to_xben( - ben, - out, - overwrite=False, - n_threads=1, - compression_level=1, - ) +# ---------- Decoder error / laziness paths ---------- def test_decoder_constructor_and_mode_errors(tmp_path: Path) -> None: with pytest.raises(Exception, match="Unknown mode"): BenDecoder(tmp_path / "missing.ben", mode="weird") - with pytest.raises(OSError, match="Failed to open"): BenDecoder(tmp_path / "missing.ben", mode="ben") - bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(Exception, match="Failed to create BenDecoder"): BenDecoder(bad_ben, mode="ben") - bad_xben = tmp_path / "bad.xben" bad_xben.write_bytes(b"garbage") with pytest.warns(UserWarning, match="XBEN may take a second"): @@ -586,103 +319,61 @@ def test_decoder_constructor_and_mode_errors(tmp_path: Path) -> None: BenDecoder(bad_xben, mode="xben") -def test_decoder_len_and_count_samples_are_lazy_and_cached(tmp_path: Path) -> None: +def test_decoder_len_and_count_are_lazy_and_cached(tmp_path: Path) -> None: samples = [[1, 1, 2], [1, 1, 2], [2, 3, 3], [4]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) - ben = tmp_path / "out.ben" encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - dec = BenDecoder(ben, mode="ben") assert len(dec) == len(samples) assert dec.count_samples() == len(samples) assert list(dec) == samples - gone = BenDecoder(ben, mode="ben") - assert len(gone) == len(samples) - ben.unlink() - with pytest.raises(Exception, match="Failed to create frame iterator"): - gone.subsample_range(1, 2) - - -def test_decoder_xben_len_count_and_warning(tmp_path: Path) -> None: - samples = [[1, 1], [1, 1], [2, 2], [3, 3]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - xben = tmp_path / "out.xben" - encode_jsonl_to_xben( - src, xben, overwrite=True, variant="mkv_chain", n_threads=1, compression_level=1 - ) - with pytest.warns(UserWarning, match="XBEN may take a second"): - dec = BenDecoder(xben, mode="xben") - assert len(dec) == len(samples) +def test_decoder_count_after_subsample_preserves_len(tmp_path: Path) -> None: + samples = [[i] for i in range(1, 11)] + path = tmp_path / "plain.ben" + with BenEncoder(path, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + dec = BenDecoder(path).subsample_every(3, 1) + expected = samples[::3] + assert len(dec) == len(expected) assert dec.count_samples() == len(samples) - assert list(dec) == samples + assert len(dec) == len(expected) + assert list(dec) == expected -def test_decoder_subsample_validations_and_warning_paths(tmp_path: Path) -> None: +def test_decoder_subsample_validations(tmp_path: Path) -> None: samples = [[1], [2], [3], [4], [5]] src = tmp_path / "src.jsonl" write_jsonl(samples, src) - ben = tmp_path / "out.ben" encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") with pytest.warns(UserWarning, match="sorted and unique"): got = list(BenDecoder(ben, mode="ben").subsample_indices([5, 1, 1, 3])) assert got == [samples[0], samples[2], samples[4]] - with pytest.raises(Exception, match="indices must be 1-based"): BenDecoder(ben, mode="ben").subsample_indices([0, 1]) - - with pytest.raises(Exception): - BenDecoder(ben, mode="ben").subsample_indices([-1]) - + with pytest.raises(Exception, match="indices must not be empty"): + BenDecoder(ben, mode="ben").subsample_indices([]) with pytest.raises(Exception, match="indices must be <="): BenDecoder(ben, mode="ben").subsample_indices([6]) - with pytest.raises(Exception, match="range must be 1-based"): BenDecoder(ben, mode="ben").subsample_range(0, 2) - - with pytest.raises(Exception): - BenDecoder(ben, mode="ben").subsample_range(-1, 2) - with pytest.raises(Exception, match="end must be <="): BenDecoder(ben, mode="ben").subsample_range(1, 99) - with pytest.raises(Exception, match="step and offset must be >= 1"): BenDecoder(ben, mode="ben").subsample_every(0, 1) - with pytest.raises(Exception, match="offset must be <="): BenDecoder(ben, mode="ben").subsample_every(2, 99) - assert list(BenDecoder(ben, mode="ben").subsample_range(2, 4)) == samples[1:4] - assert list(BenDecoder(ben, mode="ben").subsample_every(2, 2)) == samples[1::2] - - -def test_decoder_count_and_subsample_fail_cleanly_if_source_disappears( - tmp_path: Path, -) -> None: - src = tmp_path / "src.jsonl" - write_jsonl([[1], [2], [3]], src) - - ben = tmp_path / "out.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - dec = BenDecoder(ben, mode="ben") - ben.unlink() - - with pytest.raises(Exception, match="Failed to count samples"): - dec.count_samples() - def test_decoder_reports_zero_count_and_bad_frame_errors(tmp_path: Path) -> None: src = tmp_path / "src.jsonl" write_jsonl([[1, 1, 2]], src) - mkv_ben = tmp_path / "mkv.ben" encode_jsonl_to_ben(src, mkv_ben, overwrite=True, variant="mkv_chain") data = bytearray(mkv_ben.read_bytes()) @@ -696,846 +387,64 @@ def test_decoder_reports_zero_count_and_bad_frame_errors(tmp_path: Path) -> None truncated = standard_ben.read_bytes()[:-1] bad_ben = tmp_path / "truncated.ben" bad_ben.write_bytes(truncated) - dec = BenDecoder(bad_ben, mode="ben") with pytest.raises(Exception, match="Error decoding next item"): - next(iter(dec)) + next(iter(BenDecoder(bad_ben, mode="ben"))) + + +# ---------- Codec error paths ---------- + + +def test_codec_helpers_reject_unknown_variants(tmp_path: Path) -> None: + src = tmp_path / "src.jsonl" + write_jsonl([[1, 1, 2]], src) + with pytest.raises(ValueError, match="Unknown variant"): + encode_jsonl_to_ben(src, tmp_path / "o.ben", overwrite=True, variant="weird") + with pytest.raises(ValueError, match="Unknown variant"): + encode_jsonl_to_xben(src, tmp_path / "o.xben", overwrite=True, variant="weird") -def test_decode_helpers_reject_same_paths_missing_inputs_existing_output_and_invalid_headers( - tmp_path: Path, -) -> None: +def test_codec_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: Path) -> None: + src = tmp_path / "src.jsonl" + write_jsonl([[1, 1, 2]], src) + with pytest.raises(OSError, match="must differ"): + encode_jsonl_to_ben(src, src, overwrite=True, variant="standard") with pytest.raises(OSError, match="does not exist"): - decode_ben_to_jsonl( - tmp_path / "missing.ben", - tmp_path / "out.jsonl", - overwrite=True, - ) + encode_jsonl_to_ben(tmp_path / "missing.jsonl", tmp_path / "o.ben", overwrite=True) + bad_json = tmp_path / "bad.jsonl" + bad_json.write_text("not json\n", encoding="utf-8") + with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): + encode_jsonl_to_ben(bad_json, tmp_path / "bad.ben", overwrite=True) + +def test_encode_ben_to_xben_error_paths(tmp_path: Path) -> None: + with pytest.raises(OSError, match="does not exist"): + encode_ben_to_xben(tmp_path / "missing.ben", tmp_path / "o.xben", overwrite=True) bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") - with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): - decode_ben_to_jsonl( - bad_ben, - tmp_path / "out.jsonl", - overwrite=True, - ) + with pytest.raises(OSError, match="must differ"): + encode_ben_to_xben(bad_ben, bad_ben, overwrite=True) + with pytest.raises(OSError, match="Failed to convert BEN to XBEN"): + encode_ben_to_xben(bad_ben, tmp_path / "o.xben", overwrite=True) + +def test_decode_helpers_error_paths(tmp_path: Path) -> None: + with pytest.raises(OSError, match="does not exist"): + decode_ben_to_jsonl(tmp_path / "missing.ben", tmp_path / "o.jsonl", overwrite=True) + bad_ben = tmp_path / "bad.ben" + bad_ben.write_bytes(b"garbage") + with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): + decode_ben_to_jsonl(bad_ben, tmp_path / "o.jsonl", overwrite=True) bad_xben = tmp_path / "bad.xben" bad_xben.write_bytes(b"garbage") with pytest.raises(OSError, match="Failed to convert XBEN to BEN"): - decode_xben_to_ben( - bad_xben, - tmp_path / "out.ben", - overwrite=True, - ) - + decode_xben_to_ben(bad_xben, tmp_path / "o.ben", overwrite=True) with pytest.raises(OSError, match="must differ"): - decode_xben_to_jsonl( - bad_xben, - bad_xben, - overwrite=True, - ) - + decode_xben_to_jsonl(bad_xben, bad_xben, overwrite=True) src = tmp_path / "src.jsonl" write_jsonl([[1, 2, 3]], src) ben = tmp_path / "good.ben" - xben = tmp_path / "good.xben" encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) - out = tmp_path / "exists.jsonl" out.write_text("exists\n", encoding="utf-8") with pytest.raises(OSError, match="already exists"): decode_ben_to_jsonl(ben, out, overwrite=False) - - -# --------------------------------------------------------------------------- -# Bundle inspection via BenDecoder -# --------------------------------------------------------------------------- - - -def test_decoder_bundle_round_trip_all_methods(tmp_path: Path) -> None: - samples = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - graph = {"nodes": [{"id": 0}, {"id": 1}], "links": [{"source": 0, "target": 1}]} - path = tmp_path / "full.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert dec.is_bundle() - assert dec.is_complete() - assert dec.count_samples() == len(samples) - assert dec.assignment_format() == "ben" - v = dec.version() - assert isinstance(v, tuple) and len(v) == 2 - - names = dec.asset_names() - assert "graph.json" in names - - assets = dec.list_assets() - assert len(assets) >= 1 - for entry in assets: - assert "name" in entry - assert "type" in entry - assert "flags" in entry - - raw = dec.read_asset_bytes("graph.json") - assert isinstance(raw, bytes) - - parsed = dec.read_json_asset("graph.json") - assert parsed["nodes"] == graph["nodes"] - - g = dec.read_graph() - assert g is not None - assert g["nodes"] == graph["nodes"] - - assert dec.read_metadata() is None - assert dec.read_relabel_map() is None - - assert list(dec) == samples - - -def test_decoder_bundle_extract_stream_and_decode(tmp_path: Path) -> None: - samples = [[10, 20], [30, 40]] - path = tmp_path / "extract.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - out = tmp_path / "extracted.ben" - dec.extract_stream(out) - assert list(BenDecoder(out, mode="ben")) == samples - - -def test_decoder_bundle_extract_stream_overwrite_and_refuse(tmp_path: Path) -> None: - samples = [[1]] - path = tmp_path / "ow.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - enc.write(samples[0]) - - dec = BenDecoder(path) - out = tmp_path / "out.ben" - dec.extract_stream(out) - with pytest.raises(OSError, match="already exists"): - dec.extract_stream(out, overwrite=False) - dec.extract_stream(out, overwrite=True) - assert list(BenDecoder(out, mode="ben")) == samples - - -def test_decoder_bundle_missing_asset_raises_keyerror(tmp_path: Path) -> None: - path = tmp_path / "no_asset.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - enc.write([1, 2]) - - dec = BenDecoder(path) - with pytest.raises(KeyError, match="nope"): - dec.read_asset_bytes("nope") - with pytest.raises(KeyError, match="nope"): - dec.read_json_asset("nope") - - -# --------------------------------------------------------------------------- -# BenEncoder bundle-mode coverage -# --------------------------------------------------------------------------- - - -def test_benencoder_bundle_without_graph(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4]] - path = tmp_path / "no_graph.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert dec.is_bundle() - assert dec.assignment_format() == "ben" - assert dec.read_graph() is None - assert list(dec) == samples - - -def test_benencoder_bundle_graph_from_dict(tmp_path: Path) -> None: - graph = {"test": True} - path = tmp_path / "dict_graph.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: - enc.write([1]) - dec = BenDecoder(path) - assert dec.read_graph() == graph - - -def test_benencoder_bundle_graph_from_bytes(tmp_path: Path) -> None: - graph = {"test": "bytes"} - path = tmp_path / "bytes_graph.bendl" - with BenEncoder( - path, overwrite=True, variant="standard", graph=json.dumps(graph).encode() - ) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_graph_from_bytearray(tmp_path: Path) -> None: - graph = {"test": "bytearray"} - path = tmp_path / "ba_graph.bendl" - with BenEncoder( - path, - overwrite=True, - variant="standard", - graph=bytearray(json.dumps(graph).encode()), - ) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_graph_from_file_path(tmp_path: Path) -> None: - graph = {"test": "path"} - gpath = tmp_path / "g.json" - gpath.write_text(json.dumps(graph), encoding="utf-8") - path = tmp_path / "path_graph.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=gpath) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_graph_from_str_path(tmp_path: Path) -> None: - graph = {"test": "str_path"} - gpath = tmp_path / "g2.json" - gpath.write_text(json.dumps(graph), encoding="utf-8") - path = tmp_path / "str_path_graph.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=str(gpath)) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_graph_from_bytesio(tmp_path: Path) -> None: - graph = {"test": "bytesio"} - path = tmp_path / "bio_graph.bendl" - with BenEncoder( - path, - overwrite=True, - variant="standard", - graph=io.BytesIO(json.dumps(graph).encode()), - ) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_graph_from_stringio(tmp_path: Path) -> None: - graph = {"test": "stringio"} - path = tmp_path / "sio_graph.bendl" - with BenEncoder( - path, - overwrite=True, - variant="standard", - graph=io.StringIO(json.dumps(graph)), - ) as enc: - enc.write([1]) - assert BenDecoder(path).read_graph() == graph - - -def test_benencoder_bundle_rejects_graph_with_ben_file_only(tmp_path: Path) -> None: - with pytest.raises(ValueError, match="graph.*cannot be combined"): - BenEncoder( - tmp_path / "bad.ben", - overwrite=True, - variant="standard", - graph={"a": 1}, - ben_file_only=True, - ) - - -def test_benencoder_bundle_rejects_invalid_graph_type(tmp_path: Path) -> None: - with pytest.raises(ValueError, match="graph must be"): - BenEncoder( - tmp_path / "bad.bendl", - overwrite=True, - variant="standard", - graph=42, - ) - - -def test_benencoder_bundle_close_is_idempotent(tmp_path: Path) -> None: - path = tmp_path / "idempotent.bendl" - enc = BenEncoder(path, overwrite=True, variant="standard") - enc.write([1, 2]) - enc.close() - enc.close() - assert list(BenDecoder(path)) == [[1, 2]] - - -def test_benencoder_bundle_write_after_close_raises(tmp_path: Path) -> None: - path = tmp_path / "closed.bendl" - enc = BenEncoder(path, overwrite=True, variant="standard") - enc.write([1]) - enc.close() - with pytest.raises(OSError, match="already been closed"): - enc.write([2]) - - -# --------------------------------------------------------------------------- -# BenDecoder bundle-path coverage -# --------------------------------------------------------------------------- - - -def test_bendecoder_bundle_auto_detect_and_iterate(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4], [5, 6]] - path = tmp_path / "auto.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - dec = BenDecoder(path) - assert dec.is_bundle() - assert list(dec) == samples - - -def test_bendecoder_bundle_toc_methods(tmp_path: Path) -> None: - graph = {"g": 1} - path = tmp_path / "toc.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: - enc.write([1, 2, 3]) - - dec = BenDecoder(path) - assert dec.is_bundle() - assert dec.assignment_format() == "ben" - v = dec.version() - assert isinstance(v, tuple) and len(v) == 2 - assert dec.is_complete() - - names = dec.asset_names() - assert "graph.json" in names - - assets = dec.list_assets() - assert len(assets) >= 1 - for entry in assets: - assert "name" in entry - assert "type" in entry - assert "flags" in entry - - raw = dec.read_asset_bytes("graph.json") - assert isinstance(raw, bytes) - - parsed = dec.read_json_asset("graph.json") - assert parsed == graph - - assert dec.read_graph() == graph - assert dec.read_metadata() is None - assert dec.read_relabel_map() is None - - -def test_bendecoder_bundle_subsample_all_modes(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 11)] - path = tmp_path / "subsample.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_range(2, 5) - assert list(dec) == samples[1:5] - - dec2 = BenDecoder(path) - dec2.subsample_indices([1, 3, 10]) - assert list(dec2) == [samples[0], samples[2], samples[9]] - - dec3 = BenDecoder(path) - dec3.subsample_every(3, 2) - assert list(dec3) == [samples[1], samples[4], samples[7]] - - -def test_bendecoder_bundle_len_and_count(tmp_path: Path) -> None: - samples = [[1], [2], [3], [4], [5]] - path = tmp_path / "len.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert len(dec) == len(samples) - assert dec.count_samples() == len(samples) - assert list(dec) == samples - - -def test_bendecoder_bundle_iteration_restart(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4]] - path = tmp_path / "restart.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert list(dec) == samples - assert list(dec) == samples - - -def test_bendecoder_bundle_subsample_survives_reiteration(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 8)] - path = tmp_path / "re_sub.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_range(2, 5) - expected = samples[1:5] - assert list(dec) == expected - assert list(dec) == expected - - -def test_bendecoder_plain_rejects_bundle_methods(tmp_path: Path) -> None: - path = tmp_path / "plain.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - enc.write([1, 2]) - - dec = BenDecoder(path) - assert not dec.is_bundle() - assert dec.assignment_format() == "ben" - - for method, args in [ - ("version", ()), - ("is_complete", ()), - ("asset_names", ()), - ("list_assets", ()), - ("read_asset_bytes", ("x",)), - ("read_json_asset", ("x",)), - ("read_graph", ()), - ("read_metadata", ()), - ("read_relabel_map", ()), - ]: - with pytest.raises(Exception, match="only available on .bendl"): - getattr(dec, method)(*args) - - -def test_bendecoder_bundle_count_samples_preserves_subsample_len( - tmp_path: Path, -) -> None: - samples = [[i] for i in range(1, 9)] - path = tmp_path / "count_sub.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_range(2, 5) - assert len(dec) == 4 - assert dec.count_samples() == len(samples) - assert len(dec) == 4 - - -# --------------------------------------------------------------------------- -# BenDecoder XBEN bundle coverage -# --------------------------------------------------------------------------- - - -def test_bendecoder_xben_bundle_roundtrip(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4], [5, 6]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - xben_path = tmp_path / "samples.xben" - encode_jsonl_to_xben( - src, - xben_path, - overwrite=True, - variant="standard", - n_threads=1, - compression_level=1, - ) - - bendl_path = tmp_path / "xben_bundle.bendl" - with BenEncoder(bendl_path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(bendl_path) - assert dec.is_bundle() - assert list(dec) == samples - - -def test_bendecoder_xben_plain_stream(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - xben_path = tmp_path / "plain.xben" - encode_jsonl_to_xben( - src, - xben_path, - overwrite=True, - variant="standard", - n_threads=1, - compression_level=1, - ) - - dec = BenDecoder(xben_path, mode="xben") - assert not dec.is_bundle() - assert dec.assignment_format() == "xben" - assert list(dec) == samples - - -# --------------------------------------------------------------------------- -# BenDecoder subsample validation errors -# --------------------------------------------------------------------------- - - -def test_bendecoder_subsample_indices_empty_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "empty_idx.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_indices([]) - - -def test_bendecoder_subsample_indices_zero_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "zero_idx.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_indices([0, 1, 2]) - - -def test_bendecoder_subsample_range_zero_start_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "zero_start.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_range(0, 2) - - -def test_bendecoder_subsample_range_end_lt_start_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "bad_range.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_range(5, 2) - - -def test_bendecoder_subsample_every_zero_step_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "zero_step.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_every(0) - - -def test_bendecoder_subsample_every_zero_offset_raises(tmp_path: Path) -> None: - samples = [[1], [2]] - path = tmp_path / "zero_off.bendl" - with BenEncoder(path, overwrite=True, variant="standard") as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - with pytest.raises(Exception): - dec.subsample_every(1, offset=0) - - -# --------------------------------------------------------------------------- -# BenDecoder subsample on plain streams -# --------------------------------------------------------------------------- - - -def test_bendecoder_plain_subsample_indices(tmp_path: Path) -> None: - samples = [[1], [2], [3], [4], [5]] - path = tmp_path / "plain_sub.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_indices([1, 3, 5]) - assert list(dec) == [[1], [3], [5]] - - -def test_bendecoder_plain_subsample_range(tmp_path: Path) -> None: - samples = [[1], [2], [3], [4], [5]] - path = tmp_path / "plain_range.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_range(2, 4) - assert list(dec) == [[2], [3], [4]] - - -def test_bendecoder_plain_subsample_every(tmp_path: Path) -> None: - samples = [[1], [2], [3], [4], [5], [6]] - path = tmp_path / "plain_every.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_every(2, offset=1) - assert list(dec) == [[1], [3], [5]] - - -# --------------------------------------------------------------------------- -# BenDecoder len/count on plain streams -# --------------------------------------------------------------------------- - - -def test_bendecoder_plain_len_and_count(tmp_path: Path) -> None: - samples = [[1], [2], [3]] - path = tmp_path / "plain_len.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert dec.count_samples() == 3 - assert len(dec) == 3 - - -def test_bendecoder_plain_len_after_subsample(tmp_path: Path) -> None: - samples = [[1], [2], [3], [4], [5]] - path = tmp_path / "plain_sub_len.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_range(2, 4) - assert len(dec) == 3 - assert dec.count_samples() == 5 - assert len(dec) == 3 - - -# --------------------------------------------------------------------------- -# BenDecoder multiple iteration passes -# --------------------------------------------------------------------------- - - -def test_bendecoder_plain_multiple_iterations(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4]] - path = tmp_path / "multi_iter.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - assert list(dec) == samples - assert list(dec) == samples - assert list(dec) == samples - - -def test_bendecoder_plain_subsample_survives_reiteration(tmp_path: Path) -> None: - samples = [[i] for i in range(1, 8)] - path = tmp_path / "plain_re_sub.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path) - dec.subsample_every(2, offset=1) - expected = [[1], [3], [5], [7]] - assert list(dec) == expected - assert list(dec) == expected - - -# --------------------------------------------------------------------------- -# BenEncoder ben_file_only mode coverage -# --------------------------------------------------------------------------- - - -def test_benencoder_ben_file_only_roundtrip(tmp_path: Path) -> None: - samples = [[10, 20, 30], [40, 50, 60]] - path = tmp_path / "ben_only.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path, mode="ben") - assert not dec.is_bundle() - assert list(dec) == samples - - -def test_benencoder_ben_file_only_mkv(tmp_path: Path) -> None: - samples = [[1, 2], [1, 2], [3, 4]] - path = tmp_path / "ben_mkv.ben" - with BenEncoder( - path, overwrite=True, variant="mkv_chain", ben_file_only=True - ) as enc: - for a in samples: - enc.write(a) - - dec = BenDecoder(path, mode="ben") - assert list(dec) == samples - - -def test_benencoder_ben_file_only_close_and_reopen(tmp_path: Path) -> None: - samples = [[5, 6]] - path = tmp_path / "close_reopen.ben" - enc = BenEncoder(path, overwrite=True, variant="standard", ben_file_only=True) - enc.write(samples[0]) - enc.close() - - dec = BenDecoder(path, mode="ben") - assert list(dec) == samples - - -# --------------------------------------------------------------------------- -# BenEncoder bundle with metadata -# --------------------------------------------------------------------------- - - -def test_benencoder_bundle_with_metadata(tmp_path: Path) -> None: - samples = [[1, 2]] - graph = {"nodes": [{"id": 0}], "adjacency": [[]]} - path = tmp_path / "with_meta.bendl" - with BenEncoder(path, overwrite=True, variant="standard", graph=graph) as enc: - enc.write(samples[0]) - - dec = BenDecoder(path) - assert dec.read_graph() == graph - assert list(dec) == samples - - -# --------------------------------------------------------------------------- -# BenDecoder extract_stream on plain stream raises -# --------------------------------------------------------------------------- - - -def test_bendecoder_extract_stream_on_plain_raises(tmp_path: Path) -> None: - path = tmp_path / "plain_extract.ben" - with BenEncoder( - path, overwrite=True, variant="standard", ben_file_only=True - ) as enc: - enc.write([1, 2]) - - dec = BenDecoder(path, mode="ben") - with pytest.raises(Exception, match="only available on .bendl"): - dec.extract_stream(tmp_path / "out.ben") - - -# --------------------------------------------------------------------------- -# decode_ben_to_jsonl and decode_xben_to_jsonl coverage -# --------------------------------------------------------------------------- - - -def test_decode_ben_to_jsonl_roundtrip(tmp_path: Path) -> None: - samples = [[1, 2, 3], [4, 5, 6]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - ben = tmp_path / "out.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - out = tmp_path / "round.jsonl" - decode_ben_to_jsonl(ben, out, overwrite=True) - - restored = read_jsonl_assignments(out) - assert restored == samples - - -def test_decode_xben_to_jsonl_roundtrip(tmp_path: Path) -> None: - samples = [[1, 2, 3], [4, 5, 6]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - xben = tmp_path / "out.xben" - encode_jsonl_to_xben( - src, - xben, - overwrite=True, - variant="standard", - n_threads=1, - compression_level=1, - ) - - out = tmp_path / "round.jsonl" - decode_xben_to_jsonl(xben, out, overwrite=True) - - restored = read_jsonl_assignments(out) - assert restored == samples - - -# --------------------------------------------------------------------------- -# encode_ben_to_xben coverage -# --------------------------------------------------------------------------- - - -def test_encode_ben_to_xben_roundtrip(tmp_path: Path) -> None: - samples = [[1, 2], [3, 4], [5, 6]] - src = tmp_path / "src.jsonl" - write_jsonl(samples, src) - - ben = tmp_path / "out.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") - - xben = tmp_path / "from_ben.xben" - encode_ben_to_xben(ben, xben, overwrite=True, n_threads=1, compression_level=1) - - out = tmp_path / "round.jsonl" - decode_xben_to_jsonl(xben, out, overwrite=True) - - restored = read_jsonl_assignments(out) - assert restored == samples - - -# --------------------------------------------------------------------------- -# BenDecoder unknown mode error -# --------------------------------------------------------------------------- - - -def test_bendecoder_unknown_mode_raises(tmp_path: Path) -> None: - path = tmp_path / "dummy.ben" - path.write_bytes(b"\x00" * 100) - with pytest.raises(Exception): - BenDecoder(path, mode="bogus") - - -# --------------------------------------------------------------------------- -# BenDecoder MkvChain plain stream -# --------------------------------------------------------------------------- - - -def test_bendecoder_mkv_plain_stream(tmp_path: Path) -> None: - samples = [[1, 2], [1, 2], [3, 4]] - src = tmp_path / "mkv_src.jsonl" - write_jsonl(samples, src) - - ben = tmp_path / "mkv.ben" - encode_jsonl_to_ben(src, ben, overwrite=True, variant="mkv_chain") - - dec = BenDecoder(ben, mode="ben") - assert list(dec) == samples - assert dec.count_samples() == 3 diff --git a/ben-py/tests/test_recompress.py b/ben-py/tests/test_recompress.py new file mode 100644 index 0000000..e2c2de7 --- /dev/null +++ b/ben-py/tests/test_recompress.py @@ -0,0 +1,113 @@ +"""Tests for ``binary_ensemble.bundle.compress_stream`` (BEN bundle → XBEN bundle).""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from binary_ensemble.bundle import BendlDecoder, BendlEncoder, compress_stream + +EXAMPLE_GRAPH = ( + Path(__file__).resolve().parent.parent + / "docs" + / "user" + / "example_data" + / "gerrymandria.json" +) + + +def _graph(): + return json.loads(EXAMPLE_GRAPH.read_text()) + + +def _build_ben_bundle(path: Path): + n = len(_graph()["nodes"]) + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(8)] + with BendlEncoder(path, overwrite=True) as enc: + enc.add_graph(_graph(), preprocess_method="rcm") + enc.add_metadata({"seed": 99}) + with enc.stream("ben") as s: + for a in samples: + s.write(a) + enc.add_asset("notes.txt", "hi", content_type="text") + return samples + + +def _assert_preserved(src_dec, out_dec): + assert out_dec.assignment_format() == "xben" + assert out_dec.asset_names() == src_dec.asset_names() + # Decoded payloads + JSON flag preserved semantically. + src_flags = {a["name"]: ("json" in a["flags"]) for a in src_dec.list_assets()} + out_flags = {a["name"]: ("json" in a["flags"]) for a in out_dec.list_assets()} + assert src_flags == out_flags + for name in src_dec.asset_names(): + assert out_dec.read_asset_bytes(name) == src_dec.read_asset_bytes(name) + + +def test_compress_stream_explicit_out_path(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + out = tmp_path / "out.bendl" + samples = _build_ben_bundle(src) + src_dec = BendlDecoder(src) + + compress_stream(src, out_file=out) + + out_dec = BendlDecoder(out) + _assert_preserved(src_dec, out_dec) + assert list(out_dec) == samples + assert out_dec.read_metadata() == {"seed": 99} + assert out_dec.read_node_permutation_map() is not None + # Source bundle is untouched and still BEN. + assert BendlDecoder(src).assignment_format() == "ben" + + +def test_compress_stream_in_place(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + samples = _build_ben_bundle(src) + before = BendlDecoder(src) + before_assets = {n: before.read_asset_bytes(n) for n in before.asset_names()} + + compress_stream(src, in_place=True) + + after = BendlDecoder(src) + assert after.assignment_format() == "xben" + assert list(after) == samples + for name, payload in before_assets.items(): + assert after.read_asset_bytes(name) == payload + + +def test_compress_stream_arg_validation(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + _build_ben_bundle(src) + with pytest.raises(ValueError, match="either in_place=True or out_file"): + compress_stream(src) + with pytest.raises(ValueError, match="not both"): + compress_stream(src, out_file=tmp_path / "o.bendl", in_place=True) + + +def test_compress_stream_assets_only_bundle(tmp_path: Path) -> None: + src = tmp_path / "assets.bendl" + enc = BendlEncoder(src, overwrite=True) + enc.add_metadata({"only": "assets"}) + enc.close() + + out = tmp_path / "assets.xben.bendl" + compress_stream(src, out_file=out) + + dec = BendlDecoder(out) + assert dec.assignment_format() == "xben" + assert dec.is_complete() + assert dec.count_samples() == 0 + assert list(dec) == [] + assert dec.read_metadata() == {"only": "assets"} + + +def test_compress_stream_out_file_refuses_existing(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + _build_ben_bundle(src) + out = tmp_path / "exists.bendl" + out.write_bytes(b"existing") + with pytest.raises(OSError, match="already exists"): + compress_stream(src, out_file=out) diff --git a/ben-py/tests/test_surface.py b/ben-py/tests/test_surface.py new file mode 100644 index 0000000..46609d7 --- /dev/null +++ b/ben-py/tests/test_surface.py @@ -0,0 +1,266 @@ +"""Public-surface and type-stub drift tests. + +These guard the packaging contract: that every documented import resolves, that +the ``_core.pyi`` stub and the facade stubs match the runtime signatures (the +check that would have caught the historical missing ``graph`` / ``ben_file_only`` +/ ``allow_unfinalized`` drift), and that the retired ``PyBen*`` / ``compress_*`` +names are not re-exported. +""" + +from __future__ import annotations + +import ast +import inspect +from pathlib import Path + +import pytest + +import binary_ensemble +from binary_ensemble import _core, bundle, codec, graph, stream + +PKG_DIR = Path(binary_ensemble.__file__).parent + + +# --------------------------------------------------------------------------- +# Import surface +# --------------------------------------------------------------------------- + + +def test_top_level_exports() -> None: + expected = { + "stream", + "bundle", + "codec", + "graph", + "BendlEncoder", + "BendlDecoder", + "compress_stream", + "BenEncoder", + "BenDecoder", + "encode_jsonl_to_ben", + "encode_jsonl_to_xben", + "encode_ben_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", + } + assert expected.issubset(set(binary_ensemble.__all__)) + for name in expected: + assert hasattr(binary_ensemble, name) + + +def test_stream_module_exports() -> None: + assert set(stream.__all__) == {"BenEncoder", "BenDecoder"} + assert stream.BenEncoder is _core.BenEncoder + assert stream.BenDecoder is _core.BenDecoder + + +def test_bundle_module_exports() -> None: + assert set(bundle.__all__) == {"BendlEncoder", "BendlDecoder", "compress_stream"} + assert bundle.BendlDecoder is _core.BendlDecoder + + +def test_codec_module_exports() -> None: + assert set(codec.__all__) == { + "encode_jsonl_to_ben", + "encode_jsonl_to_xben", + "encode_ben_to_xben", + "decode_ben_to_jsonl", + "decode_xben_to_jsonl", + "decode_xben_to_ben", + } + for name in codec.__all__: + assert getattr(codec, name) is getattr(_core, name) + + +def test_graph_module_exports() -> None: + assert set(graph.__all__) == { + "reorder", + "reorder_multi_level_cluster", + "reorder_reverse_cuthill_mckee", + "reorder_by_key", + } + + +def test_core_submodule_is_accessible() -> None: + # _core stays importable for power users, but is not advertised in __all__. + assert hasattr(binary_ensemble, "_core") + assert "_core" not in binary_ensemble.__all__ + + +# --------------------------------------------------------------------------- +# Negative imports: retired names must not be re-exported. +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "name", + [ + "PyBenEncoder", + "PyBenDecoder", + "PyBendlEncoder", + "PyBendlDecoder", + "compress_jsonl_to_ben", + "compress_jsonl_to_xben", + "compress_ben_to_xben", + "decompress_ben_to_jsonl", + "decompress_xben_to_jsonl", + "decompress_xben_to_ben", + ], +) +def test_retired_names_not_exported(name: str) -> None: + assert not hasattr(binary_ensemble, name) + assert name not in binary_ensemble.__all__ + + +# --------------------------------------------------------------------------- +# Signature-drift machinery. +# --------------------------------------------------------------------------- + +_SKIP = {"self", "cls", "$self", "$cls"} + + +def _params_from_text_sig(text_sig): + """Normalize a PyO3 ``__text_signature__`` into ``[(name, has_default), ...]``.""" + if not text_sig: + return None + inner = text_sig.strip() + inner = inner[inner.index("(") + 1 : inner.rindex(")")] + out = [] + for part in inner.split(","): + part = part.strip() + if not part or part in ("/", "*"): + continue + if part.startswith("*"): + continue # *args / **kwargs + name = part.split("=")[0].split(":")[0].strip() + if name in _SKIP: + continue + out.append((name, "=" in part)) + return out + + +def _params_from_ast(func: ast.FunctionDef): + a = func.args + positional = list(a.posonlyargs) + list(a.args) + n_def = len(a.defaults) + has_default = [False] * (len(positional) - n_def) + [True] * n_def + out = [] + for arg, hd in zip(positional, has_default): + if arg.arg in _SKIP: + continue + out.append((arg.arg, hd)) + for arg, default in zip(a.kwonlyargs, a.kw_defaults): + out.append((arg.arg, default is not None)) + return out + + +def _parse_stub(path: Path): + """Parse a ``.pyi`` into ``{name: ('func', params) | ('class', {method: params})}``.""" + tree = ast.parse(path.read_text()) + symbols = {} + for node in tree.body: + if isinstance(node, ast.FunctionDef): + symbols[node.name] = ("func", _params_from_ast(node)) + elif isinstance(node, ast.ClassDef): + methods = {} + for item in node.body: + if isinstance(item, ast.FunctionDef): + methods[item.name] = _params_from_ast(item) + symbols[node.name] = ("class", methods) + return symbols + + +def _runtime_public_names(obj): + return {n for n in dir(obj) if not n.startswith("_")} + + +# --------------------------------------------------------------------------- +# _core stub drift (catches missing / extra / changed parameters). +# --------------------------------------------------------------------------- + + +def test_core_stub_covers_runtime_and_matches_signatures() -> None: + stub = _parse_stub(PKG_DIR / "_core.pyi") + + runtime_names = _runtime_public_names(_core) + # Every runtime public symbol must be documented in the stub. + for name in runtime_names: + assert name in stub, f"_core.{name} is missing from _core.pyi" + # Every stubbed symbol must exist at runtime. + for name in stub: + assert hasattr(_core, name), f"_core.pyi declares {name} but runtime lacks it" + + for name, (kind, payload) in stub.items(): + obj = getattr(_core, name) + if kind == "func": + runtime = _params_from_text_sig(obj.__text_signature__) + assert runtime == payload, f"signature drift on _core.{name}" + else: + # __init__ is described by the class-level text signature. + if "__init__" in payload: + runtime_init = _params_from_text_sig(obj.__text_signature__) + assert runtime_init == payload["__init__"], ( + f"__init__ signature drift on _core.{name}" + ) + # Public non-dunder methods declared in the stub must match. + stub_methods = {m for m in payload if not m.startswith("__")} + runtime_methods = _runtime_public_names(obj) + assert stub_methods == runtime_methods, ( + f"method set drift on _core.{name}: " + f"stub={stub_methods} runtime={runtime_methods}" + ) + for method in stub_methods: + runtime = _params_from_text_sig(getattr(obj, method).__text_signature__) + if runtime is None: + continue + assert runtime == payload[method], ( + f"signature drift on _core.{name}.{method}" + ) + + +# --------------------------------------------------------------------------- +# Facade stub drift (pure-Python objects, via inspect). +# --------------------------------------------------------------------------- + + +def _params_from_inspect(func, *, drop_self: bool): + out = [] + params = list(inspect.signature(func).parameters.values()) + for i, p in enumerate(params): + if drop_self and i == 0 and p.name in ("self", "cls"): + continue + if p.kind in (p.VAR_POSITIONAL, p.VAR_KEYWORD): + continue + out.append((p.name, p.default is not inspect.Parameter.empty)) + return out + + +def test_bundle_facade_matches_stub() -> None: + stub = _parse_stub(PKG_DIR / "bundle.pyi") + + # compress_stream (module-level function). + assert _params_from_inspect(bundle.compress_stream, drop_self=False) == stub[ + "compress_stream" + ][1] + + # BendlEncoder methods. + enc_methods = stub["BendlEncoder"][1] + for method, expected in enc_methods.items(): + if method.startswith("__"): + continue + runtime = getattr(bundle.BendlEncoder, method) + assert _params_from_inspect(runtime, drop_self=True) == expected, ( + f"signature drift on bundle.BendlEncoder.{method}" + ) + + +def test_graph_facade_matches_stub() -> None: + stub = _parse_stub(PKG_DIR / "graph.pyi") + for name, (kind, params) in stub.items(): + if kind != "func": + continue + runtime = getattr(graph, name) + assert _params_from_inspect(runtime, drop_self=False) == params, ( + f"signature drift on graph.{name}" + ) diff --git a/ben-py/uv.lock b/ben-py/uv.lock index 173008d..aa1c51f 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -91,6 +91,9 @@ wheels = [ [[package]] name = "binary-ensemble" source = { editable = "." } +dependencies = [ + { name = "networkx" }, +] [package.optional-dependencies] docs = [ @@ -120,6 +123,7 @@ requires-dist = [ { name = "myst-nb", marker = "extra == 'docs'", specifier = ">=1.3.0" }, { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=4.0.1" }, { name = "nbconvert", marker = "extra == 'docs'", specifier = ">=7.16.6" }, + { name = "networkx", specifier = ">=3.0" }, { name = "recommonmark", marker = "extra == 'docs'", specifier = ">=0.7.1" }, { name = "sphinx", marker = "extra == 'docs'", specifier = ">=8.2.3" }, { name = "sphinx-autoapi", marker = "extra == 'docs'", specifier = ">=3.6.1" }, From 2d327127509f1e37d32acb89eac19ed5061348ea Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:11:36 -0600 Subject: [PATCH 135/221] add relabel bundle helper and reformat --- ben-py/binary_ensemble/__init__.py | 8 +- ben-py/binary_ensemble/_core.pyi | 14 +- ben-py/binary_ensemble/bundle.py | 117 ++++++++++---- ben-py/binary_ensemble/bundle.pyi | 12 +- ben-py/src/common.rs | 4 +- ben-py/src/decode/bundle_decoder.rs | 21 ++- ben-py/src/decode/cursor.rs | 10 +- ben-py/src/decode/helpers.rs | 14 +- ben-py/src/decode/types.rs | 6 +- ben-py/src/encode/bundle_encoder.rs | 29 ++-- ben-py/src/encode/encoder.rs | 6 +- ben-py/src/encode/py_funcs.rs | 8 +- ben-py/src/graph/helpers.rs | 19 ++- ben-py/src/lib.rs | 2 + ben-py/src/recompress.rs | 8 +- ben-py/src/relabel.rs | 198 ++++++++++++++++++++++++ ben-py/tests/test_bundle.py | 205 ++++++++++++++++++++----- ben-py/tests/test_bundle_api.py | 23 ++- ben-py/tests/test_python_pipelines.py | 20 ++- ben-py/tests/test_relabel.py | 127 +++++++++++++++ ben-py/tests/test_surface.py | 21 ++- ben/src/codec/decode/tests/twodelta.rs | 37 +++-- ben/src/io/reader/stream_reader/ben.rs | 4 +- ben/src/io/reader/tests.rs | 30 +++- ben/src/io/writer/tests.rs | 10 +- ben/src/io/writer/twodelta.rs | 5 +- ben/tests/test_format_stability.rs | 17 +- 27 files changed, 810 insertions(+), 165 deletions(-) create mode 100644 ben-py/src/relabel.rs create mode 100644 ben-py/tests/test_relabel.py diff --git a/ben-py/binary_ensemble/__init__.py b/ben-py/binary_ensemble/__init__.py index 2b837b8..36caafd 100644 --- a/ben-py/binary_ensemble/__init__.py +++ b/ben-py/binary_ensemble/__init__.py @@ -16,7 +16,12 @@ """ from binary_ensemble import bundle, codec, graph, stream -from binary_ensemble.bundle import BendlDecoder, BendlEncoder, compress_stream +from binary_ensemble.bundle import ( + BendlDecoder, + BendlEncoder, + compress_stream, + relabel_bundle, +) from binary_ensemble.codec import ( decode_ben_to_jsonl, decode_xben_to_ben, @@ -37,6 +42,7 @@ "BendlEncoder", "BendlDecoder", "compress_stream", + "relabel_bundle", # Stream "BenEncoder", "BenDecoder", diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index 5254823..44b33f0 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -110,8 +110,8 @@ class BendlEncoder: ) -> None: ... def add_metadata(self, metadata: Any) -> None: ... # Returns the (possibly reordered) graph as a NetworkX graph, matching - # BendlDecoder.read_graph. - def add_graph(self, graph: Any, preprocess_method: str | None) -> Any: ... + # BendlDecoder.read_graph. preprocess_method defaults to "mlc"; pass None for raw. + def add_graph(self, graph: Any, preprocess_method: str | None = "mlc") -> Any: ... def stream( self, format: Literal["ben"] = "ben", @@ -129,13 +129,13 @@ def encode_jsonl_to_ben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] = "mkv_chain", + variant: Literal["standard", "mkv_chain", "twodelta"] = "twodelta", ) -> None: ... def encode_jsonl_to_xben( in_file: str | Path, out_file: str | Path, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] = "mkv_chain", + variant: Literal["standard", "mkv_chain", "twodelta"] = "twodelta", n_threads: int | None = None, compression_level: int | None = None, xz_block_size: int | None = None, @@ -166,3 +166,9 @@ def graph_reorder(graph: Any, method: str) -> tuple[Any, Any]: ... def recompress_bundle( in_file: str | Path, out_file: str | Path, overwrite: bool = False ) -> None: ... +def relabel_bundle( + in_file: str | Path, + out_file: str | Path, + method: str = "mlc", + overwrite: bool = False, +) -> None: ... diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index edfee8c..737bfab 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -32,8 +32,35 @@ from binary_ensemble._core import BendlDecoder from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder from binary_ensemble._core import recompress_bundle as _recompress_bundle +from binary_ensemble._core import relabel_bundle as _relabel_bundle -__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream"] +__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream", "relabel_bundle"] + + +def _atomic_or_out(transform, path, out_file, in_place, suffix=".bendl"): + """Shared in_place-swap / out_file dispatch for whole-bundle transforms. + + ``transform(src, dst, overwrite)`` writes the result. Exactly one of + ``in_place`` / ``out_file`` must be given. + """ + if in_place and out_file is not None: + raise ValueError("pass either in_place=True or out_file, not both") + if not in_place and out_file is None: + raise ValueError("pass either in_place=True or out_file") + + if in_place: + directory = os.path.dirname(os.path.abspath(os.fspath(path))) + fd, tmp = tempfile.mkstemp(suffix=suffix, dir=directory) + os.close(fd) + try: + transform(path, tmp, True) + os.replace(tmp, path) + except BaseException: + if os.path.exists(tmp): + os.remove(tmp) + raise + else: + transform(path, out_file, False) def _coerce_bytes(payload: Union[bytes, bytearray, memoryview, str]) -> bytes: @@ -42,18 +69,29 @@ def _coerce_bytes(payload: Union[bytes, bytearray, memoryview, str]) -> bytes: return payload.encode("utf-8") if isinstance(payload, (bytes, bytearray, memoryview)): return bytes(payload) - raise TypeError( - f"asset payload must be bytes or str, got {type(payload).__name__}" - ) + raise TypeError(f"asset payload must be bytes or str, got {type(payload).__name__}") class BendlEncoder: """Writer for a ``.bendl`` bundle (create mode) or an asset appender (append mode). In create mode (the constructor), assets may be added before or after a - single-use ``stream()``; closing finalizes the bundle. In append mode - (:meth:`append`), an existing finalized bundle is grown with new assets and - ``stream()`` is unavailable. + single-use ``stream()``. You do **not** need to use ``BendlEncoder`` itself as + a context manager: closing the ``stream()`` context finalizes the bundle, so + the common pattern is:: + + enc = BendlEncoder(path, overwrite=True) + graph = enc.add_graph(my_graph) # MLC-reordered by default + with enc.stream("ben") as stream: # only the stream needs ``with`` + for assignment in chain: + stream.write(assignment) + # bundle is finalized here + + The encoder is still usable as a context manager if you prefer, and that is + the easy way to finalize an *assets-only* bundle (one written with no + ``stream()``): either ``with BendlEncoder(...) as enc: ...`` or an explicit + :meth:`close`. In append mode (:meth:`append`), an existing finalized bundle + is grown with new assets and ``stream()`` is unavailable. """ def __init__(self, file_path, overwrite: bool = False) -> None: @@ -70,14 +108,16 @@ def append(cls, file_path) -> "BendlEncoder": self._enc = _CoreBendlEncoder.append(file_path) return self - def add_graph(self, graph: Any, preprocess_method: Optional[str]) -> Any: + def add_graph(self, graph: Any, preprocess_method: Optional[str] = "mlc") -> Any: """Embed the dual ``graph.json`` and return the (possibly reordered) graph. - When ``preprocess_method`` is not ``None`` the graph is reordered (e.g. - ``"rcm"``, ``"mlc"``, or a node-attribute key) and both ``graph.json`` - and ``node_permutation_map.json`` are stored; the reordered graph is - returned so the chain runs on that ordering. Reordering is pre-stream - only. ``None`` stores the graph as-is with no permutation map. + ``preprocess_method`` defaults to ``"mlc"`` (multi-level clustering), so + by default the graph is reordered for better compression. When it is not + ``None`` the graph is reordered (``"rcm"``, ``"mlc"``, or a node-attribute + key) and both ``graph.json`` and ``node_permutation_map.json`` are stored; + the reordered graph is returned so the chain runs on that ordering. + Reordering is pre-stream only. Pass ``preprocess_method=None`` to store + the graph as-is with no permutation map. The graph is returned as a NetworkX graph (matching :meth:`BendlDecoder.read_graph`), so its node order is the order the @@ -126,7 +166,7 @@ def stream(self, format: str = "ben", variant: Optional[str] = None): Only ``"ben"`` is accepted; produce XBEN bundles via :func:`compress_stream`. ``variant`` selects the BEN variant - (default ``"mkv_chain"``). + (default ``"twodelta"``). """ return self._enc.stream(format, variant) @@ -158,21 +198,36 @@ def compress_stream( is normalized to the writer's default policy. An assets-only bundle (empty stream) recompresses to an empty XBEN bundle. """ - if in_place and out_file is not None: - raise ValueError("pass either in_place=True or out_file, not both") - if not in_place and out_file is None: - raise ValueError("pass either in_place=True or out_file") + _atomic_or_out( + lambda src, dst, overwrite: _recompress_bundle(src, dst, overwrite=overwrite), + path, + out_file, + in_place, + ) - if in_place: - directory = os.path.dirname(os.path.abspath(os.fspath(path))) - fd, tmp = tempfile.mkstemp(suffix=".bendl", dir=directory) - os.close(fd) - try: - _recompress_bundle(path, tmp, overwrite=True) - os.replace(tmp, path) - except BaseException: - if os.path.exists(tmp): - os.remove(tmp) - raise - else: - _recompress_bundle(path, out_file, overwrite=False) + +def relabel_bundle( + path, + out_file=None, + method: str = "mlc", + in_place: bool = False, +) -> None: + """Reorder a BEN bundle's graph by ``method`` and relabel its stream to match. + + Reorders the embedded ``graph.json`` (``"mlc"`` by default; also ``"rcm"`` or + a node-attribute key), rewrites every assignment into the new node order, and + writes a fresh bundle storing the reordered graph and a + ``node_permutation_map.json`` (so the reordering is reversible). Metadata and + custom assets are preserved. This is the bundle-level form of the CLI's + ``reben`` ordering flow — typically run to shrink a bundle before an XBEN + recompress. + + Provide exactly one of ``in_place=True`` or ``out_file``. Only BEN bundles are + supported (relabel before compressing to XBEN); the source must carry a graph. + """ + _atomic_or_out( + lambda src, dst, overwrite: _relabel_bundle(src, dst, method, overwrite), + path, + out_file, + in_place, + ) diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi index 8b39104..813efb6 100644 --- a/ben-py/binary_ensemble/bundle.pyi +++ b/ben-py/binary_ensemble/bundle.pyi @@ -3,13 +3,15 @@ from typing import Any, Optional, Union from binary_ensemble._core import BendlDecoder as BendlDecoder from binary_ensemble._core import BendlStreamSession -__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream"] +__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream", "relabel_bundle"] class BendlEncoder: def __init__(self, file_path, overwrite: bool = False) -> None: ... @classmethod def append(cls, file_path) -> "BendlEncoder": ... - def add_graph(self, graph: Any, preprocess_method: Optional[str]) -> Any: ... + def add_graph( + self, graph: Any, preprocess_method: Optional[str] = "mlc" + ) -> Any: ... def add_metadata(self, metadata: Any) -> None: ... def add_asset( self, @@ -29,3 +31,9 @@ def compress_stream( out_file=None, in_place: bool = False, ) -> None: ... +def relabel_bundle( + path, + out_file=None, + method: str = "mlc", + in_place: bool = False, +) -> None: ... diff --git a/ben-py/src/common.rs b/ben-py/src/common.rs index f222875..94eb2b1 100644 --- a/ben-py/src/common.rs +++ b/ben-py/src/common.rs @@ -9,8 +9,8 @@ use std::path::PathBuf; pub fn parse_variant(variant: Option<&str>) -> PyResult { match variant { Some("standard") => Ok(BenVariant::Standard), - Some("mkv_chain") | Some("markov") | None => Ok(BenVariant::MkvChain), - Some("twodelta") | Some("two_delta") => Ok(BenVariant::TwoDelta), + Some("mkv_chain") | Some("markov") => Ok(BenVariant::MkvChain), + Some("twodelta") | Some("two_delta") | None => Ok(BenVariant::TwoDelta), Some(other) => Err(PyValueError::new_err(format!( "Unknown variant: {other}. Supported variants are 'standard', 'mkv_chain', and 'twodelta'." ))), diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index 2e55863..f54fed8 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -17,8 +17,8 @@ use std::path::PathBuf; /// /// This decoder is bundle-only: opening it on a plain `.ben`/`.xben` stream raises and points the /// caller at `BenDecoder`. It exposes the bundle inspection surface (`version`, `is_complete`, -/// `asset_names`, `list_assets`, canonical and generic asset getters, `extract_stream`) and iterates -/// the embedded assignment stream. +/// `asset_names`, `list_assets`, canonical and generic asset getters, `extract_stream`) and +/// iterates the embedded assignment stream. #[pyclass(module = "binary_ensemble", name = "BendlDecoder", unsendable)] pub struct PyBendlDecoder { path: PathBuf, @@ -169,7 +169,11 @@ impl PyBendlDecoder { /// Names of every entry in the bundle's directory, in directory order. #[pyo3(text_signature = "(self)")] fn asset_names(&self) -> Vec { - self.reader.assets().iter().map(|e| e.name.clone()).collect() + self.reader + .assets() + .iter() + .map(|e| e.name.clone()) + .collect() } /// Return the full bundle directory as a list of dicts with keys `name`, `type`, `offset`, @@ -246,7 +250,11 @@ impl PyBendlDecoder { /// Read the bundle's `node_permutation_map.json` asset as parsed JSON, or `None` if absent. #[pyo3(text_signature = "(self)")] fn read_node_permutation_map<'py>(&mut self, py: Python<'py>) -> PyResult>> { - self.read_known_json(py, ASSET_TYPE_NODE_PERMUTATION_MAP, "node_permutation_map.json") + self.read_known_json( + py, + ASSET_TYPE_NODE_PERMUTATION_MAP, + "node_permutation_map.json", + ) } /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be @@ -282,7 +290,10 @@ impl PyBendlDecoder { .truncate(true) .open(&out_path) } else { - OpenOptions::new().write(true).create_new(true).open(&out_path) + OpenOptions::new() + .write(true) + .create_new(true) + .open(&out_path) } .map_err(|e| PyIOError::new_err(format!("Failed to create {}: {e}", out_path.display())))?; let mut out = BufWriter::new(out); diff --git a/ben-py/src/decode/cursor.rs b/ben-py/src/decode/cursor.rs index f71bea8..9357d6f 100644 --- a/ben-py/src/decode/cursor.rs +++ b/ben-py/src/decode/cursor.rs @@ -17,9 +17,9 @@ use pyo3::types::PyDict; pub(super) struct SampleCursor { source: StreamSource, mode: DecoderMode, - /// Lazily-constructed frame iterator. Construction is deferred so opening a bundle with an empty - /// or truncated stream still succeeds — only methods that actually walk the stream need a live - /// iterator. + /// Lazily-constructed frame iterator. Construction is deferred so opening a bundle with an + /// empty or truncated stream still succeeds — only methods that actually walk the stream + /// need a live iterator. iter: Option, current_assignment: Option>, remaining_count: u16, @@ -100,7 +100,9 @@ impl SampleCursor { self.remaining_count = count - 1; Ok(Some(assignment)) } - Some(Err(e)) => Err(PyException::new_err(format!("Error decoding next item: {e}"))), + Some(Err(e)) => Err(PyException::new_err(format!( + "Error decoding next item: {e}" + ))), None => Ok(None), } } diff --git a/ben-py/src/decode/helpers.rs b/ben-py/src/decode/helpers.rs index a4459f5..6c9cb00 100644 --- a/ben-py/src/decode/helpers.rs +++ b/ben-py/src/decode/helpers.rs @@ -138,7 +138,10 @@ pub(super) fn scan_samples( let format = mode.wire_format(); py.detach(|| count_samples_from_file(&path, format)) .map_err(|e| { - PyException::new_err(format!("Failed to count samples in {}: {e}", path.display())) + PyException::new_err(format!( + "Failed to count samples in {}: {e}", + path.display() + )) }) } StreamSource::Bundle { @@ -149,10 +152,13 @@ pub(super) fn scan_samples( } => { let reader = open_bundle_stream_reader(path, *stream_offset, *stream_len)?; let iter = build_frame_iter_from_reader(reader, mode.wire_format()).map_err(|e| { - PyException::new_err(format!("Failed to open bundle stream for sample count: {e}")) + PyException::new_err(format!( + "Failed to open bundle stream for sample count: {e}" + )) })?; - count_samples_from_frame_iter(iter) - .map_err(|e| PyException::new_err(format!("Failed to count samples in bundle: {e}"))) + count_samples_from_frame_iter(iter).map_err(|e| { + PyException::new_err(format!("Failed to count samples in bundle: {e}")) + }) } } } diff --git a/ben-py/src/decode/types.rs b/ben-py/src/decode/types.rs index db15cca..a7ec859 100644 --- a/ben-py/src/decode/types.rs +++ b/ben-py/src/decode/types.rs @@ -65,9 +65,9 @@ pub(super) enum StreamSource { /// Authoritative sample count from a finalized bundle header, or `None` when the bundle is /// unfinalized (forcing a stream scan). header_sample_count: Option, - /// `true` for a finalized bundle whose stream region is empty (an assets-only bundle with no - /// BEN banner). Iteration over such a source yields nothing instead of failing on the - /// missing banner. + /// `true` for a finalized bundle whose stream region is empty (an assets-only bundle with + /// no BEN banner). Iteration over such a source yields nothing instead of failing + /// on the missing banner. empty: bool, }, } diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index bfb6159..1f3de68 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -12,7 +12,9 @@ use crate::common::{ use crate::graph::helpers::reorder_graph_to_bytes; use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; use binary_ensemble::io::bundle::writer::BendlAppender; -use binary_ensemble::io::bundle::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; +use binary_ensemble::io::bundle::{ + AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, +}; use binary_ensemble::io::writer::BenStreamWriter; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; @@ -160,14 +162,15 @@ impl PyBendlEncoder { /// Add the `graph.json` known asset. /// - /// When `preprocess_method` is not `None`, the graph is reordered via the chosen method, both + /// `preprocess_method` defaults to `"mlc"`, so by default the graph is reordered for better + /// compression. When it is not `None`, the graph is reordered via the chosen method, both /// `graph.json` and `node_permutation_map.json` are stored, and the reordered graph is returned - /// (as a NetworkX graph, matching `BendlDecoder.read_graph`) so the chain runs on that ordering. - /// Reordering is pre-stream only. When `preprocess_method` is `None`, the graph is stored as-is - /// (no permutation map) and may also be attached post-stream / in append mode. The returned - /// graph's node count is recorded for per-write validation. - #[pyo3(signature = (graph, preprocess_method))] - #[pyo3(text_signature = "(self, graph, preprocess_method)")] + /// (as a NetworkX graph, matching `BendlDecoder.read_graph`) so the chain runs on that + /// ordering. Reordering is pre-stream only. Pass `preprocess_method=None` to store the + /// graph as-is (no permutation map); a raw graph may also be attached post-stream / in + /// append mode. The returned graph's node count is recorded for per-write validation. + #[pyo3(signature = (graph, preprocess_method = Some("mlc".to_string())))] + #[pyo3(text_signature = "(self, graph, preprocess_method='mlc')")] fn add_graph( &mut self, py: Python<'_>, @@ -225,7 +228,7 @@ impl PyBendlEncoder { } /// Open the single-use assignment stream. Only `"ben"` is accepted today; XBEN comes from - /// `bundle.compress_stream`. `variant` selects the BEN variant (default `"mkv_chain"`). + /// `bundle.compress_stream`. `variant` selects the BEN variant (default `"twodelta"`). #[pyo3(signature = (format = "ben", variant = None))] #[pyo3(text_signature = "(self, format='ben', variant=None)")] fn stream( @@ -252,9 +255,7 @@ impl PyBendlEncoder { } match &me.state { BundleState::PreStream { .. } => {} - BundleState::Streaming => { - return Err(PyException::new_err("a stream is already open")) - } + BundleState::Streaming => return Err(PyException::new_err("a stream is already open")), BundleState::Appendable => { return Err(PyException::new_err( "a stream has already been written to this bundle", @@ -407,9 +408,7 @@ impl PyBendlStreamSession { .writer .as_mut() .ok_or_else(|| PyIOError::new_err("stream session is already closed"))?; - writer - .write_assignment(assignment) - .map_err(map_io_err)?; + writer.write_assignment(assignment).map_err(map_io_err)?; self.sample_count += 1; Ok(()) } diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index e456874..1c8d534 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -30,7 +30,7 @@ impl PyBenEncoder { /// * `file_path` - Output path. Must not exist unless `overwrite=True`. /// * `overwrite` - Replace an existing file at `file_path`. /// * `variant` - BEN variant for the assignment stream (`"standard"`, `"mkv_chain"`, or - /// `"twodelta"`). Defaults to `"mkv_chain"` when `None`. + /// `"twodelta"`). Defaults to `"twodelta"` when `None`. #[new] #[pyo3(signature = (file_path, overwrite = false, variant = None))] #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] @@ -51,7 +51,9 @@ impl PyBenEncoder { .writer .as_mut() .ok_or_else(|| PyIOError::new_err("Encoder has already been closed."))?; - writer.write_assignment(assignment).map_err(Self::map_io_err) + writer + .write_assignment(assignment) + .map_err(Self::map_io_err) } /// Flush the assignment stream and close the underlying file. Idempotent. diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index cae1f58..6234b7a 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -45,8 +45,8 @@ pub fn encode_ben_to_xben( } #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain"))] -#[pyo3(text_signature = "(in_file, out_file, overwrite=False, variant='mkv_chain')")] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="twodelta"))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False, variant='twodelta')")] pub fn encode_jsonl_to_ben( in_file: PathBuf, out_file: PathBuf, @@ -69,9 +69,9 @@ pub fn encode_jsonl_to_ben( } #[pyfunction] -#[pyo3(signature = (in_file, out_file, overwrite=false, variant="mkv_chain", n_threads=None, compression_level=None, xz_block_size=None))] +#[pyo3(signature = (in_file, out_file, overwrite=false, variant="twodelta", n_threads=None, compression_level=None, xz_block_size=None))] #[pyo3( - text_signature = "(in_file, out_file, overwrite=False, variant='mkv_chain', n_threads=None, compression_level=None, xz_block_size=None)" + text_signature = "(in_file, out_file, overwrite=False, variant='twodelta', n_threads=None, compression_level=None, xz_block_size=None)" )] pub fn encode_jsonl_to_xben( in_file: PathBuf, diff --git a/ben-py/src/graph/helpers.rs b/ben-py/src/graph/helpers.rs index 9d66a8a..7565ce1 100644 --- a/ben-py/src/graph/helpers.rs +++ b/ben-py/src/graph/helpers.rs @@ -16,12 +16,14 @@ enum Reorder { fn classify(method: &str) -> Reorder { match method { - "mlc" | "multi-level-cluster" => { - Reorder::Ordering(GraphOrderingMethod::MultiLevelCluster, "multi-level-cluster") - } - "rcm" | "reverse-cuthill-mckee" => { - Reorder::Ordering(GraphOrderingMethod::ReverseCuthillMckee, "reverse-cuthill-mckee") - } + "mlc" | "multi-level-cluster" => Reorder::Ordering( + GraphOrderingMethod::MultiLevelCluster, + "multi-level-cluster", + ), + "rcm" | "reverse-cuthill-mckee" => Reorder::Ordering( + GraphOrderingMethod::ReverseCuthillMckee, + "reverse-cuthill-mckee", + ), other => Reorder::Key(other.to_string()), } } @@ -37,8 +39,9 @@ pub fn reorder_graph_to_bytes(graph_bytes: &[u8], method: &str) -> PyResult<(Vec let mut reordered = Vec::new(); let (map, key_field, ordering_field) = match classify(method) { Reorder::Ordering(ordering, name) => { - let map = sort_json_file_by_ordering(Cursor::new(graph_bytes), &mut reordered, ordering) - .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; + let map = + sort_json_file_by_ordering(Cursor::new(graph_bytes), &mut reordered, ordering) + .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; (map, None::, Some(name)) } Reorder::Key(key) => { diff --git a/ben-py/src/lib.rs b/ben-py/src/lib.rs index a79eb47..89c31ee 100755 --- a/ben-py/src/lib.rs +++ b/ben-py/src/lib.rs @@ -6,6 +6,7 @@ pub mod decode; pub mod encode; pub mod graph; pub mod recompress; +pub mod relabel; #[pymodule] fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { @@ -22,6 +23,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(crate::encode::encode_ben_to_xben, m)?)?; m.add_function(wrap_pyfunction!(crate::graph::graph_reorder, m)?)?; m.add_function(wrap_pyfunction!(crate::recompress::recompress_bundle, m)?)?; + m.add_function(wrap_pyfunction!(crate::relabel::relabel_bundle, m)?)?; Ok(()) } diff --git a/ben-py/src/recompress.rs b/ben-py/src/recompress.rs index d0514a0..a948eae 100644 --- a/ben-py/src/recompress.rs +++ b/ben-py/src/recompress.rs @@ -57,7 +57,8 @@ fn add_preserved( } } -/// Recompress the BEN stream of the bundle at `in_file` to XBEN, writing a new bundle at `out_file`. +/// Recompress the BEN stream of the bundle at `in_file` to XBEN, writing a new bundle at +/// `out_file`. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite = false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] @@ -115,9 +116,8 @@ pub fn recompress_bundle(in_file: PathBuf, out_file: PathBuf, overwrite: bool) - // Build the new XBEN bundle. let buf = open_output(&out_file, overwrite)?; - let mut writer = BendlWriter::new(buf, AssignmentFormat::Xben).map_err(|e| { - PyIOError::new_err(format!("Failed to initialize bundle writer: {e}")) - })?; + let mut writer = BendlWriter::new(buf, AssignmentFormat::Xben) + .map_err(|e| PyIOError::new_err(format!("Failed to initialize bundle writer: {e}")))?; for asset in &assets { add_preserved(&mut writer, asset).map_err(map_bundle_err)?; } diff --git a/ben-py/src/relabel.rs b/ben-py/src/relabel.rs new file mode 100644 index 0000000..21db9fd --- /dev/null +++ b/ben-py/src/relabel.rs @@ -0,0 +1,198 @@ +//! Binding for relabeling a `.bendl` bundle: reorder its dual graph and rewrite the embedded BEN +//! assignment stream into the new node order, producing a fresh bundle. +//! +//! This is the bundle-level form of the CLI's `reben` ordering flow. The reordered `graph.json` and +//! a `node_permutation_map.json` are stored as canonical assets so the reordering stays reversible; +//! every other asset (metadata, custom blobs) is carried over by decoded payload, name, type, and +//! JSON flag. + +use crate::common::open_output; +use crate::graph::helpers::reorder_graph_to_bytes; +use binary_ensemble::io::bundle::format::{ + AssignmentFormat, KnownAssetKind, ASSET_FLAG_JSON, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, + ASSET_TYPE_NODE_PERMUTATION_MAP, +}; +use binary_ensemble::io::bundle::{AddAssetOptions, BendlReader, BendlWriteError, BendlWriter}; +use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; +use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::prelude::*; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufReader, Cursor, Read, Write}; +use std::path::PathBuf; + +fn map_bundle_err(err: BendlWriteError) -> PyErr { + match err { + BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + other => PyException::new_err(format!("{other}")), + } +} + +/// A metadata/custom asset carried over unchanged from the source bundle. +struct PreservedAsset { + asset_type: u16, + name: String, + is_json: bool, + payload: Vec, +} + +fn add_preserved( + writer: &mut BendlWriter, + asset: &PreservedAsset, +) -> Result<(), BendlWriteError> { + let opts = if asset.is_json { + AddAssetOptions::defaults().json() + } else { + AddAssetOptions::defaults() + }; + // Keep canonical known assets (e.g. metadata.json) canonical; everything else is custom. + match asset.asset_type { + ASSET_TYPE_METADATA => { + writer.add_known_asset(KnownAssetKind::Metadata, &asset.payload, opts) + } + _ => writer.add_custom_asset(&asset.name, &asset.payload, opts), + } +} + +/// Invert a stored `node_permutation_old_to_new` object into the dense `new -> old` map that +/// `relabel_ben_file` consumes. +fn new_to_old_from_map_bytes(map_bytes: &[u8]) -> PyResult> { + let value: serde_json::Value = serde_json::from_slice(map_bytes) + .map_err(|e| PyException::new_err(format!("permutation map is not valid JSON: {e}")))?; + let obj = value + .get("node_permutation_old_to_new") + .and_then(|v| v.as_object()) + .ok_or_else(|| { + PyException::new_err("permutation map missing node_permutation_old_to_new") + })?; + let mut new_to_old = HashMap::with_capacity(obj.len()); + for (old_text, new_val) in obj { + let old = old_text + .parse::() + .map_err(|e| PyException::new_err(format!("invalid node index {old_text:?}: {e}")))?; + let new = new_val + .as_u64() + .ok_or_else(|| PyException::new_err("permutation map value is not an integer"))? + as usize; + new_to_old.insert(new, old); + } + Ok(new_to_old) +} + +/// Relabel the bundle at `in_file` by reordering its graph via `method`, writing a fresh BEN bundle +/// at `out_file`. +#[pyfunction] +#[pyo3(signature = (in_file, out_file, method = "mlc".to_string(), overwrite = false))] +#[pyo3(text_signature = "(in_file, out_file, method='mlc', overwrite=False)")] +pub fn relabel_bundle( + in_file: PathBuf, + out_file: PathBuf, + method: String, + overwrite: bool, +) -> PyResult<()> { + let file = File::open(&in_file) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + in_file.display() + )) + })?; + + if !reader.is_finalized() { + return Err(PyException::new_err( + "relabel_bundle requires a finalized bundle", + )); + } + if !matches!(reader.assignment_format(), Some(AssignmentFormat::Ben)) { + return Err(PyValueError::new_err( + "relabel_bundle only supports BEN bundles; relabel before compressing to XBEN", + )); + } + let sample_count = reader.header().sample_count; + if reader.header().stream_len == 0 && sample_count == 0 { + return Err(PyValueError::new_err( + "relabel_bundle requires a non-empty assignment stream", + )); + } + + // The graph asset is required: it defines the ordering we permute into. + let graph_entry = reader + .find_asset_by_type(ASSET_TYPE_GRAPH) + .cloned() + .ok_or_else(|| PyValueError::new_err("bundle has no graph.json to reorder"))?; + let graph_bytes = reader + .asset_bytes(&graph_entry) + .map_err(|e| PyIOError::new_err(format!("Failed to read graph asset: {e}")))?; + + // Reorder the graph and derive the new->old permutation for the stream. + let (reordered_graph, map_bytes) = reorder_graph_to_bytes(&graph_bytes, &method)?; + let new_to_old = new_to_old_from_map_bytes(&map_bytes)?; + + // Carry over every other asset (skip the old graph and any old permutation map; we rewrite + // those as canonical assets below). + let entries: Vec<_> = reader.assets().to_vec(); + let mut preserved = Vec::new(); + for entry in &entries { + if entry.asset_type == ASSET_TYPE_GRAPH + || entry.asset_type == ASSET_TYPE_NODE_PERMUTATION_MAP + { + continue; + } + let payload = reader.asset_bytes(entry).map_err(|e| { + PyIOError::new_err(format!("Failed to read asset {:?}: {e}", entry.name)) + })?; + preserved.push(PreservedAsset { + asset_type: entry.asset_type, + name: entry.name.clone(), + is_json: entry.asset_flags & ASSET_FLAG_JSON != 0, + payload, + }); + } + + // Read the BEN stream and relabel it into the new node order. + let mut ben_bytes = Vec::new(); + reader + .assignment_stream_reader() + .map_err(|e| PyException::new_err(format!("Failed to open stream region: {e}")))? + .read_to_end(&mut ben_bytes) + .map_err(|e| PyIOError::new_err(format!("Failed to read BEN stream: {e}")))?; + let mut relabeled = Vec::new(); + relabel_ben_file( + Cursor::new(ben_bytes), + &mut relabeled, + RelabelOptions::node_permutation(new_to_old), + ) + .map_err(|e| PyException::new_err(format!("Failed to relabel BEN stream: {e}")))?; + + // Write the new bundle: reordered graph + permutation map (canonical), then the rest. + let buf = open_output(&out_file, overwrite)?; + let mut writer = BendlWriter::new(buf, AssignmentFormat::Ben) + .map_err(|e| PyIOError::new_err(format!("Failed to initialize bundle writer: {e}")))?; + writer + .add_known_asset( + KnownAssetKind::Graph, + &reordered_graph, + AddAssetOptions::defaults().json(), + ) + .map_err(map_bundle_err)?; + writer + .add_known_asset( + KnownAssetKind::NodePermutationMap, + &map_bytes, + AddAssetOptions::defaults().json(), + ) + .map_err(map_bundle_err)?; + for asset in &preserved { + add_preserved(&mut writer, asset).map_err(map_bundle_err)?; + } + + let mut session = writer.into_stream_session().map_err(map_bundle_err)?; + session + .write_all(&relabeled) + .map_err(|e| PyIOError::new_err(format!("Failed to write relabeled stream: {e}")))?; + let writer = session.finish_into_writer(sample_count); + writer.finish().map_err(map_bundle_err)?; + + Ok(()) +} diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index c7fe92f..42d1286 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -247,7 +247,9 @@ def _write_jsonl(samples: List[List[int]], path: Path) -> None: f.write("\n") -def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: +def _ben_bytes_for( + samples: List[List[int]], tmp: Path, variant: str = "standard" +) -> bytes: ben_path = tmp / "inner.ben" with BenEncoder(ben_path, overwrite=True, variant=variant) as enc: for a in samples: @@ -255,7 +257,9 @@ def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard return ben_path.read_bytes() -def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: +def _xben_bytes_for( + samples: List[List[int]], tmp: Path, variant: str = "standard" +) -> bytes: src = tmp / "src.jsonl" _write_jsonl(samples, src) out = tmp / "inner.xben" @@ -277,7 +281,9 @@ def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: def test_bundle_round_trip_ben_with_assets(tmp_path: Path) -> None: rng = random.Random(4242) - samples = [[rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40)] + samples = [ + [rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40) + ] # NetworkX adjacency format (what read_graph rebuilds into a live graph). graph_json = ( b'{"directed":false,"multigraph":false,"graph":{},' @@ -292,9 +298,25 @@ def test_bundle_round_trip_ben_with_assets(tmp_path: Path) -> None: stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples), assets=[ - _Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=metadata_json, is_json=True), - _Asset(asset_type=ASSET_TYPE_GRAPH, name="graph.json", payload=graph_json, is_json=True, compress=True), - _Asset(asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, name="node_permutation_map.json", payload=perm_json, is_json=True), + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=metadata_json, + is_json=True, + ), + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=graph_json, + is_json=True, + compress=True, + ), + _Asset( + asset_type=ASSET_TYPE_NODE_PERMUTATION_MAP, + name="node_permutation_map.json", + payload=perm_json, + is_json=True, + ), _Asset(asset_type=ASSET_TYPE_CUSTOM, name="notes.bin", payload=custom_blob), ], ) @@ -357,7 +379,9 @@ def test_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="only_custom.bin", payload=b"x")], + assets=[ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name="only_custom.bin", payload=b"x") + ], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) dec = BendlDecoder(path) @@ -412,7 +436,11 @@ def test_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="binary.bin", payload=b"\xff\xfe\xfd")], + assets=[ + _Asset( + asset_type=ASSET_TYPE_CUSTOM, name="binary.bin", payload=b"\xff\xfe\xfd" + ) + ], ) path = _write_bundle(tmp_path / "bin.bendl", bundle) dec = BendlDecoder(path) @@ -425,7 +453,14 @@ def test_read_json_asset_rejects_malformed_json(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1]], tmp_path), sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b"not a json {{{", is_json=True)], + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b"not a json {{{", + is_json=True, + ) + ], ) path = _write_bundle(tmp_path / "m.bendl", bundle) dec = BendlDecoder(path) @@ -449,8 +484,15 @@ def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: payloads = {f"asset_{i:04d}.bin": bytes([i & 0xFF] * (i + 1)) for i in range(200)} - assets = [_Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) for n, p in payloads.items()] - bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), sample_count=1, assets=assets) + assets = [ + _Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) + for n, p in payloads.items() + ] + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), + sample_count=1, + assets=assets, + ) path = _write_bundle(tmp_path / "many.bendl", bundle) dec = BendlDecoder(path) assert dec.asset_names() == list(payloads.keys()) @@ -461,16 +503,30 @@ def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: def test_list_assets_flag_fidelity(tmp_path: Path) -> None: combos: List[Tuple[bool, bool, bool]] = [ - (False, False, False), (True, False, False), (False, True, False), - (False, False, True), (True, True, False), (True, False, True), - (False, True, True), (True, True, True), + (False, False, False), + (True, False, False), + (False, True, False), + (False, False, True), + (True, True, False), + (True, False, True), + (False, True, True), + (True, True, True), ] assets: List[_Asset] = [] expected: List[List[str]] = [] for i, (is_json, compress, has_checksum) in enumerate(combos): payload = f'{{"i":{i}}}'.encode("utf-8") if is_json else bytes([i % 256]) * 32 checksum = b"\xde\xad\xbe\xef" if has_checksum else None - assets.append(_Asset(asset_type=ASSET_TYPE_CUSTOM, name=f"asset-{i}.bin", payload=payload, is_json=is_json, compress=compress, checksum=checksum)) + assets.append( + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name=f"asset-{i}.bin", + payload=payload, + is_json=is_json, + compress=compress, + checksum=checksum, + ) + ) want: List[str] = [] if is_json: want.append("json") @@ -479,7 +535,12 @@ def test_list_assets_flag_fidelity(tmp_path: Path) -> None: if has_checksum: want.append("checksum") expected.append(want) - bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, assets=assets, checksums=False) + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + assets=assets, + checksums=False, + ) path = _write_bundle(tmp_path / "flags.bendl", bundle) got = BendlDecoder(path).list_assets() for entry, want in zip(got, expected): @@ -505,7 +566,9 @@ def test_zero_length_custom_payload(tmp_path: Path) -> None: def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) -> None: - bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1 + ) path = _write_bundle(tmp_path / "a.bendl", bundle) dec = BendlDecoder(path) target = tmp_path / "already.ben" @@ -516,7 +579,9 @@ def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) def test_extract_stream_into_missing_parent_dir_raises(tmp_path: Path) -> None: - bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1 + ) path = _write_bundle(tmp_path / "mini.bendl", bundle) dec = BendlDecoder(path) with pytest.raises(OSError): @@ -542,7 +607,11 @@ def test_open_rejects_plain_stream(tmp_path: Path) -> None: def test_open_rejects_unsupported_major_version(tmp_path: Path) -> None: - bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, major_version=999) + bundle = build_bundle( + stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), + sample_count=1, + major_version=999, + ) path = _write_bundle(tmp_path / "oldfuture.bendl", bundle) with pytest.raises(Exception, match="Failed to parse bundle header"): BendlDecoder(path) @@ -596,7 +665,14 @@ def test_open_rejects_malformed_directory_invariants(tmp_path: Path) -> None: wrong = build_bundle( stream_bytes=stream, sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="not_metadata.json", payload=b"{}", is_json=True)], + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="not_metadata.json", + payload=b"{}", + is_json=True, + ) + ], ) with pytest.raises(Exception, match="malformed directory"): BendlDecoder(_write_bundle(tmp_path / "singleton.bendl", wrong)) @@ -631,7 +707,15 @@ def test_corrupted_xz_asset_raises(tmp_path: Path) -> None: build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[_Asset(asset_type=ASSET_TYPE_GRAPH, name="graph.json", payload=b'{"nodes":[0,1,2,3,4,5,6,7,8,9]}', is_json=True, compress=True)], + assets=[ + _Asset( + asset_type=ASSET_TYPE_GRAPH, + name="graph.json", + payload=b'{"nodes":[0,1,2,3,4,5,6,7,8,9]}', + is_json=True, + compress=True, + ) + ], ) ) xz_start = bundle.find(b"\xfd7zXZ") @@ -701,9 +785,15 @@ def test_interrupted_zero_bytes_after_header(tmp_path: Path) -> None: assert extracted.read_bytes() == b"" -def test_finalized_bundle_with_inflated_stream_len_survives_open(tmp_path: Path) -> None: +def test_finalized_bundle_with_inflated_stream_len_survives_open( + tmp_path: Path, +) -> None: samples = [[1, 2, 3], [4, 5, 6]] - bundle = bytearray(build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples))) + bundle = bytearray( + build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) + ) old_stream_len = struct.unpack_from(" None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2], [3, 4]], tmp_path), sample_count=2, - assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b'{"x":1}', is_json=True)], + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b'{"x":1}', + is_json=True, + ) + ], ) path = _write_bundle(tmp_path / "seq.bendl", bundle) dec = BendlDecoder(path) @@ -742,7 +839,14 @@ def test_toc_interleaved_with_iteration(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples), - assets=[_Asset(asset_type=ASSET_TYPE_METADATA, name="metadata.json", payload=b'{"tag":42}', is_json=True)], + assets=[ + _Asset( + asset_type=ASSET_TYPE_METADATA, + name="metadata.json", + payload=b'{"tag":42}', + is_json=True, + ) + ], ) path = _write_bundle(tmp_path / "interleave.bendl", bundle) dec = BendlDecoder(path) @@ -763,7 +867,12 @@ def test_read_asset_bytes_idempotent(tmp_path: Path) -> None: sample_count=1, assets=[ _Asset(asset_type=ASSET_TYPE_CUSTOM, name="raw.bin", payload=payload), - _Asset(asset_type=ASSET_TYPE_CUSTOM, name="compressed.bin", payload=payload, compress=True), + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name="compressed.bin", + payload=payload, + compress=True, + ), ], ) path = _write_bundle(tmp_path / "idem.bendl", bundle) @@ -780,7 +889,9 @@ def test_read_asset_bytes_idempotent(tmp_path: Path) -> None: def test_iteration_can_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "twice.bendl", bundle) dec = BendlDecoder(path) assert list(dec) == samples @@ -789,7 +900,9 @@ def test_iteration_can_restart(tmp_path: Path) -> None: def test_partial_iteration_then_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6], [7, 8]] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "partial.bendl", bundle) dec = BendlDecoder(path) it = iter(dec) @@ -800,7 +913,9 @@ def test_partial_iteration_then_restart(tmp_path: Path) -> None: def test_subsample_modes(tmp_path: Path) -> None: samples = [[i] for i in range(1, 11)] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "sub.bendl", bundle) dec = BendlDecoder(path).subsample_range(3, 6) @@ -816,7 +931,9 @@ def test_subsample_modes(tmp_path: Path) -> None: def test_subsample_count_preserves_filtered_len(tmp_path: Path) -> None: samples = [[i] for i in range(1, 9)] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "cnt.bendl", bundle) dec = BendlDecoder(path).subsample_range(2, 5) assert len(dec) == 4 @@ -827,7 +944,9 @@ def test_subsample_count_preserves_filtered_len(tmp_path: Path) -> None: def test_subsample_out_of_bounds(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "oob.bendl", bundle) with pytest.raises(Exception, match="end must be <= number of samples"): BendlDecoder(path).subsample_range(1, 99) @@ -840,7 +959,9 @@ def test_subsample_out_of_bounds(tmp_path: Path) -> None: def test_len_uses_header_fast_path(tmp_path: Path) -> None: samples = [[i] for i in range(1, 6)] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) + ) path = _write_bundle(tmp_path / "fast.bendl", bundle) dec = BendlDecoder(path) assert len(dec) == len(samples) @@ -856,11 +977,25 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: truth: List[Tuple[str, bytes]] = [] for i in range(n_assets): payload = rng.randbytes(rng.choice([0, 1, 7, 64, 500])) - assets.append(_Asset(asset_type=ASSET_TYPE_CUSTOM, name=f"t{trial}-a{i}.bin", payload=payload, compress=rng.random() < 0.4)) + assets.append( + _Asset( + asset_type=ASSET_TYPE_CUSTOM, + name=f"t{trial}-a{i}.bin", + payload=payload, + compress=rng.random() < 0.4, + ) + ) truth.append((f"t{trial}-a{i}.bin", payload)) n_samples = rng.randint(1, 25) - samples = [[rng.randint(1, 8) for _ in range(rng.randint(1, 40))] for _ in range(n_samples)] - bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=n_samples, assets=assets) + samples = [ + [rng.randint(1, 8) for _ in range(rng.randint(1, 40))] + for _ in range(n_samples) + ] + bundle = build_bundle( + stream_bytes=_ben_bytes_for(samples, tmp_path), + sample_count=n_samples, + assets=assets, + ) path = _write_bundle(tmp_path / f"fuzz-{trial}.bendl", bundle) dec = BendlDecoder(path) assert dec.count_samples() == n_samples diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index b415dc3..63a4811 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -56,7 +56,12 @@ def test_create_round_trip_all_asset_kinds(tmp_path: Path) -> None: assert dec.is_complete() assert dec.count_samples() == len(samples) assert dec.assignment_format() == "ben" - assert dec.asset_names() == ["graph.json", "metadata.json", "notes.txt", "post.json"] + assert dec.asset_names() == [ + "graph.json", + "metadata.json", + "notes.txt", + "post.json", + ] assert dec.read_metadata() == {"seed": 1234} assert dec.read_asset_bytes("notes.txt") == b"hello world" assert dec.read_json_asset("post.json") == {"k": [1, 2, 3]} @@ -143,7 +148,9 @@ def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: with pytest.raises(Exception, match="unfinalized"): dec.extract_stream(tmp_path / "recovered.ben") # ...but the partial write is recoverable. - dec.extract_stream(tmp_path / "recovered.ben", overwrite=True, allow_unfinalized=True) + dec.extract_stream( + tmp_path / "recovered.ben", overwrite=True, allow_unfinalized=True + ) assert (tmp_path / "recovered.ben").stat().st_size > 0 @@ -209,6 +216,18 @@ def test_add_graph_none_stores_raw_without_permutation_map(tmp_path: Path) -> No assert dec.read_node_permutation_map() is None +def test_add_graph_defaults_to_mlc_reorder(tmp_path: Path) -> None: + # With no preprocess_method, add_graph reorders via MLC and stores a map. + path = tmp_path / "default.bendl" + enc = BendlEncoder(path, overwrite=True) + returned = enc.add_graph(_graph()) + enc.close() + assert returned.number_of_nodes() == _n() + dec = BendlDecoder(path) + assert dec.asset_names() == ["graph.json", "node_permutation_map.json"] + assert dec.read_node_permutation_map()["ordering_method"] == "multi-level-cluster" + + def test_add_graph_node_count_mismatch_raises(tmp_path: Path) -> None: n = _n() enc = BendlEncoder(tmp_path / "nc.bendl", overwrite=True) diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index 5a68aa3..f81216f 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -297,7 +297,9 @@ def test_benencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> Non with pytest.raises(OSError, match="already exists"): BenEncoder(out, overwrite=False, variant="standard") with pytest.raises(OSError, match="Failed to create"): - BenEncoder(tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard") + BenEncoder( + tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard" + ) # ---------- Decoder error / laziness paths ---------- @@ -403,13 +405,17 @@ def test_codec_helpers_reject_unknown_variants(tmp_path: Path) -> None: encode_jsonl_to_xben(src, tmp_path / "o.xben", overwrite=True, variant="weird") -def test_codec_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: Path) -> None: +def test_codec_helpers_reject_same_path_missing_input_and_bad_json( + tmp_path: Path, +) -> None: src = tmp_path / "src.jsonl" write_jsonl([[1, 1, 2]], src) with pytest.raises(OSError, match="must differ"): encode_jsonl_to_ben(src, src, overwrite=True, variant="standard") with pytest.raises(OSError, match="does not exist"): - encode_jsonl_to_ben(tmp_path / "missing.jsonl", tmp_path / "o.ben", overwrite=True) + encode_jsonl_to_ben( + tmp_path / "missing.jsonl", tmp_path / "o.ben", overwrite=True + ) bad_json = tmp_path / "bad.jsonl" bad_json.write_text("not json\n", encoding="utf-8") with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): @@ -418,7 +424,9 @@ def test_codec_helpers_reject_same_path_missing_input_and_bad_json(tmp_path: Pat def test_encode_ben_to_xben_error_paths(tmp_path: Path) -> None: with pytest.raises(OSError, match="does not exist"): - encode_ben_to_xben(tmp_path / "missing.ben", tmp_path / "o.xben", overwrite=True) + encode_ben_to_xben( + tmp_path / "missing.ben", tmp_path / "o.xben", overwrite=True + ) bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="must differ"): @@ -429,7 +437,9 @@ def test_encode_ben_to_xben_error_paths(tmp_path: Path) -> None: def test_decode_helpers_error_paths(tmp_path: Path) -> None: with pytest.raises(OSError, match="does not exist"): - decode_ben_to_jsonl(tmp_path / "missing.ben", tmp_path / "o.jsonl", overwrite=True) + decode_ben_to_jsonl( + tmp_path / "missing.ben", tmp_path / "o.jsonl", overwrite=True + ) bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): diff --git a/ben-py/tests/test_relabel.py b/ben-py/tests/test_relabel.py new file mode 100644 index 0000000..24ad5a4 --- /dev/null +++ b/ben-py/tests/test_relabel.py @@ -0,0 +1,127 @@ +"""Tests for ``binary_ensemble.bundle.relabel_bundle`` (reorder graph + relabel stream).""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from binary_ensemble.bundle import ( + BendlDecoder, + BendlEncoder, + compress_stream, + relabel_bundle, +) + +EXAMPLE_GRAPH = ( + Path(__file__).resolve().parent.parent + / "docs" + / "user" + / "example_data" + / "gerrymandria.json" +) + + +def _graph(): + return json.loads(EXAMPLE_GRAPH.read_text()) + + +def _n(): + return len(_graph()["nodes"]) + + +def _build_ben_bundle(path: Path, with_graph: bool = True): + n = _n() + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(8)] + enc = BendlEncoder(path, overwrite=True) + if with_graph: + enc.add_graph(_graph(), preprocess_method=None) # store in raw order + enc.add_metadata({"seed": 99}) + with enc.stream("ben") as s: + for a in samples: + s.write(a) + enc.add_asset("notes.txt", "hi", content_type="text") + return samples + + +def _depermute(dst_plan, old_to_new): + """Map an MLC-ordered plan back to the source node order.""" + return [dst_plan[old_to_new[i]] for i in range(len(dst_plan))] + + +def test_relabel_out_file_is_lossless_and_preserves_assets(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + out = tmp_path / "out.bendl" + samples = _build_ben_bundle(src) + + relabel_bundle(src, out_file=out, method="mlc") + + dec = BendlDecoder(out) + # Stays BEN, same sample count, canonical graph + permutation map present. + assert dec.assignment_format() == "ben" + assert len(dec) == len(samples) + assert dec.asset_names() == [ + "graph.json", + "node_permutation_map.json", + "metadata.json", + "notes.txt", + ] + # Metadata + custom assets carried over. + assert dec.read_metadata() == {"seed": 99} + assert dec.read_asset_bytes("notes.txt") == b"hi" + + pmap = dec.read_node_permutation_map() + old_to_new = {int(k): v for k, v in pmap["node_permutation_old_to_new"].items()} + assert sorted(old_to_new) == list(range(_n())) + assert sorted(old_to_new.values()) == list(range(_n())) + + # Relabeling is lossless: de-permuting reproduces the source plans exactly. + relabeled = list(dec) + assert [_depermute(p, old_to_new) for p in relabeled] == samples + # Source bundle is untouched. + assert list(BendlDecoder(src)) == samples + + +def test_relabel_in_place(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + samples = _build_ben_bundle(src) + + relabel_bundle(src, in_place=True, method="rcm") + + dec = BendlDecoder(src) + assert dec.assignment_format() == "ben" + assert len(dec) == len(samples) + assert dec.read_node_permutation_map()["ordering_method"] == "reverse-cuthill-mckee" + old_to_new = { + int(k): v + for k, v in dec.read_node_permutation_map()[ + "node_permutation_old_to_new" + ].items() + } + assert [_depermute(p, old_to_new) for p in dec] == samples + + +def test_relabel_arg_validation(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + _build_ben_bundle(src) + with pytest.raises(ValueError, match="either in_place=True or out_file"): + relabel_bundle(src) + with pytest.raises(ValueError, match="not both"): + relabel_bundle(src, out_file=tmp_path / "o.bendl", in_place=True) + + +def test_relabel_requires_graph(tmp_path: Path) -> None: + src = tmp_path / "nograph.bendl" + _build_ben_bundle(src, with_graph=False) + with pytest.raises(ValueError, match="no graph.json"): + relabel_bundle(src, out_file=tmp_path / "o.bendl") + + +def test_relabel_rejects_xben_bundle(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + xben = tmp_path / "in.xben.bendl" + _build_ben_bundle(src) + compress_stream(src, out_file=xben) + with pytest.raises(ValueError, match="only supports BEN"): + relabel_bundle(xben, out_file=tmp_path / "o.bendl") diff --git a/ben-py/tests/test_surface.py b/ben-py/tests/test_surface.py index 46609d7..e0a0dc5 100644 --- a/ben-py/tests/test_surface.py +++ b/ben-py/tests/test_surface.py @@ -35,6 +35,7 @@ def test_top_level_exports() -> None: "BendlEncoder", "BendlDecoder", "compress_stream", + "relabel_bundle", "BenEncoder", "BenDecoder", "encode_jsonl_to_ben", @@ -56,7 +57,12 @@ def test_stream_module_exports() -> None: def test_bundle_module_exports() -> None: - assert set(bundle.__all__) == {"BendlEncoder", "BendlDecoder", "compress_stream"} + assert set(bundle.__all__) == { + "BendlEncoder", + "BendlDecoder", + "compress_stream", + "relabel_bundle", + } assert bundle.BendlDecoder is _core.BendlDecoder @@ -239,10 +245,15 @@ def _params_from_inspect(func, *, drop_self: bool): def test_bundle_facade_matches_stub() -> None: stub = _parse_stub(PKG_DIR / "bundle.pyi") - # compress_stream (module-level function). - assert _params_from_inspect(bundle.compress_stream, drop_self=False) == stub[ - "compress_stream" - ][1] + # Module-level functions. + assert ( + _params_from_inspect(bundle.compress_stream, drop_self=False) + == stub["compress_stream"][1] + ) + assert ( + _params_from_inspect(bundle.relabel_bundle, drop_self=False) + == stub["relabel_bundle"][1] + ) # BendlEncoder methods. enc_methods = stub["BendlEncoder"][1] diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index ed89806..9c6921d 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -616,7 +616,10 @@ fn decode_twodelta_ben_to_assignments(ben: &[u8]) -> Vec> { #[test] fn twodelta_ben_first_frame_carries_snapshot_tag() { let ben = make_twodelta_ben(&[vec![1u16, 1, 2, 2], vec![1u16, 2, 1, 2]]); - assert_eq!(collect_twodelta_tags(&ben).first().copied(), Some(BEN_TWODELTA_SNAPSHOT_TAG)); + assert_eq!( + collect_twodelta_tags(&ben).first().copied(), + Some(BEN_TWODELTA_SNAPSHOT_TAG) + ); } #[test] @@ -687,11 +690,11 @@ fn twodelta_ben_count_samples_over_mixed_stream() { let anchor = vec![1u16, 1, 2, 2]; let assignments = vec![ anchor.clone(), - anchor.clone(), // repeat of anchor - vec![1u16, 2, 1, 2], // delta - vec![3u16, 3, 1, 2], // snapshot - vec![3u16, 3, 1, 2], // repeat of snapshot - vec![3u16, 3, 2, 1], // delta + anchor.clone(), // repeat of anchor + vec![1u16, 2, 1, 2], // delta + vec![3u16, 3, 1, 2], // snapshot + vec![3u16, 3, 1, 2], // repeat of snapshot + vec![3u16, 3, 2, 1], // delta ]; let ben = make_twodelta_ben(&assignments); let reader = crate::io::reader::BenStreamReader::from_ben(ben.as_slice()).unwrap(); @@ -728,8 +731,15 @@ fn decode_xben_to_jsonl_twodelta_mixed_via_translate_roundtrip() { let ben = make_twodelta_ben(&assignments); let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None) - .unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let mut jsonl = Vec::new(); decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); @@ -752,8 +762,15 @@ fn decode_xben_to_jsonl_twodelta_delta_snapshot_repeat_delta_via_translate() { let ben = make_twodelta_ben(&assignments); let mut xben = Vec::new(); - encode_ben_to_xben(BufReader::new(ben.as_slice()), &mut xben, Some(1), Some(0), None, None) - .unwrap(); + encode_ben_to_xben( + BufReader::new(ben.as_slice()), + &mut xben, + Some(1), + Some(0), + None, + None, + ) + .unwrap(); let mut jsonl = Vec::new(); decode_xben_to_jsonl(BufReader::new(xben.as_slice()), &mut jsonl).unwrap(); diff --git a/ben/src/io/reader/stream_reader/ben.rs b/ben/src/io/reader/stream_reader/ben.rs index 41769e5..038ec04 100644 --- a/ben/src/io/reader/stream_reader/ben.rs +++ b/ben/src/io/reader/stream_reader/ben.rs @@ -14,8 +14,8 @@ use crate::BenVariant; /// Read the next frame from the underlying BEN stream. /// /// Every frame of a `TwoDelta` stream is prefixed with a 1-byte tag selecting its body layout: a -/// `BEN_TWODELTA_SNAPSHOT_TAG` frame is `MkvChain`-formatted and a `BEN_TWODELTA_DELTA_TAG` frame is -/// a delta. The tag is consumed here so the frame module stays variant-clean. Non-`TwoDelta` +/// `BEN_TWODELTA_SNAPSHOT_TAG` frame is `MkvChain`-formatted and a `BEN_TWODELTA_DELTA_TAG` frame +/// is a delta. The tag is consumed here so the frame module stays variant-clean. Non-`TwoDelta` /// streams carry no tag and read their fixed body directly. pub(super) fn pop_frame_from_reader( reader: &mut R, diff --git a/ben/src/io/reader/tests.rs b/ben/src/io/reader/tests.rs index e84b83a..46b6f23 100644 --- a/ben/src/io/reader/tests.rs +++ b/ben/src/io/reader/tests.rs @@ -255,7 +255,9 @@ fn raw_frame_surface_roundtrips_mixed_twodelta_ben() { // self-contained Standard frame. A mixed snapshot/delta stream must round-trip across it. let assignments = mixed_twodelta_assignments(); let ben = make_ben_from_assignments(&assignments, BenVariant::TwoDelta); - let frames = BenStreamReader::from_ben(Cursor::new(ben)).unwrap().into_frames(); + let frames = BenStreamReader::from_ben(Cursor::new(ben)) + .unwrap() + .into_frames(); assert_eq!(expand_raw_frames(frames), assignments); } @@ -263,7 +265,9 @@ fn raw_frame_surface_roundtrips_mixed_twodelta_ben() { fn raw_frame_surface_roundtrips_mixed_twodelta_xben() { let assignments = mixed_twodelta_assignments(); let xben = make_xben_from_assignments(&assignments, BenVariant::TwoDelta); - let frames = BenStreamReader::from_xben(Cursor::new(xben)).unwrap().into_frames(); + let frames = BenStreamReader::from_xben(Cursor::new(xben)) + .unwrap() + .into_frames(); assert_eq!(expand_raw_frames(frames), assignments); } @@ -278,7 +282,14 @@ fn subsample_mixed_twodelta_ben_selects_correct_samples() { .into_subsample_by_indices(vec![1, 3, 5]) .map(|r| r.unwrap().0) .collect(); - assert_eq!(results, vec![assignments[0].clone(), assignments[2].clone(), assignments[4].clone()]); + assert_eq!( + results, + vec![ + assignments[0].clone(), + assignments[2].clone(), + assignments[4].clone() + ] + ); } #[test] @@ -290,7 +301,14 @@ fn subsample_mixed_twodelta_xben_selects_correct_samples() { .into_subsample_by_indices(vec![1, 3, 5]) .map(|r| r.unwrap().0) .collect(); - assert_eq!(results, vec![assignments[0].clone(), assignments[2].clone(), assignments[4].clone()]); + assert_eq!( + results, + vec![ + assignments[0].clone(), + assignments[2].clone(), + assignments[4].clone() + ] + ); } #[test] @@ -1709,8 +1727,8 @@ fn raw_frame_iter_propagates_twodelta_decode_error() { u32::from_be_bytes(ben[anchor_start + 2..anchor_start + 6].try_into().unwrap()) as usize; let anchor_end = anchor_start + 6 + n_bytes + 2; - // The delta frame: delta_tag(1) + pair_a(2) + pair_b(2) + max_len_bits(1) + ... Set max_len_bits - // to 0, which triggers InvalidData during decoding. + // The delta frame: delta_tag(1) + pair_a(2) + pair_b(2) + max_len_bits(1) + ... Set + // max_len_bits to 0, which triggers InvalidData during decoding. ben[anchor_end + 5] = 0; let reader = BenStreamReader::from_ben(Cursor::new(ben)).unwrap(); diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index f72744e..9a35baa 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -648,7 +648,10 @@ fn xz_writer_twodelta_mixed_snapshot_delta_direct_roundtrip() { vec![3u16, 3, 1, 2], // 3 ids → mid-stream full vec![3u16, 3, 2, 1], // delta from the snapshot ]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::TwoDelta), + assignments + ); } #[test] @@ -661,7 +664,10 @@ fn xz_writer_twodelta_new_district_falls_back_to_snapshot_direct() { vec![1u16, 1, 2, 2], // introduces district 2 → snapshot vec![1u16, 2, 1, 2], // delta (both ids present) ]; - assert_eq!(roundtrip_xben(&assignments, BenVariant::TwoDelta), assignments); + assert_eq!( + roundtrip_xben(&assignments, BenVariant::TwoDelta), + assignments + ); } #[test] diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 5173910..172ada1 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -81,10 +81,7 @@ pub(crate) enum TransitionKind { /// /// `zip` would silently truncate to the shorter vector, so the length is checked explicitly, /// preserving the validation the strict single-frame encoder performs. -pub(crate) fn classify_transition( - previous: &[u16], - current: &[u16], -) -> io::Result { +pub(crate) fn classify_transition(previous: &[u16], current: &[u16]) -> io::Result { if previous.len() != current.len() { return Err(io::Error::new( io::ErrorKind::InvalidData, diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs index 083dbc9..a33ee28 100644 --- a/ben/tests/test_format_stability.rs +++ b/ben/tests/test_format_stability.rs @@ -395,8 +395,14 @@ fn flip_unknown_flag_bits(mut bytes: Vec) -> Vec { #[test] #[ignore = "regenerates committed v1.0.0 fixtures; never run as part of normal CI"] fn generate_format_stability_fixtures() { - write_fixture("standard.ben", &mint_ben(BenVariant::Standard, CANONICAL_JSONL)); - write_fixture("mkvchain.ben", &mint_ben(BenVariant::MkvChain, CANONICAL_JSONL)); + write_fixture( + "standard.ben", + &mint_ben(BenVariant::Standard, CANONICAL_JSONL), + ); + write_fixture( + "mkvchain.ben", + &mint_ben(BenVariant::MkvChain, CANONICAL_JSONL), + ); write_fixture( "twodelta.ben", &mint_ben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), @@ -435,9 +441,10 @@ fn generate_format_stability_fixtures() { #[ignore = "regenerates only the (unreleased) TwoDelta fixtures; never run as part of normal CI"] fn regenerate_twodelta_fixtures() { // `TwoDelta` is unreleased, so its wire format may change and its fixtures may be re-minted in - // place (the "committed before any release shipped" escape hatch in `docs/format-stability.md`). - // This regenerator touches *only* the TwoDelta fixtures and their source, leaving every released - // Standard/MkvChain/BENDL fixture byte-for-byte untouched. + // place (the "committed before any release shipped" escape hatch in + // `docs/format-stability.md`). This regenerator touches *only* the TwoDelta fixtures and + // their source, leaving every released Standard/MkvChain/BENDL fixture byte-for-byte + // untouched. write_fixture( "twodelta.ben", &mint_ben(BenVariant::TwoDelta, TWODELTA_CANONICAL_JSONL), From 3ca70d0f434400a7db43d8c178e981131e729ea2 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:33:40 -0600 Subject: [PATCH 136/221] api change to add_graph on the python side --- ben-py/binary_ensemble/_core.pyi | 14 ++-- ben-py/binary_ensemble/bundle.py | 49 ++++++++------ ben-py/binary_ensemble/bundle.pyi | 5 +- ben-py/binary_ensemble/graph.py | 27 ++++---- ben-py/binary_ensemble/graph.pyi | 6 +- ben-py/src/encode/bundle_encoder.rs | 33 +++++----- ben-py/src/graph/helpers.rs | 99 ++++++++++++++++++++++------- ben-py/src/graph/py_funcs.rs | 18 +++--- ben-py/src/relabel.rs | 16 +++-- ben-py/tests/test_bundle_api.py | 22 +++---- ben-py/tests/test_graph.py | 28 ++++++++ ben-py/tests/test_recompress.py | 2 +- ben-py/tests/test_relabel.py | 23 ++++++- 13 files changed, 234 insertions(+), 108 deletions(-) diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index 44b33f0..7e57e30 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -110,8 +110,11 @@ class BendlEncoder: ) -> None: ... def add_metadata(self, metadata: Any) -> None: ... # Returns the (possibly reordered) graph as a NetworkX graph, matching - # BendlDecoder.read_graph. preprocess_method defaults to "mlc"; pass None for raw. - def add_graph(self, graph: Any, preprocess_method: str | None = "mlc") -> Any: ... + # BendlDecoder.read_graph. sort defaults to "mlc"; sort="key" sorts by `key`; + # sort=None stores raw. + def add_graph( + self, graph: Any, sort: str | None = "mlc", key: str | None = None + ) -> Any: ... def stream( self, format: Literal["ben"] = "ben", @@ -162,13 +165,16 @@ def decode_xben_to_ben( # Graph reordering and bundle recompression # --------------------------------------------------------------------------- -def graph_reorder(graph: Any, method: str) -> tuple[Any, Any]: ... +def graph_reorder( + graph: Any, sort: str | None = "mlc", key: str | None = None +) -> tuple[Any, Any]: ... def recompress_bundle( in_file: str | Path, out_file: str | Path, overwrite: bool = False ) -> None: ... def relabel_bundle( in_file: str | Path, out_file: str | Path, - method: str = "mlc", + sort: str | None = "mlc", + key: str | None = None, overwrite: bool = False, ) -> None: ... diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 737bfab..933b8ce 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -108,22 +108,31 @@ def append(cls, file_path) -> "BendlEncoder": self._enc = _CoreBendlEncoder.append(file_path) return self - def add_graph(self, graph: Any, preprocess_method: Optional[str] = "mlc") -> Any: + def add_graph( + self, graph: Any, sort: Optional[str] = "mlc", key: Optional[str] = None + ) -> Any: """Embed the dual ``graph.json`` and return the (possibly reordered) graph. - ``preprocess_method`` defaults to ``"mlc"`` (multi-level clustering), so - by default the graph is reordered for better compression. When it is not - ``None`` the graph is reordered (``"rcm"``, ``"mlc"``, or a node-attribute - key) and both ``graph.json`` and ``node_permutation_map.json`` are stored; - the reordered graph is returned so the chain runs on that ordering. - Reordering is pre-stream only. Pass ``preprocess_method=None`` to store - the graph as-is with no permutation map. + ``sort`` selects how nodes are ordered and defaults to ``"mlc"`` (so the + graph is reordered for better compression): + + - ``"mlc"`` — multi-level clustering, + - ``"rcm"`` — reverse Cuthill-McKee, + - ``"key"`` — sort by the node attribute named in ``key`` (e.g. + ``sort="key", key="GEOID"``; ``key="id"`` sorts by the NetworkX node id), + - ``None`` — store the graph as-is, with no permutation map. + + When reordering, both ``graph.json`` and ``node_permutation_map.json`` are + stored and the reordered graph is returned so the chain runs on that + ordering. Reordering is pre-stream only; a raw graph (``sort=None``) may + also be attached post-stream / in append mode. ``key`` is only valid with + ``sort="key"``. The graph is returned as a NetworkX graph (matching :meth:`BendlDecoder.read_graph`), so its node order is the order the chain should write assignments in. """ - return self._enc.add_graph(graph, preprocess_method) + return self._enc.add_graph(graph, sort, key) def add_metadata(self, metadata: Any) -> None: """Embed the canonical ``metadata.json`` asset (a dict/list, bytes, or path).""" @@ -209,24 +218,26 @@ def compress_stream( def relabel_bundle( path, out_file=None, - method: str = "mlc", + sort: str = "mlc", + key: Optional[str] = None, in_place: bool = False, ) -> None: - """Reorder a BEN bundle's graph by ``method`` and relabel its stream to match. + """Reorder a BEN bundle's graph and relabel its stream to match. - Reorders the embedded ``graph.json`` (``"mlc"`` by default; also ``"rcm"`` or - a node-attribute key), rewrites every assignment into the new node order, and - writes a fresh bundle storing the reordered graph and a - ``node_permutation_map.json`` (so the reordering is reversible). Metadata and - custom assets are preserved. This is the bundle-level form of the CLI's - ``reben`` ordering flow — typically run to shrink a bundle before an XBEN - recompress. + ``sort`` selects the ordering — ``"mlc"`` (default), ``"rcm"``, or ``"key"`` + to sort by the node attribute named in ``key`` (e.g. ``sort="key", + key="GEOID"``). It reorders the embedded ``graph.json``, rewrites every + assignment into the new node order, and writes a fresh bundle storing the + reordered graph and a ``node_permutation_map.json`` (so the reordering is + reversible). Metadata and custom assets are preserved. This is the + bundle-level form of the CLI's ``reben`` ordering flow — typically run to + shrink a bundle before an XBEN recompress. Provide exactly one of ``in_place=True`` or ``out_file``. Only BEN bundles are supported (relabel before compressing to XBEN); the source must carry a graph. """ _atomic_or_out( - lambda src, dst, overwrite: _relabel_bundle(src, dst, method, overwrite), + lambda src, dst, overwrite: _relabel_bundle(src, dst, sort, key, overwrite), path, out_file, in_place, diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi index 813efb6..3cdaeed 100644 --- a/ben-py/binary_ensemble/bundle.pyi +++ b/ben-py/binary_ensemble/bundle.pyi @@ -10,7 +10,7 @@ class BendlEncoder: @classmethod def append(cls, file_path) -> "BendlEncoder": ... def add_graph( - self, graph: Any, preprocess_method: Optional[str] = "mlc" + self, graph: Any, sort: Optional[str] = "mlc", key: Optional[str] = None ) -> Any: ... def add_metadata(self, metadata: Any) -> None: ... def add_asset( @@ -34,6 +34,7 @@ def compress_stream( def relabel_bundle( path, out_file=None, - method: str = "mlc", + sort: str = "mlc", + key: Optional[str] = None, in_place: bool = False, ) -> None: ... diff --git a/ben-py/binary_ensemble/graph.py b/ben-py/binary_ensemble/graph.py index 6a263fd..cf62009 100644 --- a/ben-py/binary_ensemble/graph.py +++ b/ben-py/binary_ensemble/graph.py @@ -12,13 +12,13 @@ an object with a ``node_permutation_old_to_new`` field mapping original zero-based node positions to their new positions. -To reorder *and* embed the result in a bundle in one step, pass -``preprocess_method`` to :meth:`binary_ensemble.bundle.BendlEncoder.add_graph`. +To reorder *and* embed the result in a bundle in one step, pass ``sort`` / ``key`` +to :meth:`binary_ensemble.bundle.BendlEncoder.add_graph`. """ from __future__ import annotations -from typing import Any, Tuple +from typing import Any, Optional, Tuple from binary_ensemble._core import graph_reorder @@ -30,26 +30,29 @@ ] -def reorder(graph: Any, method: str) -> Tuple[Any, Any]: - """Reorder ``graph`` by ``method`` and return ``(reordered_graph, node_permutation_map)``. +def reorder( + graph: Any, sort: str = "mlc", key: Optional[str] = None +) -> Tuple[Any, Any]: + """Reorder ``graph`` and return ``(reordered_graph, node_permutation_map)``. - ``method`` is one of ``"multi-level-cluster"`` / ``"mlc"``, - ``"reverse-cuthill-mckee"`` / ``"rcm"``, or a node-attribute key (e.g. - ``"geoid"``, or the special ``"id"`` for the NetworkX node id). + ``sort`` is ``"mlc"`` (multi-level clustering, the default), ``"rcm"`` + (reverse Cuthill-McKee), or ``"key"`` to sort by the node attribute named in + ``key`` (e.g. ``sort="key", key="GEOID"``; ``key="id"`` sorts by the NetworkX + node id). ``key`` is only valid with ``sort="key"``. """ - return graph_reorder(graph, method) + return graph_reorder(graph, sort, key) def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: """Reorder ``graph`` using recursive multi-level clustering.""" - return graph_reorder(graph, "multi-level-cluster") + return graph_reorder(graph, "mlc") def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: """Reorder ``graph`` using Reverse Cuthill-McKee.""" - return graph_reorder(graph, "reverse-cuthill-mckee") + return graph_reorder(graph, "rcm") def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: """Reorder ``graph`` by sorting on a node-attribute ``key`` (use ``"id"`` for node id).""" - return graph_reorder(graph, key) + return graph_reorder(graph, "key", key) diff --git a/ben-py/binary_ensemble/graph.pyi b/ben-py/binary_ensemble/graph.pyi index 0317d93..0c0e70b 100644 --- a/ben-py/binary_ensemble/graph.pyi +++ b/ben-py/binary_ensemble/graph.pyi @@ -1,4 +1,4 @@ -from typing import Any, Tuple +from typing import Any, Optional, Tuple __all__ = [ "reorder", @@ -9,7 +9,9 @@ __all__ = [ # Each helper returns (reordered_graph, node_permutation_map): the graph is a live # NetworkX graph, the map is the parsed node_permutation_map.json dict. -def reorder(graph: Any, method: str) -> Tuple[Any, Any]: ... +def reorder( + graph: Any, sort: str = "mlc", key: Optional[str] = None +) -> Tuple[Any, Any]: ... def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: ... def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: ... def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: ... diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index 1f3de68..db777ce 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -9,7 +9,7 @@ use crate::common::{ graph_node_count, networkx_graph_from_bytes, open_output, parse_graph_input, parse_variant, }; -use crate::graph::helpers::reorder_graph_to_bytes; +use crate::graph::helpers::{reorder_graph_to_bytes, resolve_reorder}; use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; use binary_ensemble::io::bundle::writer::BendlAppender; use binary_ensemble::io::bundle::{ @@ -162,34 +162,37 @@ impl PyBendlEncoder { /// Add the `graph.json` known asset. /// - /// `preprocess_method` defaults to `"mlc"`, so by default the graph is reordered for better - /// compression. When it is not `None`, the graph is reordered via the chosen method, both - /// `graph.json` and `node_permutation_map.json` are stored, and the reordered graph is returned - /// (as a NetworkX graph, matching `BendlDecoder.read_graph`) so the chain runs on that - /// ordering. Reordering is pre-stream only. Pass `preprocess_method=None` to store the - /// graph as-is (no permutation map); a raw graph may also be attached post-stream / in - /// append mode. The returned graph's node count is recorded for per-write validation. - #[pyo3(signature = (graph, preprocess_method = Some("mlc".to_string())))] - #[pyo3(text_signature = "(self, graph, preprocess_method='mlc')")] + /// `sort` defaults to `"mlc"`, so by default the graph is reordered for better compression. + /// `sort` is `"mlc"` (multi-level clustering), `"rcm"` (reverse Cuthill-McKee), `"key"` to sort + /// by a node attribute named via `key` (e.g. `key="GEOID"`), or `None` to store the graph + /// as-is. When reordering, both `graph.json` and `node_permutation_map.json` are stored, + /// and the reordered graph is returned (as a NetworkX graph, matching + /// `BendlDecoder.read_graph`) so the chain runs on that ordering. Reordering is pre-stream + /// only; a raw graph (`sort=None`) may also be attached post-stream / in append mode. The + /// returned graph's node count is recorded for per-write validation. + #[pyo3(signature = (graph, sort = Some("mlc".to_string()), key = None))] + #[pyo3(text_signature = "(self, graph, sort='mlc', key=None)")] fn add_graph( &mut self, py: Python<'_>, graph: Bound<'_, PyAny>, - preprocess_method: Option, + sort: Option, + key: Option, ) -> PyResult> { + let plan = resolve_reorder(sort.as_deref(), key.as_deref())?; let graph_bytes = parse_graph_input(py, &graph)?; let opts = AddAssetOptions::defaults().json(); - if let Some(method) = preprocess_method { + if let Some(plan) = plan { // Reordering rewrites the node ordering the chain must write in, so it is pre-stream // only. if !matches!(self.state, BundleState::PreStream { .. }) { return Err(PyException::new_err( - "a reordering add_graph (preprocess_method != None) is only allowed before \ - stream(); post-stream or append-mode graphs must use preprocess_method=None", + "a reordering add_graph (sort != None) is only allowed before stream(); \ + post-stream or append-mode graphs must use sort=None", )); } - let (reordered, map) = reorder_graph_to_bytes(&graph_bytes, &method)?; + let (reordered, map) = reorder_graph_to_bytes(&graph_bytes, &plan)?; let count = graph_node_count(&reordered)?; if let BundleState::PreStream { writer, diff --git a/ben-py/src/graph/helpers.rs b/ben-py/src/graph/helpers.rs index 7565ce1..c013bb7 100644 --- a/ben-py/src/graph/helpers.rs +++ b/ben-py/src/graph/helpers.rs @@ -1,53 +1,104 @@ use binary_ensemble::json::graph::{ sort_json_file_by_key, sort_json_file_by_ordering, GraphOrderingMethod, }; -use pyo3::exceptions::PyException; +use pyo3::exceptions::{PyException, PyValueError}; use pyo3::prelude::*; use serde_json::json; use std::io::Cursor; -/// How a `preprocess_method` / graph-utility method string maps onto reben's reordering machinery. -enum Reorder { +/// A resolved reordering, derived from the `sort` / `key` arguments. +pub enum ReorderPlan { /// A topology-based ordering algorithm, paired with its canonical kebab-case name. Ordering(GraphOrderingMethod, &'static str), - /// A node-attribute key sort (e.g. `"geoid"`, or the special `"id"` for the NetworkX node id). + /// A node-attribute key sort (e.g. `"GEOID"`, or the special `"id"` for the NetworkX node id). Key(String), } -fn classify(method: &str) -> Reorder { - match method { - "mlc" | "multi-level-cluster" => Reorder::Ordering( - GraphOrderingMethod::MultiLevelCluster, - "multi-level-cluster", - ), - "rcm" | "reverse-cuthill-mckee" => Reorder::Ordering( - GraphOrderingMethod::ReverseCuthillMckee, - "reverse-cuthill-mckee", - ), - other => Reorder::Key(other.to_string()), +/// Resolve the `(sort, key)` argument pair into a reordering plan, or `None` to store the graph +/// as-is (`sort=None`). +/// +/// `sort` selects the method: `"mlc"` / `"rcm"` reorder by graph topology, `"key"` sorts by a node +/// attribute (which must be named via `key`), and `None` means no reordering. `key` is only valid +/// with `sort="key"`. +pub fn resolve_reorder(sort: Option<&str>, key: Option<&str>) -> PyResult> { + match sort { + None => { + if key.is_some() { + return Err(PyValueError::new_err( + "key=... requires sort='key'; pass sort='key' to sort by a node attribute", + )); + } + Ok(None) + } + Some("mlc") | Some("multi-level-cluster") => { + reject_key(key, "mlc")?; + Ok(Some(ReorderPlan::Ordering( + GraphOrderingMethod::MultiLevelCluster, + "multi-level-cluster", + ))) + } + Some("rcm") | Some("reverse-cuthill-mckee") => { + reject_key(key, "rcm")?; + Ok(Some(ReorderPlan::Ordering( + GraphOrderingMethod::ReverseCuthillMckee, + "reverse-cuthill-mckee", + ))) + } + Some("key") => { + let key = key.ok_or_else(|| { + PyValueError::new_err( + "sort='key' requires key=... (the node attribute to sort by, e.g. 'GEOID')", + ) + })?; + Ok(Some(ReorderPlan::Key(key.to_string()))) + } + Some(other) => Err(PyValueError::new_err(format!( + "unknown sort {other:?}; use 'mlc', 'rcm', 'key', or None" + ))), } } -/// Reorder a NetworkX adjacency-format graph and emit a `node_permutation_map.json` payload. +fn reject_key(key: Option<&str>, sort: &str) -> PyResult<()> { + if key.is_some() { + return Err(PyValueError::new_err(format!( + "key=... is only valid with sort='key', not sort='{sort}'" + ))); + } + Ok(()) +} + +/// Resolve `(sort, key)` and require an actual reordering (used by callers that have no "store raw" +/// path, e.g. the standalone reorder utility and `relabel_bundle`). +pub fn require_reorder(sort: Option<&str>, key: Option<&str>) -> PyResult { + resolve_reorder(sort, key)?.ok_or_else(|| { + PyValueError::new_err("sort=None has nothing to reorder; pass sort='mlc', 'rcm', or 'key'") + }) +} + +/// Reorder a NetworkX adjacency-format graph per `plan` and emit a `node_permutation_map.json` +/// payload. /// /// Returns `(reordered_graph_bytes, node_permutation_map_bytes)`. The permutation map is a JSON /// object carrying the required `node_permutation_old_to_new` field (original zero-based node /// positions → new positions) plus an optional `key` or `ordering_method` recording how the order /// was produced. The reben file-path fields (`input_file` / `output_file`) are omitted, since the /// Python graph utilities have no such paths. -pub fn reorder_graph_to_bytes(graph_bytes: &[u8], method: &str) -> PyResult<(Vec, Vec)> { +pub fn reorder_graph_to_bytes( + graph_bytes: &[u8], + plan: &ReorderPlan, +) -> PyResult<(Vec, Vec)> { let mut reordered = Vec::new(); - let (map, key_field, ordering_field) = match classify(method) { - Reorder::Ordering(ordering, name) => { + let (map, key_field, ordering_field) = match plan { + ReorderPlan::Ordering(ordering, name) => { let map = - sort_json_file_by_ordering(Cursor::new(graph_bytes), &mut reordered, ordering) + sort_json_file_by_ordering(Cursor::new(graph_bytes), &mut reordered, *ordering) .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; - (map, None::, Some(name)) + (map, None::, Some(*name)) } - Reorder::Key(key) => { - let map = sort_json_file_by_key(Cursor::new(graph_bytes), &mut reordered, &key) + ReorderPlan::Key(key) => { + let map = sort_json_file_by_key(Cursor::new(graph_bytes), &mut reordered, key) .map_err(|e| PyException::new_err(format!("Failed to reorder graph: {e}")))?; - (map, Some(key), None) + (map, Some(key.clone()), None) } }; diff --git a/ben-py/src/graph/py_funcs.rs b/ben-py/src/graph/py_funcs.rs index 347c4f8..636c911 100644 --- a/ben-py/src/graph/py_funcs.rs +++ b/ben-py/src/graph/py_funcs.rs @@ -1,4 +1,4 @@ -use super::helpers::reorder_graph_to_bytes; +use super::helpers::{reorder_graph_to_bytes, require_reorder}; use crate::common::{networkx_graph_from_bytes, parse_graph_input}; use pyo3::exceptions::PyException; use pyo3::prelude::*; @@ -16,20 +16,22 @@ fn json_loads(py: Python<'_>, bytes: &[u8]) -> PyResult> { /// `reordered_graph` is a live NetworkX graph (matching `BendlEncoder.add_graph` / /// `BendlDecoder.read_graph`); `node_permutation_map` is the parsed map JSON. /// -/// `method` selects the ordering: `"multi-level-cluster"` / `"mlc"`, -/// `"reverse-cuthill-mckee"` / `"rcm"`, or a node-attribute key (e.g. `"geoid"`, or the special -/// `"id"` for the NetworkX node id). The permutation map matches the on-disk +/// `sort` selects the ordering: `"mlc"` (multi-level clustering), `"rcm"` (reverse Cuthill-McKee), +/// or `"key"` to sort by a node attribute named via `key` (e.g. `key="GEOID"`, or the special +/// `key="id"` for the NetworkX node id). The permutation map matches the on-disk /// `node_permutation_map.json` convention (a `node_permutation_old_to_new` object). #[pyfunction] -#[pyo3(signature = (graph, method))] -#[pyo3(text_signature = "(graph, method)")] +#[pyo3(signature = (graph, sort = Some("mlc".to_string()), key = None))] +#[pyo3(text_signature = "(graph, sort='mlc', key=None)")] pub fn graph_reorder<'py>( py: Python<'py>, graph: Bound<'py, PyAny>, - method: &str, + sort: Option, + key: Option, ) -> PyResult<(Py, Py)> { + let plan = require_reorder(sort.as_deref(), key.as_deref())?; let graph_bytes = parse_graph_input(py, &graph)?; - let (reordered_bytes, map_bytes) = reorder_graph_to_bytes(&graph_bytes, method)?; + let (reordered_bytes, map_bytes) = reorder_graph_to_bytes(&graph_bytes, &plan)?; Ok(( networkx_graph_from_bytes(py, &reordered_bytes)?, json_loads(py, &map_bytes)?, diff --git a/ben-py/src/relabel.rs b/ben-py/src/relabel.rs index 21db9fd..29bef5a 100644 --- a/ben-py/src/relabel.rs +++ b/ben-py/src/relabel.rs @@ -7,7 +7,7 @@ //! JSON flag. use crate::common::open_output; -use crate::graph::helpers::reorder_graph_to_bytes; +use crate::graph::helpers::{reorder_graph_to_bytes, require_reorder}; use binary_ensemble::io::bundle::format::{ AssignmentFormat, KnownAssetKind, ASSET_FLAG_JSON, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, @@ -79,17 +79,19 @@ fn new_to_old_from_map_bytes(map_bytes: &[u8]) -> PyResult Ok(new_to_old) } -/// Relabel the bundle at `in_file` by reordering its graph via `method`, writing a fresh BEN bundle -/// at `out_file`. +/// Relabel the bundle at `in_file` by reordering its graph (via `sort` / `key`), writing a fresh +/// BEN bundle at `out_file`. #[pyfunction] -#[pyo3(signature = (in_file, out_file, method = "mlc".to_string(), overwrite = false))] -#[pyo3(text_signature = "(in_file, out_file, method='mlc', overwrite=False)")] +#[pyo3(signature = (in_file, out_file, sort = Some("mlc".to_string()), key = None, overwrite = false))] +#[pyo3(text_signature = "(in_file, out_file, sort='mlc', key=None, overwrite=False)")] pub fn relabel_bundle( in_file: PathBuf, out_file: PathBuf, - method: String, + sort: Option, + key: Option, overwrite: bool, ) -> PyResult<()> { + let plan = require_reorder(sort.as_deref(), key.as_deref())?; let file = File::open(&in_file) .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { @@ -126,7 +128,7 @@ pub fn relabel_bundle( .map_err(|e| PyIOError::new_err(format!("Failed to read graph asset: {e}")))?; // Reorder the graph and derive the new->old permutation for the stream. - let (reordered_graph, map_bytes) = reorder_graph_to_bytes(&graph_bytes, &method)?; + let (reordered_graph, map_bytes) = reorder_graph_to_bytes(&graph_bytes, &plan)?; let new_to_old = new_to_old_from_map_bytes(&map_bytes)?; // Carry over every other asset (skip the old graph and any old permutation map; we rewrite diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index 63a4811..a44a48e 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -42,7 +42,7 @@ def test_create_round_trip_all_asset_kinds(tmp_path: Path) -> None: samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(6)] path = tmp_path / "full.bendl" with BendlEncoder(path, overwrite=True) as enc: - returned = enc.add_graph(_graph(), preprocess_method=None) + returned = enc.add_graph(_graph(), sort=None) enc.add_metadata({"seed": 1234}) with enc.stream("ben") as stream: for a in samples: @@ -187,7 +187,7 @@ def test_add_graph_reorder_emits_graph_and_permutation_map(tmp_path: Path) -> No n = _n() path = tmp_path / "reord.bendl" enc = BendlEncoder(path, overwrite=True) - reordered = enc.add_graph(_graph(), preprocess_method="rcm") + reordered = enc.add_graph(_graph(), sort="rcm") with enc.stream("ben") as s: s.write([1] * n) enc.close() @@ -209,7 +209,7 @@ def test_add_graph_reorder_emits_graph_and_permutation_map(tmp_path: Path) -> No def test_add_graph_none_stores_raw_without_permutation_map(tmp_path: Path) -> None: path = tmp_path / "raw.bendl" enc = BendlEncoder(path, overwrite=True) - enc.add_graph(_graph(), preprocess_method=None) + enc.add_graph(_graph(), sort=None) enc.close() dec = BendlDecoder(path) assert dec.asset_names() == ["graph.json"] @@ -217,7 +217,7 @@ def test_add_graph_none_stores_raw_without_permutation_map(tmp_path: Path) -> No def test_add_graph_defaults_to_mlc_reorder(tmp_path: Path) -> None: - # With no preprocess_method, add_graph reorders via MLC and stores a map. + # With no sort given, add_graph reorders via MLC (the default) and stores a map. path = tmp_path / "default.bendl" enc = BendlEncoder(path, overwrite=True) returned = enc.add_graph(_graph()) @@ -231,7 +231,7 @@ def test_add_graph_defaults_to_mlc_reorder(tmp_path: Path) -> None: def test_add_graph_node_count_mismatch_raises(tmp_path: Path) -> None: n = _n() enc = BendlEncoder(tmp_path / "nc.bendl", overwrite=True) - enc.add_graph(_graph(), preprocess_method=None) + enc.add_graph(_graph(), sort=None) with enc.stream("ben") as s: s.write([1] * n) # correct with pytest.raises(ValueError, match="does not match graph node count"): @@ -245,18 +245,18 @@ def test_reorder_add_graph_after_stream_raises_but_raw_succeeds(tmp_path: Path) with enc.stream("ben") as s: s.write([1] * n) with pytest.raises(Exception, match="only allowed before"): - enc.add_graph(_graph(), preprocess_method="rcm") + enc.add_graph(_graph(), sort="rcm") # A raw graph attaches fine post-stream. - enc.add_graph(_graph(), preprocess_method=None) + enc.add_graph(_graph(), sort=None) enc.close() assert BendlDecoder(path).asset_names() == ["graph.json"] def test_duplicate_graph_raises(tmp_path: Path) -> None: enc = BendlEncoder(tmp_path / "dup.bendl", overwrite=True) - enc.add_graph(_graph(), preprocess_method=None) + enc.add_graph(_graph(), sort=None) with pytest.raises(Exception, match="duplicate singleton"): - enc.add_graph(_graph(), preprocess_method=None) + enc.add_graph(_graph(), sort=None) # --------------------------------------------------------------------------- @@ -318,9 +318,9 @@ def test_append_mode_reorder_graph_raises(tmp_path: Path) -> None: s.write([1] * _n()) ap = BendlEncoder.append(path) with pytest.raises(Exception, match="only allowed before"): - ap.add_graph(_graph(), preprocess_method="rcm") + ap.add_graph(_graph(), sort="rcm") # Raw graph append works. - ap.add_graph(_graph(), preprocess_method=None) + ap.add_graph(_graph(), sort=None) ap.close() assert "graph.json" in BendlDecoder(path).asset_names() diff --git a/ben-py/tests/test_graph.py b/ben-py/tests/test_graph.py index 58c111d..bbe3559 100644 --- a/ben-py/tests/test_graph.py +++ b/ben-py/tests/test_graph.py @@ -66,6 +66,34 @@ def test_reorder_by_key_id() -> None: assert pmap["ordering_method"] is None +def test_reorder_sort_key_with_attribute() -> None: + n = _n() + reordered, pmap = g.reorder(_graph(), sort="key", key="county") + _check_consistent(reordered, pmap, n) + assert pmap["key"] == "county" + assert pmap["ordering_method"] is None + + +def test_reorder_sort_key_requires_key() -> None: + with pytest.raises(ValueError, match="sort='key' requires key"): + g.reorder(_graph(), sort="key") + + +def test_reorder_key_without_sort_key_raises() -> None: + with pytest.raises(ValueError, match="only valid with sort='key'"): + g.reorder(_graph(), sort="mlc", key="county") + + +def test_reorder_unknown_sort_raises() -> None: + with pytest.raises(ValueError, match="unknown sort"): + g.reorder(_graph(), sort="county") # a key must go through sort="key" + + +def test_reorder_none_sort_raises() -> None: + with pytest.raises(ValueError, match="nothing to reorder"): + g.reorder(_graph(), sort=None) + + def test_reorder_accepts_bytes_and_path() -> None: n = _n() raw = EXAMPLE_GRAPH.read_bytes() diff --git a/ben-py/tests/test_recompress.py b/ben-py/tests/test_recompress.py index e2c2de7..3da797d 100644 --- a/ben-py/tests/test_recompress.py +++ b/ben-py/tests/test_recompress.py @@ -26,7 +26,7 @@ def _build_ben_bundle(path: Path): n = len(_graph()["nodes"]) samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(8)] with BendlEncoder(path, overwrite=True) as enc: - enc.add_graph(_graph(), preprocess_method="rcm") + enc.add_graph(_graph(), sort="rcm") enc.add_metadata({"seed": 99}) with enc.stream("ben") as s: for a in samples: diff --git a/ben-py/tests/test_relabel.py b/ben-py/tests/test_relabel.py index 24ad5a4..7754bcb 100644 --- a/ben-py/tests/test_relabel.py +++ b/ben-py/tests/test_relabel.py @@ -36,7 +36,7 @@ def _build_ben_bundle(path: Path, with_graph: bool = True): samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(8)] enc = BendlEncoder(path, overwrite=True) if with_graph: - enc.add_graph(_graph(), preprocess_method=None) # store in raw order + enc.add_graph(_graph(), sort=None) # store in raw order enc.add_metadata({"seed": 99}) with enc.stream("ben") as s: for a in samples: @@ -55,7 +55,7 @@ def test_relabel_out_file_is_lossless_and_preserves_assets(tmp_path: Path) -> No out = tmp_path / "out.bendl" samples = _build_ben_bundle(src) - relabel_bundle(src, out_file=out, method="mlc") + relabel_bundle(src, out_file=out, sort="mlc") dec = BendlDecoder(out) # Stays BEN, same sample count, canonical graph + permutation map present. @@ -87,7 +87,7 @@ def test_relabel_in_place(tmp_path: Path) -> None: src = tmp_path / "in.bendl" samples = _build_ben_bundle(src) - relabel_bundle(src, in_place=True, method="rcm") + relabel_bundle(src, in_place=True, sort="rcm") dec = BendlDecoder(src) assert dec.assignment_format() == "ben" @@ -102,6 +102,21 @@ def test_relabel_in_place(tmp_path: Path) -> None: assert [_depermute(p, old_to_new) for p in dec] == samples +def test_relabel_by_key(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + out = tmp_path / "out.bendl" + samples = _build_ben_bundle(src) + + relabel_bundle(src, out_file=out, sort="key", key="county") + + dec = BendlDecoder(out) + pmap = dec.read_node_permutation_map() + assert pmap["key"] == "county" + assert pmap["ordering_method"] is None + old_to_new = {int(k): v for k, v in pmap["node_permutation_old_to_new"].items()} + assert [_depermute(p, old_to_new) for p in dec] == samples + + def test_relabel_arg_validation(tmp_path: Path) -> None: src = tmp_path / "in.bendl" _build_ben_bundle(src) @@ -109,6 +124,8 @@ def test_relabel_arg_validation(tmp_path: Path) -> None: relabel_bundle(src) with pytest.raises(ValueError, match="not both"): relabel_bundle(src, out_file=tmp_path / "o.bendl", in_place=True) + with pytest.raises(ValueError, match="sort='key' requires key"): + relabel_bundle(src, out_file=tmp_path / "o.bendl", sort="key") def test_relabel_requires_graph(tmp_path: Path) -> None: From 70caba8b7241491deec83a887360c97409e883ef Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:12:58 -0600 Subject: [PATCH 137/221] update cli to use dualgraph rather than shapefile --- ben/src/cli/reben/args.rs | 11 ++++++----- ben/src/cli/reben/ben_mode.rs | 20 +++++++++---------- ben/src/cli/reben/tests.rs | 36 ++++++++++++++++++++++++++--------- ben/tests/test_cli.rs | 28 +++++++++++++-------------- 4 files changed, 57 insertions(+), 38 deletions(-) diff --git a/ben/src/cli/reben/args.rs b/ben/src/cli/reben/args.rs index 8c8021b..7043a96 100644 --- a/ben/src/cli/reben/args.rs +++ b/ben/src/cli/reben/args.rs @@ -39,7 +39,6 @@ pub(super) enum BenCliVariant { version )] /// Defines the command line arguments accepted by the program. -// TODO: Change the name of shape_file to dual_graph_file. pub(super) struct Args { /// Input file to read from. #[arg()] @@ -53,10 +52,12 @@ pub(super) struct Args { /// Topology-based ordering method to use instead of a key sort. #[arg(long, value_enum)] pub ordering: Option, - /// Shape file to use for sorting the BEN file. Only needed in BEN mode when a map is not - /// provided. - #[arg(short, long)] - pub shape_file: Option, + /// Dual-graph (JSON) file to use for sorting the BEN file. Only needed in BEN mode when a map + /// is not provided. + // `shape-file` is the former name of this flag, kept as a hidden alias for backward + // compatibility. + #[arg(short = 'd', long = "dualgraph", alias = "shape-file")] + pub dual_graph: Option, /// Map file to use for relabeling the BEN file. #[arg(short = 'p', long)] pub map_file: Option, diff --git a/ben/src/cli/reben/ben_mode.rs b/ben/src/cli/reben/ben_mode.rs index aa5eb50..7c6ef5a 100644 --- a/ben/src/cli/reben/ben_mode.rs +++ b/ben/src/cli/reben/ben_mode.rs @@ -75,34 +75,34 @@ pub(super) fn run_ben_mode(args: Args) -> Result<(), String> { let mut map_file_name = String::new(); if args.key.is_some() || args.ordering.is_some() { - let shape = args.shape_file.as_ref().ok_or_else(|| { - "No shape file provided to go with the requested ordering.".to_string() + let dual_graph = args.dual_graph.as_ref().ok_or_else(|| { + "No dual-graph file provided to go with the requested ordering.".to_string() })?; let label = relabeling_label(args.key.as_deref(), args.ordering.as_ref())?; tracing::trace!("Creating map file for ordering: {}", label); - let output_file_name = shape.trim_end_matches(".json").to_owned() + let output_file_name = dual_graph.trim_end_matches(".json").to_owned() + format!("_sorted_by_{}.json", label).as_str(); let output_file = File::create(&output_file_name) .map_err(|e| format!("Could not create output file {output_file_name:?}: {e}"))?; let writer = BufWriter::new(output_file); - let shape_file = - File::open(shape).map_err(|e| format!("Could not open shape file {shape:?}: {e}"))?; - let shape_reader = BufReader::new(shape_file); + let dual_graph_file = File::open(dual_graph) + .map_err(|e| format!("Could not open dual-graph file {dual_graph:?}: {e}"))?; + let dual_graph_reader = BufReader::new(dual_graph_file); let map = if let Some(key) = args.key.as_ref() { - sort_json_file_by_key(shape_reader, writer, key) + sort_json_file_by_key(dual_graph_reader, writer, key) } else { let ordering = args .ordering .as_ref() .ok_or_else(|| "Provide either --key or --ordering.".to_string())?; - sort_json_file_by_ordering(shape_reader, writer, to_graph_ordering(ordering)) + sort_json_file_by_ordering(dual_graph_reader, writer, to_graph_ordering(ordering)) } - .map_err(|e| format!("Could not sort shape file: {e}"))?; + .map_err(|e| format!("Could not sort dual-graph file: {e}"))?; - map_file_name = shape.trim_end_matches(".json").to_owned() + map_file_name = dual_graph.trim_end_matches(".json").to_owned() + format!("_sorted_by_{}", label).as_str() + "_map.json"; let map_file = File::create(&map_file_name) diff --git a/ben/src/cli/reben/tests.rs b/ben/src/cli/reben/tests.rs index 026e7fb..b294c83 100644 --- a/ben/src/cli/reben/tests.rs +++ b/ben/src/cli/reben/tests.rs @@ -26,10 +26,28 @@ fn clap_metadata_uses_package_version() { assert_eq!(command.get_version(), Some(env!("CARGO_PKG_VERSION"))); assert!(help.contains("Relabeling Binary Ensemble CLI Tool")); - assert!(help.contains("--shape-file")); + assert!(help.contains("--dualgraph")); + // `--shape-file` is a hidden alias: it works but does not appear in help. + assert!(!help.contains("--shape-file")); assert!(help.contains("canonicalize")); } +#[test] +fn shape_file_is_accepted_as_hidden_alias_for_dualgraph() { + let args = Args::try_parse_from([ + "reben", + "input.jsonl.ben", + "--mode", + "ben", + "--key", + "GEOID20", + "--shape-file", + "graph.json", + ]) + .unwrap(); + assert_eq!(args.dual_graph.as_deref(), Some("graph.json")); +} + #[test] fn parse_json_mode_args() { let args = Args::try_parse_from([ @@ -317,7 +335,7 @@ fn run_ben_mode_with_output_variant_and_n_items() { #[test] fn run_ben_mode_with_shape_file_and_ordering() { - // Covers the shape_file + ordering path. Creates a map from the shape file ordering, then + // Covers the dual-graph + ordering path. Creates a map from the dual-graph ordering, then // relabels the BEN. let input = write_temp_ben("shape_order_input.jsonl.ben"); let shape = unique_path("shape_order_shape.json"); @@ -332,7 +350,7 @@ fn run_ben_mode_with_shape_file_and_ordering() { input.to_str().unwrap(), "--mode", "ben", - "--shape-file", + "--dualgraph", shape.to_str().unwrap(), "--ordering", "reverse-cuthill-mckee", @@ -605,7 +623,7 @@ fn run_json_mode_with_key_happy_path() { #[test] fn run_ben_mode_with_key_and_shape_happy_path() { - // Exercise the --key + --shape-file branch of run_ben_mode (lines 76-123 of ben_mode.rs): + // Exercise the --key + --dualgraph branch of run_ben_mode (lines 76-123 of ben_mode.rs): // sort by key, generate a map file, then permute the BEN stream by that map. The existing // tests cover the no-map/no-key path and the --map-file path; this is the gap. let input = write_temp_ben("ben_mode_key_input.jsonl.ben"); @@ -619,7 +637,7 @@ fn run_ben_mode_with_key_and_shape_happy_path() { "ben", "--key", "GEOID20", - "--shape-file", + "--dualgraph", shape.to_str().unwrap(), "--output-file", out.to_str().unwrap(), @@ -651,7 +669,7 @@ fn run_ben_mode_with_ordering_and_shape_happy_path() { "ben", "--ordering", "reverse-cuthill-mckee", - "--shape-file", + "--dualgraph", shape.to_str().unwrap(), "--output-file", out.to_str().unwrap(), @@ -693,8 +711,8 @@ fn run_ben_mode_rejects_map_file_combined_with_key() { } #[test] -fn run_ben_mode_rejects_key_without_shape_file() { - // The shape-file presence guard (ben_mode.rs line 78-80). +fn run_ben_mode_rejects_key_without_dual_graph() { + // The dual-graph presence guard (ben_mode.rs line 78-80). let input = write_temp_ben("key_no_shape_input.jsonl.ben"); let args = Args::try_parse_from([ "reben", @@ -707,5 +725,5 @@ fn run_ben_mode_rejects_key_without_shape_file() { .unwrap(); let err = run_ben_mode(args).unwrap_err(); let _ = fs::remove_file(&input); - assert!(err.contains("shape file"), "got: {err}"); + assert!(err.contains("dual-graph file"), "got: {err}"); } diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 31f75ec..01e4313 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1358,9 +1358,9 @@ fn reben_cli_can_canonicalize_into_a_different_ben_variant() { } #[test] -fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations() { +fn reben_cli_generates_map_from_dual_graph_and_reports_invalid_flag_combinations() { let temp = TempDir::new("reben-more"); - let graph_path = temp.path().join("shape.json"); + let graph_path = temp.path().join("dualgraph.json"); let ben_path = temp.path().join("samples.jsonl.ben"); let relabeled_path = temp.path().join("rekeyed.ben"); @@ -1382,7 +1382,7 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations "ben", "--key", "GEOID20", - "--shape-file", + "--dualgraph", graph_path.to_str().unwrap(), "--output-file", relabeled_path.to_str().unwrap(), @@ -1392,11 +1392,11 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations assert_success(&relabel); assert!(temp .path() - .join("shape_sorted_by_GEOID20_map.json") + .join("dualgraph_sorted_by_GEOID20_map.json") .exists()); - let generated_graph = temp.path().join("shape_sorted_by_GEOID20.json"); - let generated_map = temp.path().join("shape_sorted_by_GEOID20_map.json"); + let generated_graph = temp.path().join("dualgraph_sorted_by_GEOID20.json"); + let generated_map = temp.path().join("dualgraph_sorted_by_GEOID20_map.json"); let both = run( "reben", &[ @@ -1405,7 +1405,7 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations "ben", "--key", "GEOID20", - "--shape-file", + "--dualgraph", graph_path.to_str().unwrap(), "--map-file", generated_map.to_str().unwrap(), @@ -1416,7 +1416,7 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations assert!(String::from_utf8_lossy(&both.stderr) .contains("Cannot provide both a map file and a sorting option")); - let missing_shape = run( + let missing_dual_graph = run( "reben", &[ ben_path.to_str().unwrap(), @@ -1427,8 +1427,8 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations ], temp.path(), ); - assert_failure(&missing_shape); - assert!(String::from_utf8_lossy(&missing_shape.stderr).contains("No shape file provided")); + assert_failure(&missing_dual_graph); + assert!(String::from_utf8_lossy(&missing_dual_graph.stderr).contains("No dual-graph file provided")); let sorted_json: Value = serde_json::from_str(&fs::read_to_string(generated_graph).unwrap()).unwrap(); @@ -1438,7 +1438,7 @@ fn reben_cli_generates_map_from_shape_file_and_reports_invalid_flag_combinations #[test] fn reben_cli_supports_rcm_ordering() { let temp = TempDir::new("reben-orderings"); - let graph_path = temp.path().join("shape.json"); + let graph_path = temp.path().join("dualgraph.json"); let rcm_path = temp.path().join("rcm.json"); fs::write(&graph_path, sample_graph()).unwrap(); @@ -1459,7 +1459,7 @@ fn reben_cli_supports_rcm_ordering() { assert_success(&rcm); assert!(temp .path() - .join("shape_sorted_by_reverse-cuthill-mckee_map.json") + .join("dualgraph_sorted_by_reverse-cuthill-mckee_map.json") .exists()); let rcm_json: Value = serde_json::from_str(&fs::read_to_string(&rcm_path).unwrap()).unwrap(); @@ -1469,7 +1469,7 @@ fn reben_cli_supports_rcm_ordering() { #[test] fn reben_cli_supports_multi_level_cluster_ordering() { let temp = TempDir::new("reben-mlc"); - let graph_path = temp.path().join("shape.json"); + let graph_path = temp.path().join("dualgraph.json"); let mlc_path = temp.path().join("mlc.json"); fs::write(&graph_path, sample_graph()).unwrap(); @@ -1490,7 +1490,7 @@ fn reben_cli_supports_multi_level_cluster_ordering() { assert_success(&mlc); assert!(temp .path() - .join("shape_sorted_by_multi-level-cluster_map.json") + .join("dualgraph_sorted_by_multi-level-cluster_map.json") .exists()); let mlc_json: Value = serde_json::from_str(&fs::read_to_string(&mlc_path).unwrap()).unwrap(); From 1712c3b2ffcf45725bb91173bb3a61736d95c23a Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Sat, 6 Jun 2026 09:53:11 -0600 Subject: [PATCH 138/221] major docs overhaul --- .github/workflows/docs.yml | 46 + .readthedocs.yaml | 3 + Taskfile.yml | 40 +- ben-py/README.md | 88 +- ben-py/binary_ensemble/bundle.py | 2 +- ben-py/docs/.gitignore | 7 + ben-py/docs/_static/css/custom.css | 123 +- ben-py/docs/_static/js/palette-switcher.js | 224 +++ ben-py/docs/api/bundle.md | 34 + ben-py/docs/api/codec.md | 25 + ben-py/docs/api/graph.md | 17 + ben-py/docs/api/index.md | 56 + ben-py/docs/api/stream.md | 19 + ben-py/docs/concepts/api-map.md | 48 + ben-py/docs/concepts/compression.md | 150 ++ ben-py/docs/concepts/formats.md | 69 + ben-py/docs/concepts/overview.md | 67 + ben-py/docs/concepts/variants.md | 57 + ben-py/docs/concepts/vocabulary.md | 74 + ben-py/docs/conf.py | 490 ++++- ben-py/docs/getting-started/installation.md | 60 + ben-py/docs/getting-started/quickstart.md | 103 ++ ben-py/docs/how-to/compress-gerrychain-run.md | 81 + ben-py/docs/how-to/convert-formats.md | 74 + .../docs/how-to/custom-assets-and-append.md | 63 + ben-py/docs/how-to/index.md | 102 + ben-py/docs/how-to/read-and-iterate.md | 72 + ben-py/docs/how-to/shrink-for-sharing.md | 59 + ben-py/docs/how-to/subsample.md | 56 + ben-py/docs/index.md | 158 ++ ben-py/docs/index.rst | 35 - ben-py/docs/user/.gitignore | 3 +- ben-py/docs/user/using_ben_py.ipynb | 789 ++++---- ben-py/docs/user/using_bendl.ipynb | 1276 +++++++++++++ ben-py/pyproject.toml | 22 +- ben-py/src/decode/bundle_decoder.rs | 112 +- ben-py/src/decode/decoder.rs | 91 +- ben-py/src/decode/py_funcs.rs | 36 + ben-py/src/encode/bundle_encoder.rs | 28 +- ben-py/src/encode/encoder.rs | 37 +- ben-py/src/encode/py_funcs.rs | 52 + ben-py/tests/data/gerrymandria.json | 1641 +++++++++++++++++ ben-py/tests/test_bundle_api.py | 8 +- ben-py/tests/test_docs_snippets.py | 90 + ben-py/tests/test_graph.py | 8 +- ben-py/tests/test_recompress.py | 8 +- ben-py/tests/test_relabel.py | 8 +- ben-py/uv.lock | 269 +-- ben/tests/test_cli.rs | 4 +- docs/ben-format-spec.md | 256 +++ docs/bendl-format-spec.md | 277 +-- docs/bendl-implementation-plan.md | 261 --- docs/bendl-roadmap.md | 175 -- docs/coding-standards.md | 273 +++ docs/format-stability.md | 94 + docs/glossary.md | 362 ++++ docs/twodelta-format-spec.md | 272 +++ 57 files changed, 7602 insertions(+), 1352 deletions(-) create mode 100644 .github/workflows/docs.yml create mode 100644 ben-py/docs/.gitignore create mode 100644 ben-py/docs/_static/js/palette-switcher.js create mode 100644 ben-py/docs/api/bundle.md create mode 100644 ben-py/docs/api/codec.md create mode 100644 ben-py/docs/api/graph.md create mode 100644 ben-py/docs/api/index.md create mode 100644 ben-py/docs/api/stream.md create mode 100644 ben-py/docs/concepts/api-map.md create mode 100644 ben-py/docs/concepts/compression.md create mode 100644 ben-py/docs/concepts/formats.md create mode 100644 ben-py/docs/concepts/overview.md create mode 100644 ben-py/docs/concepts/variants.md create mode 100644 ben-py/docs/concepts/vocabulary.md create mode 100644 ben-py/docs/getting-started/installation.md create mode 100644 ben-py/docs/getting-started/quickstart.md create mode 100644 ben-py/docs/how-to/compress-gerrychain-run.md create mode 100644 ben-py/docs/how-to/convert-formats.md create mode 100644 ben-py/docs/how-to/custom-assets-and-append.md create mode 100644 ben-py/docs/how-to/index.md create mode 100644 ben-py/docs/how-to/read-and-iterate.md create mode 100644 ben-py/docs/how-to/shrink-for-sharing.md create mode 100644 ben-py/docs/how-to/subsample.md create mode 100644 ben-py/docs/index.md delete mode 100644 ben-py/docs/index.rst create mode 100644 ben-py/docs/user/using_bendl.ipynb create mode 100644 ben-py/tests/data/gerrymandria.json create mode 100644 ben-py/tests/test_docs_snippets.py create mode 100644 docs/ben-format-spec.md delete mode 100644 docs/bendl-implementation-plan.md delete mode 100644 docs/bendl-roadmap.md create mode 100644 docs/coding-standards.md create mode 100644 docs/format-stability.md create mode 100644 docs/glossary.md create mode 100644 docs/twodelta-format-spec.md diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..3836188 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,46 @@ +name: Docs + +on: + push: + branches: [main] + pull_request: + workflow_dispatch: + +jobs: + build-docs: + name: Build & execute docs + runs-on: ubuntu-latest + defaults: + run: + working-directory: ben-py + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Set up Python + run: uv python install 3.12 + + # Builds the PyO3 extension (so autodoc reads live docstrings) and installs the + # render + execution doc dependencies. + - name: Install docs dependencies + run: uv sync --no-dev --extra docs --extra docs-exec + + # NB_EXECUTION_MODE=cache executes every tutorial notebook end to end; -W turns + # any Sphinx warning (including a notebook cell error) into a build failure, so + # the docs cannot drift from the live API without CI catching it. + - name: Build docs (execute notebooks, warnings as errors) + env: + NB_EXECUTION_MODE: cache + run: uv run sphinx-build -W -b dirhtml docs docs/_build + + # Run every Python code block in the Markdown docs against the live API. + - name: Test documentation code snippets + run: uv run --with pytest pytest tests/test_docs_snippets.py -q + + - name: Upload built site + uses: actions/upload-artifact@v4 + with: + name: docs-html + path: ben-py/docs/_build diff --git a/.readthedocs.yaml b/.readthedocs.yaml index fb83822..cddb360 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -16,3 +16,6 @@ python: sphinx: builder: dirhtml configuration: ben-py/docs/conf.py + # Notebook execution stays off here (NB_EXECUTION_MODE defaults to "off"), so the + # hosted build renders the committed notebook outputs. CI executes the notebooks. + fail_on_warning: true diff --git a/Taskfile.yml b/Taskfile.yml index 31f5584..fb0892e 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -324,6 +324,38 @@ tasks: fi; ' + docs: + desc: Build the Python docs site (renders committed notebook outputs) + dir: ben-py + cmds: + # -E -a forces a full rebuild (re-read all sources, rewrite every page). Needed + # because the palette/switcher state and asset cache-busters are injected by conf.py + # and don't invalidate Sphinx's incremental cache — an incremental build would serve + # stale inline JS / cached CSS (e.g. the switcher reappearing after DOCS_SWITCHER changes). + - uv run --extra docs sphinx-build -E -a -W -b dirhtml docs docs/_build + - 'echo "Docs built -> ben-py/docs/_build/index.html"' + + docs-exec: + desc: Build the docs and execute every tutorial notebook (CI-equivalent) + dir: ben-py + env: + NB_EXECUTION_MODE: cache + cmds: + - uv run --extra docs --extra docs-exec sphinx-build -W -b dirhtml docs docs/_build + + docs-serve: + desc: Build the docs and serve them at http://localhost:8000 + dir: ben-py + cmds: + - task: docs + - uv run python -m http.server 8000 --directory docs/_build + + docs-test: + desc: Execute every Python code block in the Markdown docs + dir: ben-py + cmds: + - uv run pytest tests/test_docs_snippets.py + clean-linux: &clean-unix desc: Clean build artifacts internal: true @@ -334,7 +366,7 @@ tasks: - rm -rf ben-py/binary_ensemble/*abi3.so ben-py/binary_ensemble/*.pyd - rm -rf ben-py/.venv - find . -type d -name "__pycache__" -exec rm -rf {} + - - rm -rf docs/_build docs/user/example_data + - rm -rf ben-py/docs/_build ben-py/docs/jupyter_execute ben-py/docs/_generated ben-py/docs/user/example_data clean-darwin: *clean-unix @@ -349,8 +381,10 @@ tasks: - cmd /c "if exist ben-py\\binary_ensemble.egg-info rmdir /s /q ben-py\\binary_ensemble.egg-info" - cmd /c "if exist ben-py\\.venv rmdir /s /q ben-py\\.venv" - powershell -NoProfile -Command "Get-ChildItem -Path . -Directory -Filter __pycache__ -Recurse | Remove-Item -Recurse -Force" - - cmd /c "if exist docs\\_build rmdir /s /q docs\\_build" - - cmd /c "if exist docs\\user\\example_data rmdir /s /q docs\\user\\example_data" + - cmd /c "if exist ben-py\\docs\\_build rmdir /s /q ben-py\\docs\\_build" + - cmd /c "if exist ben-py\\docs\\jupyter_execute rmdir /s /q ben-py\\docs\\jupyter_execute" + - cmd /c "if exist ben-py\\docs\\_generated rmdir /s /q ben-py\\docs\\_generated" + - cmd /c "if exist ben-py\\docs\\user\\example_data rmdir /s /q ben-py\\docs\\user\\example_data" clean: desc: Clean build artifacts diff --git a/ben-py/README.md b/ben-py/README.md index 935cdb5..6848796 100755 --- a/ben-py/README.md +++ b/ben-py/README.md @@ -1,11 +1,81 @@ -# Py-BEN +# binary-ensemble -BEN (short for Binary-Ensemble) is a compression algorithm designed for efficient storage and access -of ensembles of districting plans, and was designed to work primarily as a companion to the -GerrySuite collection of packages (GerryChain, GerryTools, FRCW) and to also be compatible with -other ensemble generators (e.g. ForestRecom, Sequential Monte Carlo [SMC]). +[![PyPI](https://img.shields.io/pypi/v/binary-ensemble.svg)](https://pypi.org/project/binary-ensemble/) +[![Python versions](https://img.shields.io/pypi/pyversions/binary-ensemble.svg)](https://pypi.org/project/binary-ensemble/) +[![Documentation](https://img.shields.io/readthedocs/binary-ensemble.svg)](https://binary-ensemble.readthedocs.io/) +[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE) -This is a package containing some Python bindings for the for the -[Binary-Ensemble](https://crates.io/crates/binary-ensemble) Rust library. In particular, this -package provides some easy tools for compressing and decompressing ensembles of districting plans, -as well as some utilities for working with ensembles stored in the BEN and XBEN formats. +**Compress, store, and stream massive ensembles of districting plans.** + +Redistricting samplers like [GerryChain](https://gerrychain.readthedocs.io)'s ReCom, +ForestReCom, and Sequential Monte Carlo emit millions of plans. Stored as JSONL, a single +ensemble can run to *tens of gigabytes* — most of it redundant. **BEN** (Binary-Ensemble) is +a compression format and toolkit built for exactly this data: it turns those JSONL mountains +into compact binary files you can store, share, and stream sample-by-sample without unpacking +the whole thing. + +`binary-ensemble` is the Python interface to the +[binary-ensemble](https://crates.io/crates/binary-ensemble) Rust library. + +> A real 100k-plan ensemble on Colorado's ~140k census blocks is **27 GB** as JSONL. +> Reordered by `GEOID20` it compresses to a **~550 MB** BEN stream, and then to a **~6 MB** +> XBEN file — over a **4500× reduction**, fully lossless. + +## Install + +```bash +pip install binary-ensemble +``` + +Requires Python 3.11+. Pre-built wheels are available for Linux, macOS, and Windows. + +## Quick example + +Write an ensemble into one self-describing `.bendl` bundle, then read it back: + +```python +from binary_ensemble import BendlEncoder, BendlDecoder + +plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] + +# The stream context finalizes the bundle when it closes. +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_metadata({"sampler": "demo", "seed": 1234}) +with encoder.stream("ben") as stream: + for assignment in plans: + stream.write(assignment) + +# Iterate the assignments straight back out, one at a time. +for assignment in BendlDecoder("ensemble.bendl"): + print(assignment) +``` + +Already have JSONL files? Convert whole files in one call: + +```python +from binary_ensemble import encode_jsonl_to_ben, encode_ben_to_xben + +encode_jsonl_to_ben("plans.jsonl", "plans.ben") # fast working format +encode_ben_to_xben("plans.ben", "plans.xben") # smallest, for storage +``` + +## Documentation + +Full docs are at **[binary-ensemble.readthedocs.io](https://binary-ensemble.readthedocs.io/)**: + +- [Quickstart](https://binary-ensemble.readthedocs.io/getting-started/quickstart/) — your first ensemble in a few lines. +- [Concepts](https://binary-ensemble.readthedocs.io/concepts/overview/) — dual graphs, the BEN/XBEN/BENDL formats, encoding variants, and the compression levers. +- [How-to guides](https://binary-ensemble.readthedocs.io/how-to/) — compress a GerryChain run, subsample, convert formats, shrink a bundle for sharing. +- [API reference](https://binary-ensemble.readthedocs.io/api/) — every public class and function. + +## Command-line tools + +The same engine ships as the `ben`, `reben`, `bendl`, and `pcben` CLI tools via Cargo: + +```bash +cargo install binary-ensemble +``` + +## License + +MIT — see [LICENSE](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE). diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 933b8ce..a59a41d 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -8,7 +8,7 @@ Typical write:: with BendlEncoder(path, overwrite=True) as enc: - enc.add_graph(graph, preprocess_method="rcm") # None => store raw + enc.add_graph(graph, sort="rcm") # sort=None => store raw enc.add_metadata({"seed": 1234}) with enc.stream("ben") as stream: for assignment in chain: diff --git a/ben-py/docs/.gitignore b/ben-py/docs/.gitignore new file mode 100644 index 0000000..c914451 --- /dev/null +++ b/ben-py/docs/.gitignore @@ -0,0 +1,7 @@ +# Sphinx build output and MyST-NB execution artifacts. +_build/ +jupyter_execute/ +.jupyter_cache/ + +# Generated by conf.py: per-theme dark Pygments stylesheet (see CODE_THEMES). +_generated/ diff --git a/ben-py/docs/_static/css/custom.css b/ben-py/docs/_static/css/custom.css index c231962..3579259 100644 --- a/ben-py/docs/_static/css/custom.css +++ b/ben-py/docs/_static/css/custom.css @@ -1,33 +1,100 @@ +/* Project-specific tweaks layered on top of the Furo theme. */ + +/* Call-to-action button used on the landing page (e.g. "Get started"). */ .download-badge { - display: inline-block; - padding: 8px 15px; - margin: 10px; - background-color: #0099CD; - color: white; - text-decoration: none; - border-radius: 5px; - font-size: 16px; - border: none; - cursor: pointer; - outline: none; -} -.download-badge:hover, .download-badge:focus { - background-color: #2980B9; - outline: none; + display: inline-block; + padding: 8px 15px; + margin: 10px 10px 10px 0; + background-color: var(--color-brand-primary); + color: #ffffff !important; + text-decoration: none; + border-radius: 6px; + font-weight: 600; + border: none; + cursor: pointer; } - -.center-container { - text-align: center; - width: 100%; +.download-badge:hover, +.download-badge:focus { + filter: brightness(1.1); + outline: none; } +/* Navigation cards (sphinx-design grid-item-cards, e.g. the "Where to next" grid). Furo + leaves them near-flat — a transparent background in light mode and a 5%-opacity shadow + — so they blend into the page. Give them a real surface, a visible shadow, and a clear + hover lift. Colors come from Furo variables, so they track the active palette and + light/dark mode. The shadow utility class uses !important, so these must too. */ +.sd-card { + background-color: var(--color-background-secondary) !important; + border: 1px solid var(--color-background-border, rgba(128, 128, 128, 0.3)) !important; + border-radius: 10px !important; + box-shadow: 0 4px 14px rgba(0, 0, 0, 0.2) !important; + transition: + transform 0.15s ease, + box-shadow 0.15s ease, + border-color 0.15s ease !important; +} +.sd-card:hover { + transform: translateY(-3px) !important; + border-color: var(--color-brand-primary) !important; + box-shadow: 0 8px 20px rgba(0, 0, 0, 0.22) !important; +} -table { - width: 100%; - border-collapse: collapse; -} -th, td { - border: 1px solid #DDDDDD; - padding: 8px; - text-align: center; -} \ No newline at end of file +/* Live palette + code-theme switcher (added by js/palette-switcher.js). */ +.palette-switcher-bar { + position: fixed; + bottom: 1rem; + right: 1rem; + z-index: 30; + display: flex; + flex-direction: column; + align-items: flex-end; + gap: 0.4rem; +} +.palette-switcher { + display: flex; + align-items: center; + gap: 0.4rem; + padding: 0.3rem 0.55rem; + border-radius: 8px; + background: var(--color-background-secondary); + border: 1px solid var(--color-background-border, rgba(128, 128, 128, 0.3)); + box-shadow: 0 1px 6px rgba(0, 0, 0, 0.15); + font-size: 0.8rem; +} +.palette-switcher__icon { + line-height: 1; +} +.palette-switcher select { + border: none; + /* The populated by `fill(select)`. + function makeControl(icon, ariaLabel, fill, onChange) { + var wrap = document.createElement("div"); + wrap.className = "palette-switcher"; + + var label = document.createElement("span"); + label.className = "palette-switcher__icon"; + label.textContent = icon; + label.setAttribute("aria-hidden", "true"); + + var select = document.createElement("select"); + select.setAttribute("aria-label", ariaLabel); + fill(select); + select.addEventListener("change", function () { + onChange(select.value); + }); + + wrap.appendChild(label); + wrap.appendChild(select); + return wrap; + } + + function buildControls(paletteName) { + var bar = document.createElement("div"); + bar.className = "palette-switcher-bar"; + + if (paletteNames.length >= 2) { + bar.appendChild( + makeControl( + "🎨", + "Color palette", + function (select) { + paletteNames.forEach(function (n) { + select.appendChild(option(n, n, paletteName)); + }); + }, + function (value) { + applyPalette(value); + write(PALETTE_KEY, value); + } + ) + ); + } + + if (codeMenu.length >= 1) { + var selectedCode = storedCode(); + bar.appendChild( + makeControl( + "", + "Code theme", + function (select) { + select.appendChild(option("", "Auto", selectedCode)); // "" = follow palette + Object.keys(CODE_THEMES).forEach(function (group) { + var og = document.createElement("optgroup"); + og.label = group; + CODE_THEMES[group].forEach(function (style) { + og.appendChild(option(style, style, selectedCode)); + }); + select.appendChild(og); + }); + }, + function (value) { + write(CODE_KEY, value); + applyCode(currentPalette()); + } + ) + ); + } + + document.body.appendChild(bar); + } + + // ---- init ---- + + var name = currentPalette(); + applyPalette(name); // always paint the palette + default code themes; early to limit flash + + if (!SHOW_SWITCHER) return; // published build: locked to the active palette, no controls + + if (document.readyState === "loading") { + document.addEventListener("DOMContentLoaded", function () { + buildControls(name); + }); + } else { + buildControls(name); + } +})(); diff --git a/ben-py/docs/api/bundle.md b/ben-py/docs/api/bundle.md new file mode 100644 index 0000000..73181be --- /dev/null +++ b/ben-py/docs/api/bundle.md @@ -0,0 +1,34 @@ +# `binary_ensemble.bundle` + +```{eval-rst} +.. automodule:: binary_ensemble.bundle +``` + +## Encoder + +```{eval-rst} +.. autoclass:: binary_ensemble.bundle.BendlEncoder + :members: +``` + +## The stream session + +```{eval-rst} +.. autoclass:: binary_ensemble._core.BendlStreamSession + :members: +``` + +## Decoder + +```{eval-rst} +.. autoclass:: binary_ensemble.bundle.BendlDecoder + :members: +``` + +## Whole-bundle transforms + +```{eval-rst} +.. autofunction:: binary_ensemble.bundle.compress_stream + +.. autofunction:: binary_ensemble.bundle.relabel_bundle +``` diff --git a/ben-py/docs/api/codec.md b/ben-py/docs/api/codec.md new file mode 100644 index 0000000..f26718c --- /dev/null +++ b/ben-py/docs/api/codec.md @@ -0,0 +1,25 @@ +# `binary_ensemble.codec` + +```{eval-rst} +.. automodule:: binary_ensemble.codec +``` + +## Encoders + +```{eval-rst} +.. autofunction:: binary_ensemble.codec.encode_jsonl_to_ben + +.. autofunction:: binary_ensemble.codec.encode_jsonl_to_xben + +.. autofunction:: binary_ensemble.codec.encode_ben_to_xben +``` + +## Decoders + +```{eval-rst} +.. autofunction:: binary_ensemble.codec.decode_ben_to_jsonl + +.. autofunction:: binary_ensemble.codec.decode_xben_to_jsonl + +.. autofunction:: binary_ensemble.codec.decode_xben_to_ben +``` diff --git a/ben-py/docs/api/graph.md b/ben-py/docs/api/graph.md new file mode 100644 index 0000000..822a066 --- /dev/null +++ b/ben-py/docs/api/graph.md @@ -0,0 +1,17 @@ +# `binary_ensemble.graph` + +```{eval-rst} +.. automodule:: binary_ensemble.graph +``` + +## Reordering functions + +```{eval-rst} +.. autofunction:: binary_ensemble.graph.reorder + +.. autofunction:: binary_ensemble.graph.reorder_multi_level_cluster + +.. autofunction:: binary_ensemble.graph.reorder_reverse_cuthill_mckee + +.. autofunction:: binary_ensemble.graph.reorder_by_key +``` diff --git a/ben-py/docs/api/index.md b/ben-py/docs/api/index.md new file mode 100644 index 0000000..63a0814 --- /dev/null +++ b/ben-py/docs/api/index.md @@ -0,0 +1,56 @@ +# API reference + +The public API is split into four modules that mirror the project's CLI tools. Everything +listed here is also re-exported from the top-level `binary_ensemble` namespace, so +`from binary_ensemble import BendlEncoder` and +`from binary_ensemble.bundle import BendlEncoder` are equivalent. + +```{tip} +New here? Reach for **{mod}`binary_ensemble.bundle`** first. A `.bendl` bundle keeps the +assignment stream and its dual graph together in one self-describing file, which is what you +want the vast majority of the time. The other modules are for plain streams, whole-file +conversions, and graph preprocessing. +``` + +::::{grid} 1 1 2 2 +:gutter: 3 + +:::{grid-item-card} {octicon}`package` bundle +:link: bundle +:link-type: doc + +`BendlEncoder`, `BendlDecoder`, `compress_stream`, `relabel_bundle` — the recommended +single-file `.bendl` format. +::: + +:::{grid-item-card} {octicon}`list-unordered` stream +:link: stream +:link-type: doc + +`BenEncoder`, `BenDecoder` — plain `.ben`/`.xben` streams when you don't need a bundle. +::: + +:::{grid-item-card} {octicon}`arrow-switch` codec +:link: codec +:link-type: doc + +Whole-file `encode_*` / `decode_*` transforms between JSONL, BEN, and XBEN. +::: + +:::{grid-item-card} {octicon}`sort-desc` graph +:link: graph +:link-type: doc + +Reorder a dual graph (MLC, RCM, or by key) before encoding to shrink the result. +::: + +:::: + +```{toctree} +:hidden: + +bundle +stream +codec +graph +``` diff --git a/ben-py/docs/api/stream.md b/ben-py/docs/api/stream.md new file mode 100644 index 0000000..ae63f50 --- /dev/null +++ b/ben-py/docs/api/stream.md @@ -0,0 +1,19 @@ +# `binary_ensemble.stream` + +```{eval-rst} +.. automodule:: binary_ensemble.stream +``` + +## Encoder + +```{eval-rst} +.. autoclass:: binary_ensemble.stream.BenEncoder + :members: +``` + +## Decoder + +```{eval-rst} +.. autoclass:: binary_ensemble.stream.BenDecoder + :members: +``` diff --git a/ben-py/docs/concepts/api-map.md b/ben-py/docs/concepts/api-map.md new file mode 100644 index 0000000..df0a662 --- /dev/null +++ b/ben-py/docs/concepts/api-map.md @@ -0,0 +1,48 @@ +# The API map + +The Python API is deliberately split into four modules that mirror the project's CLI tools. +Knowing which module owns which job makes the whole surface easy to navigate. + +| Module | Mirrors CLI | Owns | +|---|---|---| +| {mod}`binary_ensemble.bundle` | `bendl` | Creating, reading, and transforming `.bendl` bundles | +| {mod}`binary_ensemble.stream` | `ben` | Reading and writing plain `.ben`/`.xben` streams | +| {mod}`binary_ensemble.codec` | `ben` (encode/decode modes) | Whole-file conversions between JSONL, BEN, and XBEN | +| {mod}`binary_ensemble.graph` | `reben` (orderings) | Reordering a dual graph before encoding | + +Everything is also re-exported at the top level, so `from binary_ensemble import BendlEncoder` +works. + +## Lead with `bundle` + +For most work, reach for {mod}`binary_ensemble.bundle` first: + +- **`BendlEncoder`** — write a bundle: attach a graph and metadata, then stream assignments. +- **`BendlDecoder`** — read a bundle: iterate assignments, recover the graph and metadata. +- **`compress_stream`** — recompress a bundle's BEN stream to XBEN, keeping every asset. +- **`relabel_bundle`** — reorder a bundle's graph and rewrite its stream to match. + +A bundle keeps the assignment stream and its dual graph together, which is what you want the +vast majority of the time. + +## When to use the others + +**`stream`** — when you specifically *don't* want a bundle: a raw `.ben`/`.xben` stream with +no embedded graph. `BenEncoder` writes one; `BenDecoder` reads and +[subsamples](../how-to/subsample.md) one. Note that the bundle decoder supports the same +subsampling methods, so you rarely need to drop down to the stream classes just for that. + +**`codec`** — when you have whole files to convert and don't need sample-by-sample access: +`encode_jsonl_to_ben`, `encode_ben_to_xben`, `decode_ben_to_jsonl`, and friends transform an +entire file in one call. See [Convert between formats](../how-to/convert-formats.md). + +**`graph`** — when you want to reorder a dual graph yourself (to inspect the permutation, or +to reorder before running a sampler) rather than letting `BendlEncoder.add_graph` do it +inline. See [Why reordering shrinks files](compression.md). + +```{admonition} `_core` is an implementation detail +:class: note +You may notice a `binary_ensemble._core` module — the compiled extension. Always import from +the public modules above (or the top level); `_core` is internal and unsupported for direct +use. +``` diff --git a/ben-py/docs/concepts/compression.md b/ben-py/docs/concepts/compression.md new file mode 100644 index 0000000..905fcc2 --- /dev/null +++ b/ben-py/docs/concepts/compression.md @@ -0,0 +1,150 @@ +# Why reordering shrinks files + +BEN's base compression is run-length encoding plus bit-packing. RLE turns an assignment into +`(value, length)` pairs: + +``` +[1, 1, 1, 2, 2, 2, 2, 3] -> [(1, 3), (2, 4), (3, 1)] +``` + +The fewer, longer runs an assignment has, the smaller it gets. So **anything that produces +longer runs makes the files dramatically smaller**. There are two levers for that, and they +are where almost all the savings come from. + +## Lever 1: node reordering (the big one) + +Nearby geographic units tend to land in the same district. If the dual graph's **node +ordering** keeps neighbors adjacent, assignments collapse into a handful of long runs instead +of many short ones. + +### Why a good node order creates long runs + +RLE only shrinks an assignment when *consecutive* nodes share a district id. And districts +aren't random scatterings of nodes — they're **contiguous, densely-connected regions** of the +dual graph. So a run appears wherever the node order happens to place nodes from the same +district side by side, and the longer those stretches, the fewer runs the assignment needs. + +That reframes the goal: order the nodes so that **nodes likely to share a district sit next to +each other**. Every assignment in the ensemble is read in that *one* order, so a good order +pays off across the entire ensemble at once — and because the runs become longer and more +regular, the byte patterns *across* plans get more repetitive too, which feeds the XBEN +(LZMA2) stage on top (Lever 2). + +The three orderings below are different heuristics for that same goal. All of them are +**lossless permutations**: reordering records a node permutation map, so the original order is +always recoverable, and the values inside each assignment are untouched — only their positions +move. + +### Sort by a key, e.g. `GEOID` (`sort="key"`) + +A Census `GEOID` is a *hierarchical* identifier — state, then county, then tract, then block — +so sorting nodes lexicographically by `GEOID` lays the map out in nested geographic order: the +blocks within a tract end up adjacent, the tracts within a county end up adjacent, and so on. +Because districts are assembled from geographically-contiguous pieces, units that are close in +that hierarchy usually fall in the same district, which produces long runs. + +When you *have* a meaningful geographic key this is often the single most effective ordering, +and it's the cheapest — it's just a sort. Use any node attribute via +`sort="key", key="GEOID20"`. + +### Reverse Cuthill–McKee (`sort="rcm"`) + +RCM comes from sparse linear algebra, where it reorders a matrix to pull all the non-zeros +close to the diagonal (it minimizes *bandwidth*). On a graph that amounts to: walk it +breadth-first from a peripheral node, number nodes as you reach them, then reverse the result. +The effect is that **graph-adjacent nodes get nearby indices**. Since the edges *inside* a +district far outnumber the edges that cross a district boundary, neighbors usually share a +district — so nearby indices usually share a district, and the runs grow. + +RCM uses only the graph's topology — no attributes required — so it's a solid default when you +don't have a geographic key to sort on. + +### Multi-level clustering (`sort="mlc"`) + +MLC reorders by **community structure**: it recursively groups the graph into tightly-connected +clusters and lays the nodes out cluster by cluster (each connected component handled on its +own). A district is, almost by definition, a tightly-connected cluster of units — so ordering +by clusters tends to line up with district boundaries even more closely than RCM does. That is +why it is the **default** for `add_graph`. Like RCM, it needs only the topology. + +### In Python + +```python +import networkx as nx + +from binary_ensemble import graph + +# A dual graph in NetworkX adjacency form, with a GEOID20 attribute to sort on. +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(4, 4)) +for node in dual_graph.nodes: + dual_graph.nodes[node]["GEOID20"] = f"{node:04d}" +adjacency = nx.adjacency_data(dual_graph) + +reordered, permutation_map = graph.reorder(adjacency, sort="key", key="GEOID20") +# or graph.reorder(adjacency, sort="mlc") / sort="rcm" +``` + +Reordering returns a **node permutation map** so the change is fully reversible — you can +always recover the original node order. When you embed a graph in a bundle with +`BendlEncoder.add_graph(..., sort="mlc")`, the permutation map is stored for you. + +### Which ordering should I use? + +- **Have a geographic key** like `GEOID`? Start with `sort="key"` — usually the strongest, and + the cheapest. +- **No useful key?** Use `sort="mlc"` (the default) or `sort="rcm"`; both work from the graph's + topology alone. + +These are all heuristics, so the exact win depends on your dual graph. Reordering is cheap and +reversible, so it rarely hurts — though on a *tiny* ensemble the extra permutation map can +occasionally make the file net-larger. It pays off most right before an expensive XBEN +recompress, where every byte saved in BEN is amplified. + +```{admonition} This is where the headline number comes from +:class: tip +The Colorado example: 100k plans on ~140k census blocks is **27 GB** of JSONL. Reordered by +`GEOID20`, the BEN stream is **~550 MB**; compressed to XBEN it's **~6 MB** — over **4500×** +smaller. Without the reorder, the same XBEN is far larger. +``` + +## Lever 2: district relabeling + +LZMA2 (the compressor behind XBEN) spots repeated *byte sequences* across plans. Two plans +can be structurally identical yet use different district numbers: + +``` +[2, 2, 3, 3, 1, 1] +[1, 1, 2, 2, 3, 3] # same partition, different labels +``` + +To a human these are "the same map"; to LZMA2 they look different. **First-seen relabeling** +fixes this by renumbering district ids in order of first appearance, starting at 1, so +equivalent plans encode identically and compress better. Run it before encoding to XBEN. + +## Putting it together + +The recommended pipeline for a small, shareable archive is: + +1. Build a BEN bundle while sampling (ideally on an already-reordered graph). +2. **Relabel and reorder** the bundle to maximize run length and cross-plan repetition. +3. **Recompress** the bundle's stream to XBEN. + +`relabel_bundle` does step 2 in one call — it reorders the embedded graph, rewrites every +assignment into the new order, and stores the permutation map: + +```python +from binary_ensemble import relabel_bundle, compress_stream + +relabel_bundle("ensemble.bendl", out_file="ensemble.relabeled.bendl", sort="key", key="GEOID20") +compress_stream("ensemble.relabeled.bendl", out_file="ensemble.xben.bendl") +``` + +See [Shrink a bundle for sharing](../how-to/shrink-for-sharing.md) for the full recipe. + +## A note on resolution + +BEN excels on **census-block** ensembles, where assignments are long (hundreds of thousands +of nodes) and runs are long. For coarser units like VTDs or tracts, assignments are short +(10–20k nodes) and the compression ratios are more modest — still useful, just less dramatic. +For very small files from MCMC on coarse units, the byte-level delta encoding of +[PCompress](https://github.com/mggg/pcompress) can be a good alternative. diff --git a/ben-py/docs/concepts/formats.md b/ben-py/docs/concepts/formats.md new file mode 100644 index 0000000..e58abc6 --- /dev/null +++ b/ben-py/docs/concepts/formats.md @@ -0,0 +1,69 @@ +# Formats: BEN vs XBEN vs BENDL + +`binary-ensemble` has three on-disk **containers**. They share the same underlying encoding; +they differ in how much extra compression and packaging they add. + +## `.ben` — the working format + +A plain BEN **stream**: a one-line banner followed by the bit-packed, run-length-encoded +frames. This is the format you *work* with — it supports reading any sample, replaying an +ensemble, and [subsampling](../how-to/subsample.md) without decompressing everything. + +- **Fast** to write and read. +- Already much smaller than JSONL (the Colorado example: 27 GB → ~550 MB). +- The format the `BenEncoder` / `BenDecoder` stream classes produce and consume. + +## `.xben` — the storage format + +A BEN stream wrapped in [LZMA2](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm). +LZMA2 exploits the repetition *across* plans that bit-packing alone can't reach, taking the +Colorado example from ~550 MB down to ~6 MB. + +```{admonition} XBEN is for storage and transfer, not active work +:class: important +Decompression is fast (a large file extracts in a few minutes), but **compression is slow** — +high-ratio XBEN encoding of a block-level ensemble can take an hour or more. Encode to XBEN +once for archival or sharing; do your day-to-day reading against a BEN stream. +``` + +## `.bendl` — the bundle (recommended) + +A **bundle** packages a BEN or XBEN assignment stream together with its assets in a single +self-describing file: + +- the **dual graph** (`graph.json`), so the node order travels with the data; +- a **node permutation map** (`node_permutation_map.json`), if the graph was reordered; +- **metadata** (`metadata.json`) — seeds, sampler settings, anything you want; +- arbitrary **custom assets** you attach. + +Because the graph is embedded, a collaborator can open a `.bendl` and immediately reconstruct +plans — no separate graph file to track down, no chance of pairing the wrong one. This is why +the bundle is the recommended default. + +A bundle can wrap *either* a BEN stream (the working form) or an XBEN stream (the compressed +form). You typically build a BEN bundle while sampling, then +[recompress it to XBEN](../how-to/shrink-for-sharing.md) for distribution. + +## Choosing a format + +| If you want to… | Use | +|---|---| +| Hand an ensemble to a collaborator as one file | `.bendl` (XBEN inside) | +| Keep building / reading an ensemble locally | `.bendl` (BEN inside) or `.ben` | +| Archive an ensemble as small as possible | `.xben`, or a `.bendl` recompressed to XBEN | +| Interoperate with the JSONL world | convert with the [codec helpers](../how-to/convert-formats.md) | + +```{tip} +When in doubt, use a `.bendl` bundle. You only need the plain `.ben`/`.xben` stream classes +when you specifically don't want the bundle packaging — for example, feeding a raw stream to +another tool that expects it. +``` + +## Going deeper + +The exact byte layouts are documented in the format specifications, for readers building +interoperating tools: + +- [BEN / XBEN stream format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/ben-format-spec.md) +- [TwoDelta variant format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/twodelta-format-spec.md) +- [BENDL bundle format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/bendl-format-spec.md) diff --git a/ben-py/docs/concepts/overview.md b/ben-py/docs/concepts/overview.md new file mode 100644 index 0000000..f578251 --- /dev/null +++ b/ben-py/docs/concepts/overview.md @@ -0,0 +1,67 @@ +# Overview + +## The problem + +A redistricting **sampler** — GerryChain's ReCom, ForestReCom, a Sequential Monte Carlo +routine — explores the space of legal districting plans by emitting a long sequence of +plans. Serious analyses want *many* plans: tens of thousands to millions. + +The natural way to store them is [JSONL](https://jsonlines.org) (JSON Lines), one plan per +line: + +``` +{"assignment": [1, 1, 2, 2, 3, 3, ...], "sample": 1} +{"assignment": [1, 1, 2, 2, 3, 1, ...], "sample": 2} +... +``` + +This is simple and portable, but it does not scale. A 100,000-plan ensemble on Colorado's +~140,000 census blocks is **27 GB** of JSONL. Most of that is redundancy: each assignment +is mostly long runs of the same district id, and consecutive plans differ only slightly. + +## What BEN does + +**BEN** (Binary-Ensemble) is a binary format that wrings out that redundancy. The core +compression is deliberately simple and works in two stages: + +1. **Run-length encoding (RLE)** — `[1, 1, 1, 2, 2, 2, 2, 3]` becomes + `[(1, 3), (2, 4), (3, 1)]`. Districting plans are mostly long runs, so this is a big win, + especially when nearby geographic units sit next to each other in the node ordering. +2. **Bit-packing** — each run's value and length are stored in the minimum number of bits, + not padded out to whole bytes. + +On top of that, the **XBEN** format adds LZMA2 compression to exploit the repetition *across* +plans, and several **encoding variants** specialize for how a particular sampler produces its +plans. + +```{admonition} The headline result +:class: tip +That 27 GB Colorado JSONL ensemble, reordered by `GEOID20`, becomes a **~550 MB** BEN stream, +and then a **~6 MB** XBEN file — a **>4500×** reduction, completely lossless. The biggest +single lever is *node reordering*; see [Why reordering shrinks files](compression.md). +``` + +## The format family + +BEN comes as three on-disk **containers**, each suited to a different job: + +| Container | What it is | Use it for | +|-----------|-----------|------------| +| `.ben` | A plain BEN **stream** | Working with an ensemble: reading, replaying, subsampling | +| `.xben` | A BEN stream wrapped in LZMA2 | Long-term storage and transferring ensembles | +| `.bendl` | A **bundle**: a BEN/XBEN stream plus the dual graph and metadata | The recommended default — one self-describing file | + +[Formats: BEN vs XBEN vs BENDL](formats.md) covers the trade-offs in detail. + +## How the Python API is organized + +The Python package mirrors the project's CLI tools: + +- **{mod}`binary_ensemble.bundle`** — read and write `.bendl` bundles (start here). +- **{mod}`binary_ensemble.stream`** — read and write plain `.ben`/`.xben` streams. +- **{mod}`binary_ensemble.codec`** — convert whole files between JSONL, BEN, and XBEN. +- **{mod}`binary_ensemble.graph`** — reorder a dual graph before encoding. + +See [The API map](api-map.md) for when to reach for each, and the +[Vocabulary](vocabulary.md) page for the precise meaning of *plan*, *assignment*, +*sample*, and *ensemble*. diff --git a/ben-py/docs/concepts/variants.md b/ben-py/docs/concepts/variants.md new file mode 100644 index 0000000..ff73875 --- /dev/null +++ b/ben-py/docs/concepts/variants.md @@ -0,0 +1,57 @@ +# Encoding variants + +A BEN stream is encoded with one of three **variants**. The variant controls how individual +plans (frames) are stored relative to each other; it's fixed for the whole stream when you +encode, and **decoding auto-detects it**, so you never pass a variant when reading a file +back. + +You choose a variant with the `variant=` argument on the encoders and the +`encode_jsonl_to_*` helpers. + +## `standard` + +Each plan is stored independently — RLE + bit-packing, nothing more. It's the simplest +encoding and the baseline. For ensembles with no repetition, its output is very slightly +smaller than `mkv_chain`; for chains with repeats, the other variants win comfortably. + +- **Good for:** any ensemble; a safe baseline. + +## `mkv_chain` + +Like `standard`, but identical consecutive plans are collapsed into a single frame carrying a +repetition count. This is built for **MCMC chains logged in full** — including self-loops, +where a proposal was rejected and the same plan repeats (as in +[Reversible ReCom](https://mggg.org/rrc)). + +- **Good for:** full-chain MCMC ensembles where rejections produce repeated plans. + +## `twodelta` + +The **default**, and usually the best general-purpose choice. It delta-encodes **pairwise +ReCom steps**: when two consecutive plans differ by exactly one recombination move (two +districts swap some nodes, nothing else changes), only the difference is stored. Any other +transition — a multi-district move, independent/random sampling, a newly created district — +is stored as a full snapshot frame instead, and identical consecutive plans are handled with +repetition counts. + +Because it falls back to snapshots, `twodelta` is **compatible with every sampler**; non-ReCom +ensembles just produce more snapshot frames and less delta savings. Its best-case compression +comes from a full-chain *pairwise* ReCom ensemble, where nearly every accepted move changes +exactly two districts. + +- **Good for:** ReCom chains (best case) and as a robust default for anything else. + +## Choosing a variant + +| Sampler / data shape | Recommended variant | +|---|---| +| Pairwise ReCom chain | `twodelta` (default) | +| Full MCMC chain with many rejections/repeats | `mkv_chain` | +| Independent / random sampling, ForestReCom, mixed | `twodelta` or `standard` | +| Not sure | `twodelta` (the default) | + +```{admonition} You don't decode by variant +:class: note +The variant is recorded in the stream's banner, so readers detect it automatically. The only +place you specify a variant is when **encoding**. +``` diff --git a/ben-py/docs/concepts/vocabulary.md b/ben-py/docs/concepts/vocabulary.md new file mode 100644 index 0000000..763d481 --- /dev/null +++ b/ben-py/docs/concepts/vocabulary.md @@ -0,0 +1,74 @@ +# Vocabulary + +These are the core terms used throughout the docs and the API. They come from the project's +[glossary](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/glossary.md), which +is the source of truth for the whole workspace. + +## Dual graph + +The geographic adjacency graph that gives meaning to an assignment. Nodes are geographic +**units** (census blocks, VTDs, tracts, precincts); edges connect units that are adjacent. +The dual graph fixes a **node ordering** — which unit is index 0, which is index 1, and so on. + +In Python, dual graphs are read and written in **NetworkX adjacency format** (a JSON shape). +`BendlDecoder.read_graph()` hands one back to you as a live `networkx.Graph`. + +## Plan + +The mathematical object: a partition of the dual graph's nodes into districts. A plan is +*label-free up to relabeling* — renumbering the districts gives the same plan. + +## Assignment + +The concrete vector encoding of a plan: a list of integers of length *N* (the number of +nodes), where index *i* holds the **district id** of node *i*, in dual-graph node order. + +```python +assignment = [1, 1, 2, 2, 3, 3] # node 0 -> district 1, node 2 -> district 2, ... +``` + +An assignment uniquely determines a plan, but a single plan has many valid assignments (one +per node ordering and per district relabeling). This freedom is exactly what the +[compression levers](compression.md) exploit. + +```{admonition} Node order is load-bearing +:class: warning +An assignment only means something *with respect to a particular dual graph's node order*. +If you write assignments in one order and read them against a graph in another, you get +silent nonsense. This is why bundles embed the graph — so the order travels with the data. +``` + +## District id + +The integer values inside an assignment. The maximum supported district id is **65535** +(it must fit in 16 bits), which is far beyond any real statewide map. + +## Sample + +One entry in an ensemble: the pair `(sample_number, assignment)`. The `sample_number` is +**1-indexed** — decoded ensembles always start at sample 1. + +## Ensemble + +An ordered stream of samples produced by a single sampler run. Conceptually it's a +probabilistic draw from the space of plans. Every `.ben`, `.xben`, and `.bendl` file wraps +exactly one ensemble. + +## Sample count + +The number of draws an ensemble represents, always counted in *expanded* terms. When a +variant collapses five identical consecutive samples into one frame, the sample count still +goes up by five, not one. `len(decoder)` reports this expanded count. + +## Variant + +How a stream encodes its frames internally — one of `standard`, `mkv_chain`, or `twodelta`. +A variant is fixed for an entire stream when you encode, and decoding **auto-detects** it, so +you never specify a variant when reading. See [Encoding variants](variants.md). + +## Sampler vs chain + +- **Sampler** — any algorithm that produces an ensemble (covers both MCMC and SMC). +- **Chain** — specifically an MCMC method, where the Markov property matters. + +Use *sampler* unless you specifically mean a Markov chain. diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index ae821e1..0c96861 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -1,27 +1,16 @@ -# -*- coding: utf-8 -*- -# # Configuration file for the Sphinx documentation builder. # -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config +# Full option reference: https://www.sphinx-doc.org/en/master/usage/configuration.html +import json import os import sys +from importlib import metadata - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - -# autodoc needs to find our code. -sys.path.insert(0, os.path.abspath("../src")) +# Make the source package importable for autodoc even when it is not pip-installed +# (autodoc imports the built module at runtime to read live docstrings — including +# the ones that live in the compiled ``_core`` extension). +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- @@ -29,62 +18,447 @@ copyright = "2025, Peter Rock" author = "Peter Rock" -# The short X.Y version -version = "" -# The full version, including alpha/beta/rc tags -release = "" +try: + release = metadata.version("binary-ensemble") +except metadata.PackageNotFoundError: + release = "" +version = ".".join(release.split(".")[:2]) # -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ - "autoapi.extension", - "sphinx.ext.coverage", + "sphinx.ext.autodoc", "sphinx.ext.napoleon", - "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinx.ext.intersphinx", + "sphinx.ext.mathjax", "sphinx_copybutton", + "sphinx_design", + "sphinxext.opengraph", "myst_nb", ] -nb_execution_mode = "off" # render outputs already in the .ipynb; no execution -templates_path = ["_templates"] -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -autoapi_type = "python" -autoapi_dirs = ["../binary_ensemble"] -autoapi_clean = True -autoapi_keep_files = False -autoapi_ignore = [ - "../docs/**", - "**/_build/**", - "**/.venv/**", - "**/tests/**", - "**/examples/**", - "**/notebooks/**", + +exclude_patterns = [ + "_build", + "jupyter_execute", + ".jupyter_cache", + "Thumbs.db", + ".DS_Store", + "**/example_data/**", ] -autoapi_options = [ - "members", - "undoc-members", - "show-inheritance", - "show-module-summary", - "imported-members", + +# -- MyST (markdown) --------------------------------------------------------- + +myst_enable_extensions = [ + "colon_fence", + "deflist", + "dollarmath", + "linkify", + "substitution", + "tasklist", ] +myst_heading_anchors = 3 -# -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output +# -- MyST-NB (executable notebooks) ------------------------------------------ +# +# Execution is env-driven. The hosted build (ReadTheDocs) leaves it "off" so it +# renders the committed notebook outputs and stays fast and reliable; CI and local +# verification set ``NB_EXECUTION_MODE=cache`` to actually run every notebook and +# fail the build on any drift between the docs and the live API. +nb_execution_mode = os.environ.get("NB_EXECUTION_MODE", "off") +nb_execution_timeout = 1800 +nb_execution_raise_on_error = True +nb_merge_streams = True -html_theme = "sphinx_rtd_theme" -html_theme_options = {"style_nav_header_background": "#0099cd"} -html_static_path = ["_static"] +# -- autodoc / napoleon ------------------------------------------------------ + +# Members are listed by the explicit ``autoclass``/``autofunction`` directives in the +# API pages, so ``automodule`` is left to render only the module docstring (no members) — +# documenting each object twice produces "duplicate object description" warnings. +autodoc_default_options = { + "show-inheritance": True, + "member-order": "bysource", +} +autodoc_typehints = "description" +autodoc_inherit_docstrings = False +add_module_names = False + +napoleon_google_docstring = True +napoleon_numpy_docstring = True +napoleon_use_rtype = False + +# -- intersphinx ------------------------------------------------------------- + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "networkx": ("https://networkx.org/documentation/stable/", None), +} -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" +# -- HTML output ------------------------------------------------------------- +html_theme = "furo" +html_title = "binary-ensemble" +html_static_path = ["_static"] html_css_files = ["css/custom.css"] -# -- Extension configuration ------------------------------------------------- +pygments_style = "friendly" +pygments_dark_style = "github-dark" -# Prepend the module name of classes. -add_module_names = True -autodoc_inherit_docstrings = False +# -- Color palettes ---------------------------------------------------------- +# +# Each palette gives furo a light-mode and a dark-mode brand color as +# ``(primary, content)`` — ``primary`` tints sidebar accents/headings, ``content`` +# is the in-text link color (kept darker in light mode / lighter in dark mode for +# legible contrast). Swap the whole site palette by changing ACTIVE_PALETTE below, +# or without editing this file via the DOCS_PALETTE env var, e.g.: +# +# DOCS_PALETTE=forest task docs-serve +# +# Add your own entries freely. +# Each entry maps a mode ("light"/"dark") to a dict of Furo CSS variables. Most +# palettes set only the two brand colors — "color-brand-primary" (sidebar/heading +# accents) and "color-brand-content" (in-text links) — via the _brand() helper, but a +# palette may set *any* Furo variable, e.g. backgrounds/foregrounds (see "aurora"). +# +# A palette may also carry "dark_pygments" / "light_pygments": the names of the Pygments +# styles used for code blocks in dark / light mode when that palette is active (the +# switcher's "Auto" code theme). Each must be a style listed in CODE_THEMES below; a +# palette without one falls back to the global pygments_dark_style / pygments_style. +def _brand(primary, content): + return {"color-brand-primary": primary, "color-brand-content": content} + + +PALETTES = { + "ocean": {"light": _brand("#0099cd", "#0066a0"), "dark": _brand("#36c5f0", "#5cc8f5")}, + "indigo": {"light": _brand("#4f46e5", "#4338ca"), "dark": _brand("#818cf8", "#a5b4fc")}, + "forest": {"light": _brand("#047857", "#065f46"), "dark": _brand("#34d399", "#6ee7b7")}, + "sunset": {"light": _brand("#ea580c", "#c2410c"), "dark": _brand("#fb923c", "#fdba74")}, + "plum": {"light": _brand("#7c3aed", "#6d28d9"), "dark": _brand("#a78bfa", "#c4b5fd")}, + "slate": {"light": _brand("#334155", "#1e293b"), "dark": _brand("#94a3b8", "#cbd5e1")}, + # From a Huemint palette: a charcoal dark mode with neon-teal accents, and a + # matching light mode that carries the teal as a darker, legible shade on white. + "aurora": { + "dark_pygments": "github-dark", + "light": { + "color-background-primary": "#feffff", + "color-foreground-primary": "#242827", + "color-brand-primary": "#0d9488", + "color-brand-content": "#0f766e", + }, + "dark": { + "color-background-primary": "#242827", + "color-background-secondary": "#000200", + "color-foreground-primary": "#feffff", + "color-brand-primary": "#36e8c8", + "color-brand-content": "#36e8c8", + }, + }, + # From a Huemint palette: warm ember accents (peach headings, teal links) + # over a near-black dark mode; light mode carries the amber as a legible + # darker shade on white. + "ember": { + "dark_pygments": "gruvbox-dark", + "light": { + "color-background-primary": "#fdf7f1", + "color-background-secondary": "#f7ede3", + "color-foreground-primary": "#0c0706", + "color-brand-primary": "#d97906", + "color-brand-content": "#a8560a", + }, + "dark": { + "color-background-primary": "#0c0706", + "color-background-secondary": "#181206", + "color-foreground-primary": "#e3ecf6", + "color-brand-primary": "#fc9d66", + "color-brand-content": "#45c9cb", + }, + }, + # From a Huemint light palette (navy + blue with an orange pop); the dark + # mode is derived: a deep-navy canvas with the true navy as the secondary + # surface, a lightened blue for headings, and a warmed orange for links. + "harbor": { + "dark_pygments": "one-dark", + "light": { + "color-background-primary": "#f4f7fc", + "color-background-secondary": "#e8eef7", + "color-foreground-primary": "#1f2c5b", + "color-brand-primary": "#2965ad", + "color-brand-content": "#1f4a8a", + }, + "dark": { + "color-background-primary": "#131a36", + "color-background-secondary": "#1f2c5b", + "color-foreground-primary": "#fffdfe", + "color-brand-primary": "#6ea8e0", + "color-brand-content": "#ff7a45", + }, + }, + # From a Huemint palette: a deep-indigo dark mode with a neon cyan/hot-pink + # accent pair (synthwave); light mode carries them as a deep rose + dark teal + # that stay legible on cream. + "nebula": { + "dark_pygments": "dracula", + "light": { + "color-background-primary": "#fbfaf2", + "color-background-secondary": "#f1f0e6", + "color-foreground-primary": "#17143b", + "color-brand-primary": "#c8155a", + "color-brand-content": "#0e7490", + }, + "dark": { + "color-background-primary": "#17143b", + "color-background-secondary": "#211f1f", + "color-foreground-primary": "#fbfaf2", + "color-brand-primary": "#ea0758", + "color-brand-content": "#2cdbde", + }, + }, + # From a Huemint palette: a warm near-black dark mode with a bright orange / + # cerulean (complementary) accent pair; light mode darkens both for white. + "tangerine": { + "dark_pygments": "fruity", + "light_pygments": "warm-light", + "light": { + "color-background-primary": "#fbfaf2", + "color-background-secondary": "#f1f0e6", + "color-foreground-primary": "#140f0c", + "color-brand-primary": "#c2410c", + "color-brand-content": "#004483", + }, + "dark": { + "color-background-primary": "#1c1917", + "color-background-secondary": "#292524", + "color-foreground-primary": "#fcffff", + "color-brand-primary": "#ff750f", + "color-brand-content": "#0097d4", + }, + }, +} +ACTIVE_PALETTE = os.environ.get("DOCS_PALETTE", "tangerine") +_palette = PALETTES[ACTIVE_PALETTE] + +# Whether to render the in-browser palette/code-theme dropdowns. Off by default so the +# published site ships locked to the active palette and its default code themes; set +# DOCS_SWITCHER=1 while developing to expose the controls and experiment live. +SHOW_SWITCHER = os.environ.get("DOCS_SWITCHER", "").lower() not in ("", "0", "false", "no") + +html_theme_options = { + "source_repository": "https://github.com/peterrrock2/binary-ensemble/", + "source_branch": "main", + "source_directory": "ben-py/docs/", + # Bake only the brand colors; the switcher script paints the full active palette + # (including any background/foreground overrides) on load, so it stays the sole + # owner of those and switching palettes in the browser reverts cleanly. + "light_css_variables": { + k: v for k, v in _palette["light"].items() if k.startswith("color-brand-") + }, + "dark_css_variables": { + k: v for k, v in _palette["dark"].items() if k.startswith("color-brand-") + }, + "footer_icons": [ + { + "name": "GitHub", + "url": "https://github.com/peterrrock2/binary-ensemble", + "html": "", + "class": "fa-brands fa-github", + }, + ], +} + +# -- OpenGraph (social cards) ------------------------------------------------ + +ogp_site_url = "https://binary-ensemble.readthedocs.io/" +ogp_description_length = 200 +ogp_enable_meta_description = True +# Emit OpenGraph meta tags but skip the matplotlib-rendered preview images (their default +# font lacks some glyphs we use, e.g. the "↔" arrow). +ogp_social_cards = {"enable": False} + + +# -- Swappable code (Pygments) themes ---------------------------------------- +# +# Furo bakes one light + one dark Pygments theme (pygments_style / pygments_dark_style) +# into pygments.css. To make code themes swappable — per palette and live in the +# browser — we render each style below and key it off a attribute the switcher +# sets. Pygments' own `.highlight { background }` line rides along, so every theme +# brings its matching code-block surface. +# +# Two attributes, two behaviors: +# * data-code-theme — an explicit pick from the dropdown. Scoped `html body[…]` so it +# applies in BOTH light and dark mode and out-specifies Furo's +# own rules regardless of stylesheet order. +# * data-code-auto — the active palette's "dark_pygments" default (the "Auto" entry), +# scoped to dark mode only so light mode keeps the global light +# style. The auto-mode (`prefers-color-scheme`) variant mirrors +# Furo's `:not([data-theme="light"])` selector for system readers. +# +# CODE_THEMES is the menu the switcher offers, grouped into the s shown in the +# dropdown. Add or remove any valid Pygments style name (`python -m pygments -L styles`); +# the "Dark"/"Light" labels are just hints about which mode a style suits. +CODE_THEMES = { + "Dark": [ + "github-dark", + "gruvbox-dark", + "one-dark", + "dracula", + "nord", + "monokai", + "material", + "zenburn", + "native", + "solarized-dark", + "paraiso-dark", + "stata-dark", + "fruity", + "coffee", + ], + "Light": [ + "warm-light", + "github-light", + "gruvbox-light", + "solarized-light", + "friendly", + "tango", + "xcode", + "lovelace", + "manni", + "paraiso-light", + "arduino", + "vs", + ], +} + + +# Custom (non-builtin) Pygments styles, keyed by the name used in CODE_THEMES and the +# data-code-theme attribute. _pygments_theme_css resolves these to the Style class +# instead of a builtin style name (HtmlFormatter accepts either). +# +# "warm-light" is built to fit the warm palettes (tangerine/nebula/ember): a cream +# background with tokens drawn from the brand accent family — orange and amber warms +# against cerulean and teal cools — rather than a stock theme's unrelated hues. Every +# token color is chosen to clear ~4.5:1 contrast on the cream background. +def _warm_light(): + from pygments.style import Style + from pygments.token import ( + Comment, Error, Generic, Keyword, Name, Number, Operator, String, Token, + ) + + return type( + "WarmLightStyle", + (Style,), + { + "name": "warm-light", + "background_color": "#f6f1e7", + "highlight_color": "#e7dcc4", + "styles": { + Token: "#20180f", + Comment: "italic #857762", + Comment.Preproc: "noitalic #b0420a", + Keyword: "bold #b0420a", + Keyword.Type: "nobold #9a5b00", + Keyword.Constant: "nobold #9a5b00", + Operator: "#6a4a2a", + Operator.Word: "bold #b0420a", + Name.Builtin: "#9a5b00", + Name.Function: "#0a5a86", + Name.Class: "bold #0a5a86", + Name.Namespace: "bold #0a5a86", + Name.Exception: "bold #b3261e", + Name.Variable: "#20180f", + Name.Constant: "#9a5b00", + Name.Decorator: "#b0420a", + Name.Attribute: "#0a5a86", + Name.Tag: "bold #0e6a60", + String: "#0e6a60", + String.Doc: "italic #857762", + String.Escape: "bold #b0420a", + Number: "#8a5a00", + Generic.Heading: "bold #20180f", + Generic.Subheading: "bold #0a5a86", + Generic.Deleted: "#b3261e", + Generic.Inserted: "#0e6a60", + Generic.Error: "#b3261e", + Generic.Emph: "italic", + Generic.Strong: "bold", + Generic.Prompt: "bold #857762", + Error: "border:#b3261e", + }, + }, + ) + + +CUSTOM_STYLES = {"warm-light": _warm_light()} + + +def _pygments_theme_css(): + from pygments.formatters import HtmlFormatter + + menu = [s for group in CODE_THEMES.values() for s in group] + dark_defaults = [p["dark_pygments"] for p in PALETTES.values() if p.get("dark_pygments")] + light_defaults = [p["light_pygments"] for p in PALETTES.values() if p.get("light_pygments")] + + # A style name may resolve to a builtin (the string) or a registered custom class. + def make_formatter(style): + return HtmlFormatter(style=CUSTOM_STYLES.get(style, style)) + + def rules(formatter, prefix): + # get_style_defs prefixes the token rules (and the `.highlight {background}` line) + # with `prefix`; keep only those, dropping Pygments' un-prefixed globals + # (pre{}, td.linenos{}) so nothing leaks outside code blocks. + return "\n".join( + line + for line in formatter.get_style_defs(f"{prefix} .highlight").splitlines() + if line.startswith(f"{prefix} .highlight") + ) + + blocks = [] + # Explicit picks (and any palette default, so it resolves even if absent from the + # menu) apply in any mode via the order-independent `html body` prefix. + for style in dict.fromkeys(menu + dark_defaults + light_defaults): + blocks.append(rules(make_formatter(style), f'html body[data-code-theme="{style}"]')) + # "Auto" applies a palette's dark/light default, each scoped to its own mode so the + # other mode keeps the global Pygments style. The auto-mode (`prefers-color-scheme`) + # variants mirror Furo's `:not([data-theme=…])` selectors for system readers. + for style in dict.fromkeys(dark_defaults): + fmt = make_formatter(style) + blocks.append(rules(fmt, f'body[data-theme="dark"][data-code-auto="{style}"]')) + auto = rules(fmt, f'body:not([data-theme="light"])[data-code-auto="{style}"]') + blocks.append("@media (prefers-color-scheme: dark){\n" + auto + "\n}") + for style in dict.fromkeys(light_defaults): + fmt = make_formatter(style) + blocks.append(rules(fmt, f'body[data-theme="light"][data-code-auto-light="{style}"]')) + auto = rules(fmt, f'body:not([data-theme="dark"])[data-code-auto-light="{style}"]') + blocks.append("@media (prefers-color-scheme: light){\n" + auto + "\n}") + return "\n".join(blocks) + + +# The rendered themes are large, so write them to one linked stylesheet (the browser +# caches it once) instead of inlining them into every page. The file lives in a +# build-only, git-ignored "_generated" static dir that html_static_path picks up. +_generated = os.path.join(os.path.dirname(__file__), "_generated", "css") +os.makedirs(_generated, exist_ok=True) +with open(os.path.join(_generated, "pygments-themes.css"), "w", encoding="utf-8") as _f: + _f.write(_pygments_theme_css()) +html_static_path.append("_generated") +html_css_files.append("css/pygments-themes.css") + + +# -- In-browser palette + code-theme switcher -------------------------------- +# +# Expose the registries to the page (single source of truth) and add the switcher script. +# It always paints the active palette and its default code themes on load (the full +# palette isn't baked into the theme — only its brand colors are), and additionally +# renders the palette/code dropdowns when DOCS_SHOW_SWITCHER is true. Choices recolor the +# live site and persist in localStorage; delete this setup() and js/palette-switcher.js +# to remove it. +def setup(app): + app.add_js_file( + None, + body=( + f"window.DOCS_PALETTES = {json.dumps(PALETTES)};\n" + f"window.DOCS_PALETTE_DEFAULT = {json.dumps(ACTIVE_PALETTE)};\n" + f"window.DOCS_CODE_THEMES = {json.dumps(CODE_THEMES)};\n" + f"window.DOCS_SHOW_SWITCHER = {json.dumps(SHOW_SWITCHER)};" + ), + ) + app.add_js_file("js/palette-switcher.js") diff --git a/ben-py/docs/getting-started/installation.md b/ben-py/docs/getting-started/installation.md new file mode 100644 index 0000000..7a93043 --- /dev/null +++ b/ben-py/docs/getting-started/installation.md @@ -0,0 +1,60 @@ +# Installation + +`binary-ensemble` ships as a pre-built wheel for Linux, macOS, and Windows, so the usual +one-liner is all you need: + +```bash +pip install binary-ensemble +``` + +The package requires **Python 3.11 or newer**. Its only runtime dependency is +[NetworkX](https://networkx.org) (used to hand dual graphs back to you as graph objects). + +## Optional: GerryChain + +The how-to guides and tutorials that build ensembles with +[GerryChain](https://gerrychain.readthedocs.io) need it installed alongside +`binary-ensemble`: + +```bash +pip install binary-ensemble gerrychain +``` + +`binary-ensemble` itself does **not** depend on GerryChain — it accepts plain Python lists +of integers, so it works with any sampler (ForestReCom, SMC, your own code) or with +pre-existing JSONL files. + +## Verify the install + +```python +import binary_ensemble + +print(binary_ensemble.__all__) +``` + +You should see the public surface: the `BendlEncoder`/`BendlDecoder` bundle classes, the +`BenEncoder`/`BenDecoder` stream classes, the `encode_*`/`decode_*` codec helpers, and the +`bundle`, `stream`, `codec`, and `graph` submodules. + +## Building from source + +`binary-ensemble` is a [PyO3](https://pyo3.rs) extension built with +[maturin](https://www.maturin.rs). To build it from a checkout you need a Rust toolchain: + +```bash +git clone https://github.com/peterrrock2/binary-ensemble +cd binary-ensemble/ben-py +pip install maturin +maturin develop --release # builds the extension and installs it editable +``` + +## Command-line tools + +This Python package wraps the same engine as the project's CLI tools (`ben`, `reben`, +`bendl`, `pcben`), which are distributed through Cargo: + +```bash +cargo install binary-ensemble +``` + +The Python API mirrors the CLI's structure — see [The API map](../concepts/api-map.md). diff --git a/ben-py/docs/getting-started/quickstart.md b/ben-py/docs/getting-started/quickstart.md new file mode 100644 index 0000000..c731de1 --- /dev/null +++ b/ben-py/docs/getting-started/quickstart.md @@ -0,0 +1,103 @@ +# Quickstart + +This page takes you from zero to a compressed, self-describing ensemble in a few minutes. +If a term is unfamiliar, the [Concepts](../concepts/overview.md) section explains the model +behind the API. + +## The one thing to know + +A districting plan is represented as an **assignment**: a flat list of integers, one per +node of a dual graph, giving the district id of each node. + +```python +assignment = [1, 1, 2, 2] # nodes 0 and 1 are in district 1; nodes 2 and 3 in district 2 +``` + +An **ensemble** is just a sequence of these. `binary-ensemble` compresses that sequence. + +## Write an ensemble + +The recommended container is a **`.bendl` bundle** — a single self-describing file. Open a +`BendlEncoder`, attach any metadata, then write assignments through a `stream` context that +finalizes the bundle when it closes: + +```python +from binary_ensemble import BendlEncoder + +plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] + +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_metadata({"sampler": "demo", "seed": 1234}) +with encoder.stream("ben") as stream: + for assignment in plans: + stream.write(assignment) +# bundle is finalized here +``` + +## Read it back + +Open a `BendlDecoder` and iterate. The bundle knows how many samples it holds and what it +carries: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(len(decoder)) # 3 +print(decoder.asset_names()) # ['metadata.json'] +print(decoder.read_metadata()) # {'sampler': 'demo', 'seed': 1234} + +for assignment in decoder: + print(assignment) +``` + +## Make it self-describing + +The real value of a bundle is embedding the **dual graph** so a collaborator can open the +file without hunting down the matching graph JSON. `add_graph` accepts a graph in NetworkX +*adjacency* form (a `dict`) or a path to a graph JSON file: + +```python +import networkx as nx +from binary_ensemble import BendlEncoder, BendlDecoder + +graph = nx.grid_2d_graph(2, 2) +graph = nx.convert_node_labels_to_integers(graph) +adjacency = nx.adjacency_data(graph) # the dict shape add_graph expects + +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_graph(adjacency, sort=None) # store as-is; see below for reordering +with encoder.stream("ben") as stream: + for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: + stream.write(assignment) + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() # back as a live NetworkX graph +print(graph.number_of_nodes(), "nodes") +``` + +```{tip} +Passing `sort="rcm"` or `sort="mlc"` instead of `sort=None` reorders the graph's nodes for +much better compression and records a reversible permutation map. See +[Why reordering shrinks files](../concepts/compression.md). +``` + +## Already have JSONL files? + +If your sampler already wrote a JSONL ensemble, the `codec` helpers convert whole files in +one call — no iteration required: + +```python +from binary_ensemble import encode_jsonl_to_ben, encode_ben_to_xben, decode_ben_to_jsonl + +encode_jsonl_to_ben("plans.jsonl", "plans.ben") # JSONL -> BEN (fast, working format) +encode_ben_to_xben("plans.ben", "plans.xben") # BEN -> XBEN (smallest, for storage) +decode_ben_to_jsonl("plans.ben", "plans_again.jsonl") # round-trip back to JSONL +``` + +## Next steps + +- [Compress a GerryChain run](../how-to/compress-gerrychain-run.md) — the most common workflow. +- [Subsample a large ensemble](../how-to/subsample.md) without decoding the whole thing. +- [Concepts](../concepts/overview.md) — formats, encoding variants, and how the compression works. diff --git a/ben-py/docs/how-to/compress-gerrychain-run.md b/ben-py/docs/how-to/compress-gerrychain-run.md new file mode 100644 index 0000000..a68568e --- /dev/null +++ b/ben-py/docs/how-to/compress-gerrychain-run.md @@ -0,0 +1,81 @@ +# Compress a GerryChain run + +The most common workflow: run a [GerryChain](https://gerrychain.readthedocs.io) ReCom chain +and stream every plan straight into a single self-describing `.bendl` bundle, so you never +materialize a giant JSONL file. + +```{note} +This recipe needs GerryChain installed: `pip install gerrychain`. `binary-ensemble` itself +only ever sees plain lists of integers, so the same pattern works with any sampler. +``` + +## Set up the chain + +```python +from functools import partial + +from gerrychain import Partition, Graph, MarkovChain, updaters, accept +from gerrychain.proposals import recom +from gerrychain.constraints import contiguous + +graph = Graph.from_json("gerrymandria.json") + +initial_partition = Partition( + graph, + assignment="district", + updaters={"population": updaters.Tally("TOTPOP")}, +) + +ideal_population = sum(initial_partition["population"].values()) / len(initial_partition) + +proposal = partial( + recom, pop_col="TOTPOP", pop_target=ideal_population, epsilon=0.01, node_repeats=2 +) + +chain = MarkovChain( + proposal=proposal, + constraints=[contiguous], + accept=accept.always_accept, + initial_state=initial_partition, + total_steps=1000, +) +``` + +## Stream the chain into a bundle + +The one thing to get right is **node order**: an assignment vector is only meaningful in the +dual graph's node order, so reorder each plan to match the order you embed. + +```python +from binary_ensemble import BendlEncoder + +# The order assignments must be written in. +node_order = list(graph.nodes) + +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_graph("gerrymandria.json", sort=None) # embed the dual graph as-is +encoder.add_metadata({"sampler": "ReCom", "epsilon": 0.01, "steps": 1000}) + +with encoder.stream("ben", variant="twodelta") as stream: # twodelta suits ReCom chains + for partition in chain: + series = partition.assignment.to_series() + assignment = series.loc[node_order].astype(int).tolist() + stream.write(assignment) +# the bundle is finalized when the stream context closes +``` + +That's it — `ensemble.bendl` now holds all 1,000 plans plus the graph and metadata in one +file. To read it back, see [Read and iterate an ensemble](read-and-iterate.md). + +## Make it smaller + +The bundle above stores the graph in its original node order. For a much smaller file, reorder +the graph (so assignments form long runs) and recompress to XBEN — see +[Shrink a bundle for sharing](shrink-for-sharing.md). You can do this after the fact, so it +never complicates the sampling loop. + +```{tip} +Encoding `twodelta` (the default) delta-compresses pairwise ReCom moves. If you log a full +MCMC chain *including rejections*, `variant="mkv_chain"` collapses the repeated plans +instead. See [Encoding variants](../concepts/variants.md). +``` diff --git a/ben-py/docs/how-to/convert-formats.md b/ben-py/docs/how-to/convert-formats.md new file mode 100644 index 0000000..c4dd799 --- /dev/null +++ b/ben-py/docs/how-to/convert-formats.md @@ -0,0 +1,74 @@ +# Convert between formats + +The {mod}`binary_ensemble.codec` helpers transform whole files in a single call — no +iteration, no decoder objects. Use them when you have a complete file to convert and don't +need sample-by-sample access. + +The expected JSONL shape is one plan per line: + +``` +{"assignment": [1, 1, 2, 2, ...], "sample": 1} +{"assignment": [1, 2, 2, 2, ...], "sample": 2} +``` + +## JSONL → BEN + +```python +from binary_ensemble import encode_jsonl_to_ben + +encode_jsonl_to_ben("plans.jsonl", "plans.ben") # default variant: twodelta +encode_jsonl_to_ben("plans.jsonl", "plans.ben", variant="mkv_chain", overwrite=True) +``` + +## BEN → XBEN (maximum compression) + +```python +from binary_ensemble import encode_ben_to_xben + +encode_ben_to_xben("plans.ben", "plans.xben", overwrite=True) +``` + +You can also go straight from JSONL to XBEN with `encode_jsonl_to_xben`. The XBEN encoders +accept tuning knobs: + +```python +from binary_ensemble import encode_jsonl_to_xben + +encode_jsonl_to_xben( + "plans.jsonl", + "plans.xben", + overwrite=True, + variant="twodelta", + n_threads=8, # parallelize across cores (default: all available) + compression_level=9, # 0 (fastest) … 9 (smallest); default 9 +) +``` + +```{important} +XBEN compression is slow — high-ratio encoding of a block-level ensemble can take an hour or +more. Decompression, by contrast, is fast. Encode to XBEN once for storage; work against BEN +day to day. See [Formats](../concepts/formats.md). +``` + +## Decoding back out + +The decoders mirror the encoders and all take `(in_file, out_file, overwrite=False)`: + +```python +from binary_ensemble import decode_ben_to_jsonl, decode_xben_to_jsonl, decode_xben_to_ben + +decode_ben_to_jsonl("plans.ben", "plans.jsonl", overwrite=True) # BEN -> JSONL +decode_xben_to_jsonl("plans.xben", "plans.jsonl", overwrite=True) # XBEN -> JSONL +decode_xben_to_ben("plans.xben", "plans.ben", overwrite=True) # XBEN -> BEN (to work with it) +``` + +```{note} +By default these refuse to overwrite an existing output file; pass `overwrite=True` to +replace it. You never specify a variant when decoding — it's detected from the stream. +``` + +## Working with bundles instead? + +These helpers operate on plain streams and JSONL. To recompress the stream *inside* a +`.bendl` bundle (keeping its graph and metadata), use +[`compress_stream`](shrink-for-sharing.md) instead. diff --git a/ben-py/docs/how-to/custom-assets-and-append.md b/ben-py/docs/how-to/custom-assets-and-append.md new file mode 100644 index 0000000..201c9b5 --- /dev/null +++ b/ben-py/docs/how-to/custom-assets-and-append.md @@ -0,0 +1,63 @@ +# Custom assets and appending + +A bundle isn't limited to the graph and metadata — you can attach arbitrary named blobs, and +you can add more to a bundle even after it's finalized. + +## Attach metadata and custom assets + +`add_metadata` writes the canonical `metadata.json`. `add_asset` writes any named blob; its +`content_type` is `"json"` (the payload must be valid UTF-8 JSON, and the decoder will parse +it for you) or `"text"` (any UTF-8 text): + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_metadata({"sampler": "ReCom", "seed": 1234}) +encoder.add_asset("scores.json", '{"mean_cut_edges": 41.2}', content_type="json") +encoder.add_asset("README.txt", "Generated for the 2026 analysis.", content_type="text") + +with encoder.stream("ben") as stream: + for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: + stream.write(assignment) +``` + +Assets may be added before *or* after the stream — only the stream itself is single-use. (The +one exception is a *reordering* `add_graph`, which must come before the stream because it sets +the node order the chain writes in.) + +## Read assets back + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(decoder.asset_names()) # ['metadata.json', 'scores.json', 'README.txt'] +print(decoder.read_json_asset("scores.json")) # {'mean_cut_edges': 41.2} (parsed) +print(decoder.read_asset_bytes("README.txt")) # b'Generated for the 2026 analysis.' (raw bytes) +``` + +Use `read_json_asset` for JSON assets (it parses them) and `read_asset_bytes` for raw bytes of +anything. The canonical getters `read_metadata()`, `read_graph()`, and +`read_node_permutation_map()` are shortcuts for the well-known assets. + +## Append to a finalized bundle + +To add assets to a bundle that's already finalized, open it with `BendlEncoder.append`. In +append mode each `add_*` commits immediately, and `stream()` is unavailable (a bundle's +assignment stream is written once): + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder.append("ensemble.bendl") +encoder.add_asset("notes.txt", "Reviewed and approved.", content_type="text") +encoder.close() +``` + +```{note} +Each post-finalize add rewrites the bundle's directory, so it's perfect for a handful of extra +assets but not for tight loops. Attach what you can up front, and use `append` for the +occasional addition after the fact. +``` diff --git a/ben-py/docs/how-to/index.md b/ben-py/docs/how-to/index.md new file mode 100644 index 0000000..d125679 --- /dev/null +++ b/ben-py/docs/how-to/index.md @@ -0,0 +1,102 @@ +# How-to guides + +Task-focused recipes for common jobs. Each one is short and assumes you've met the basics in +the [Quickstart](../getting-started/quickstart.md). + +## Sample data for these guides + +The recipes below assume a small `ensemble.bendl`, a `plans.jsonl`, a `chain.ben` / +`chain.xben` pair, and a `gerrymandria.json` dual graph in your working directory. To follow +along, create them all with this snippet: + + +```python +import json + +import networkx as nx + +from binary_ensemble import BendlEncoder, BenEncoder, encode_ben_to_xben + +# A small dual graph: an 8x8 grid with unit population, contiguous stripe districts, +# and a GEOID20-style key to sort on. +SIDE = 8 +graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(SIDE, SIDE)) +for node in graph.nodes: + _row, col = divmod(node, SIDE) + graph.nodes[node].update(TOTPOP=1, district=col // 2 + 1, GEOID20=f"{node:04d}") +adjacency = nx.adjacency_data(graph) +n_nodes = SIDE * SIDE + +# The GerryChain how-to reads the dual graph from this file. +with open("gerrymandria.json", "w") as handle: + json.dump(adjacency, handle) + +# 120 toy plans on the grid's nodes. +plans = [[(node + step) % 4 + 1 for node in range(n_nodes)] for step in range(120)] + +# A self-describing bundle (graph + metadata + the plans)... +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_graph(adjacency, sort=None) +encoder.add_metadata({"sampler": "demo", "seed": 0}) +with encoder.stream("ben") as stream: + for plan in plans: + stream.write(plan) + +# ...the same plans as JSONL... +with open("plans.jsonl", "w") as handle: + for sample, plan in enumerate(plans, start=1): + handle.write(json.dumps({"assignment": plan, "sample": sample}) + "\n") + +# ...and as plain BEN / XBEN streams. +with BenEncoder("chain.ben", overwrite=True) as stream: + for plan in plans: + stream.write(plan) +encode_ben_to_xben("chain.ben", "chain.xben", overwrite=True) +``` + +::::{grid} 1 1 2 2 +:gutter: 3 + +:::{grid-item-card} Compress a GerryChain run +:link: compress-gerrychain-run +:link-type: doc + +Stream a ReCom chain straight into a self-describing `.bendl` bundle. +::: + +:::{grid-item-card} Read and iterate an ensemble +:link: read-and-iterate +:link-type: doc + +Open a bundle, recover its graph and metadata, and walk its assignments. +::: + +:::{grid-item-card} Subsample a large ensemble +:link: subsample +:link-type: doc + +Pull a subset of plans by index, range, or stride — without decoding the whole file. +::: + +:::{grid-item-card} Convert between formats +:link: convert-formats +:link-type: doc + +Whole-file transforms between JSONL, BEN, and XBEN. +::: + +:::{grid-item-card} Shrink a bundle for sharing +:link: shrink-for-sharing +:link-type: doc + +Reorder, relabel, and recompress a bundle to its smallest shareable form. +::: + +:::{grid-item-card} Custom assets and appending +:link: custom-assets-and-append +:link-type: doc + +Attach metadata and arbitrary blobs, then add more to a finalized bundle. +::: + +:::: diff --git a/ben-py/docs/how-to/read-and-iterate.md b/ben-py/docs/how-to/read-and-iterate.md new file mode 100644 index 0000000..1a66269 --- /dev/null +++ b/ben-py/docs/how-to/read-and-iterate.md @@ -0,0 +1,72 @@ +# Read and iterate an ensemble + +Open a `.bendl` bundle with `BendlDecoder` and you get the assignment stream *and* everything +the bundle carries alongside it. + +## Inspect before you iterate + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(len(decoder)) # number of samples (expanded count) +print(decoder.assignment_format()) # 'ben' or 'xben' +print(decoder.asset_names()) # e.g. ['graph.json', 'metadata.json'] +print(decoder.read_metadata()) # the metadata.json payload, or None +``` + +`len()` is cheap and cached, so it's safe to use for a progress bar. + +## Iterate the assignments + +```python +for assignment in decoder: + # assignment is a list[int]: the district id of each node, in graph order + ... +``` + +You can iterate the same decoder as many times as you like — each `for` loop rewinds to the +start of the stream automatically, so there's no need to reopen the file: + +```python +total = len(decoder) +first = next(iter(decoder)) # peek the first plan +all_plans = list(decoder) # full pass again, from the start +``` + +The cursor is shared, so this is sequential re-iteration — don't drive two loops over the +*same* decoder at once. If you need two independent positions simultaneously, open a second +`BendlDecoder`. + +## Recover the dual graph + +Because the graph is embedded, you can rebuild full plan objects without a separate graph +file. `read_graph()` returns a live `networkx.Graph` whose node order matches the order the +assignments were written in: + +```python +import pandas as pd +from gerrychain import Partition + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +node_order = pd.Index(graph.nodes) + +for assignment in decoder: + series = pd.Series(assignment, index=node_order) + partition = Partition(graph, assignment=series) + # ... analyze the partition (cut edges, population, scores, ...) +``` + +## Get the raw graph or permutation map + +`read_graph()` hands back a NetworkX graph; for the underlying JSON, or for a reordered +bundle's permutation map, use: + +```python +raw_graph = decoder.read_json_asset("graph.json") # parsed adjacency dict +permutation_map = decoder.read_node_permutation_map() # None if the graph wasn't reordered +``` + +See [Custom assets and appending](custom-assets-and-append.md) for reading arbitrary blobs. diff --git a/ben-py/docs/how-to/shrink-for-sharing.md b/ben-py/docs/how-to/shrink-for-sharing.md new file mode 100644 index 0000000..18dd912 --- /dev/null +++ b/ben-py/docs/how-to/shrink-for-sharing.md @@ -0,0 +1,59 @@ +# Shrink a bundle for sharing + +A bundle you build while sampling is usually a BEN bundle in the graph's original node order — +convenient, but not as small as it could be. Before handing it to a collaborator or archiving +it, two steps get it to its smallest form: + +1. **Relabel and reorder** so assignments form long runs and equivalent plans encode + identically. +2. **Recompress** the stream to XBEN. + +## Step 1: relabel and reorder + +`relabel_bundle` reorders the embedded graph, rewrites every assignment into the new node +order, and stores the reversible permutation map — all while preserving your metadata and +custom assets: + +```python +from binary_ensemble import relabel_bundle + +# Sort by a geographic key (often the most effective ordering). Use sort="mlc" or +# sort="rcm" for a topology-based ordering instead. +relabel_bundle("ensemble.bendl", out_file="ensemble.sorted.bendl", sort="key", key="GEOID20") +``` + +See [Why reordering shrinks files](../concepts/compression.md) for what `mlc`, `rcm`, and +`key` do. + +## Step 2: recompress to XBEN + +`compress_stream` re-encodes the bundle's BEN stream as XBEN, carrying every asset across +unchanged: + +```python +from binary_ensemble import compress_stream + +compress_stream("ensemble.sorted.bendl", out_file="ensemble.xben.bendl") +``` + +The result is a single `.bendl` that's typically orders of magnitude smaller — and still +self-describing, since the graph and permutation map travel inside it. + +## In place vs. a new file + +Both transforms take **either** `out_file` (write a new bundle) **or** `in_place=True` +(atomically replace the original). Passing both, or neither, raises: + +```python +relabel_bundle("ensemble.bendl", in_place=True, sort="key", key="GEOID20") +compress_stream("ensemble.bendl", in_place=True) +``` + +`in_place=True` writes to a temporary file and swaps it over the original only on success, so +an interrupted run won't corrupt your bundle. + +```{tip} +Reorder *before* compressing. Relabeling and node reordering are what create the long runs and +cross-plan repetition that LZMA2 (inside XBEN) exploits, so doing step 1 first makes step 2 +dramatically more effective. +``` diff --git a/ben-py/docs/how-to/subsample.md b/ben-py/docs/how-to/subsample.md new file mode 100644 index 0000000..7794129 --- /dev/null +++ b/ben-py/docs/how-to/subsample.md @@ -0,0 +1,56 @@ +# Subsample a large ensemble + +When an ensemble has millions of plans, you often want only a slice — every 1000th plan, a +contiguous range, or a handful of specific indices. The decoders support this directly, and +they do it by **skipping** frames rather than decoding everything, so it stays fast. + +All three methods are available on both `BendlDecoder` (for bundles) and `BenDecoder` (for +plain streams). Each returns a decoder you iterate. + +## By specific indices + +```python +from binary_ensemble import BendlDecoder + +for assignment in BendlDecoder("ensemble.bendl").subsample_indices([1, 50, 100]): + print(assignment[:10]) +``` + +## By a contiguous range + +```python +for assignment in BendlDecoder("ensemble.bendl").subsample_range(10, 15): + print(assignment[:10]) +``` + +## By a fixed stride + +`subsample_every(step)` yields every `step`-th sample (with an optional `offset`): + +```python +for assignment in BendlDecoder("ensemble.bendl").subsample_every(25): + print(assignment[:10]) +``` + +## Subsampling plain streams (and XBEN) + +The same methods work on a `BenDecoder`. For an `.xben` stream, pass `mode="xben"`: + +```python +from binary_ensemble import BenDecoder + +# Plain BEN stream — skipping is cheapest here. +for assignment in BenDecoder("chain.ben").subsample_every(25): + print(assignment[:10]) + +# XBEN works too, at the cost of a one-time decompression startup. +for assignment in BenDecoder("chain.xben", mode="xben").subsample_range(10, 15): + print(assignment[:10]) +``` + +```{tip} +Subsampling a BEN stream is fastest because frames can be skipped without decompressing. An +XBEN stream pays a one-time startup cost to begin reading, after which skipping is cheap +again. If you'll subsample an XBEN file repeatedly, extract it to BEN first with +[`decode_xben_to_ben`](convert-formats.md). +``` diff --git a/ben-py/docs/index.md b/ben-py/docs/index.md new file mode 100644 index 0000000..545132d --- /dev/null +++ b/ben-py/docs/index.md @@ -0,0 +1,158 @@ +--- +sd_hide_title: true +--- + +# binary-ensemble + +```{div} sd-text-center sd-fs-2 sd-font-weight-bold +binary-ensemble +``` + +```{div} sd-text-center sd-fs-5 sd-text-secondary +Compress, store, and stream massive ensembles of districting plans. +``` + +```{div} sd-text-center +[Get started](getting-started/quickstart.md){.download-badge} +[Concepts](concepts/overview.md){.download-badge} +[API reference](api/index.md){.download-badge} +``` + +--- + +Redistricting samplers like [GerryChain](https://gerrychain.readthedocs.io)'s ReCom, +ForestReCom, and Sequential Monte Carlo routinely emit **millions of plans**. Stored as +JSONL, a single ensemble can run to *tens of gigabytes* — most of it redundant, because +consecutive plans barely differ. **BEN** (Binary-Ensemble) is a compression format and +toolkit built for exactly this data: it turns those JSONL mountains into compact binary +files you can store, share, and stream sample-by-sample without unpacking the whole thing. + +`binary-ensemble` is the Python interface to the +[binary-ensemble](https://crates.io/crates/binary-ensemble) Rust library. + +```{admonition} How much smaller? +:class: tip +A real 100k-plan ensemble on Colorado's ~140k census blocks is **27 GB** as JSONL. +Reordered by `GEOID20` it compresses to a **~550 MB** BEN stream, and then to a +**~6 MB** XBEN file — over a **4500× reduction**, fully lossless. +``` + +## Install + +```bash +pip install binary-ensemble +``` + +## A first taste + +Write an ensemble into one self-describing `.bendl` bundle, then read it back: + +```python +from binary_ensemble import BendlEncoder, BendlDecoder + +plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] + +# The stream context finalizes the bundle when it closes. +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +with encoder.stream("ben") as stream: + for assignment in plans: + stream.write(assignment) + +# Iterate the assignments straight back out, one at a time. +for assignment in BendlDecoder("ensemble.bendl"): + print(assignment) +``` + +## Where to next + +::::{grid} 1 1 2 2 +:gutter: 3 + +:::{grid-item-card} {octicon}`rocket` Getting started +:link: getting-started/quickstart +:link-type: doc + +Install the package and compress your first ensemble in a few lines. +::: + +:::{grid-item-card} {octicon}`book` Concepts +:link: concepts/overview +:link-type: doc + +Dual graphs, assignments, the BEN/XBEN/BENDL formats, and the compression levers — +the mental model behind the API. +::: + +:::{grid-item-card} {octicon}`tools` How-to guides +:link: how-to/index +:link-type: doc + +Task-focused recipes: compress a GerryChain run, subsample, convert formats, +shrink a bundle for sharing. +::: + +:::{grid-item-card} {octicon}`code` API reference +:link: api/index +:link-type: doc + +Every public class and function in `binary_ensemble`, organized by module. +::: + +:::: + +```{toctree} +:hidden: +:caption: Getting started + +getting-started/installation +getting-started/quickstart +``` + +```{toctree} +:hidden: +:caption: Concepts + +concepts/overview +concepts/vocabulary +concepts/formats +concepts/variants +concepts/compression +concepts/api-map +``` + +```{toctree} +:hidden: +:caption: How-to guides + +how-to/index +how-to/compress-gerrychain-run +how-to/read-and-iterate +how-to/subsample +how-to/convert-formats +how-to/shrink-for-sharing +how-to/custom-assets-and-append +``` + +```{toctree} +:hidden: +:caption: Tutorials + +user/using_ben_py +user/using_bendl +``` + +```{toctree} +:hidden: +:caption: API reference + +api/index +``` + +```{toctree} +:hidden: +:caption: Project + +format stability +Rust crate +GitHub +``` diff --git a/ben-py/docs/index.rst b/ben-py/docs/index.rst deleted file mode 100644 index 095dea1..0000000 --- a/ben-py/docs/index.rst +++ /dev/null @@ -1,35 +0,0 @@ -Welcome to binary-ensemble's documentation! -=========================================== - -BEN (short for Binary-Ensemble) is a compression algorithm designed for efficient storage and access -of ensembles of districting plans, and was designed to work primarily as a companion to the -GerrySuite collection of packages (GerryChain, GerryTools, FRCW) and to also be compatible with -other ensemble generators (e.g. ForestRecom, Sequential Monte Carlo [SMC]). - -This is a package containing some Python bindings for the for the -`Binary-Ensemble ` Rust library. In particular, -this package provides some easy tools for compressing and decompressing ensembles of -districting plans, as well as some utilities for working with ensembles stored in the -BEN and XBEN formats. - -Installation ------------- - -To install binary-ensemble, you can just use pip! - -.. code-block:: bash - - pip install binary-ensemble - - -.. toctree:: - :caption: binary-ensemble Tutorial - - user/using_ben_py - - -.. toctree:: - :caption: API Reference - :maxdepth: 2 - - autoapi/index diff --git a/ben-py/docs/user/.gitignore b/ben-py/docs/user/.gitignore index 271e1f9..f77d506 100644 --- a/ben-py/docs/user/.gitignore +++ b/ben-py/docs/user/.gitignore @@ -1,4 +1,5 @@ * !.gitignore -!*.ipynb \ No newline at end of file +!*.ipynb +!*.py \ No newline at end of file diff --git a/ben-py/docs/user/using_ben_py.ipynb b/ben-py/docs/user/using_ben_py.ipynb index 4af2348..853e96e 100644 --- a/ben-py/docs/user/using_ben_py.ipynb +++ b/ben-py/docs/user/using_ben_py.ipynb @@ -2,152 +2,131 @@ "cells": [ { "cell_type": "markdown", - "id": "c7a7778b", + "id": "cf968bd9", "metadata": {}, "source": [ - "# An Introduction to Using binary-ensemble\n", - "\n", - "This is a small tutorial that is meant to help users get to using binary-ensemble: the Python interface\n", - "for the [binary-ensemble](https://crates.io/crates/binary-ensemble) Rust package.\n", - "\n", - "BEN (short for Binary-Ensemble) is a compression algorithm designed for efficient storage and\n", - "access of ensembles of districting plans, and was designed to work primarily as a companion to\n", - "the GerrySuite collection of packages (GerryChain, GerryTools, FRCW) and to also be compatible\n", - "with other ensemble generators (e.g. ForestRecom, Sequential Monte Carlo \\[SMC\\]). \n", - "\n", - "When working with an ensemble of plans, there is generally an underlying dual graph, :math:`G`,\n", - "on which there is an ordering of nodes :math:`(n_1, n_2, \\ldots, n_\\ell)`. If we then wish to \n", - "partition the graph into districts, then the only thing that we need to do is assign each\n", - "node in the graph a district number. This is what we call the ***assignment vector*** for the \n", - "districting plan. Then to encode an ensemble of districting plans in a JSONL file (short for JSON \n", - "Lines and it really just means a file with a dictionary on every line), we may format each of the\n", - "lines in the following way:\n", - "\n", - "```\n", - "{\"assignment\": , \"sample\": }\n", - "```\n", - "\n", - "However, if the graph has a lot of nodes in it and we want to collect millions of samples (as we \n", - "tend to want to do), then this JSONL format can make for MASSIVE (tens or hundreds of Gb) files. So\n", - "this is why we have BEN and XBEN (e\\[X\\]treme BEN): to make the storage and processing of these\n", - "millions of plans possible without needing to buy an extra hard drive for every project that you \n", - "would like to work with." + "# BEN & XBEN streams: compressing and converting ensembles\n", + "\n", + "This tutorial covers the plain **BEN/XBEN streams** and the whole-file conversion helpers:\n", + "`binary_ensemble.codec` (JSONL ↔ BEN ↔ XBEN) and `binary_ensemble.stream`\n", + "(`BenEncoder` / `BenDecoder`). Its companion, **Working with `.bendl` bundles**, covers the\n", + "recommended self-describing bundle format and the graph-reordering utilities.\n", + "\n", + "**BEN** (Binary-Ensemble) compresses ensembles of districting plans. An ensemble is usually\n", + "stored as [JSONL](https://jsonlines.org) — one plan per line, like\n", + "`{\"assignment\": [...], \"sample\": n}` — which is simple but can balloon to tens of gigabytes.\n", + "BEN shrinks that losslessly; **XBEN** adds LZMA2 on top for archival-grade compression. See\n", + "[Formats](../concepts/formats.md) for the full picture." ] }, { "cell_type": "markdown", - "id": "3fea1b6a", + "id": "589a241f", "metadata": {}, "source": [ - "## Setup for the Tutorial\n", + "## Setup: generate a small ensemble\n", "\n", - "For this tutorial, you will need access to a few files. We are going to go ahead and download\n", - "them here and then place them in a folder called \"example_data\"." + "So this tutorial is self-contained and reproducible, we *generate* a small ensemble instead\n", + "of downloading one: a short [GerryChain](https://gerrychain.readthedocs.io) ReCom chain on a\n", + "16×16 grid (256 nodes), written out as a JSONL file. `binary-ensemble` only ever sees lists\n", + "of integers, so any sampler — or any existing JSONL file — works the same way." ] }, { "cell_type": "code", "execution_count": 1, - "id": "1c33cbfb", - "metadata": {}, - "outputs": [], - "source": [ - "from urllib.request import urlopen\n", - "from pathlib import Path\n", - "import shutil\n", - "\n", - "if Path(\"example_data\").exists():\n", - " shutil.rmtree(\"example_data\")\n", - "\n", - "Path(\"example_data\").mkdir()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f8a56fa7", - "metadata": {}, + "id": "ed9904bc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:42.359600Z", + "iopub.status.busy": "2026-06-06T03:02:42.359414Z", + "iopub.status.idle": "2026-06-06T03:02:44.247643Z", + "shell.execute_reply": "2026-06-06T03:02:44.242891Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Downloading CO_small.json...\n", - "Downloading small_example.jsonl...\n", - "Downloading 100k_CO_chain.jsonl.xben...\n", - "Downloading gerrymandria.json...\n" + "wrote example_data/small_example.jsonl: 200 plans on 256 nodes, 159892 bytes\n" ] } ], "source": [ - "def open_and_save(base_url, file_name):\n", - " url = f\"{base_url}/{file_name}\"\n", - " out_path = f\"./example_data/{file_name}\"\n", - "\n", - " chunk = 1024 * 64\n", - " with urlopen(url, timeout=120) as resp, open(out_path, \"wb\") as f:\n", - " while True:\n", - " buf = resp.read(chunk)\n", - " if not buf:\n", - " break\n", - " f.write(buf)\n", - "\n", - "\n", - "url_base = \"https://raw.githubusercontent.com/peterrrock2/binary-ensemble/main/example\"\n", - "for file_name in [\n", - " \"CO_small.json\",\n", - " \"small_example.jsonl\",\n", - "]:\n", - " out_path = f\"./example_data/{file_name}\"\n", - " if not Path(out_path).exists():\n", - " print(f\"Downloading {file_name}...\")\n", - " open_and_save(url_base, file_name)\n", - " else:\n", - " print(f\"{file_name} already exists, skipping download.\")\n", + "import json\n", + "import os\n", + "from functools import partial\n", + "from pathlib import Path\n", "\n", + "import networkx as nx\n", + "from gerrychain import Graph, MarkovChain, Partition, accept, constraints, updaters\n", + "from gerrychain.proposals import recom\n", "\n", - "url_base = \"https://github.com/peterrrock2/binary-ensemble/raw/refs/heads/main/example/\"\n", - "for file_name in [\n", - " \"100k_CO_chain.jsonl.xben\",\n", - "]:\n", - " out_path = f\"./example_data/{file_name}\"\n", - " if not Path(out_path).exists():\n", - " print(f\"Downloading {file_name}...\")\n", - " open_and_save(url_base, file_name)\n", - " else:\n", - " print(f\"{file_name} already exists, skipping download.\")\n", + "Path(\"example_data\").mkdir(exist_ok=True)\n", + "\n", + "# A 16x16 grid with unit population and stripe districts -> a contiguous start state.\n", + "SIDE, N_DISTRICTS = 16, 4\n", + "grid = nx.grid_2d_graph(SIDE, SIDE)\n", + "grid = nx.convert_node_labels_to_integers(grid, ordering=\"sorted\")\n", + "for node in grid.nodes:\n", + " _row, col = divmod(node, SIDE)\n", + " grid.nodes[node][\"TOTPOP\"] = 1\n", + " grid.nodes[node][\"district\"] = col // (SIDE // N_DISTRICTS)\n", + "\n", + "gc_graph = Graph.from_networkx(grid)\n", + "node_order = list(gc_graph.nodes) # the order we write each assignment in\n", + "initial = Partition(\n", + " gc_graph,\n", + " assignment=\"district\",\n", + " updaters={\"population\": updaters.Tally(\"TOTPOP\", alias=\"population\")},\n", + ")\n", + "ideal = sum(initial[\"population\"].values()) / len(initial)\n", + "chain = MarkovChain(\n", + " proposal=partial(\n", + " recom, pop_col=\"TOTPOP\", pop_target=ideal, epsilon=0.05, node_repeats=2\n", + " ),\n", + " constraints=[constraints.contiguous],\n", + " accept=accept.always_accept,\n", + " initial_state=initial,\n", + " total_steps=200,\n", + ")\n", "\n", + "with open(\"example_data/small_example.jsonl\", \"w\") as f:\n", + " for i, partition in enumerate(chain, start=1):\n", + " assignment = (\n", + " partition.assignment.to_series().loc[node_order].astype(int).tolist()\n", + " )\n", + " f.write(json.dumps({\"assignment\": assignment, \"sample\": i}) + \"\\n\")\n", "\n", - "url_base = (\n", - " \"https://raw.githubusercontent.com/mggg/GerryChain/refs/heads/main/docs/_static\"\n", - ")\n", - "for file_name in [\n", - " \"gerrymandria.json\",\n", - "]:\n", - " out_path = f\"./example_data/{file_name}\"\n", - " if not Path(out_path).exists():\n", - " print(f\"Downloading {file_name}...\")\n", - " open_and_save(url_base, file_name)\n", - " else:\n", - " print(f\"{file_name} already exists, skipping download.\")" + "jsonl_size = os.path.getsize(\"example_data/small_example.jsonl\")\n", + "print(\n", + " f\"wrote example_data/small_example.jsonl: 200 plans on {SIDE * SIDE} nodes, {jsonl_size} bytes\"\n", + ")" ] }, { "cell_type": "markdown", - "id": "197dd92d", + "id": "990d1238", "metadata": {}, "source": [ "## Converting between file types\n", "\n", - "binary-ensemble comes equiped with some utility functions for users who wish to convert between different\n", - "file types." + "The `binary_ensemble.codec` helpers convert whole files in a single call." ] }, { "cell_type": "code", - "execution_count": 3, - "id": "9296ca41", - "metadata": {}, + "execution_count": 2, + "id": "1ec1d4ed", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.251325Z", + "iopub.status.busy": "2026-06-06T03:02:44.250847Z", + "iopub.status.idle": "2026-06-06T03:02:44.256236Z", + "shell.execute_reply": "2026-06-06T03:02:44.255813Z" + } + }, "outputs": [], "source": [ "from binary_ensemble.codec import (\n", @@ -162,48 +141,72 @@ }, { "cell_type": "markdown", - "id": "84f7c7f6", + "id": "0dad081f", "metadata": {}, "source": [ - "### BEN compression\n", + "### JSONL → BEN\n", "\n", - "The most basic (and quickest) type of compression available is the BEN compression format. You \n", - "may convert between a standard JSONL file to a BEN file using the following function:\n" + "The quickest format is BEN. `encode_jsonl_to_ben` reads the JSONL ensemble and writes a\n", + "compact `.ben` stream." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "1e1e32b0", - "metadata": {}, - "outputs": [], + "execution_count": 3, + "id": "ef701764", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.257724Z", + "iopub.status.busy": "2026-06-06T03:02:44.257583Z", + "iopub.status.idle": "2026-06-06T03:02:44.261414Z", + "shell.execute_reply": "2026-06-06T03:02:44.261122Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BEN bytes: 4303\n" + ] + } + ], "source": [ "encode_jsonl_to_ben(\n", " in_file=\"example_data/small_example.jsonl\",\n", - " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", - ")" + " out_file=\"example_data/small_example.ben\",\n", + " overwrite=True,\n", + ")\n", + "print(\"BEN bytes:\", os.path.getsize(\"example_data/small_example.ben\"))" ] }, { "cell_type": "markdown", - "id": "60f4ff71", + "id": "01b079e3", "metadata": {}, "source": [ - "As a small note, the above function (and all the conversion functions) has a default behavior of \n", - "not overwriting output. " + "By default the conversion functions refuse to overwrite an existing output file — pass\n", + "`overwrite=True` to replace it." ] }, { "cell_type": "code", - "execution_count": 5, - "id": "2f1ce280", - "metadata": {}, + "execution_count": 4, + "id": "cce1caac", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.267885Z", + "iopub.status.busy": "2026-06-06T03:02:44.266218Z", + "iopub.status.idle": "2026-06-06T03:02:44.279005Z", + "shell.execute_reply": "2026-06-06T03:02:44.276912Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Found Error: Output file example_data/small_example_jsonl_to_ben.jsonl.ben already exists (use overwrite=True to replace).\n" + "refused to overwrite: Output file example_data/small_example.ben already exists (use overwrite=True to replace).\n" ] } ], @@ -211,400 +214,372 @@ "try:\n", " encode_jsonl_to_ben(\n", " in_file=\"example_data/small_example.jsonl\",\n", - " out_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", + " out_file=\"example_data/small_example.ben\",\n", " )\n", "except OSError as e:\n", - " print(f\"Found Error: {e}\")" + " print(\"refused to overwrite:\", e)" ] }, { "cell_type": "markdown", - "id": "5d107d57", + "id": "53374915", "metadata": {}, "source": [ - "In addition, there is a `variant`\n", - "parameter with two options: \"standard\" and \"mkv_chain\". The \"mkv_chain\" variation is a special \n", - "version of BEN that is optimized for ensembles generated using an MCMC method with a non-zero \n", - "rejection probability (so the generated maps may repeat a few times to target an appropriate \n", - "probability distribution like in [Reversible ReCom](https://mggg.org/rrc)).\n", + "### Encoding variants\n", "\n", - "For ensembles without repetition, the output size of the \"mkv_chain\" variant is very slightly larger\n", - "than the \"standard\" variant, but for MCMC chains, the savings can be significant, so \"mkv_chain\"\n", - "is set as the default variant." + "A BEN stream is encoded with one of three **variants**, chosen with `variant=`:\n", + "\n", + "- `\"twodelta\"` (the **default**) delta-encodes pairwise ReCom moves — ideal for ReCom chains.\n", + "- `\"mkv_chain\"` collapses identical consecutive plans — for full MCMC chains with rejections.\n", + "- `\"standard\"` stores each plan independently — a simple baseline.\n", + "\n", + "Decoding **auto-detects** the variant, so you never specify it when reading. See\n", + "[Encoding variants](../concepts/variants.md)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7646489b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.283412Z", + "iopub.status.busy": "2026-06-06T03:02:44.282472Z", + "iopub.status.idle": "2026-06-06T03:02:44.311097Z", + "shell.execute_reply": "2026-06-06T03:02:44.310379Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " standard: 8216 bytes\n", + " mkv_chain: 8576 bytes\n", + " twodelta: 4303 bytes\n" + ] + } + ], + "source": [ + "for variant in [\"standard\", \"mkv_chain\", \"twodelta\"]:\n", + " encode_jsonl_to_ben(\n", + " \"example_data/small_example.jsonl\",\n", + " f\"example_data/small_example.{variant}.ben\",\n", + " overwrite=True,\n", + " variant=variant,\n", + " )\n", + " print(\n", + " f\"{variant:>10}: {os.path.getsize(f'example_data/small_example.{variant}.ben'):>6} bytes\"\n", + " )" ] }, { "cell_type": "markdown", - "id": "8333a252", + "id": "a125e072", "metadata": {}, "source": [ - "### XBEN Compression\n", + "### BEN → XBEN\n", "\n", - "XBEN (short for e\\[X\\]treme BEN) is a much more powerful version of our compression. In fact, with\n", - "some coercing of the data, it is not uncommon to get 1000x compression compared to base JSONL files.\n", - "However, all of these savings come at a cost: time and compute power. In general, while XBEN is \n", - "relatively quick to decompress, it can take up to a few hours to compress a large sample. So this\n", - "format is great for when the user wants to store data long-term, but is less good in an actively \n", - "changing project. " + "XBEN wraps a BEN stream in LZMA2 for much smaller files, at the cost of slower compression.\n", + "The XBEN encoders accept `n_threads` and `compression_level` (0 fastest … 9 smallest)." ] }, { "cell_type": "code", - "execution_count": 7, - "id": "81b1f724", - "metadata": {}, - "outputs": [], + "execution_count": 6, + "id": "97d5e070", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.312703Z", + "iopub.status.busy": "2026-06-06T03:02:44.312531Z", + "iopub.status.idle": "2026-06-06T03:02:44.327111Z", + "shell.execute_reply": "2026-06-06T03:02:44.326520Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " small_example.jsonl: 159892 bytes\n", + " small_example.ben: 4303 bytes\n", + " small_example.xben: 2076 bytes\n" + ] + } + ], "source": [ - "encode_jsonl_to_xben(\n", - " in_file=\"example_data/small_example.jsonl\",\n", - " out_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", + "encode_ben_to_xben(\n", + " in_file=\"example_data/small_example.ben\",\n", + " out_file=\"example_data/small_example.xben\",\n", " overwrite=True,\n", - " variant=\"mkv_chain\",\n", - " n_threads=1,\n", " compression_level=9,\n", ")\n", "\n", - "encode_ben_to_xben(\n", - " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", - " out_file=\"example_data/small_example_jsonl_to_ben_to_xben.jsonl.xben\",\n", + "# You can also go straight from JSONL to XBEN in one step.\n", + "encode_jsonl_to_xben(\n", + " in_file=\"example_data/small_example.jsonl\",\n", + " out_file=\"example_data/small_example.direct.xben\",\n", " overwrite=True,\n", - " n_threads=1,\n", - " compression_level=9,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cbfe2361", - "metadata": {}, - "source": [ - "There are now a few new parameters added to the XBEN compression functions: `n_threads` and \n", - "`compression_level`. \n", - "\n", - "- `n_threads`: In the interest of actually finishing the compression at a reasonable \n", - "pace, XBEN has been parallelized to allow the user to take advantage of modern CPUs with \n", - "higher thread counts. So increasing the number of threads in the parameter will decrease the \n", - "compression time. \n", - "\n", - "- `compression_level`: There are 10 possible compression levels 0 (fastest) - 9 (slowest) (these\n", - "follow the XZ compression levels). The higher the compression level, the better the compression \n", - "ratio and the higher the demands on the CPU when compressing the object. \n", + ")\n", "\n", - "By default, all XBEN compression functions will use all available threads on the machine and will\n", - "use the highest compression level (9). The XBEN format is only really needed for very large ensemble \n", - "analysis, and machines running such analysis tend to have the compute power to accommodate these\n", - "defaults." + "for name in [\"small_example.jsonl\", \"small_example.ben\", \"small_example.xben\"]:\n", + " print(f\"{name:>22}: {os.path.getsize('example_data/' + name):>7} bytes\")" ] }, { "cell_type": "markdown", - "id": "09bb043b", + "id": "3def43b6", "metadata": {}, "source": [ - "### Decompression\n", + "### Decoding\n", "\n", - "Insofar as file decompression goes, what you see is what you get. All of the functions have the \n", - "exact same signature, and should be pretty self-explanatory." + "The decoders mirror the encoders and all take `(in_file, out_file, overwrite=False)`." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "a4e512b3", - "metadata": {}, - "outputs": [], + "execution_count": 7, + "id": "5a315a19", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.328951Z", + "iopub.status.busy": "2026-06-06T03:02:44.328764Z", + "iopub.status.idle": "2026-06-06T03:02:44.336173Z", + "shell.execute_reply": "2026-06-06T03:02:44.335649Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "decoded BEN -> JSONL, XBEN -> JSONL, and XBEN -> BEN\n" + ] + } + ], "source": [ "decode_ben_to_jsonl(\n", - " in_file=\"example_data/small_example_jsonl_to_ben.jsonl.ben\",\n", - " out_file=\"example_data/small_example_jsonl_to_ben_to_jsonl.jsonl\",\n", - " overwrite=True,\n", + " \"example_data/small_example.ben\", \"example_data/roundtrip.jsonl\", overwrite=True\n", ")\n", - "\n", "decode_xben_to_jsonl(\n", - " in_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", - " out_file=\"example_data/small_example_jsonl_to_xben_to_jsonl.jsonl\",\n", - " overwrite=True,\n", + " \"example_data/small_example.xben\", \"example_data/from_xben.jsonl\", overwrite=True\n", ")\n", - "\n", "decode_xben_to_ben(\n", - " in_file=\"example_data/small_example_jsonl_to_xben.jsonl.xben\",\n", - " out_file=\"example_data/small_example_jsonl_to_xben_to_ben.jsonl.ben\",\n", - " overwrite=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "157bc601", - "metadata": {}, - "source": [ - "## binary-ensemble and GerryChain\n", - "\n", - "As mentioned before, binary-ensemble was originally designed to work with ensembles generated by programs\n", - "like [GerryChain](https://gerrychain.readthedocs.io), and so we will give a small tutorial here.\n", - "\n", - "> **Note:** in the current version of GerryChain (0.3.2), there are some small peculiarities in\n", - "> the way that the `Assignment` class works that require some care." + " \"example_data/small_example.xben\", \"example_data/from_xben.ben\", overwrite=True\n", + ")\n", + "print(\"decoded BEN -> JSONL, XBEN -> JSONL, and XBEN -> BEN\")" ] }, { "cell_type": "markdown", - "id": "ed52aff8", + "id": "dcd1f6b4", "metadata": {}, "source": [ - "### Encoding\n", - "\n", - "Working with the binary-ensemble encoder should feel a lot like working with any Python object that handles\n", - "writing to files. In particular, we will use the context manager pattern to make sure that the\n", - "file is appropriately opened and closed as we write assignment vectors to it." + "Encoding is lossless. Decoding a BEN stream back to JSONL recovers the original plans exactly:" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "eb43be57", - "metadata": {}, - "outputs": [], + "execution_count": 8, + "id": "d37a01b9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.339313Z", + "iopub.status.busy": "2026-06-06T03:02:44.339066Z", + "iopub.status.idle": "2026-06-06T03:02:44.355306Z", + "shell.execute_reply": "2026-06-06T03:02:44.352869Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "round-trip identical: True\n" + ] + } + ], "source": [ - "from gerrychain import Partition, Graph, MarkovChain, updaters, accept\n", - "from gerrychain.proposals import recom\n", - "from gerrychain.constraints import contiguous\n", - "from functools import partial\n", - "\n", + "def load(path):\n", + " with open(path) as f:\n", + " return [json.loads(line)[\"assignment\"] for line in f]\n", "\n", - "graph = Graph.from_json(\"./example_data/gerrymandria.json\")\n", "\n", - "my_updaters = {\n", - " \"population\": updaters.Tally(\"TOTPOP\"),\n", - "}\n", - "\n", - "initial_partition = Partition(graph, assignment=\"district\", updaters=my_updaters)\n", - "\n", - "ideal_population = sum(initial_partition[\"population\"].values()) / len(\n", - " initial_partition\n", - ")\n", - "\n", - "proposal = partial(\n", - " recom, pop_col=\"TOTPOP\", pop_target=ideal_population, epsilon=0.01, node_repeats=2\n", - ")\n", - "\n", - "recom_chain = MarkovChain(\n", - " proposal=proposal,\n", - " constraints=[contiguous],\n", - " accept=accept.always_accept,\n", - " initial_state=initial_partition,\n", - " total_steps=10_000,\n", + "print(\n", + " \"round-trip identical:\",\n", + " load(\"example_data/small_example.jsonl\") == load(\"example_data/roundtrip.jsonl\"),\n", ")" ] }, { "cell_type": "markdown", - "id": "a5ef02f8", + "id": "454e82b0", "metadata": {}, "source": [ - "Okay, now it is time to write the output. The recommended format is a **`.bendl` bundle**: a single\n", - "self-describing file that stores the dual graph (and any metadata) alongside the assignment stream,\n", - "so a collaborator can open it without hunting down the matching graph JSON.\n", + "## Streaming with `BenEncoder` / `BenDecoder`\n", "\n", - "The most important thing we need to keep track of is the order of the `Assignment` returned by\n", - "GerryChain. In general GerryChain makes no guarantees about the ordering of the nodes in the output\n", - "assignment, and to write the stream we MUST make sure that the ordering of the values in the\n", - "assignment vector lines up with the order of the nodes in the graph." + "When you'd rather write plans one at a time as they're produced (instead of from a JSONL\n", + "file), use `BenEncoder`. It's a context manager: write each assignment, and the stream is\n", + "flushed on exit. `BenDecoder` reads a stream back, one assignment at a time." ] }, { "cell_type": "code", - "execution_count": 10, - "id": "dec15cda", - "metadata": {}, + "execution_count": 9, + "id": "e693be01", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.357968Z", + "iopub.status.busy": "2026-06-06T03:02:44.357728Z", + "iopub.status.idle": "2026-06-06T03:02:44.365655Z", + "shell.execute_reply": "2026-06-06T03:02:44.364033Z" + } + }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ffff4f4cfe644c3ea93516c8532626cd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/10000 [00:00 [[0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 3, 3], [2, 2, 2, 2, 2, 2]]\n", + "range(50, 53) -> [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]]\n", + "every 50th -> 4 plans\n", + "indices again -> [[0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 3, 3], [2, 2, 2, 2, 2, 2]]\n" + ] + } + ], "source": [ - "### Subsampling\n", + "ben_file = \"example_data/small_example.ben\"\n", + "decoder = BenDecoder(ben_file) # one decoder, reused for every subsample below\n", "\n", - "Often times, when working with ensembles of plans, it is desirable to subsample from the ensemble\n", - "for the sake of winnowing, and the `BenDecoder` has native support for this.\n", - "\n", - "\n", - "We'll work with the \"100k_CO_chain.json.xben\" file which contains 100k districting plans on \n", - "Colorado Census blocks (there are ~140k census blocks in Colorado)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff8e8e4a", - "metadata": {}, - "outputs": [], - "source": [ - "from binary_ensemble.stream import BenDecoder\n", + "print(\n", + " \"indices [1, 100, 200] ->\",\n", + " [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])],\n", + ")\n", + "print(\n", + " \"range(50, 53) ->\",\n", + " [assignment[:6] for assignment in decoder.subsample_range(50, 53)],\n", + ")\n", + "print(\"every 50th ->\", sum(1 for _ in decoder.subsample_every(50)), \"plans\")\n", "\n", - "# Warning, this BEN file will be ~2Gb\n", - "decode_xben_to_ben(\n", - " in_file=\"example_data/100k_CO_chain.jsonl.xben\",\n", - " out_file=\"example_data/100k_CO_chain.jsonl.ben\",\n", - " overwrite=True,\n", + "# The same decoder rewinds and re-selects on each call, so you can run subsamples\n", + "# repeatedly without building a new decoder:\n", + "print(\n", + " \"indices again ->\",\n", + " [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])],\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "24761ca6", - "metadata": {}, - "outputs": [], - "source": [ - "for assignment in BenDecoder(\n", - " \"example_data/100k_CO_chain.jsonl.ben\"\n", - ").subsample_indices([1, 23978, 100000]):\n", - " print(assignment[:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a815edf", - "metadata": {}, - "outputs": [], - "source": [ - "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_range(\n", - " 1000, 1005\n", - "):\n", - " print(assignment[:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3be48458", - "metadata": {}, - "outputs": [], - "source": [ - "for assignment in BenDecoder(\"example_data/100k_CO_chain.jsonl.ben\").subsample_every(\n", - " 10000\n", - "):\n", - " print(assignment[:10])" - ] - }, { "cell_type": "markdown", - "id": "6166b148", + "id": "10268f1c", "metadata": {}, "source": [ - "Of course, you can also do subsampling from XBEN, but the extra compression induces a startup\n", - "cost for accessing anything in the file." + "The same methods work on an XBEN stream — pass `mode=\"xben\"`. Reading XBEN pays a one-time\n", + "decompression startup cost, so if you'll subsample repeatedly, extract to BEN first with\n", + "`decode_xben_to_ben`." ] }, { "cell_type": "code", - "execution_count": null, - "id": "51d9f27a", - "metadata": {}, - "outputs": [], - "source": [ - "for assignment in BenDecoder(\n", - " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", - ").subsample_indices([1, 23978, 100000]):\n", - " print(assignment[:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a51d0019", - "metadata": {}, - "outputs": [], + "execution_count": 11, + "id": "59c78e95", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:02:44.373339Z", + "iopub.status.busy": "2026-06-06T03:02:44.373192Z", + "iopub.status.idle": "2026-06-06T03:02:44.379619Z", + "shell.execute_reply": "2026-06-06T03:02:44.379163Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 0, 0, 0, 1, 1]\n", + "[0, 0, 0, 0, 2, 2]\n", + "[0, 0, 0, 0, 2, 2]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/claude-1000/ipykernel_3711264/1859154157.py:1: UserWarning: XBEN may take a second to start decoding.\n", + " for assignment in BenDecoder(\"example_data/small_example.xben\", mode=\"xben\").subsample_range(1, 3):\n" + ] + } + ], "source": [ "for assignment in BenDecoder(\n", - " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", - ").subsample_range(1000, 1005):\n", - " print(assignment[:10])" + " \"example_data/small_example.xben\", mode=\"xben\"\n", + ").subsample_range(1, 3):\n", + " print(assignment[:6])" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "eeb1c112", + "cell_type": "markdown", + "id": "97f1ad2a", "metadata": {}, - "outputs": [], "source": [ - "for assignment in BenDecoder(\n", - " \"example_data/100k_CO_chain.jsonl.xben\", mode=\"xben\"\n", - ").subsample_every(10000):\n", - " print(assignment[:10])" + "## Where to next\n", + "\n", + "- [Working with `.bendl` bundles](using_bendl.ipynb) — the recommended self-describing\n", + " container, plus graph reordering for much better compression.\n", + "- [Concepts](../concepts/overview.md) — the formats, variants, and how the compression works.\n", + "- [API reference](../api/index.md) — every public class and function." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "179c6f2d", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -623,4 +598,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/ben-py/docs/user/using_bendl.ipynb b/ben-py/docs/user/using_bendl.ipynb new file mode 100644 index 0000000..4a75772 --- /dev/null +++ b/ben-py/docs/user/using_bendl.ipynb @@ -0,0 +1,1276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dc9c2020", + "metadata": {}, + "source": [ + "# Working with `.bendl` bundles\n", + "\n", + "This tutorial is a companion to `using_ben_py.ipynb`. That notebook covers the\n", + "plain BEN/XBEN *streams* (`binary_ensemble.stream` + `binary_ensemble.codec`);\n", + "this one covers the **`.bendl` bundle** — the recommended, self-describing\n", + "container format — and walks the full `binary_ensemble.bundle` /\n", + "`binary_ensemble.graph` API, driving it with a live GerryChain ReCom run.\n", + "\n", + "It is written in the `# %%` \"percent\" cell format, so you can step through it\n", + "cell-by-cell in VS Code / Jupyter (via Jupytext) or just run it top-to-bottom\n", + "as a plain script: `python using_bendl.py`.\n", + "\n", + "## What is a bundle, and why use one?\n", + "\n", + "A plain `.ben` file is *just* the assignment stream: a sequence of districting\n", + "plans, with no record of the graph they were drawn on. To use it, a\n", + "collaborator has to separately track down the matching dual-graph JSON **and**\n", + "know the exact node ordering the assignments were written in. Lose either and\n", + "the file is undecodable.\n", + "\n", + "A `.bendl` bundle fixes this by wrapping the stream together with *assets* in a\n", + "single file:\n", + "\n", + "- the **dual graph** (`graph.json`), so the file is self-describing;\n", + "- an optional **`node_permutation_map.json`**, recording any reordering applied\n", + " to the graph for better compression;\n", + "- **`metadata.json`**, for run provenance (seed, parameters, generator, …);\n", + "- arbitrary **custom assets** (notes, analysis results, plots-as-bytes, …).\n", + "\n", + "Intended use cases:\n", + "\n", + "1. **Shareable, reproducible ensembles** — hand someone one file; they can\n", + " recover the graph and replay the plans with no side files.\n", + "2. **Provenance** — stamp the seed / chain parameters into the bundle.\n", + "3. **Better compression** — reorder the graph (RCM / multi-level clustering)\n", + " before writing so the BEN/XBEN delta-encoding shrinks; the permutation map\n", + " keeps the reordering reversible.\n", + "4. **A lifecycle** — work in BEN (fast) while a project is active, then\n", + " recompress the bundle to XBEN for long-term archival, assets preserved.\n", + "5. **Extensibility** — append analysis results to a finished bundle later,\n", + " without rewriting the stream." + ] + }, + { + "cell_type": "markdown", + "id": "ad13ca48", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We need a dual graph to draw plans on. Rather than download a multi-megabyte\n", + "real-world graph, we *generate* a `SIDE × SIDE` grid (here 32×32 = 1024 nodes) —\n", + "big enough to feel like a real ensemble, small enough to run in seconds, and\n", + "fully reproducible. Each node gets unit population (`TOTPOP = 1`) and an initial\n", + "`district` label of vertical stripes, which gives ReCom a contiguous, balanced\n", + "starting partition.\n", + "\n", + "Then we deliberately **shuffle the node order**. Real-world dual graphs rarely\n", + "arrive in a compression-friendly order (think census blocks listed by GEOID, or\n", + "nodes in arbitrary shapefile order), so the stored order has no relationship to\n", + "graph locality. Shuffling reproduces that — and it's exactly the situation where\n", + "reordering before encoding pays off, which we'll see below. We write the graph\n", + "out as NetworkX adjacency JSON under `example_data/`, the shape a bundle stores." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "351e870b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:05:40.368921Z", + "iopub.status.busy": "2026-06-06T03:05:40.368852Z", + "iopub.status.idle": "2026-06-06T03:05:40.416711Z", + "shell.execute_reply": "2026-06-06T03:05:40.416323Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "graph file: example_data/grid.json (95290 bytes, 1024 nodes)\n" + ] + } + ], + "source": [ + "import json\n", + "import random\n", + "\n", + "import networkx as nx\n", + "from pathlib import Path\n", + "\n", + "Path(\"example_data\").mkdir(exist_ok=True)\n", + "\n", + "SIDE, N_DISTRICTS = 32, 4 # 1024 nodes; SIDE must be divisible by N_DISTRICTS\n", + "GRAPH_PATH = Path(\"example_data/grid.json\")\n", + "\n", + "\n", + "def build_grid_graph(side, n_districts, shuffle_seed=0):\n", + " \"\"\"A side*side grid with unit population, stripe districts, and a shuffled order.\"\"\"\n", + " g = nx.grid_2d_graph(side, side)\n", + " g = nx.convert_node_labels_to_integers(g, ordering=\"sorted\") # row-major ints\n", + " cols_per_district = side // n_districts\n", + " for node in g.nodes:\n", + " _row, col = divmod(node, side)\n", + " g.nodes[node][\"TOTPOP\"] = 1\n", + " g.nodes[node][\"district\"] = col // cols_per_district\n", + " # Rebuild with nodes inserted in a random order, so the *stored* order has no\n", + " # spatial locality (attributes and edges are preserved untouched).\n", + " shuffled = list(g.nodes)\n", + " random.Random(shuffle_seed).shuffle(shuffled)\n", + " h = nx.Graph()\n", + " h.add_nodes_from((node, g.nodes[node]) for node in shuffled)\n", + " h.add_edges_from(g.edges)\n", + " return h\n", + "\n", + "\n", + "grid = build_grid_graph(SIDE, N_DISTRICTS)\n", + "GRAPH_PATH.write_text(json.dumps(nx.readwrite.json_graph.adjacency_data(grid)))\n", + "print(\n", + " f\"graph file: {GRAPH_PATH} ({GRAPH_PATH.stat().st_size} bytes, {SIDE * SIDE} nodes)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2c8934ab", + "metadata": {}, + "source": [ + "### The public surface\n", + "\n", + "Everything bundle-related is re-exported from the top-level package, but it\n", + "lives in two submodules:\n", + "\n", + "- `binary_ensemble.bundle` — `BendlEncoder`, `BendlDecoder`, `compress_stream`\n", + "- `binary_ensemble.graph` — `reorder`, `reorder_multi_level_cluster`,\n", + " `reorder_reverse_cuthill_mckee`, `reorder_by_key`\n", + "\n", + "(The plain-stream `BenEncoder` / `BenDecoder` and the whole-file `encode_*` /\n", + "`decode_*` codec helpers are the subject of the BEN tutorial.)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e0fa37a7", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:05:40.417603Z", + "iopub.status.busy": "2026-06-06T03:05:40.417449Z", + "iopub.status.idle": "2026-06-06T03:05:40.420025Z", + "shell.execute_reply": "2026-06-06T03:05:40.419752Z" + } + }, + "outputs": [], + "source": [ + "from binary_ensemble import BendlDecoder, BendlEncoder, compress_stream\n", + "from binary_ensemble import graph as bgraph" + ] + }, + { + "cell_type": "markdown", + "id": "42d0386c", + "metadata": {}, + "source": [ + "## The GerryChain ingredients\n", + "\n", + "We drive everything with a short ReCom chain. The chain's *recipe* (proposal,\n", + "constraints, updaters) is independent of how nodes are ordered, so we factor it\n", + "into a helper that builds a fresh chain on whatever graph we hand it. We'll call\n", + "this once per bundle and **stream each plan to disk as the chain produces it** —\n", + "no need to hold the whole ensemble in memory." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b34e60bd", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:05:40.420533Z", + "iopub.status.busy": "2026-06-06T03:05:40.420472Z", + "iopub.status.idle": "2026-06-06T03:05:40.684962Z", + "shell.execute_reply": "2026-06-06T03:05:40.684568Z" + } + }, + "outputs": [], + "source": [ + "from functools import partial\n", + "\n", + "from gerrychain import Graph, MarkovChain, Partition, accept, constraints, updaters\n", + "from gerrychain.proposals import recom\n", + "\n", + "\n", + "def make_chain(gc_graph, steps):\n", + " \"\"\"Build a fresh ReCom MarkovChain over ``gc_graph`` (a gerrychain.Graph).\"\"\"\n", + " chain_updaters = {\n", + " \"population\": updaters.Tally(\"TOTPOP\", alias=\"population\"),\n", + " \"cut_edges\": updaters.cut_edges,\n", + " }\n", + " initial = Partition(gc_graph, assignment=\"district\", updaters=chain_updaters)\n", + " ideal_pop = sum(initial[\"population\"].values()) / len(initial)\n", + " return MarkovChain(\n", + " proposal=partial(\n", + " recom, pop_col=\"TOTPOP\", pop_target=ideal_pop, epsilon=0.05, node_repeats=2\n", + " ),\n", + " constraints=[constraints.contiguous],\n", + " accept=accept.always_accept,\n", + " initial_state=initial,\n", + " total_steps=steps,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "66229db9", + "metadata": {}, + "source": [ + "## Writing your first bundle — encoding as the chain runs\n", + "\n", + "You do **not** need to use `BendlEncoder` itself as a context manager. Only the\n", + "`stream(...)` writer needs a `with` block: closing the stream context is what\n", + "finalizes the bundle. So the pattern is:\n", + "\n", + "1. create the encoder and add the graph (and any other assets),\n", + "2. open the single-use `stream(...)` in a `with` block,\n", + "3. iterate the chain and `write` each plan inside it,\n", + "4. when the `with enc.stream(...)` block exits, the bundle is finalized on disk.\n", + "\n", + "The one rule when writing: every assignment must be in a **fixed, known node\n", + "order**. GerryChain makes no ordering promise, so we pin the order to the graph's\n", + "node iteration order and reindex each plan to it.\n", + "\n", + "A convenient trick: `add_graph` *returns* the embedded graph (as a NetworkX\n", + "graph), so we can build the GerryChain graph straight from it and guarantee the\n", + "write order matches what gets stored. For this first bundle we pass\n", + "`sort=None` to store the graph in its raw (shuffled) order — a\n", + "deliberately un-optimized baseline we'll improve on next." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "68ad7766", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:05:40.686373Z", + "iopub.status.busy": "2026-06-06T03:05:40.686121Z", + "iopub.status.idle": "2026-06-06T03:06:04.364287Z", + "shell.execute_reply": "2026-06-06T03:06:04.363727Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wrote example_data/basic.bendl\n" + ] + } + ], + "source": [ + "encoder = BendlEncoder(\"example_data/basic.bendl\", overwrite=True) # no `with` needed\n", + "stored_graph = encoder.add_graph(GRAPH_PATH, sort=None)\n", + "gc_graph = Graph.from_networkx(stored_graph)\n", + "write_order = list(gc_graph.nodes) # the order stored == the order we write\n", + "\n", + "with encoder.stream(\"ben\") as stream: # only the stream is context-managed\n", + " for partition in make_chain(gc_graph, steps=1000):\n", + " series = partition.assignment.to_series()\n", + " stream.write(series.loc[write_order].astype(int).tolist())\n", + "# the bundle is finalized now that the stream context has closed\n", + "\n", + "print(\"wrote example_data/basic.bendl\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac4a2306", + "metadata": {}, + "source": [ + "A note on validation: because we embedded a graph *before* the stream, the\n", + "encoder knows the node count and checks every `write` against it. A\n", + "wrong-length assignment raises immediately instead of silently corrupting the\n", + "file (and because the exception escapes the stream context, the bundle is left\n", + "unfinalized rather than stamped complete — more on that at the end):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4723655b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:04.365628Z", + "iopub.status.busy": "2026-06-06T03:06:04.365482Z", + "iopub.status.idle": "2026-06-06T03:06:04.399476Z", + "shell.execute_reply": "2026-06-06T03:06:04.398745Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rejected as expected: assignment length 3 does not match graph node count 1024\n" + ] + } + ], + "source": [ + "encoder = BendlEncoder(\"example_data/willfail.bendl\", overwrite=True)\n", + "encoder.add_graph(GRAPH_PATH, sort=None)\n", + "try:\n", + " with encoder.stream(\"ben\") as stream:\n", + " stream.write([0, 1, 2]) # too short\n", + "except ValueError as e:\n", + " print(\"rejected as expected:\", e)" + ] + }, + { + "cell_type": "markdown", + "id": "e2e7c5b7", + "metadata": {}, + "source": [ + "## Reordering for compression (the default)\n", + "\n", + "BEN/XBEN compress *runs of equal adjacent labels* well, so a node ordering that\n", + "keeps neighbouring nodes near each other in the stream compresses much better.\n", + "Because our grid's stored order is shuffled, the raw `basic.bendl` above is a\n", + "worst case. Fixing it is the encoder's default behaviour: `add_graph` reorders\n", + "the graph with **multi-level clustering (`sort=\"mlc\"`)** unless you opt out with\n", + "`sort=None`. Reordering:\n", + "\n", + "- reorders the graph — `sort=\"mlc\"` (default), `sort=\"rcm\"`, or `sort=\"key\"`\n", + " with `key=\"\"` (e.g. `key=\"GEOID\"`) to sort by a node attribute,\n", + "- stores both the reordered `graph.json` **and** a `node_permutation_map.json`,\n", + "- and **returns the reordered graph**.\n", + "\n", + "Returning the reordered graph is what makes this ergonomic: we build the *entire\n", + "ReCom chain on that ordering*, so the chain's natural node order already equals\n", + "the stored order — streaming needs no extra bookkeeping. **Reordering is\n", + "pre-stream only** (it decides the write order), so `add_graph(...)` must come\n", + "before `stream()`.\n", + "\n", + "We'll make this the \"real\" bundle for the rest of the tutorial, so we also stamp\n", + "in metadata and a couple of custom assets while we're here." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af9eb82e", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:04.400699Z", + "iopub.status.busy": "2026-06-06T03:06:04.400539Z", + "iopub.status.idle": "2026-06-06T03:06:24.003515Z", + "shell.execute_reply": "2026-06-06T03:06:24.001220Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wrote example_data/rich.bendl\n" + ] + } + ], + "source": [ + "encoder = BendlEncoder(\"example_data/rich.bendl\", overwrite=True)\n", + "\n", + "# add_graph reorders with MLC by default; build the chain on the returned graph.\n", + "reordered_graph = encoder.add_graph(GRAPH_PATH)\n", + "gc_graph = Graph.from_networkx(reordered_graph)\n", + "write_order = list(gc_graph.nodes)\n", + "\n", + "# Provenance + extra assets (covered in detail in the next section).\n", + "encoder.add_metadata(\n", + " {\"generator\": \"gerrychain\", \"proposal\": \"recom\", \"epsilon\": 0.05, \"seed\": 1234}\n", + ")\n", + "encoder.add_asset(\n", + " \"readme.txt\", \"ReCom ensemble on a 32x32 grid, MLC-reordered.\", \"text\"\n", + ")\n", + "\n", + "with encoder.stream(\"ben\") as stream:\n", + " for partition in make_chain(gc_graph, steps=1000):\n", + " series = partition.assignment.to_series()\n", + " stream.write(series.loc[write_order].astype(int).tolist())\n", + "\n", + "print(\"wrote example_data/rich.bendl\")" + ] + }, + { + "cell_type": "markdown", + "id": "e514afa6", + "metadata": {}, + "source": [ + "### Did reordering actually help?\n", + "\n", + "Tempting as it is to compare `basic.bendl` against `rich.bendl`, that isn't a\n", + "fair fight: they hold **different ensembles** — each was streamed live from its\n", + "own independent ReCom run — so their stream sizes mix the ordering effect with\n", + "run-to-run randomness. Let's look anyway, then do it properly. We compare the\n", + "*embedded BEN stream* sizes (the assignment data only, excluding assets and\n", + "header) by extracting each stream and measuring it:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d3ae7f81", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.012031Z", + "iopub.status.busy": "2026-06-06T03:06:24.011612Z", + "iopub.status.idle": "2026-06-06T03:06:24.030110Z", + "shell.execute_reply": "2026-06-06T03:06:24.024395Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "basic.bendl (raw, run A): 135840 bytes\n", + "rich.bendl (mlc, run B): 40081 bytes\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "\n", + "def stream_size(path):\n", + " \"\"\"Size in bytes of a bundle's embedded BEN stream (extracted).\"\"\"\n", + " decoder = BendlDecoder(path)\n", + " tmp = \"example_data/_measure.ben\"\n", + " decoder.extract_stream(tmp, overwrite=True)\n", + " size = os.path.getsize(tmp)\n", + " os.remove(tmp)\n", + " return size\n", + "\n", + "\n", + "print(f\"basic.bendl (raw, run A): {stream_size('example_data/basic.bendl'):>8} bytes\")\n", + "print(f\"rich.bendl (mlc, run B): {stream_size('example_data/rich.bendl'):>8} bytes\")" + ] + }, + { + "cell_type": "markdown", + "id": "b0d561f5", + "metadata": {}, + "source": [ + "For a true **apples-to-apples** measurement we need the *same* plans in two\n", + "orderings. We can get that without running a second chain by **relabeling**\n", + "`basic.bendl`'s exact ensemble into MLC order. `relabel_bundle` does exactly\n", + "this in one call: it reorders the stored graph, rewrites every assignment into\n", + "the new node order, and stores a `node_permutation_map.json` so the change stays\n", + "reversible (it preserves metadata and custom assets too). It's the bundle-level\n", + "form of the CLI's `reben` ordering step:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a0f34268", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.037694Z", + "iopub.status.busy": "2026-06-06T03:06:24.037235Z", + "iopub.status.idle": "2026-06-06T03:06:24.087832Z", + "shell.execute_reply": "2026-06-06T03:06:24.086921Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "same ensemble, raw order: 135840 bytes\n", + "same ensemble, MLC order: 39908 bytes\n", + "-> 3.4x smaller from reordering alone\n" + ] + } + ], + "source": [ + "from binary_ensemble import relabel_bundle\n", + "\n", + "# out_file won't overwrite an existing file, so clear any copy from a previous run.\n", + "Path(\"example_data/relabeled.bendl\").unlink(missing_ok=True)\n", + "relabel_bundle(\n", + " \"example_data/basic.bendl\", out_file=\"example_data/relabeled.bendl\", sort=\"mlc\"\n", + ")\n", + "\n", + "raw_bytes = stream_size(\"example_data/basic.bendl\")\n", + "mlc_bytes = stream_size(\"example_data/relabeled.bendl\")\n", + "print(f\"same ensemble, raw order: {raw_bytes:>8} bytes\")\n", + "print(f\"same ensemble, MLC order: {mlc_bytes:>8} bytes\")\n", + "print(f\"-> {raw_bytes / mlc_bytes:.1f}x smaller from reordering alone\")" + ] + }, + { + "cell_type": "markdown", + "id": "f80e0806", + "metadata": {}, + "source": [ + "Now the *only* thing that changed is the node ordering, so that ratio is the\n", + "real compression win from MLC — and it's why MLC is the **default** in\n", + "`add_graph`. (On a graph that already arrives in a locality-friendly order the\n", + "gain is smaller, and the extra `node_permutation_map.json` can even make a tiny\n", + "file net-larger, but reordering is cheap and rarely hurts — so the encoder does\n", + "it for you unless you ask for raw with `sort=None`.) It matters most\n", + "right before an expensive XBEN recompress, where every byte of BEN is amplified." + ] + }, + { + "cell_type": "markdown", + "id": "43f2253d", + "metadata": {}, + "source": [ + "### Reordering under the hood: the standalone utilities\n", + "\n", + "`add_graph(..., sort=..., key=...)` is built on the `binary_ensemble.graph`\n", + "utilities, which you can also call directly — handy when you want to compute an\n", + "ordering once and reuse it, or inspect the permutation map before committing.\n", + "Each returns `(reordered_graph, node_permutation_map)`: a live NetworkX graph\n", + "plus the map dict." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4ccffbe8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.089778Z", + "iopub.status.busy": "2026-06-06T03:06:24.089558Z", + "iopub.status.idle": "2026-06-06T03:06:24.210989Z", + "shell.execute_reply": "2026-06-06T03:06:24.207417Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reorder(sort='rcm') -> Graph with 1024 nodes\n", + "orderings: sort='mlc', sort='rcm', or sort='key' with key=''\n", + "old_to_new is a bijection over [0, n): True\n", + "provenance fields: {'ordering_method': 'reverse-cuthill-mckee', 'key': None}\n" + ] + } + ], + "source": [ + "reordered, permutation_map = bgraph.reorder(GRAPH_PATH, sort=\"rcm\")\n", + "print(\n", + " \"reorder(sort='rcm') ->\",\n", + " type(reordered).__name__,\n", + " \"with\",\n", + " reordered.number_of_nodes(),\n", + " \"nodes\",\n", + ")\n", + "\n", + "# Sort by a node attribute with sort=\"key\" + key=... (on real data this is how\n", + "# you'd order by, say, \"GEOID\"; here the grid only has \"district\"/\"id\"):\n", + "graph_mlc, _ = bgraph.reorder(GRAPH_PATH, sort=\"mlc\")\n", + "graph_rcm, _ = bgraph.reorder(GRAPH_PATH, sort=\"rcm\")\n", + "graph_by_district, _ = bgraph.reorder(GRAPH_PATH, sort=\"key\", key=\"district\")\n", + "# reorder_multi_level_cluster / reorder_reverse_cuthill_mckee / reorder_by_key are\n", + "# thin convenience wrappers over these.\n", + "print(\"orderings: sort='mlc', sort='rcm', or sort='key' with key=''\")\n", + "\n", + "# The permutation map is what makes a reordering reversible: its required field\n", + "# `node_permutation_old_to_new` maps original 0-based node positions -> new ones.\n", + "old_to_new = permutation_map[\"node_permutation_old_to_new\"]\n", + "print(\n", + " \"old_to_new is a bijection over [0, n):\",\n", + " sorted(old_to_new.values()) == list(range(reordered.number_of_nodes())),\n", + ")\n", + "print(\"provenance fields:\", {k: permutation_map[k] for k in (\"ordering_method\", \"key\")})" + ] + }, + { + "cell_type": "markdown", + "id": "4194a236", + "metadata": {}, + "source": [ + "## Metadata and custom assets\n", + "\n", + "We already used these while building `rich.bendl`. `add_metadata` writes the\n", + "canonical `metadata.json` (provenance). `add_asset` writes a *custom* asset\n", + "under a name you choose, with a `content_type` of `\"json\"` or `\"text\"`:\n", + "\n", + "- `\"json\"` — payload must be valid UTF-8 JSON; the decoder will auto-parse it.\n", + "- `\"text\"` — payload must be valid UTF-8; stored without the JSON flag.\n", + "\n", + "The facade validates the payload, so a malformed `\"json\"` asset is caught at\n", + "write time. Assets may be added before *or* after the stream — only the stream\n", + "itself is single-use. Post-stream adds commit immediately (one directory\n", + "rewrite each), so use them sparingly. Here we tack a JSON asset onto an\n", + "already-finalized bundle to show both behaviours:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f4db28a9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.218758Z", + "iopub.status.busy": "2026-06-06T03:06:24.217041Z", + "iopub.status.idle": "2026-06-06T03:06:24.230725Z", + "shell.execute_reply": "2026-06-06T03:06:24.229938Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rejected as expected: content_type='json' requires valid UTF-8 JSON: Expecting value: line 1 column 1 (char 0)\n" + ] + } + ], + "source": [ + "# Add to rich.bendl after the fact (this finalized bundle is reopened to append).\n", + "# In append mode each add_* commits immediately, so there is nothing to finalize.\n", + "appender = BendlEncoder.append(\"example_data/rich.bendl\")\n", + "appender.add_asset(\"params.json\", json.dumps({\"node_repeats\": 2}), \"json\")\n", + "\n", + "# Validation in action — a \"json\" asset that isn't JSON is rejected up front:\n", + "encoder = BendlEncoder(\"example_data/tmp.bendl\", overwrite=True)\n", + "try:\n", + " encoder.add_asset(\"bad.json\", \"this is not json\", \"json\")\n", + "except ValueError as e:\n", + " print(\"rejected as expected:\", e)" + ] + }, + { + "cell_type": "markdown", + "id": "f063be5c", + "metadata": {}, + "source": [ + "## Reading a bundle\n", + "\n", + "`BendlDecoder(path)` opens a bundle. The **canonical getters** pull the\n", + "well-known assets back in convenient form:\n", + "\n", + "- `read_graph()` → a live **NetworkX graph** (or `None` if absent),\n", + "- `read_metadata()` → parsed `metadata.json` (or `None`),\n", + "- `read_node_permutation_map()` → parsed map dict (or `None`).\n", + "\n", + "Crucially, `read_graph()` returns the graph in the node order the assignments\n", + "were written in — which, because we built the chain on the reordered graph, is\n", + "exactly the reordered order. It lines up with the stream with no extra work." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "56fdac96", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.233188Z", + "iopub.status.busy": "2026-06-06T03:06:24.232969Z", + "iopub.status.idle": "2026-06-06T03:06:24.272046Z", + "shell.execute_reply": "2026-06-06T03:06:24.270881Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "read_graph() -> Graph with 1024 nodes\n", + "read_metadata() -> {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234}\n", + "read_node_permutation_map() has old_to_new: True\n" + ] + } + ], + "source": [ + "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "\n", + "packaged_graph = decoder.read_graph()\n", + "print(\n", + " \"read_graph() ->\",\n", + " type(packaged_graph).__name__,\n", + " \"with\",\n", + " packaged_graph.number_of_nodes(),\n", + " \"nodes\",\n", + ")\n", + "print(\"read_metadata() ->\", decoder.read_metadata())\n", + "print(\n", + " \"read_node_permutation_map() has old_to_new:\",\n", + " \"node_permutation_old_to_new\" in decoder.read_node_permutation_map(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "45770f97", + "metadata": {}, + "source": [ + "**Generic accessors** reach any asset by name:\n", + "\n", + "- `read_asset_bytes(name)` → raw `bytes`,\n", + "- `read_json_asset(name)` → parsed JSON.\n", + "\n", + "Note `read_json_asset(\"graph.json\")` gives you the *raw* adjacency dict, in case\n", + "you want the JSON rather than the rebuilt NetworkX object." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e76546f3", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.278023Z", + "iopub.status.busy": "2026-06-06T03:06:24.277591Z", + "iopub.status.idle": "2026-06-06T03:06:24.294493Z", + "shell.execute_reply": "2026-06-06T03:06:24.292037Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "readme.txt -> b'ReCom ensemble on a 32x32 grid, MLC-reordered.'\n", + "params.json -> {'node_repeats': 2}\n", + "graph.json (raw dict) top-level keys: ['directed', 'multigraph', 'graph', 'nodes', 'adjacency']\n" + ] + } + ], + "source": [ + "print(\"readme.txt ->\", decoder.read_asset_bytes(\"readme.txt\"))\n", + "print(\"params.json ->\", decoder.read_json_asset(\"params.json\"))\n", + "print(\n", + " \"graph.json (raw dict) top-level keys:\",\n", + " list(decoder.read_json_asset(\"graph.json\").keys()),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "46436643", + "metadata": {}, + "source": [ + "## Inspecting a bundle\n", + "\n", + "Before (or instead of) reading payloads, you can inspect structure — handy for\n", + "tooling, debugging, or deciding whether a file is what you expect:\n", + "\n", + "- `version` → `(major, minor)` format version,\n", + "- `is_complete()` → was it finalized cleanly,\n", + "- `assignment_format()` → `\"ben\"` or `\"xben\"`,\n", + "- `asset_names()` → directory names in order,\n", + "- `list_assets()` → full directory: name, type, offset, len, flag tags,\n", + "- `len(dec)` / `count_samples()` → number of plans in the stream." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "59091af4", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.299715Z", + "iopub.status.busy": "2026-06-06T03:06:24.298245Z", + "iopub.status.idle": "2026-06-06T03:06:24.319565Z", + "shell.execute_reply": "2026-06-06T03:06:24.317012Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "version: (1, 0)\n", + "is_complete: True\n", + "assignment_format: ben\n", + "sample count: 1000\n", + "asset_names: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json']\n", + "full directory:\n", + " {'name': 'graph.json', 'type': 2, 'offset': 64, 'len': 6788, 'flags': ['json', 'xz', 'checksum']}\n", + " {'name': 'node_permutation_map.json', 'type': 3, 'offset': 6852, 'len': 10152, 'flags': ['json', 'checksum']}\n", + " {'name': 'metadata.json', 'type': 1, 'offset': 17004, 'len': 79, 'flags': ['json', 'checksum']}\n", + " {'name': 'readme.txt', 'type': 4, 'offset': 17083, 'len': 46, 'flags': ['checksum']}\n", + " {'name': 'params.json', 'type': 4, 'offset': 57400, 'len': 19, 'flags': ['json', 'checksum']}\n" + ] + } + ], + "source": [ + "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "print(\"version: \", decoder.version())\n", + "print(\"is_complete: \", decoder.is_complete())\n", + "print(\"assignment_format:\", decoder.assignment_format())\n", + "print(\"sample count: \", len(decoder))\n", + "print(\"asset_names: \", decoder.asset_names())\n", + "print(\"full directory:\")\n", + "for entry in decoder.list_assets():\n", + " print(\" \", entry)" + ] + }, + { + "cell_type": "markdown", + "id": "6edb1f13", + "metadata": {}, + "source": [ + "## Iterating the stream and reconstructing plans\n", + "\n", + "A `BendlDecoder` iterates its embedded stream, yielding each assignment as a\n", + "`list[int]`. Combined with `read_graph()`, you can rebuild GerryChain\n", + "`Partition`s straight from the bundle — no separate graph file, no remembered\n", + "node order:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4686c660", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:24.325095Z", + "iopub.status.busy": "2026-06-06T03:06:24.324485Z", + "iopub.status.idle": "2026-06-06T03:06:33.608232Z", + "shell.execute_reply": "2026-06-06T03:06:33.607233Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reconstructed 1000 partitions from the bundle alone\n", + "first five cut-edge counts: [96, 87, 96, 100, 122]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "packaged_graph = decoder.read_graph()\n", + "order = pd.Index(packaged_graph.nodes) # matches the written assignment order\n", + "\n", + "cut_edge_counts = []\n", + "for assignment in decoder:\n", + " partition = Partition(\n", + " packaged_graph,\n", + " assignment=pd.Series(assignment, index=order),\n", + " updaters={\"cut_edges\": updaters.cut_edges},\n", + " )\n", + " cut_edge_counts.append(len(partition[\"cut_edges\"]))\n", + "\n", + "print(f\"reconstructed {len(cut_edge_counts)} partitions from the bundle alone\")\n", + "print(\"first five cut-edge counts:\", cut_edge_counts[:5])" + ] + }, + { + "cell_type": "markdown", + "id": "e7be8ec8", + "metadata": {}, + "source": [ + "## Subsampling\n", + "\n", + "For winnowing a large ensemble you rarely want every plan. `BendlDecoder`\n", + "supports three native subsamplers; each returns the decoder set up to yield\n", + "only the chosen plans, so you still just iterate. **Indices are 1-based** (plan\n", + "1 is the first sample):\n", + "\n", + "- `subsample_indices([...])` — exactly these 1-based indices (sorted, unique),\n", + "- `subsample_range(start, end)` — the 1-based *inclusive* range `[start, end]`,\n", + "- `subsample_every(step, offset=1)` — every `step`-th plan starting at `offset`\n", + " (\"thinning\")." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b2879d36", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.609437Z", + "iopub.status.busy": "2026-06-06T03:06:33.609296Z", + "iopub.status.idle": "2026-06-06T03:06:33.619502Z", + "shell.execute_reply": "2026-06-06T03:06:33.619228Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indices [1, 500, 1000] -> [[2, 2, 2, 2], [3, 3, 3, 3], [2, 2, 2, 2]]\n", + "range(100, 104) -> [[0, 0, 0, 0], [0, 0, 0, 0], [3, 3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3]]\n", + "every 250th -> 4 plans\n", + "indices again -> [[2, 2, 2, 2], [3, 3, 3, 3], [2, 2, 2, 2]]\n" + ] + } + ], + "source": [ + "bundle_file = \"example_data/rich.bendl\"\n", + "decoder = BendlDecoder(bundle_file) # one decoder, reused for every subsample below\n", + "\n", + "print(\n", + " \"indices [1, 500, 1000] ->\",\n", + " [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])],\n", + ")\n", + "print(\n", + " \"range(100, 104) ->\", # plans 100..104 inclusive = 5 plans\n", + " [assignment[:4] for assignment in decoder.subsample_range(100, 104)],\n", + ")\n", + "print(\n", + " \"every 250th ->\", sum(1 for _ in decoder.subsample_every(250)), \"plans\"\n", + ")\n", + "\n", + "# The same decoder rewinds and re-selects on each call, so you can run subsamples\n", + "# repeatedly without building a new decoder:\n", + "print(\n", + " \"indices again ->\",\n", + " [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6e4a9eeb", + "metadata": {}, + "source": [ + "## Extracting the raw stream\n", + "\n", + "Sometimes you want the bare assignment stream back out — e.g. to hand it to the\n", + "plain-stream tools or a different pipeline. `extract_stream` copies the\n", + "embedded stream region verbatim to a standalone `.ben`/`.xben` file, which you\n", + "can then open with the stream-only `BenDecoder`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e27b2818", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.620112Z", + "iopub.status.busy": "2026-06-06T03:06:33.620051Z", + "iopub.status.idle": "2026-06-06T03:06:33.626399Z", + "shell.execute_reply": "2026-06-06T03:06:33.626105Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "extracted stream yields 1000 plans\n" + ] + } + ], + "source": [ + "from binary_ensemble import BenDecoder\n", + "\n", + "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "decoder.extract_stream(\"example_data/extracted.ben\", overwrite=True)\n", + "\n", + "# Open the extracted file with the plain stream decoder (mode matches the bundle).\n", + "ben = BenDecoder(\"example_data/extracted.ben\", mode=decoder.assignment_format())\n", + "print(\"extracted stream yields\", sum(1 for _ in ben), \"plans\")" + ] + }, + { + "cell_type": "markdown", + "id": "73da8256", + "metadata": {}, + "source": [ + "## Appending analysis back onto the bundle\n", + "\n", + "A finished, finalized bundle isn't frozen: `BendlEncoder.append(path)` opens it\n", + "to add more assets later — say, the cut-edge summary we just computed. The\n", + "stream is *not* re-opened (it's already written); each `add_*` commits\n", + "immediately to disk." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8e9437f5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.627298Z", + "iopub.status.busy": "2026-06-06T03:06:33.627187Z", + "iopub.status.idle": "2026-06-06T03:06:33.629095Z", + "shell.execute_reply": "2026-06-06T03:06:33.628894Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "assets after append: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'cut_edge_summary.json']\n", + "appended summary: {'mean': 130.707, 'min': 87, 'max': 186}\n" + ] + } + ], + "source": [ + "appender = BendlEncoder.append(\"example_data/rich.bendl\")\n", + "appender.add_asset(\n", + " \"cut_edge_summary.json\",\n", + " json.dumps(\n", + " {\n", + " \"mean\": sum(cut_edge_counts) / len(cut_edge_counts),\n", + " \"min\": min(cut_edge_counts),\n", + " \"max\": max(cut_edge_counts),\n", + " }\n", + " ),\n", + " \"json\",\n", + ")\n", + "\n", + "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "print(\"assets after append:\", decoder.asset_names())\n", + "print(\"appended summary:\", decoder.read_json_asset(\"cut_edge_summary.json\"))" + ] + }, + { + "cell_type": "markdown", + "id": "7ee0a9d6", + "metadata": {}, + "source": [ + "## Assets-only bundles (no stream)\n", + "\n", + "You don't have to write a stream at all. This is the one case where you finalize\n", + "the bundle yourself — since there's no `stream()` context to do it — with an\n", + "explicit `close()` (or by using the encoder as a context manager). The result is\n", + "a valid **assets-only** bundle, useful for shipping a graph + metadata package\n", + "on its own. It decodes to an empty iteration with `len == 0` (no spurious\n", + "\"missing stream\" error)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "cf9903e8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.629606Z", + "iopub.status.busy": "2026-06-06T03:06:33.629548Z", + "iopub.status.idle": "2026-06-06T03:06:33.664827Z", + "shell.execute_reply": "2026-06-06T03:06:33.664354Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "assets-only: is_complete = True | len = 0 | assets = ['graph.json', 'metadata.json']\n" + ] + } + ], + "source": [ + "encoder = BendlEncoder(\"example_data/assets_only.bendl\", overwrite=True)\n", + "encoder.add_graph(GRAPH_PATH, sort=None)\n", + "encoder.add_metadata({\"note\": \"graph package, no plans\"})\n", + "encoder.close() # no stream was opened, so finalize explicitly\n", + "\n", + "decoder = BendlDecoder(\"example_data/assets_only.bendl\")\n", + "print(\n", + " \"assets-only: is_complete =\",\n", + " decoder.is_complete(),\n", + " \"| len =\",\n", + " len(decoder),\n", + " \"| assets =\",\n", + " decoder.asset_names(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b8587f8b", + "metadata": {}, + "source": [ + "## Recompressing to XBEN for archival\n", + "\n", + "BEN is fast to write and good for active work. For long-term storage, XBEN\n", + "squeezes much harder (at a real CPU/time cost). `compress_stream` repackages a\n", + "bundle's BEN stream as XBEN, **preserving every asset** (graph, metadata,\n", + "permutation map, custom blobs). Choose exactly one of:\n", + "\n", + "- `in_place=True` — recompress to a temp file and atomically swap it in, or\n", + "- `out_file=...` — write a new bundle and leave the original untouched." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e79a35c8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.666037Z", + "iopub.status.busy": "2026-06-06T03:06:33.665888Z", + "iopub.status.idle": "2026-06-06T03:06:33.711857Z", + "shell.execute_reply": "2026-06-06T03:06:33.711295Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recompressed format: xben\n", + "assets preserved: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'cut_edge_summary.json']\n", + "metadata preserved: {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234}\n", + "plans unchanged: 1000 == 1000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/claude-1000/ipykernel_3730522/3018985229.py:6: UserWarning: XBEN may take a second to start decoding.\n", + " xben_decoder = BendlDecoder(\"example_data/rich.xben.bendl\")\n" + ] + } + ], + "source": [ + "# Write a fresh XBEN copy, original preserved. (out_file won't overwrite an\n", + "# existing file, so clear any copy from a previous run first.)\n", + "Path(\"example_data/rich.xben.bendl\").unlink(missing_ok=True)\n", + "compress_stream(\"example_data/rich.bendl\", out_file=\"example_data/rich.xben.bendl\")\n", + "\n", + "xben_decoder = BendlDecoder(\"example_data/rich.xben.bendl\")\n", + "print(\"recompressed format:\", xben_decoder.assignment_format())\n", + "print(\"assets preserved: \", xben_decoder.asset_names())\n", + "print(\"metadata preserved: \", xben_decoder.read_metadata())\n", + "print(\n", + " \"plans unchanged: \",\n", + " len(xben_decoder),\n", + " \"==\",\n", + " len(BendlDecoder(\"example_data/rich.bendl\")),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0a8025fd", + "metadata": {}, + "source": [ + "(Passing both `in_place=True` and `out_file=`, or neither, raises — the choice\n", + "is exclusive. Note XBEN bundles emit a one-time startup warning on decode,\n", + "since opening them does real decompression work.)" + ] + }, + { + "cell_type": "markdown", + "id": "2d9ecfde", + "metadata": {}, + "source": [ + "## Lifecycle and failure semantics\n", + "\n", + "A subtle but important guarantee: if an exception escapes the `stream()`\n", + "context — say the chain or your write logic throws partway through — the bundle\n", + "is left **unfinalized** rather than stamped complete over a half-written\n", + "stream. You can detect this (`is_complete()` is `False`) and still recover what\n", + "was written via `extract_stream(..., allow_unfinalized=True)`." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6fc232f9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-06T03:06:33.713277Z", + "iopub.status.busy": "2026-06-06T03:06:33.713121Z", + "iopub.status.idle": "2026-06-06T03:06:34.663699Z", + "shell.execute_reply": "2026-06-06T03:06:34.663011Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "caught: simulated crash mid-stream\n", + "is_complete: False (left unfinalized, as intended)\n", + "recovered 50 plans written before the crash\n" + ] + } + ], + "source": [ + "encoder = BendlEncoder(\"example_data/partial.bendl\", overwrite=True)\n", + "stored_graph = encoder.add_graph(GRAPH_PATH, sort=None)\n", + "gc_graph = Graph.from_networkx(stored_graph)\n", + "write_order = list(gc_graph.nodes)\n", + "try:\n", + " with encoder.stream(\"ben\") as stream:\n", + " for i, partition in enumerate(make_chain(gc_graph, steps=1000)):\n", + " if i == 50:\n", + " raise RuntimeError(\"simulated crash mid-stream\")\n", + " series = partition.assignment.to_series()\n", + " stream.write(series.loc[write_order].astype(int).tolist())\n", + "except RuntimeError as e:\n", + " print(\"caught:\", e)\n", + "\n", + "decoder = BendlDecoder(\"example_data/partial.bendl\")\n", + "print(\"is_complete:\", decoder.is_complete(), \"(left unfinalized, as intended)\")\n", + "decoder.extract_stream(\n", + " \"example_data/partial.ben\", overwrite=True, allow_unfinalized=True\n", + ")\n", + "recovered = sum(1 for _ in BenDecoder(\"example_data/partial.ben\", mode=\"ben\"))\n", + "print(\"recovered\", recovered, \"plans written before the crash\")" + ] + }, + { + "cell_type": "markdown", + "id": "fc7ae1a1", + "metadata": {}, + "source": [ + "## Recap — when to reach for what\n", + "\n", + "- **`BendlEncoder` / `BendlDecoder`** are your default for storing an ensemble:\n", + " one self-describing file, graph + metadata included, encoded live as the\n", + " chain runs. You only ever need a `with` block around the `stream()` writer —\n", + " closing it finalizes the bundle (use `close()` for an assets-only bundle).\n", + "- **`add_graph(graph)`** before the stream (MLC-reordered by default; pass\n", + " `sort=\"rcm\"`, `sort=\"key\", key=\"GEOID\"`, or `sort=None` for raw), then build\n", + " the chain on the returned graph — you get a compression win *and* a write order\n", + " that already matches the stored graph.\n", + "- **`relabel_bundle`** to reorder an *existing* BEN bundle and rewrite its stream\n", + " to match (in place or to a new file) — e.g. to optimize a bundle you received\n", + " raw, before archiving it.\n", + "- **`binary_ensemble.graph.reorder*`** when you want the reordering standalone\n", + " (e.g. to reuse an ordering across several bundles).\n", + "- **`add_metadata` / `add_asset`** to stamp provenance and ship analysis\n", + " alongside the plans; **`append`** to add results to a finished bundle.\n", + "- **`compress_stream`** to graduate an active BEN bundle to an archival XBEN\n", + " one without losing any asset.\n", + "- Drop to the plain **`binary_ensemble.stream`** API (via `extract_stream`)\n", + " only when you specifically need the bare stream and are tracking the graph\n", + " and node order yourself.\n", + "print(\"done — see the example_data/ folder for the bundles this tutorial wrote\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index 1829e51..673195f 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -25,15 +25,25 @@ filterwarnings = [ ] [project.optional-dependencies] +# Render the site (used by ReadTheDocs). Notebook outputs are rendered from the +# committed .ipynb files, so no execution dependencies are needed here. docs = [ - "myst-nb>=1.3.0", - "myst-parser>=4.0.1", - "nbconvert>=7.16.6", - "recommonmark>=0.7.1", + # Sphinx core + a modern theme "sphinx>=8.2.3", - "sphinx-autoapi>=3.6.1", + "furo>=2024.8.6", "sphinx-copybutton>=0.5.2", - "sphinx-rtd-theme>=3.0.2", + "sphinx-design>=0.6.1", + "sphinxext-opengraph>=0.9.1", + # MyST markdown + notebook rendering + "myst-nb>=1.3.0", + "linkify-it-py>=2.0.3", +] +# Extra deps to *execute* the tutorial notebooks (used by CI and local verification +# with NB_EXECUTION_MODE=cache), so every example is run end to end. +docs-exec = [ + "ipykernel>=6.29.5", + "gerrychain>=0.3.2", + "tqdm>=4.67.1", ] [dependency-groups] diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index f54fed8..b1c302c 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -13,12 +13,26 @@ use std::fs::{File, OpenOptions}; use std::io::{self, BufReader, BufWriter, Write}; use std::path::PathBuf; -/// Reader and iterator for a `.bendl` bundle. +/// Reader and iterator for a ``.bendl`` bundle. /// -/// This decoder is bundle-only: opening it on a plain `.ben`/`.xben` stream raises and points the -/// caller at `BenDecoder`. It exposes the bundle inspection surface (`version`, `is_complete`, -/// `asset_names`, `list_assets`, canonical and generic asset getters, `extract_stream`) and -/// iterates the embedded assignment stream. +/// Iterate the decoder to yield the embedded assignment stream one plan at a time (each a +/// ``list[int]`` of district ids), and use ``len()`` for the sample count. Alongside the +/// stream, a bundle carries assets — the dual graph, metadata, a node permutation map, and any +/// custom blobs — exposed through the canonical getters (:meth:`read_graph`, +/// :meth:`read_metadata`, :meth:`read_node_permutation_map`) and the generic +/// :meth:`read_asset_bytes` / :meth:`read_json_asset`. Inspect the directory with +/// :meth:`asset_names`, :meth:`list_assets`, :meth:`version`, and :meth:`is_complete`. +/// +/// This decoder is bundle-only: opening it on a plain ``.ben``/``.xben`` stream raises and +/// points the caller at :class:`~binary_ensemble.stream.BenDecoder`. A finalized assets-only +/// bundle (one written with no assignment stream) iterates to nothing with ``len() == 0``. +/// +/// Example: +/// >>> from binary_ensemble import BendlDecoder +/// >>> dec = BendlDecoder("ensemble.bendl") +/// >>> graph = dec.read_graph() +/// >>> for assignment in dec: +/// ... ... #[pyclass(module = "binary_ensemble", name = "BendlDecoder", unsendable)] pub struct PyBendlDecoder { path: PathBuf, @@ -28,10 +42,20 @@ pub struct PyBendlDecoder { #[pymethods] impl PyBendlDecoder { - /// Open a decoder on a `.bendl` bundle. + /// Open a decoder on a ``.bendl`` bundle. + /// + /// The file's leading bytes are sniffed and a plain ``.ben``/``.xben`` stream is rejected. + /// The bundle header decides whether the embedded stream is BEN or XBEN; an XBEN stream + /// pays a one-time decompression startup cost. /// - /// The file's leading bytes are sniffed; a plain `.ben`/`.xben` stream is rejected with a - /// pointer at `BenDecoder`. The bundle header decides the embedded BEN/XBEN format. + /// Args: + /// file_path: Path to the input ``.bendl`` file. + /// + /// Raises: + /// Exception: If ``file_path`` is not a bundle (use + /// :class:`~binary_ensemble.stream.BenDecoder` for plain streams), or its header + /// cannot be parsed. + /// OSError: If the file cannot be opened. #[new] #[pyo3(signature = (file_path))] #[pyo3(text_signature = "(file_path)")] @@ -106,11 +130,28 @@ impl PyBendlDecoder { self.cursor.len(py) } + /// Count the samples in the embedded stream. + /// + /// The result is the *expanded* sample count (a frame repeating five identical samples + /// contributes five). It is computed lazily and cached, so repeated calls and ``len()`` + /// are cheap. + /// + /// Returns: + /// int: The number of samples in the bundle's stream. #[pyo3(text_signature = "(self)")] fn count_samples(&mut self, py: Python<'_>) -> PyResult { self.cursor.count_samples(py) } + /// Restrict iteration to the samples at the given 1-indexed positions. + /// + /// Selected samples are reached by skipping frames rather than decoding the whole stream. + /// + /// Args: + /// indices: The 1-indexed sample numbers to keep. + /// + /// Returns: + /// BendlDecoder: ``self``, so the call can be chained into a ``for`` loop. #[pyo3(text_signature = "(self, indices, /)")] fn subsample_indices<'py>( mut slf: PyRefMut<'py, Self>, @@ -121,6 +162,14 @@ impl PyBendlDecoder { Ok(slf.into()) } + /// Restrict iteration to a contiguous, half-open range of samples ``[start, end)``. + /// + /// Args: + /// start: First sample number to keep (1-indexed, inclusive). + /// end: One past the last sample number to keep (exclusive). + /// + /// Returns: + /// BendlDecoder: ``self``, for chaining into a ``for`` loop. #[pyo3(text_signature = "(self, start, end, /)")] fn subsample_range<'py>( mut slf: PyRefMut<'py, Self>, @@ -132,7 +181,16 @@ impl PyBendlDecoder { Ok(slf.into()) } + /// Restrict iteration to every ``step``-th sample. + /// + /// Args: + /// step: Stride between kept samples (e.g. ``10`` keeps every tenth sample). + /// offset: 1-indexed position of the first kept sample. Defaults to ``1``. + /// + /// Returns: + /// BendlDecoder: ``self``, for chaining into a ``for`` loop. #[pyo3(signature = (step, offset=1))] + #[pyo3(text_signature = "(self, step, offset=1)")] fn subsample_every<'py>( mut slf: PyRefMut<'py, Self>, step: usize, @@ -204,7 +262,16 @@ impl PyBendlDecoder { Ok(out) } - /// Read the (decoded) bytes of a named asset as a Python `bytes` object. + /// Read the (decoded) bytes of a named asset as a Python ``bytes`` object. + /// + /// Args: + /// name: The asset's name, as listed by :meth:`asset_names`. + /// + /// Returns: + /// bytes: The asset's decoded payload. + /// + /// Raises: + /// KeyError: If no asset with that name exists in the bundle. #[pyo3(text_signature = "(self, name, /)")] fn read_asset_bytes(&mut self, name: &str) -> PyResult> { let entry = self @@ -217,7 +284,17 @@ impl PyBendlDecoder { .map_err(|e| PyIOError::new_err(format!("Failed to read asset {name:?}: {e}"))) } - /// Parse a JSON asset into a Python object (dict, list, …). + /// Parse a JSON asset into a Python object (``dict``, ``list``, …). + /// + /// Args: + /// name: The asset's name, as listed by :meth:`asset_names`. + /// + /// Returns: + /// The parsed JSON value. + /// + /// Raises: + /// KeyError: If no asset with that name exists in the bundle. + /// Exception: If the asset is not valid UTF-8 JSON. #[pyo3(text_signature = "(self, name, /)")] fn read_json_asset<'py>(&mut self, py: Python<'py>, name: &str) -> PyResult> { let bytes = self.read_asset_bytes(name)?; @@ -257,8 +334,19 @@ impl PyBendlDecoder { ) } - /// Copy the embedded assignment stream region verbatim to `out_path`. The resulting file can be - /// opened directly with `BenDecoder(out_path, mode=dec.assignment_format())`. + /// Copy the embedded assignment stream out to a standalone ``.ben``/``.xben`` file. + /// + /// The bytes are copied verbatim, so the result can be opened directly with + /// ``BenDecoder(out_path, mode=dec.assignment_format())``. + /// + /// Args: + /// out_path: Path to write the extracted stream to. + /// overwrite: Replace ``out_path`` if it already exists. Defaults to ``False``. + /// allow_unfinalized: Permit extraction from a bundle that was never finalized + /// (recovering a partial stream). Defaults to ``False``. + /// + /// Raises: + /// OSError: If ``out_path`` exists and ``overwrite`` is ``False``, or the copy fails. #[pyo3(signature = (out_path, overwrite=false, allow_unfinalized=false))] #[pyo3(text_signature = "(self, out_path, overwrite=False, allow_unfinalized=False)")] fn extract_stream( diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 666fe17..9e2044b 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -5,11 +5,22 @@ use pyo3::exceptions::{PyException, PyIOError}; use pyo3::prelude::*; use std::path::PathBuf; -/// Iterator over assignments in a plain BEN or XBEN stream. +/// Iterator over the assignments in a plain BEN or XBEN stream. /// -/// This decoder is stream-only: opening it on a `.bendl` bundle raises and points the caller at -/// `BendlDecoder`. Bundle inspection (assets, directory, embedded-stream extraction) lives on -/// `BendlDecoder`, mirroring the `ben` vs `bendl` CLI split. +/// Iterate the decoder to yield one assignment at a time, each a ``list[int]`` of district +/// ids in dual-graph node order. ``len()`` reports the (expanded) sample count and is cheap +/// to call, so it is safe to use for a progress bar. The encoding variant is detected +/// automatically from the stream, so it is never passed when reading. +/// +/// This decoder is stream-only: opening it on a ``.bendl`` bundle raises and points the +/// caller at :class:`~binary_ensemble.bundle.BendlDecoder`, which carries the bundle +/// inspection surface (assets, embedded graph, metadata). This mirrors the ``ben`` vs +/// ``bendl`` split of the command-line tools. +/// +/// Example: +/// >>> from binary_ensemble import BenDecoder +/// >>> for assignment in BenDecoder("plans.ben"): +/// ... print(assignment[:8]) #[pyclass(module = "binary_ensemble", name = "BenDecoder", unsendable)] pub struct PyBenDecoder { cursor: SampleCursor, @@ -17,15 +28,20 @@ pub struct PyBenDecoder { #[pymethods] impl PyBenDecoder { - /// Open a decoder on a plain `.ben` or `.xben` file. + /// Open a decoder on a plain ``.ben`` or ``.xben`` file. /// - /// The file's leading bytes are sniffed; a `.bendl` bundle is rejected with a pointer at - /// `BendlDecoder`. `mode` selects between the BEN and XBEN readers and defaults to `"ben"`. + /// The file's leading bytes are sniffed and a ``.bendl`` bundle is rejected. ``mode`` + /// selects between the BEN and XBEN readers; opening an XBEN stream pays a one-time + /// decompression startup cost. /// - /// # Arguments + /// Args: + /// file_path: Path to the input ``.ben`` or ``.xben`` file. + /// mode: Either ``"ben"`` or ``"xben"``. Defaults to ``"ben"``. /// - /// * `file_path` - Path to the input file. - /// * `mode` - Either `"ben"` or `"xben"`. + /// Raises: + /// Exception: If ``file_path`` is a ``.bendl`` bundle (use + /// :class:`~binary_ensemble.bundle.BendlDecoder` instead). + /// OSError: If the file cannot be opened or its banner is malformed. #[new] #[pyo3(signature = (file_path, mode = "ben"))] #[pyo3(text_signature = "(file_path, mode='ben')")] @@ -56,27 +72,53 @@ impl PyBenDecoder { Ok(Self { cursor }) } - /// Return `self` as an iterator, rebuilding the underlying frame walker so iteration can be - /// restarted. A subsample selection installed via `subsample_*` is reapplied on each restart. + /// Return ``self`` as a fresh iterator over the stream. + /// + /// Restarting rebuilds the underlying frame walker, so a decoder can be iterated more + /// than once. Any subsample selection installed via a ``subsample_*`` method is + /// reapplied on each restart. fn __iter__(mut slf: PyRefMut) -> PyResult> { slf.cursor.restart()?; Ok(slf.into()) } + /// Return the next assignment, or raise ``StopIteration`` at the end of the stream. fn __next__(&mut self) -> PyResult>> { self.cursor.next() } - // Because we want progress bars!!! + /// Return the (expanded) number of samples, for use as a progress-bar total. fn __len__(&mut self, py: Python<'_>) -> PyResult { self.cursor.len(py) } + /// Count the samples in the stream. + /// + /// The result is the *expanded* sample count: a frame that repeats five identical + /// samples contributes five. The count is computed lazily and cached, so repeated calls + /// (and ``len()``) are cheap. + /// + /// Returns: + /// int: The number of samples in the stream. #[pyo3(text_signature = "(self)")] fn count_samples(&mut self, py: Python<'_>) -> PyResult { self.cursor.count_samples(py) } + /// Restrict iteration to the samples at the given 1-indexed positions. + /// + /// Selected samples are reached by skipping frames rather than decoding the whole + /// stream, so this stays fast on large ensembles. + /// + /// Args: + /// indices: The 1-indexed sample numbers to keep. + /// + /// Returns: + /// BenDecoder: ``self``, so the call can be chained directly into a ``for`` loop. + /// + /// Example: + /// >>> for plan in BenDecoder("plans.ben").subsample_indices([1, 500, 9999]): + /// ... ... #[pyo3(text_signature = "(self, indices, /)")] fn subsample_indices<'py>( mut slf: PyRefMut<'py, Self>, @@ -87,6 +129,14 @@ impl PyBenDecoder { Ok(slf.into()) } + /// Restrict iteration to a contiguous, half-open range of samples ``[start, end)``. + /// + /// Args: + /// start: First sample number to keep (1-indexed, inclusive). + /// end: One past the last sample number to keep (exclusive). + /// + /// Returns: + /// BenDecoder: ``self``, for chaining into a ``for`` loop. #[pyo3(text_signature = "(self, start, end, /)")] fn subsample_range<'py>( mut slf: PyRefMut<'py, Self>, @@ -98,7 +148,20 @@ impl PyBenDecoder { Ok(slf.into()) } + /// Restrict iteration to every ``step``-th sample. + /// + /// Args: + /// step: Stride between kept samples (e.g. ``10`` keeps every tenth sample). + /// offset: 1-indexed position of the first kept sample. Defaults to ``1``. + /// + /// Returns: + /// BenDecoder: ``self``, for chaining into a ``for`` loop. + /// + /// Example: + /// >>> for plan in BenDecoder("plans.ben").subsample_every(1000): + /// ... ... #[pyo3(signature = (step, offset=1))] + #[pyo3(text_signature = "(self, step, offset=1)")] fn subsample_every<'py>( mut slf: PyRefMut<'py, Self>, step: usize, @@ -109,7 +172,7 @@ impl PyBenDecoder { Ok(slf.into()) } - /// Return the container format of the underlying stream as `"ben"` or `"xben"`. + /// Return the container format of the underlying stream as ``"ben"`` or ``"xben"``. #[pyo3(text_signature = "(self)")] fn assignment_format(&self) -> &'static str { self.cursor.mode().as_str() diff --git a/ben-py/src/decode/py_funcs.rs b/ben-py/src/decode/py_funcs.rs index 5b45bf9..1284ee3 100644 --- a/ben-py/src/decode/py_funcs.rs +++ b/ben-py/src/decode/py_funcs.rs @@ -7,6 +7,18 @@ use pyo3::exceptions::PyIOError; use pyo3::prelude::*; use std::path::PathBuf; +/// Decompress an XBEN file into a plain BEN stream. +/// +/// XBEN decompression is fast; converting to BEN gives you a stream you can read, replay, and +/// subsample. The encoding variant is preserved and detected automatically on the next read. +/// +/// Args: +/// in_file: Path to the input ``.xben`` file. +/// out_file: Path to write the ``.ben`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] @@ -26,6 +38,18 @@ pub fn decode_xben_to_ben(in_file: PathBuf, out_file: PathBuf, overwrite: bool) Ok(()) } +/// Decode an XBEN file back to canonicalized JSONL. +/// +/// Produces one ``{"assignment": [...], "sample": n}`` object per line, with sample numbers +/// starting at 1. +/// +/// Args: +/// in_file: Path to the input ``.xben`` file. +/// out_file: Path to write the ``.jsonl`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] @@ -45,6 +69,18 @@ pub fn decode_xben_to_jsonl(in_file: PathBuf, out_file: PathBuf, overwrite: bool Ok(()) } +/// Decode a BEN stream back to canonicalized JSONL. +/// +/// Produces one ``{"assignment": [...], "sample": n}`` object per line, with sample numbers +/// starting at 1. This is the inverse of :func:`encode_jsonl_to_ben`. +/// +/// Args: +/// in_file: Path to the input ``.ben`` file. +/// out_file: Path to write the ``.jsonl`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index db777ce..f66bd19 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -78,10 +78,13 @@ pub struct PyBendlEncoder { impl PyBendlEncoder { /// Open a new bundle writer in create mode. /// - /// # Arguments + /// Args: + /// file_path: Output path. Must not exist unless ``overwrite=True``. + /// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. /// - /// * `file_path` - Output path. Must not exist unless `overwrite=True`. - /// * `overwrite` - Replace an existing file at `file_path`. + /// Raises: + /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be + /// created. #[new] #[pyo3(signature = (file_path, overwrite = false))] #[pyo3(text_signature = "(file_path, overwrite=False)")] @@ -382,7 +385,12 @@ fn state_error(state: &BundleState, op: &str) -> PyErr { PyException::new_err(format!("cannot {op}: {reason}")) } -/// Single-use context manager over the bundle's assignment stream. +/// Single-use context manager over a bundle's assignment stream. +/// +/// Obtained from :meth:`binary_ensemble.bundle.BendlEncoder.stream`; you don't construct it +/// directly. Write assignments with :meth:`write` inside a ``with`` block. Closing the context +/// cleanly **finalizes** the bundle; if the block exits via an exception the bundle is left +/// unfinalized (recoverable, rather than stamped complete over a truncated stream). #[pyclass(module = "binary_ensemble", name = "BendlStreamSession", unsendable)] pub struct PyBendlStreamSession { writer: Option>>>>, @@ -394,8 +402,16 @@ pub struct PyBendlStreamSession { #[pymethods] impl PyBendlStreamSession { - /// Encode a single assignment. When the bundle carries a pre-stream graph, the assignment - /// length must equal the graph's node count. + /// Encode a single assignment into the bundle's stream. + /// + /// Args: + /// assignment: The plan as a ``list[int]`` of district ids, one per node in + /// dual-graph node order. + /// + /// Raises: + /// ValueError: If the bundle carries a pre-stream graph and the assignment length does + /// not equal the graph's node count. + /// OSError: If the session is already closed, or the write fails. #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(self, assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 1c8d534..83acbbf 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -6,10 +6,18 @@ use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -/// Encoder for plain Binary Ensemble (`.ben`) streams. +/// Encoder for plain Binary Ensemble (``.ben``) streams. /// -/// This encoder writes a plain BEN stream with no bundle framing. To produce a `.bendl` bundle -/// (with an embedded graph, metadata, or other assets) use `binary_ensemble.bundle.BendlEncoder`. +/// Write assignments one at a time with :meth:`write`, then :meth:`close` to flush and finish +/// the file. The encoder is a context manager, so the idiomatic pattern is:: +/// +/// with BenEncoder("plans.ben", overwrite=True) as enc: +/// for assignment in plans: +/// enc.write(assignment) +/// +/// This produces a plain BEN stream with no bundle framing. To produce a self-describing +/// ``.bendl`` bundle (with an embedded graph, metadata, or other assets) use +/// :class:`~binary_ensemble.bundle.BendlEncoder` instead. #[pyclass(module = "binary_ensemble", name = "BenEncoder", unsendable)] pub struct PyBenEncoder { writer: Option>>, @@ -23,14 +31,18 @@ impl PyBenEncoder { #[pymethods] impl PyBenEncoder { - /// Open a new encoder that writes a plain `.ben` stream. + /// Open a new encoder that writes a plain ``.ben`` stream. /// - /// # Arguments + /// Args: + /// file_path: Output path. Must not exist unless ``overwrite=True``. + /// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. + /// variant: BEN encoding variant for the stream — ``"standard"``, ``"mkv_chain"``, + /// or ``"twodelta"``. Defaults to ``"twodelta"`` when ``None``. /// - /// * `file_path` - Output path. Must not exist unless `overwrite=True`. - /// * `overwrite` - Replace an existing file at `file_path`. - /// * `variant` - BEN variant for the assignment stream (`"standard"`, `"mkv_chain"`, or - /// `"twodelta"`). Defaults to `"twodelta"` when `None`. + /// Raises: + /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be + /// created. + /// ValueError: If ``variant`` is not a recognized variant name. #[new] #[pyo3(signature = (file_path, overwrite = false, variant = None))] #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] @@ -44,6 +56,13 @@ impl PyBenEncoder { } /// Encode a single assignment and append it to the output stream. + /// + /// Args: + /// assignment: The plan as a ``list[int]`` of district ids, one per node in + /// dual-graph node order. + /// + /// Raises: + /// OSError: If the encoder has already been closed, or the write fails. #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index 6234b7a..91200dc 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -8,6 +8,23 @@ use pyo3::exceptions::PyIOError; use pyo3::prelude::*; use std::path::PathBuf; +/// Compress a BEN stream into an XBEN file with LZMA2. +/// +/// XBEN is the smallest format and is meant for storage and transfer. Compression can be slow +/// for large block-level ensembles; relabel and reorder first (see +/// :func:`~binary_ensemble.bundle.relabel_bundle`) for the best ratios. +/// +/// Args: +/// in_file: Path to the input ``.ben`` file. +/// out_file: Path to write the ``.xben`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// n_threads: Number of worker threads. ``None`` uses all available cores. +/// compression_level: LZMA2 level from 0 (fastest) to 9 (smallest). ``None`` uses the +/// default (9). +/// xz_block_size: Override the xz block size in bytes. ``None`` uses the default. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, n_threads=None, compression_level=None, xz_block_size=None))] #[pyo3( @@ -44,6 +61,21 @@ pub fn encode_ben_to_xben( Ok(()) } +/// Encode a canonicalized JSONL ensemble into a BEN stream. +/// +/// Expects one ``{"assignment": [...], "sample": n}`` object per line. BEN is the fast working +/// format; encode further to XBEN with :func:`encode_ben_to_xben` for storage. +/// +/// Args: +/// in_file: Path to the input ``.jsonl`` file. +/// out_file: Path to write the ``.ben`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// variant: BEN encoding variant — ``"standard"``, ``"mkv_chain"``, or ``"twodelta"``. +/// Defaults to ``"twodelta"``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. +/// ValueError: If ``variant`` is not a recognized variant name. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, variant="twodelta"))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False, variant='twodelta')")] @@ -68,6 +100,26 @@ pub fn encode_jsonl_to_ben( Ok(()) } +/// Encode a canonicalized JSONL ensemble directly into an XBEN file. +/// +/// A one-step shortcut for :func:`encode_jsonl_to_ben` followed by +/// :func:`encode_ben_to_xben`. Expects one ``{"assignment": [...], "sample": n}`` object per +/// line. Compression can be slow for large block-level ensembles. +/// +/// Args: +/// in_file: Path to the input ``.jsonl`` file. +/// out_file: Path to write the ``.xben`` output. +/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// variant: BEN encoding variant — ``"standard"``, ``"mkv_chain"``, or ``"twodelta"``. +/// Defaults to ``"twodelta"``. +/// n_threads: Number of worker threads. ``None`` uses all available cores. +/// compression_level: LZMA2 level from 0 (fastest) to 9 (smallest). ``None`` uses the +/// default (9). +/// xz_block_size: Override the xz block size in bytes. ``None`` uses the default. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. +/// ValueError: If ``variant`` is not a recognized variant name. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite=false, variant="twodelta", n_threads=None, compression_level=None, xz_block_size=None))] #[pyo3( diff --git a/ben-py/tests/data/gerrymandria.json b/ben-py/tests/data/gerrymandria.json new file mode 100644 index 0000000..a6ca2fa --- /dev/null +++ b/ben-py/tests/data/gerrymandria.json @@ -0,0 +1,1641 @@ +{ + "directed": false, + "multigraph": false, + "graph": [], + "nodes": [ + { + "TOTPOP": 1, + "x": 0, + "y": 0, + "county": "1", + "district": "1", + "precinct": 0, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 0 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 1, + "county": "1", + "district": "1", + "precinct": 1, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 1 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 2, + "county": "1", + "district": "1", + "precinct": 2, + "muni": "5", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 2 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 3, + "county": "1", + "district": "1", + "precinct": 3, + "muni": "5", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 3 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 4, + "county": "3", + "district": "1", + "precinct": 4, + "muni": "9", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 4 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 5, + "county": "3", + "district": "1", + "precinct": 5, + "muni": "9", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 5 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 6, + "county": "3", + "district": "1", + "precinct": 6, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 6 + }, + { + "TOTPOP": 1, + "x": 0, + "y": 7, + "county": "3", + "district": "1", + "precinct": 7, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 7 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 0, + "county": "1", + "district": "2", + "precinct": 8, + "muni": "1", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "2", + "id": 8 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 1, + "county": "1", + "district": "2", + "precinct": 9, + "muni": "1", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 9 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 2, + "county": "1", + "district": "2", + "precinct": 10, + "muni": "5", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 10 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 3, + "county": "1", + "district": "2", + "precinct": 11, + "muni": "5", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 11 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 4, + "county": "3", + "district": "2", + "precinct": 12, + "muni": "9", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 12 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 5, + "county": "3", + "district": "2", + "precinct": 13, + "muni": "9", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 13 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 6, + "county": "3", + "district": "2", + "precinct": 14, + "muni": "13", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 14 + }, + { + "TOTPOP": 1, + "x": 1, + "y": 7, + "county": "3", + "district": "2", + "precinct": 15, + "muni": "13", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 15 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 0, + "county": "1", + "district": "3", + "precinct": 16, + "muni": "2", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 16 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 1, + "county": "1", + "district": "3", + "precinct": 17, + "muni": "2", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 17 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 2, + "county": "1", + "district": "3", + "precinct": 18, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 18 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 3, + "county": "1", + "district": "3", + "precinct": 19, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "2", + "id": 19 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 4, + "county": "3", + "district": "3", + "precinct": 20, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 20 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 5, + "county": "3", + "district": "3", + "precinct": 21, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 21 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 6, + "county": "3", + "district": "3", + "precinct": 22, + "muni": "14", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 22 + }, + { + "TOTPOP": 1, + "x": 2, + "y": 7, + "county": "3", + "district": "3", + "precinct": 23, + "muni": "14", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 23 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 0, + "county": "1", + "district": "4", + "precinct": 24, + "muni": "2", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "4", + "id": 24 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 1, + "county": "1", + "district": "4", + "precinct": 25, + "muni": "2", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 25 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 2, + "county": "1", + "district": "4", + "precinct": 26, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 26 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 3, + "county": "1", + "district": "4", + "precinct": 27, + "muni": "6", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 27 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 4, + "county": "3", + "district": "4", + "precinct": 28, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 28 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 5, + "county": "3", + "district": "4", + "precinct": 29, + "muni": "10", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 29 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 6, + "county": "3", + "district": "4", + "precinct": 30, + "muni": "14", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 30 + }, + { + "TOTPOP": 1, + "x": 3, + "y": 7, + "county": "3", + "district": "4", + "precinct": 31, + "muni": "14", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 31 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 0, + "county": "2", + "district": "5", + "precinct": 32, + "muni": "3", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 32 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 1, + "county": "2", + "district": "5", + "precinct": 33, + "muni": "3", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 33 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 2, + "county": "2", + "district": "5", + "precinct": 34, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 34 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 3, + "county": "2", + "district": "5", + "precinct": 35, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 35 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 4, + "county": "4", + "district": "5", + "precinct": 36, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 36 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 5, + "county": "4", + "district": "5", + "precinct": 37, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 37 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 6, + "county": "4", + "district": "5", + "precinct": 38, + "muni": "15", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 38 + }, + { + "TOTPOP": 1, + "x": 4, + "y": 7, + "county": "4", + "district": "5", + "precinct": 39, + "muni": "15", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 39 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 0, + "county": "2", + "district": "6", + "precinct": 40, + "muni": "3", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 40 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 1, + "county": "2", + "district": "6", + "precinct": 41, + "muni": "3", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 41 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 2, + "county": "2", + "district": "6", + "precinct": 42, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 42 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 3, + "county": "2", + "district": "6", + "precinct": 43, + "muni": "7", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "4", + "id": 43 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 4, + "county": "4", + "district": "6", + "precinct": 44, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 44 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 5, + "county": "4", + "district": "6", + "precinct": 45, + "muni": "11", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 45 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 6, + "county": "4", + "district": "6", + "precinct": 46, + "muni": "15", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 46 + }, + { + "TOTPOP": 1, + "x": 5, + "y": 7, + "county": "4", + "district": "6", + "precinct": 47, + "muni": "15", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 47 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 0, + "county": "2", + "district": "7", + "precinct": 48, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 48 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 1, + "county": "2", + "district": "7", + "precinct": 49, + "muni": "4", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 49 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 2, + "county": "2", + "district": "7", + "precinct": 50, + "muni": "8", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 50 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 3, + "county": "2", + "district": "7", + "precinct": 51, + "muni": "8", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 51 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 4, + "county": "4", + "district": "7", + "precinct": 52, + "muni": "12", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "3", + "id": 52 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 5, + "county": "4", + "district": "7", + "precinct": 53, + "muni": "12", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 53 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 6, + "county": "4", + "district": "7", + "precinct": 54, + "muni": "16", + "boundary_node": false, + "boundary_perim": 0, + "water_dist": "1", + "id": 54 + }, + { + "TOTPOP": 1, + "x": 6, + "y": 7, + "county": "4", + "district": "7", + "precinct": 55, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 55 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 0, + "county": "2", + "district": "8", + "precinct": 56, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 56 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 1, + "county": "2", + "district": "8", + "precinct": 57, + "muni": "4", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 57 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 2, + "county": "2", + "district": "8", + "precinct": 58, + "muni": "8", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 58 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 3, + "county": "2", + "district": "8", + "precinct": 59, + "muni": "8", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 59 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 4, + "county": "4", + "district": "8", + "precinct": 60, + "muni": "12", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "3", + "id": 60 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 5, + "county": "4", + "district": "8", + "precinct": 61, + "muni": "12", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 61 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 6, + "county": "4", + "district": "8", + "precinct": 62, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 62 + }, + { + "TOTPOP": 1, + "x": 7, + "y": 7, + "county": "4", + "district": "8", + "precinct": 63, + "muni": "16", + "boundary_node": true, + "boundary_perim": 1, + "water_dist": "1", + "id": 63 + } + ], + "adjacency": [ + [ + { + "id": 8 + }, + { + "id": 1 + } + ], + [ + { + "id": 0 + }, + { + "id": 9 + }, + { + "id": 2 + } + ], + [ + { + "id": 1 + }, + { + "id": 10 + }, + { + "id": 3 + } + ], + [ + { + "id": 2 + }, + { + "id": 11 + }, + { + "id": 4 + } + ], + [ + { + "id": 3 + }, + { + "id": 12 + }, + { + "id": 5 + } + ], + [ + { + "id": 4 + }, + { + "id": 13 + }, + { + "id": 6 + } + ], + [ + { + "id": 5 + }, + { + "id": 14 + }, + { + "id": 7 + } + ], + [ + { + "id": 6 + }, + { + "id": 15 + } + ], + [ + { + "id": 0 + }, + { + "id": 16 + }, + { + "id": 9 + } + ], + [ + { + "id": 1 + }, + { + "id": 8 + }, + { + "id": 17 + }, + { + "id": 10 + } + ], + [ + { + "id": 2 + }, + { + "id": 9 + }, + { + "id": 18 + }, + { + "id": 11 + } + ], + [ + { + "id": 3 + }, + { + "id": 10 + }, + { + "id": 19 + }, + { + "id": 12 + } + ], + [ + { + "id": 4 + }, + { + "id": 11 + }, + { + "id": 20 + }, + { + "id": 13 + } + ], + [ + { + "id": 5 + }, + { + "id": 12 + }, + { + "id": 21 + }, + { + "id": 14 + } + ], + [ + { + "id": 6 + }, + { + "id": 13 + }, + { + "id": 22 + }, + { + "id": 15 + } + ], + [ + { + "id": 7 + }, + { + "id": 14 + }, + { + "id": 23 + } + ], + [ + { + "id": 8 + }, + { + "id": 24 + }, + { + "id": 17 + } + ], + [ + { + "id": 9 + }, + { + "id": 16 + }, + { + "id": 25 + }, + { + "id": 18 + } + ], + [ + { + "id": 10 + }, + { + "id": 17 + }, + { + "id": 26 + }, + { + "id": 19 + } + ], + [ + { + "id": 11 + }, + { + "id": 18 + }, + { + "id": 27 + }, + { + "id": 20 + } + ], + [ + { + "id": 12 + }, + { + "id": 19 + }, + { + "id": 28 + }, + { + "id": 21 + } + ], + [ + { + "id": 13 + }, + { + "id": 20 + }, + { + "id": 29 + }, + { + "id": 22 + } + ], + [ + { + "id": 14 + }, + { + "id": 21 + }, + { + "id": 30 + }, + { + "id": 23 + } + ], + [ + { + "id": 15 + }, + { + "id": 22 + }, + { + "id": 31 + } + ], + [ + { + "id": 16 + }, + { + "id": 32 + }, + { + "id": 25 + } + ], + [ + { + "id": 17 + }, + { + "id": 24 + }, + { + "id": 33 + }, + { + "id": 26 + } + ], + [ + { + "id": 18 + }, + { + "id": 25 + }, + { + "id": 34 + }, + { + "id": 27 + } + ], + [ + { + "id": 19 + }, + { + "id": 26 + }, + { + "id": 35 + }, + { + "id": 28 + } + ], + [ + { + "id": 20 + }, + { + "id": 27 + }, + { + "id": 36 + }, + { + "id": 29 + } + ], + [ + { + "id": 21 + }, + { + "id": 28 + }, + { + "id": 37 + }, + { + "id": 30 + } + ], + [ + { + "id": 22 + }, + { + "id": 29 + }, + { + "id": 38 + }, + { + "id": 31 + } + ], + [ + { + "id": 23 + }, + { + "id": 30 + }, + { + "id": 39 + } + ], + [ + { + "id": 24 + }, + { + "id": 40 + }, + { + "id": 33 + } + ], + [ + { + "id": 25 + }, + { + "id": 32 + }, + { + "id": 41 + }, + { + "id": 34 + } + ], + [ + { + "id": 26 + }, + { + "id": 33 + }, + { + "id": 42 + }, + { + "id": 35 + } + ], + [ + { + "id": 27 + }, + { + "id": 34 + }, + { + "id": 43 + }, + { + "id": 36 + } + ], + [ + { + "id": 28 + }, + { + "id": 35 + }, + { + "id": 44 + }, + { + "id": 37 + } + ], + [ + { + "id": 29 + }, + { + "id": 36 + }, + { + "id": 45 + }, + { + "id": 38 + } + ], + [ + { + "id": 30 + }, + { + "id": 37 + }, + { + "id": 46 + }, + { + "id": 39 + } + ], + [ + { + "id": 31 + }, + { + "id": 38 + }, + { + "id": 47 + } + ], + [ + { + "id": 32 + }, + { + "id": 48 + }, + { + "id": 41 + } + ], + [ + { + "id": 33 + }, + { + "id": 40 + }, + { + "id": 49 + }, + { + "id": 42 + } + ], + [ + { + "id": 34 + }, + { + "id": 41 + }, + { + "id": 50 + }, + { + "id": 43 + } + ], + [ + { + "id": 35 + }, + { + "id": 42 + }, + { + "id": 51 + }, + { + "id": 44 + } + ], + [ + { + "id": 36 + }, + { + "id": 43 + }, + { + "id": 52 + }, + { + "id": 45 + } + ], + [ + { + "id": 37 + }, + { + "id": 44 + }, + { + "id": 53 + }, + { + "id": 46 + } + ], + [ + { + "id": 38 + }, + { + "id": 45 + }, + { + "id": 54 + }, + { + "id": 47 + } + ], + [ + { + "id": 39 + }, + { + "id": 46 + }, + { + "id": 55 + } + ], + [ + { + "id": 40 + }, + { + "id": 56 + }, + { + "id": 49 + } + ], + [ + { + "id": 41 + }, + { + "id": 48 + }, + { + "id": 57 + }, + { + "id": 50 + } + ], + [ + { + "id": 42 + }, + { + "id": 49 + }, + { + "id": 58 + }, + { + "id": 51 + } + ], + [ + { + "id": 43 + }, + { + "id": 50 + }, + { + "id": 59 + }, + { + "id": 52 + } + ], + [ + { + "id": 44 + }, + { + "id": 51 + }, + { + "id": 60 + }, + { + "id": 53 + } + ], + [ + { + "id": 45 + }, + { + "id": 52 + }, + { + "id": 61 + }, + { + "id": 54 + } + ], + [ + { + "id": 46 + }, + { + "id": 53 + }, + { + "id": 62 + }, + { + "id": 55 + } + ], + [ + { + "id": 47 + }, + { + "id": 54 + }, + { + "id": 63 + } + ], + [ + { + "id": 48 + }, + { + "id": 57 + } + ], + [ + { + "id": 49 + }, + { + "id": 56 + }, + { + "id": 58 + } + ], + [ + { + "id": 50 + }, + { + "id": 57 + }, + { + "id": 59 + } + ], + [ + { + "id": 51 + }, + { + "id": 58 + }, + { + "id": 60 + } + ], + [ + { + "id": 52 + }, + { + "id": 59 + }, + { + "id": 61 + } + ], + [ + { + "id": 53 + }, + { + "id": 60 + }, + { + "id": 62 + } + ], + [ + { + "id": 54 + }, + { + "id": 61 + }, + { + "id": 63 + } + ], + [ + { + "id": 55 + }, + { + "id": 62 + } + ] + ] +} \ No newline at end of file diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index a44a48e..5d83c77 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -15,13 +15,7 @@ from binary_ensemble.bundle import BendlDecoder, BendlEncoder -EXAMPLE_GRAPH = ( - Path(__file__).resolve().parent.parent - / "docs" - / "user" - / "example_data" - / "gerrymandria.json" -) +EXAMPLE_GRAPH = Path(__file__).resolve().parent / "data" / "gerrymandria.json" def _graph(): diff --git a/ben-py/tests/test_docs_snippets.py b/ben-py/tests/test_docs_snippets.py new file mode 100644 index 0000000..79dc430 --- /dev/null +++ b/ben-py/tests/test_docs_snippets.py @@ -0,0 +1,90 @@ +"""Execute every Python code block in the Markdown docs so they can't silently drift. + +The **docs are the single source of truth**. The sample data the recipes read +(``ensemble.bendl``, ``plans.jsonl``, ``chain.ben`` / ``chain.xben``, ``gerrymandria.json``) +is created by the "Sample data" snippet in ``docs/how-to/index.md`` — marked +```` — which is shown to readers *and* run by this test. This runner +contains no fixture-creation logic and no per-page knowledge: it discovers the docs, runs the +setup snippet(s), then runs each page's blocks. Editing the docs never requires editing this +test; if a recipe needs new sample data, that goes in the setup snippet (in the docs). + +For each page the ```python fences run in order, sharing one namespace, in a fresh temp +working directory. A failing snippet fails the test with the page, block number, and source. +A page that imports GerryChain is skipped only when GerryChain isn't installed. A block may be +opted out with ```` (reserved for genuinely abstract fragments). +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + +DOCS_DIR = Path(__file__).resolve().parent.parent / "docs" + +try: + import gerrychain # noqa: F401 + + HAS_GERRYCHAIN = True +except Exception: # pragma: no cover - environment-dependent + HAS_GERRYCHAIN = False + +# A python fence, optionally preceded by a "setup"/"skip" docs-test directive comment. +_BLOCK = re.compile( + r"(?:\s*)?```python\n(?P.*?)\n```", + re.DOTALL, +) + + +def _blocks(text: str): + """Yield ``(directive, code)`` for each python fence; directive is 'setup'/'skip'/None.""" + for match in _BLOCK.finditer(text): + yield match.group("directive"), match.group("code") + + +def _markdown_files() -> list[Path]: + return sorted(DOCS_DIR.rglob("*.md")) + + +# The shared "Sample data" setup, taken from the docs themselves (single source of truth). +_SETUP_CODE = "\n".join( + code + for path in _markdown_files() + for directive, code in _blocks(path.read_text()) + if directive == "setup" +) + + +@pytest.mark.parametrize( + "doc", _markdown_files(), ids=lambda p: str(p.relative_to(DOCS_DIR)) +) +def test_markdown_snippets_execute(doc: Path, tmp_path, monkeypatch) -> None: + runnable = [ + (i, code) + for i, (directive, code) in enumerate(_blocks(doc.read_text()), start=1) + if directive is None + ] + if not runnable: + pytest.skip("no runnable python snippets") + if not HAS_GERRYCHAIN and any("gerrychain" in code for _, code in runnable): + pytest.skip("page needs GerryChain, which is not installed") + + monkeypatch.chdir(tmp_path) + # Create the sample files from the docs' own setup snippet. It runs in a throwaway + # namespace so only its files (not its variables) are visible to the page — a snippet + # that relies on an undefined name then fails honestly instead of being masked. + exec( + compile(_SETUP_CODE, "docs:sample-data-setup", "exec"), + {"__name__": "__setup__"}, + ) + + namespace: dict = {"__name__": "__docs_snippet__"} + for index, code in runnable: + try: + exec(compile(code, f"{doc.name}:block{index}", "exec"), namespace) + except Exception as exc: # noqa: BLE001 - surface as a readable test failure + pytest.fail( + f"{doc.relative_to(DOCS_DIR)} python block #{index} failed: " + f"{type(exc).__name__}: {exc}\n\n--- snippet ---\n{code}\n" + ) diff --git a/ben-py/tests/test_graph.py b/ben-py/tests/test_graph.py index bbe3559..a0a45ed 100644 --- a/ben-py/tests/test_graph.py +++ b/ben-py/tests/test_graph.py @@ -9,13 +9,7 @@ from binary_ensemble import graph as g -EXAMPLE_GRAPH = ( - Path(__file__).resolve().parent.parent - / "docs" - / "user" - / "example_data" - / "gerrymandria.json" -) +EXAMPLE_GRAPH = Path(__file__).resolve().parent / "data" / "gerrymandria.json" def _graph(): diff --git a/ben-py/tests/test_recompress.py b/ben-py/tests/test_recompress.py index 3da797d..21fc139 100644 --- a/ben-py/tests/test_recompress.py +++ b/ben-py/tests/test_recompress.py @@ -9,13 +9,7 @@ from binary_ensemble.bundle import BendlDecoder, BendlEncoder, compress_stream -EXAMPLE_GRAPH = ( - Path(__file__).resolve().parent.parent - / "docs" - / "user" - / "example_data" - / "gerrymandria.json" -) +EXAMPLE_GRAPH = Path(__file__).resolve().parent / "data" / "gerrymandria.json" def _graph(): diff --git a/ben-py/tests/test_relabel.py b/ben-py/tests/test_relabel.py index 7754bcb..36e4b58 100644 --- a/ben-py/tests/test_relabel.py +++ b/ben-py/tests/test_relabel.py @@ -14,13 +14,7 @@ relabel_bundle, ) -EXAMPLE_GRAPH = ( - Path(__file__).resolve().parent.parent - / "docs" - / "user" - / "example_data" - / "gerrymandria.json" -) +EXAMPLE_GRAPH = Path(__file__).resolve().parent / "data" / "gerrymandria.json" def _graph(): diff --git a/ben-py/uv.lock b/ben-py/uv.lock index aa1c51f..9776957 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -6,6 +6,18 @@ resolution-markers = [ "python_full_version < '3.12'", ] +[[package]] +name = "accessible-pygments" +version = "0.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" }, +] + [[package]] name = "alabaster" version = "1.0.0" @@ -24,30 +36,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, ] -[[package]] -name = "astroid" -version = "3.3.11" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.12'", -] -sdist = { url = "https://files.pythonhosted.org/packages/18/74/dfb75f9ccd592bbedb175d4a32fc643cf569d7c218508bfbd6ea7ef9c091/astroid-3.3.11.tar.gz", hash = "sha256:1e5a5011af2920c7c67a53f65d536d65bfa7116feeaf2354d8b94f29573bb0ce", size = 400439, upload-time = "2025-07-13T18:04:23.177Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/af/0f/3b8fdc946b4d9cc8cc1e8af42c4e409468c84441b933d037e101b3d72d86/astroid-3.3.11-py3-none-any.whl", hash = "sha256:54c760ae8322ece1abd213057c4b5bba7c49818853fc901ef09719a60dbf9dec", size = 275612, upload-time = "2025-07-13T18:04:21.07Z" }, -] - -[[package]] -name = "astroid" -version = "4.0.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a7/d1/6eee8726a863f28ff50d26c5eacb1a590f96ccbb273ce0a8c047ffb10f5a/astroid-4.0.1.tar.gz", hash = "sha256:0d778ec0def05b935e198412e62f9bcca8b3b5c39fdbe50b0ba074005e477aab", size = 405414, upload-time = "2025-10-11T15:15:42.6Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/f4/034361a9cbd9284ef40c8ad107955ede4efae29cbc17a059f63f6569c06a/astroid-4.0.1-py3-none-any.whl", hash = "sha256:37ab2f107d14dc173412327febf6c78d39590fdafcb44868f03b6c03452e3db0", size = 276268, upload-time = "2025-10-11T15:15:40.585Z" }, -] - [[package]] name = "asttokens" version = "3.0.0" @@ -97,14 +85,18 @@ dependencies = [ [package.optional-dependencies] docs = [ + { name = "furo" }, + { name = "linkify-it-py" }, { name = "myst-nb" }, - { name = "myst-parser" }, - { name = "nbconvert" }, - { name = "recommonmark" }, { name = "sphinx" }, - { name = "sphinx-autoapi" }, { name = "sphinx-copybutton" }, - { name = "sphinx-rtd-theme" }, + { name = "sphinx-design" }, + { name = "sphinxext-opengraph" }, +] +docs-exec = [ + { name = "gerrychain" }, + { name = "ipykernel" }, + { name = "tqdm" }, ] [package.dev-dependencies] @@ -120,17 +112,19 @@ dev = [ [package.metadata] requires-dist = [ + { name = "furo", marker = "extra == 'docs'", specifier = ">=2024.8.6" }, + { name = "gerrychain", marker = "extra == 'docs-exec'", specifier = ">=0.3.2" }, + { name = "ipykernel", marker = "extra == 'docs-exec'", specifier = ">=6.29.5" }, + { name = "linkify-it-py", marker = "extra == 'docs'", specifier = ">=2.0.3" }, { name = "myst-nb", marker = "extra == 'docs'", specifier = ">=1.3.0" }, - { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=4.0.1" }, - { name = "nbconvert", marker = "extra == 'docs'", specifier = ">=7.16.6" }, { name = "networkx", specifier = ">=3.0" }, - { name = "recommonmark", marker = "extra == 'docs'", specifier = ">=0.7.1" }, { name = "sphinx", marker = "extra == 'docs'", specifier = ">=8.2.3" }, - { name = "sphinx-autoapi", marker = "extra == 'docs'", specifier = ">=3.6.1" }, { name = "sphinx-copybutton", marker = "extra == 'docs'", specifier = ">=0.5.2" }, - { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=3.0.2" }, + { name = "sphinx-design", marker = "extra == 'docs'", specifier = ">=0.6.1" }, + { name = "sphinxext-opengraph", marker = "extra == 'docs'", specifier = ">=0.9.1" }, + { name = "tqdm", marker = "extra == 'docs-exec'", specifier = ">=4.67.1" }, ] -provides-extras = ["docs"] +provides-extras = ["docs", "docs-exec"] [package.metadata.requires-dev] dev = [ @@ -143,23 +137,6 @@ dev = [ { name = "tqdm", specifier = ">=4.67.1" }, ] -[[package]] -name = "bleach" -version = "6.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "webencodings" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/76/9a/0e33f5054c54d349ea62c277191c020c2d6ef1d65ab2cb1993f91ec846d1/bleach-6.2.0.tar.gz", hash = "sha256:123e894118b8a599fd80d3ec1a6d4cc7ce4e5882b1317a7e1ba69b56e95f991f", size = 203083, upload-time = "2024-10-29T18:30:40.477Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/55/96142937f66150805c25c4d0f31ee4132fd33497753400734f9dfdcbdc66/bleach-6.2.0-py3-none-any.whl", hash = "sha256:117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e", size = 163406, upload-time = "2024-10-29T18:30:38.186Z" }, -] - -[package.optional-dependencies] -css = [ - { name = "tinycss2" }, -] - [[package]] name = "certifi" version = "2025.10.5" @@ -342,15 +319,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, ] -[[package]] -name = "commonmark" -version = "0.9.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/60/48/a60f593447e8f0894ebb7f6e6c1f25dafc5e89c5879fdc9360ae93ff83f0/commonmark-0.9.1.tar.gz", hash = "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60", size = 95764, upload-time = "2019-10-04T15:37:39.817Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/92/dfd892312d822f36c55366118b95d914e5f16de11044a27cf10a7d71bbbf/commonmark-0.9.1-py2.py3-none-any.whl", hash = "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9", size = 51068, upload-time = "2019-10-04T15:37:37.674Z" }, -] - [[package]] name = "contourpy" version = "1.3.3" @@ -476,15 +444,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, ] -[[package]] -name = "defusedxml" -version = "0.7.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, -] - [[package]] name = "docutils" version = "0.21.2" @@ -561,6 +520,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/93/0dd45cd283c32dea1545151d8c3637b4b8c53cdb3a625aeb2885b184d74d/fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb", size = 1143175, upload-time = "2025-09-29T21:13:24.134Z" }, ] +[[package]] +name = "furo" +version = "2025.12.19" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accessible-pygments" }, + { name = "beautifulsoup4" }, + { name = "pygments" }, + { name = "sphinx" }, + { name = "sphinx-basic-ng" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/20/5f5ad4da6a5a27c80f2ed2ee9aee3f9e36c66e56e21c00fde467b2f8f88f/furo-2025.12.19.tar.gz", hash = "sha256:188d1f942037d8b37cd3985b955839fea62baa1730087dc29d157677c857e2a7", size = 1661473, upload-time = "2025-12-19T17:34:40.889Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/b2/50e9b292b5cac13e9e81272c7171301abc753a60460d21505b606e15cf21/furo-2025.12.19-py3-none-any.whl", hash = "sha256:bb0ead5309f9500130665a26bee87693c41ce4dbdff864dbfb6b0dae4673d24f", size = 339262, upload-time = "2025-12-19T17:34:38.905Z" }, +] + [[package]] name = "geopandas" version = "1.1.1" @@ -862,15 +837,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, ] -[[package]] -name = "jupyterlab-pygments" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/90/51/9187be60d989df97f5f0aba133fa54e7300f17616e065d1ada7d7646b6d6/jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d", size = 512900, upload-time = "2023-11-23T09:26:37.44Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/dd/ead9d8ea85bf202d90cc513b533f9c363121c7792674f78e0d8a854b63b4/jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780", size = 15884, upload-time = "2023-11-23T09:26:34.325Z" }, -] - [[package]] name = "jupyterlab-widgets" version = "3.0.15" @@ -970,6 +936,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" }, ] +[[package]] +name = "linkify-it-py" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "uc-micro-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2e/c9/06ea13676ef354f0af6169587ae292d3e2406e212876a413bf9eece4eb23/linkify_it_py-2.1.0.tar.gz", hash = "sha256:43360231720999c10e9328dc3691160e27a718e280673d444c38d7d3aaa3b98b", size = 29158, upload-time = "2026-03-01T07:48:47.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/de/88b3be5c31b22333b3ca2f6ff1de4e863d8fe45aaea7485f591970ec1d3e/linkify_it_py-2.1.0-py3-none-any.whl", hash = "sha256:0d252c1594ecba2ecedc444053db5d3a9b7ec1b0dd929c8f1d74dce89f86c05e", size = 19878, upload-time = "2026-03-01T07:48:46.098Z" }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -1174,15 +1152,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] -[[package]] -name = "mistune" -version = "3.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/02/a7fb8b21d4d55ac93cdcde9d3638da5dd0ebdd3a4fed76c7725e10b81cbe/mistune-3.1.4.tar.gz", hash = "sha256:b5a7f801d389f724ec702840c11d8fc48f2b33519102fc7ee739e8177b672164", size = 94588, upload-time = "2025-08-29T07:20:43.594Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/f0/8282d9641415e9e33df173516226b404d367a0fc55e1a60424a152913abc/mistune-3.1.4-py3-none-any.whl", hash = "sha256:93691da911e5d9d2e23bc54472892aff676df27a75274962ff9edc210364266d", size = 53481, upload-time = "2025-08-29T07:20:42.218Z" }, -] - [[package]] name = "myst-nb" version = "1.3.0" @@ -1236,31 +1205,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/6d/e7fa07f03a4a7b221d94b4d586edb754a9b0dc3c9e2c93353e9fa4e0d117/nbclient-0.10.2-py3-none-any.whl", hash = "sha256:4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d", size = 25434, upload-time = "2024-12-19T10:32:24.139Z" }, ] -[[package]] -name = "nbconvert" -version = "7.16.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "beautifulsoup4" }, - { name = "bleach", extra = ["css"] }, - { name = "defusedxml" }, - { name = "jinja2" }, - { name = "jupyter-core" }, - { name = "jupyterlab-pygments" }, - { name = "markupsafe" }, - { name = "mistune" }, - { name = "nbclient" }, - { name = "nbformat" }, - { name = "packaging" }, - { name = "pandocfilters" }, - { name = "pygments" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/59/f28e15fc47ffb73af68a8d9b47367a8630d76e97ae85ad18271b9db96fdf/nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582", size = 857715, upload-time = "2025-01-28T09:29:14.724Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/9a/cd673b2f773a12c992f41309ef81b99da1690426bd2f96957a7ade0d3ed7/nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b", size = 258525, upload-time = "2025-01-28T09:29:12.551Z" }, -] - [[package]] name = "nbformat" version = "5.10.4" @@ -1438,15 +1382,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, ] -[[package]] -name = "pandocfilters" -version = "1.5.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/70/6f/3dd4940bbe001c06a65f88e36bad298bc7a0de5036115639926b0c5c0458/pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e", size = 8454, upload-time = "2024-01-18T20:08:13.726Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" }, -] - [[package]] name = "parso" version = "0.8.5" @@ -1911,20 +1846,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" }, ] -[[package]] -name = "recommonmark" -version = "0.7.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "commonmark" }, - { name = "docutils" }, - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1c/00/3dd2bdc4184b0ce754b5b446325abf45c2e0a347e022292ddc44670f628c/recommonmark-0.7.1.tar.gz", hash = "sha256:bdb4db649f2222dcd8d2d844f0006b958d627f732415d399791ee436a3686d67", size = 34444, upload-time = "2020-12-17T19:24:56.523Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/77/ed589c75db5d02a77a1d5d2d9abc63f29676467d396c64277f98b50b79c2/recommonmark-0.7.1-py2.py3-none-any.whl", hash = "sha256:1b1db69af0231efce3fa21b94ff627ea33dee7079a01dd0a7f8482c3da148b3f", size = 10214, upload-time = "2020-12-17T19:24:55.137Z" }, -] - [[package]] name = "referencing" version = "0.37.0" @@ -2282,19 +2203,15 @@ wheels = [ ] [[package]] -name = "sphinx-autoapi" -version = "3.6.1" +name = "sphinx-basic-ng" +version = "1.0.0b2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "astroid", version = "3.3.11", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, - { name = "astroid", version = "4.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, - { name = "jinja2" }, - { name = "pyyaml" }, { name = "sphinx" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a9/ad/c627976d5f4d812b203ef1136108bbd81ef9bbbfd3f700f1295c322c22e6/sphinx_autoapi-3.6.1.tar.gz", hash = "sha256:1ff2992b7d5e39ccf92413098a376e0f91e7b4ca532c4f3e71298dbc8a4a9900", size = 55456, upload-time = "2025-10-06T16:21:22.888Z" } +sdist = { url = "https://files.pythonhosted.org/packages/98/0b/a866924ded68efec7a1759587a4e478aec7559d8165fac8b2ad1c0e774d6/sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9", size = 20736, upload-time = "2023-07-08T18:40:54.166Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ca/89/aea2f346fcdb44eb72464842e106b6291b2687feec2dd8b2de920ab89f28/sphinx_autoapi-3.6.1-py3-none-any.whl", hash = "sha256:6b7af0d5650f6eac1f4b85c1eb9f9a4911160ec7138bdc4451c77a5e94d5832c", size = 35334, upload-time = "2025-10-06T16:21:21.33Z" }, + { url = "https://files.pythonhosted.org/packages/3c/dd/018ce05c532a22007ac58d4f45232514cd9d6dd0ee1dc374e309db830983/sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b", size = 22496, upload-time = "2023-07-08T18:40:52.659Z" }, ] [[package]] @@ -2310,17 +2227,15 @@ wheels = [ ] [[package]] -name = "sphinx-rtd-theme" -version = "3.0.2" +name = "sphinx-design" +version = "0.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "docutils" }, { name = "sphinx" }, - { name = "sphinxcontrib-jquery" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/91/44/c97faec644d29a5ceddd3020ae2edffa69e7d00054a8c7a6021e82f20335/sphinx_rtd_theme-3.0.2.tar.gz", hash = "sha256:b7457bc25dda723b20b086a670b9953c859eab60a2a03ee8eb2bb23e176e5f85", size = 7620463, upload-time = "2024-11-13T11:06:04.545Z" } +sdist = { url = "https://files.pythonhosted.org/packages/13/7b/804f311da4663a4aecc6cf7abd83443f3d4ded970826d0c958edc77d4527/sphinx_design-0.7.0.tar.gz", hash = "sha256:d2a3f5b19c24b916adb52f97c5f00efab4009ca337812001109084a740ec9b7a", size = 2203582, upload-time = "2026-01-19T13:12:53.297Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/85/77/46e3bac77b82b4df5bb5b61f2de98637724f246b4966cfc34bc5895d852a/sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl", hash = "sha256:422ccc750c3a3a311de4ae327e82affdaf59eb695ba4936538552f3b00f4ee13", size = 7655561, upload-time = "2024-11-13T11:06:02.094Z" }, + { url = "https://files.pythonhosted.org/packages/30/cf/45dd359f6ca0c3762ce0490f681da242f0530c49c81050c035c016bfdd3a/sphinx_design-0.7.0-py3-none-any.whl", hash = "sha256:f82bf179951d58f55dca78ab3706aeafa496b741a91b1911d371441127d64282", size = 2220350, upload-time = "2026-01-19T13:12:51.077Z" }, ] [[package]] @@ -2350,18 +2265,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" }, ] -[[package]] -name = "sphinxcontrib-jquery" -version = "4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/de/f3/aa67467e051df70a6330fe7770894b3e4f09436dea6881ae0b4f3d87cad8/sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", size = 122331, upload-time = "2023-03-14T15:01:01.944Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/85/749bd22d1a68db7291c89e2ebca53f4306c3f205853cf31e9de279034c3c/sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae", size = 121104, upload-time = "2023-03-14T15:01:00.356Z" }, -] - [[package]] name = "sphinxcontrib-jsmath" version = "1.0.1" @@ -2389,6 +2292,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, ] +[[package]] +name = "sphinxext-opengraph" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/c0/eb6838e3bae624ce6c8b90b245d17e84252863150e95efdb88f92c8aa3fb/sphinxext_opengraph-0.13.0.tar.gz", hash = "sha256:103335d08567ad8468faf1425f575e3b698e9621f9323949a6c8b96d9793e80b", size = 1026875, upload-time = "2025-08-29T12:20:31.066Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/a4/66c1fd4f8fab88faf71cee04a945f9806ba0fef753f2cfc8be6353f64508/sphinxext_opengraph-0.13.0-py3-none-any.whl", hash = "sha256:936c07828edc9ad9a7b07908b29596dc84ed0b3ceaa77acdf51282d232d4d80e", size = 1004152, upload-time = "2025-08-29T12:20:29.072Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.44" @@ -2449,18 +2364,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, ] -[[package]] -name = "tinycss2" -version = "1.4.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "webencodings" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" }, -] - [[package]] name = "tornado" version = "6.5.2" @@ -2519,6 +2422,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "uc-micro-py" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/78/67/9a363818028526e2d4579334460df777115bdec1bb77c08f9db88f6389f2/uc_micro_py-2.0.0.tar.gz", hash = "sha256:c53691e495c8db60e16ffc4861a35469b0ba0821fe409a8a7a0a71864d33a811", size = 6611, upload-time = "2026-03-01T06:31:27.526Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/73/d21edf5b204d1467e06500080a50f79d49ef2b997c79123a536d4a17d97c/uc_micro_py-2.0.0-py3-none-any.whl", hash = "sha256:3603a3859af53e5a39bc7677713c78ea6589ff188d70f4fee165db88e22b242c", size = 6383, upload-time = "2026-03-01T06:31:26.257Z" }, +] + [[package]] name = "urllib3" version = "2.5.0" @@ -2537,15 +2449,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, ] -[[package]] -name = "webencodings" -version = "0.5.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" }, -] - [[package]] name = "widgetsnbextension" version = "4.0.14" diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 01e4313..1e6f556 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1428,7 +1428,9 @@ fn reben_cli_generates_map_from_dual_graph_and_reports_invalid_flag_combinations temp.path(), ); assert_failure(&missing_dual_graph); - assert!(String::from_utf8_lossy(&missing_dual_graph.stderr).contains("No dual-graph file provided")); + assert!( + String::from_utf8_lossy(&missing_dual_graph.stderr).contains("No dual-graph file provided") + ); let sorted_json: Value = serde_json::from_str(&fs::read_to_string(generated_graph).unwrap()).unwrap(); diff --git a/docs/ben-format-spec.md b/docs/ben-format-spec.md new file mode 100644 index 0000000..a3126fa --- /dev/null +++ b/docs/ben-format-spec.md @@ -0,0 +1,256 @@ +# BEN Stream Format Specification + +## Status + +Stable wire format. This document specifies the on-disk byte layout of a BEN stream for the two +snapshot variants, **Standard** and **MkvChain**. These two variants share every layer of the +encoding except the per-frame repetition count and the inter-sample constraint; they are documented +together because they differ by a single field on the wire. + +The **TwoDelta** variant is a delta encoding with a different frame shape and a different XBEN body +layout. It is out of scope here and specified separately. + +This specification covers the `.ben` container and the BEN32 body carried inside a `.xben` +container. It does not cover the `.bendl` bundle container, which embeds a BEN/XBEN stream as an +opaque payload; see the BENDL format specification for that. + +## Design Goals + +- A compact, self-describing encoding of an ensemble of district assignments. +- Per-frame headers that allow frame-level subsampling without unpacking payload bits. +- A repetition count (MkvChain) that collapses identical consecutive samples from full-chain + samplers into a single frame, while preserving the expanded sample count. +- A streamable layout: frames can be appended one at a time and read back without a global index. + +## Terminology + +This document uses the workspace glossary. The terms that matter most here: + +- **assignment** — a length-N `Vec` where index *i* is the district id of dual-graph node *i*. +- **district id** — an integer value stored in an assignment. Range `0..=65535`. +- **sample** — one `(sample_number, assignment)` pair. `sample_number` lives in *expanded* space. +- **sample count** — the *expanded* number of samples: a MkvChain frame with `count = 5` contributes + 5, not 1. +- **variant** — `Standard` or `MkvChain` here. One variant per stream, fixed by the banner. +- **banner** — the 17-byte ASCII stream identifier. Distinct from a BENDL **magic**. +- **frame header** — the leading bytes of one frame (bit-width fields and payload length). +- **frame payload** — the bit-packed bytes after the frame header. + +The encoding stack is layered as in the glossary: + +| Layer | Name | What it is here | |---|---|---| | 0 | bit-packing | run values and run lengths +crammed into bit-precise widths | | 1 | RLE | `(value, length)` pairs over an assignment | | 2 | +frame | one sample's bytes: frame header + payload, plus a `u16` count for `MkvChain` | | 3 | stream +| banner + concatenated frames; the contents of a `.ben` file | | 4 | container | the on-disk file: +`.ben`, or `.xben` (the stream wrapped in LZMA2) | + +## Byte Order + +Multi-byte integers in the frame header and the trailing count are **big-endian**. The bit-packed +payload is filled most-significant-bit first (see **Frame Payload**). + +This differs from the BENDL bundle header, which is little-endian. The two formats are independent. + +## Stream Layout + +A BEN stream (the contents of a `.ben` file, or the LZMA2-decompressed body of a `.xben` file in its +BEN32 form) is: + +```text +[17-byte Banner] +[Frame 1] +[Frame 2] +... +[Frame N] +``` + +There is no stream-level length prefix, frame count, or trailing terminator. The stream ends at a +frame boundary; a reader that reaches end-of-input while attempting to read the first byte of the +next frame has reached a clean end of stream. + +### Banner + +The first 17 bytes are an ASCII banner that fixes the variant for the entire stream: + +```text +offset size field +0 17 banner +``` + +- `STANDARD BEN FILE` — Standard variant. +- `MKVCHAIN BEN FILE` — MkvChain variant. + +(`TWODELTA BEN FILE` denotes the TwoDelta variant, specified elsewhere.) + +A reader MUST reject a stream whose first 17 bytes are not one of the known banners. + +## Run-Length Encoding (Layer 1) + +Before bit-packing, an assignment is converted to a vector of `(value, length)` runs, where `value` +is a district id and `length` is the number of consecutive nodes that carry it, in node order. + +Example: `[1, 1, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3]` becomes `[(1, 3), (2, 4), (3, 1), (1, 1), (3, 3)]`. + +Both `value` and `length` are `u16`. A run longer than `65535` is split into consecutive runs of the +same value, each at most `65535` long. The assignment length N is **not** stored anywhere in the +frame; it is recovered as the sum of all run lengths. Readers MUST reconstruct N this way and MUST +NOT assume a fixed N across frames. + +## Frame Layout (Layer 2) + +Both variants share the same 6-byte frame header and bit-packed payload. MkvChain appends a 2-byte +repetition count; Standard does not. + +### Standard frame + +```text +offset size field +0 1 max_val_bit_count +1 1 max_len_bit_count +2 4 n_bytes +6 ... payload (n_bytes bytes) +``` + +A Standard frame is exactly `6 + n_bytes` bytes. + +### MkvChain frame + +```text +offset size field +0 1 max_val_bit_count +1 1 max_len_bit_count +2 4 n_bytes +6 ... payload (n_bytes bytes) +6+n_bytes 2 count +``` + +An MkvChain frame is exactly `6 + n_bytes + 2` bytes. + +### Frame header fields + +- `max_val_bit_count` — the number of bits used to encode each run's district id in the payload. + Computed as the bit width of the largest district id in this frame, with a floor of `1` (so a + frame of all-zero district ids still uses 1 bit per value). Range `1..=16`. +- `max_len_bit_count` — the number of bits used to encode each run's length in the payload. Computed + as the bit width of the largest run length in this frame, with a floor of `1`. Range `1..=16`. +- `n_bytes` — the exact byte length of the bit-packed payload that follows the header (`u32`, + big-endian). Equal to `ceil((max_val_bit_count + max_len_bit_count) * n_runs / 8)`. +- `count` *(MkvChain only)* — the number of identical consecutive samples this frame represents + (`u16`, big-endian). MUST be `>= 1`; a reader MUST treat `count == 0` as a corrupt frame and + error. The frame's assignment is emitted `count` times, and the stream's expanded sample count + increases by `count`. + +A Standard frame always represents exactly one sample. It carries no count on the wire; readers +treat its count as `1`. + +## Frame Payload (Layer 0) + +The payload is the RLE run vector bit-packed at the widths declared in the header. For each run, in +order, the encoder emits: + +1. the district id in `max_val_bit_count` bits, then +1. the run length in `max_len_bit_count` bits. + +Bits are packed most-significant-bit first into a byte stream: the first run's value occupies the +high bits of byte 0. After the final run, any leftover bits in the last byte are zero-padded on the +low side to reach a byte boundary. `n_bytes` counts that final padded byte. + +Because each run occupies a fixed `max_val_bit_count + max_len_bit_count` bits, a decoder reads runs +back by consuming that many bits at a time until it has consumed `n_bytes` worth of payload, +ignoring the trailing zero-pad bits of the final byte. The run vector is then expanded into the +assignment by repeating each `value` `length` times. + +### Worked example + +Take the assignment `[1, 1, 1, 2, 2, 2, 2, 3]`. + +- RLE: `[(1, 3), (2, 4), (3, 1)]`. +- Largest district id is `3` → `max_val_bit_count = 2`. Largest run length is `4` → + `max_len_bit_count = 3`. Each run takes `2 + 3 = 5` bits; 3 runs = 15 bits → `n_bytes = 2`. +- Bit string, value then length per run (MSB first): `01 011 10 100 11 001` = `01011 10100 11001` + → pad to 16 bits → `0101 1101 0011 0010`. +- Payload bytes: `0x5D 0x32`. +- Standard frame: `02 03 00 00 00 02 5D 32`. +- MkvChain frame for the same sample repeated 4 times: `02 03 00 00 00 02 5D 32 00 04`. + +## XBEN Body (BEN32 Intermediate) + +A `.xben` file wraps a BEN stream in LZMA2. For Standard and MkvChain, the bytes inside the LZMA2 +stream are **not** the bit-packed layer-2 frames above; they are the **BEN32 intermediate**, a +fixed-width columnar form that compresses better under LZMA2. (TwoDelta uses a different XBEN body +and is out of scope here.) + +The decompressed BEN32 body is: + +```text +[17-byte Banner] +[BEN32 Frame 1] +[BEN32 Frame 2] +... +``` + +The banner is the same 17-byte identifier as in the plain `.ben` stream and sits inside the +compressed payload. + +A BEN32 frame is a sequence of 4-byte runs followed by a 4-byte zero sentinel: + +```text +[run: u16 value BE][run: u16 length BE] (repeated, one per RLE run) +[00 00 00 00] (4-byte zero sentinel: end of frame) +``` + +For MkvChain, a `u16` big-endian `count` follows the sentinel: + +```text +... [00 00 00 00][count: u16 BE] +``` + +The zero sentinel is unambiguous because a valid run never has length `0`. As in the native frame +layout, MkvChain `count` MUST be `>= 1`. + +The `stream_checksum` recorded by a BENDL bundle for an embedded XBEN stream is computed over these +compressed bytes, not over the decompressed BEN32 body. + +## Reader Rules + +A reader MUST: + +1. Read and validate the 17-byte banner; reject unknown banners. The banner fixes the variant for + the whole stream. +1. Read frames in the variant's wire format until a clean end of input at a frame boundary. +1. For each frame, recover the assignment by unpacking `n_bytes` of payload at the declared bit + widths and expanding the runs; the assignment length is the sum of run lengths. +1. For MkvChain, read the trailing `u16` count after the payload, reject `count == 0`, emit the + assignment `count` times, and add `count` to the expanded sample count. For Standard, treat the + count as `1`. + +A reader MUST surface an error (not a truncated result) if input ends partway through a frame +header, payload, or trailing count. + +Frame-level subsampling does not require unpacking payload bits: a reader can skip a frame by +reading its 6-byte header, seeking past `n_bytes` (and, for MkvChain, the 2-byte count), and only +unpacking the payloads of frames it keeps. + +## Relationship Between the Variants + +A Standard stream and a MkvChain stream are wire-incompatible: they carry different banners, and +MkvChain frames are 2 bytes longer. Semantically, a Standard stream is equivalent to a MkvChain +stream in which every frame has `count = 1` — MkvChain only adds value when a sampler produces +identical consecutive samples (for example, MCMC self-loops from rejected proposals). A converter +that re-encodes between the two MUST expand each MkvChain frame's `count` into that many Standard +frames, and conversely MAY collapse runs of identical Standard frames into a single MkvChain frame +with the appropriate count. + +## Versioning Strategy + +The frame header shape, the bit-packing rule, and the BEN32 body layout are contractual: committed +fixtures encoded under a stable major version MUST continue to decode in every later release of that +major version. A change to the frame header shape, the bit-packing convention, or the BEN32 layout +is a breaking change that requires a new fixture set under a new major version; existing fixtures +are never regenerated in place. See the format-stability policy. + +## Out of Scope + +- The TwoDelta variant (different frame shape and XBEN body). +- The BENDL bundle container that embeds a BEN/XBEN stream as an opaque payload. +- LZMA2 framing details; XBEN treats LZMA2 as an opaque outer wrapper around the BEN32 body. diff --git a/docs/bendl-format-spec.md b/docs/bendl-format-spec.md index 6a5e56b..91d0dcd 100644 --- a/docs/bendl-format-spec.md +++ b/docs/bendl-format-spec.md @@ -1,23 +1,27 @@ -# BENDL Format Specification Draft +# BENDL Format Specification ## Status -Draft design for a future `.bendl` file format. +Stable wire format. This document specifies the implemented v1 `.bendl` container as produced and +consumed by the bundle reader, writer, and appender, and pinned by the committed v1.0.0 stability +fixtures. -This document defines a concrete binary layout for a single-file dataset container that: +A `.bendl` is a single-file dataset container that: - feels like one file to users -- keeps metadata and optional assets accessible near the front -- stores the assignment stream at the end +- keeps metadata and optional assets in the same container +- stores the assignment stream as a bounded embedded payload - supports interrupted writes - can be finalized by patching the header -This specification is intentionally separate from the existing `.ben` and `.xben` formats. +The `.bendl` container is distinct from the `.ben` and `.xben` stream formats. It embeds a BEN or +XBEN stream as an opaque payload and records which one in its header; the stream's own layout +(banner and frames) is specified in the BEN stream format specification. ## Design Goals - Single-file dataset container. -- Efficient access to front-loaded metadata. +- Directory-indexed access to metadata and optional assets. - Stream-friendly assignment payloads. - Recoverable partial files after interruption. - Forward-compatible directory structure. @@ -26,10 +30,11 @@ This specification is intentionally separate from the existing `.ben` and `.xben ## Terminology - `bundle`: a `.bendl` file. -- `asset`: a named front-loaded object such as a graph or relabel map. -- `assignment stream`: the trailing BEN or XBEN payload. +- `asset`: a named object such as a graph or node permutation map. +- `assignment stream`: the embedded BEN or XBEN payload. - `finalized bundle`: a bundle whose header has been patched to indicate successful completion. -- `incomplete bundle`: a bundle whose assignment stream may still be usable, but whose final size/count information is not authoritative. +- `incomplete bundle`: a bundle whose assignment stream may still be usable, but whose final + size/count information is not authoritative. ## File Layout @@ -37,12 +42,15 @@ A `.bendl` file is laid out as: ```text [Fixed Header] -[Directory Table] [Asset Payloads] [Assignment Stream] +[Directory Table] ``` -The assignment stream is always the final data region in the file. +The directory table is normally the final data region in a finalized bundle. A post-finalize append +writes new asset payloads and a replacement directory after the old EOF, then patches the header +last; if that final patch fails, the old directory remains authoritative and newer bytes are +orphaned. ## Byte Order @@ -57,63 +65,57 @@ offset size field 0 8 magic 8 2 major_version 10 2 minor_version -12 2 flags -14 1 complete -15 1 assignment_format -16 8 directory_offset -24 8 directory_len -32 8 stream_offset -40 8 stream_len -48 8 sample_count -56 8 reserved +12 1 finalized +13 1 assignment_format +14 2 alignment_padding +16 4 flags +20 4 stream_checksum +24 8 directory_offset +32 8 directory_len +40 8 stream_offset +48 8 stream_len +56 8 sample_count ``` +Total: 64 bytes. All multi-byte integers are little-endian. + ### Header Fields -- `magic` - - fixed bytes identifying the file as BENDL - - proposed value: `b"BENDL\\0\\0\\1"` -- `major_version` - - initial value: `1` -- `minor_version` - - initial value: `0` -- `flags` - - bundle-level feature flags (64 bits available) -- `complete` - - `0` means incomplete/unfinalized - - `1` means finalized -- `assignment_format` - - `1 = BEN` - - `2 = XBEN` -- `directory_offset` - - byte offset of the directory table -- `directory_len` - - byte length of the directory table -- `stream_offset` - - byte offset where the assignment stream begins -- `stream_len` - - length in bytes of the assignment stream - - `0` if unknown/unfinalized -- `sample_count` - - number of expanded samples in the assignment stream - - `-1` if unknown/unfinalized -- `reserved` - - reserved for future extension +- `magic` — 8 bytes identifying the file as BENDL. Value: `b"BENDL\0\0\x01"`. +- `major_version` — incompatible-change version. Current value: `1`. +- `minor_version` — additive backward-compatible version. Current value: `0`. +- `finalized` — `0` means incomplete/unfinalized; `1` means finalized. +- `assignment_format` — `1 = BEN` (plain bit-packed), `2 = XBEN` (xz-compressed BEN32). +- `alignment_padding` — two bytes of padding that keep the following 8-byte fields at offset ≥ 24 + 8-byte aligned. Writers set to zero; readers ignore non-zero bytes. Not a forward-compat slot — + new fields must live elsewhere. +- `flags` — 32-bit bundle-level feature flags. See **Header Flags** below. Bits without a defined + constant are reserved; writers set them to zero. +- `stream_checksum` — CRC32C (Castagnoli polynomial) of the on-disk assignment stream bytes + `[stream_offset, stream_offset + stream_len)`. For XBEN streams the CRC is over the compressed + bytes, not the decompressed content. Valid only when `HEADER_FLAG_STREAM_CHECKSUM` (bit 0) is set + in `flags`. Writers set this field to zero while the bundle is unfinalized and patch it with the + final value at finalization time. +- `directory_offset` — absolute byte offset of the directory table. Zero if no directory has been + written yet. +- `directory_len` — byte length of the directory table. Zero if absent. +- `stream_offset` — byte offset where the assignment stream begins. +- `stream_len` — exact byte length of the assignment stream. Zero if unfinalized. Readers MUST + surface an error if the backing file is shorter than this declared length. +- `sample_count` — number of expanded samples in the assignment stream. `-1` if unfinalized. ## Header Flags -Initial proposed header flags: - -- bit 0: directory contains checksums -- bit 1: bundle contains graph asset -- bit 2: bundle contains relabel map asset -- bit 3: bundle contains metadata asset +- **Bit 0 — `HEADER_FLAG_STREAM_CHECKSUM`**: the `stream_checksum` field contains a valid CRC32C + over the on-disk assignment stream bytes. Library writers always set this flag and write a valid + checksum. The clear-flag state exists only for adversarial/foreign bytes and partial-recovery + flows; verified reader APIs return `Unavailable` when this flag is clear. -Unrecognized flags must be ignored by readers unless a future version marks them as mandatory. +Bits 1–31 are reserved. Unrecognized flag bits must be ignored by readers. ## Directory Table -The directory table is a compact binary table describing front-loaded assets. +The directory table is a compact binary table describing asset payloads. Layout: @@ -123,6 +125,12 @@ offset size field 4 ... repeated directory entries ``` +`entry_count` is the number of asset entries that follow; the assignment stream is stored outside +the directory and does not count toward it. Readers MUST reject a bundle whose `entry_count` exceeds +`MAX_DIRECTORY_ENTRIES` (256) **before** allocating per-entry storage, so a corrupt or adversarial +count cannot force a large reservation. Writers MUST NOT emit more than `MAX_DIRECTORY_ENTRIES` +entries. + Each directory entry has the following header: ```text @@ -149,9 +157,12 @@ offset size field - `payload_offset` - absolute file offset of the asset payload - `payload_len` - - byte length of the asset payload + - exact byte length of the on-disk asset payload. Readers MUST surface an error if the backing + file is shorter than this declared length; they MUST NOT silently return a truncated payload. - `checksum_len` - - byte length of optional checksum bytes that follow the name + - byte length of the optional checksum bytes that follow the name. MUST be `4` when the + `ASSET_FLAG_CHECKSUM` bit (bit 2) is set and `0` when it is clear; readers MUST reject any entry + where the flag and `checksum_len` disagree. - `name bytes` - UTF-8 asset name - `checksum bytes` @@ -159,53 +170,67 @@ offset size field ### Asset Types -Initial proposed asset types: +Defined asset types: - `1 = metadata.json` - `2 = graph.json` -- `3 = relabel_map.json` +- `3 = node_permutation_map.json` - `4 = custom user asset` +Types `1`–`3` are singleton known assets: each may appear at most once and MUST use its standardized +name. Type `4` is a custom asset with a writer-chosen name, and multiple are allowed. + ### Asset Flags -Initial proposed asset flags: +Defined asset flags: - bit 0: payload is UTF-8 JSON -- bit 1: payload is zstd-compressed +- bit 1: payload is xz-compressed - bit 2: checksum present -Readers must skip unknown asset types and unknown flags when possible. +When bit 2 is set, the trailing checksum is exactly four little-endian bytes holding a CRC32C +(Castagnoli polynomial) over the **on-disk payload bytes** +(`payload_offset .. payload_offset + payload_len`). For an xz-compressed asset the CRC covers the +compressed bytes, so verification happens before decompression. Library writers always set bit 2 and +write a valid checksum. + +Readers must skip unknown asset types and unknown flag bits when possible. ## Asset Payload Region -Assets are written after the directory table and before the assignment stream. +Assets are written after the fixed header and before the assignment stream. Each asset payload is +referenced by the trailing directory table. -Each asset payload is raw bytes referenced by the directory table. The bundle does not require per-asset wrapper headers in the payload region because offsets and lengths are already described by the directory entries. +Each asset payload is raw bytes referenced by the directory table. The bundle does not require +per-asset wrapper headers in the payload region because offsets and lengths are already described by +the directory entries. -Examples of front-loaded assets: +Examples of assets: - graph file -- relabel map +- node permutation map - extra metadata JSON - provenance/configuration info ## Assignment Stream Region -The assignment stream starts at `stream_offset` and occupies `stream_len` bytes if the bundle is finalized. +The assignment stream starts at `stream_offset` and occupies `stream_len` bytes if the bundle is +finalized. The stream payload must be one of: - BEN byte stream - XBEN byte stream -The bundle does not reinterpret BEN/XBEN internals. It only stores the opaque assignment stream and records its format in `assignment_format`. +The bundle does not reinterpret BEN/XBEN internals. It only stores the opaque assignment stream and +records its format in `assignment_format`. ### Incomplete Bundles -If `complete == 0`: +If `finalized == 0`: - `stream_len` may be `0` -- `sample_count` may be `u64::MAX` +- `sample_count` is `-1` - readers should treat assignment data as extending from `stream_offset` to EOF This allows partially written bundles to remain recoverable. @@ -215,31 +240,38 @@ This allows partially written bundles to remain recoverable. Writers are expected to use this sequence: 1. Write a provisional header with: - - `complete = 0` + - `finalized = 0` - `stream_len = 0` - - `sample_count = u64::MAX` -2. Write the directory table. -3. Write all front-loaded assets. -4. Record `stream_offset`. -5. Write the assignment stream. -6. On successful completion: + - `sample_count = -1` +1. Write all asset payloads. +1. Record `stream_offset`. +1. Write the assignment stream. +1. Compute the assignment-stream checksum. +1. Write the trailing directory table. +1. On successful completion: - compute final `stream_len` - compute final `sample_count` + - record final `directory_offset` and `directory_len` - seek back to patch the header - - set `complete = 1` + - set `finalized = 1` -If writing is interrupted before step 6, the file remains an incomplete bundle. +If writing is interrupted before step 7, the file remains an incomplete bundle. ## Reader Rules Readers must: -1. Validate `magic` and supported version. -2. Read the fixed header. -3. Read the directory table. -4. Make front-loaded assets available immediately. -5. Interpret the assignment stream according to `assignment_format`. -6. If `complete == 0`, treat the stream as running from `stream_offset` to EOF. +1. Validate `magic` and supported `major_version`. Higher `minor_version` values are accepted. +1. Read the fixed header. +1. Read the authoritative directory table identified by `directory_offset` and `directory_len`. + Reject a declared `entry_count` above `MAX_DIRECTORY_ENTRIES` (256) before allocating, and reject + any bytes left over in the directory region after the declared entries. +1. Validate the directory: asset names must be unique, and a singleton known type (1–3) must use its + standardized name. Reject a directory that violates either rule. +1. Make directory-listed assets available. +1. Interpret the assignment stream according to `assignment_format`. +1. If `finalized == 0`, treat the stream as running from `stream_offset` to EOF (or to + `directory_offset` when a provisional directory was written). Readers should expose: @@ -251,58 +283,69 @@ Readers should expose: If a bundle write is interrupted: -- header and front-loaded assets should still be usable if fully written +- header and assets should still be usable if fully written and directory-listed - assignment data should be readable from `stream_offset` to EOF - `sample_count` should be treated as unknown - the bundle should be marked incomplete -If the interruption happens before the directory or assets are fully written, the bundle may be unreadable. Writers should therefore prefer writing small front-loaded metadata first and beginning the assignment stream only after the directory is complete. +If the interruption happens before the final directory or header patch is written, the bundle may be +incomplete. Post-finalize append is ordered so that the old directory remains authoritative until +the replacement directory is committed by the final header patch. ## Metadata Conventions -Although the directory is binary, metadata payloads should initially use JSON for ease of debugging. +Although the directory is binary, metadata payloads use JSON for ease of debugging. -Recommended metadata file names: +Standardized metadata file names: - `metadata.json` - `graph.json` -- `relabel_map.json` +- `node_permutation_map.json` -Recommended metadata fields: +The `metadata.json` payload mirrors the fixed header for human readability; the header (and, for the +variant, the embedded stream banner) remains authoritative. Its fields are: ```json { - "bundle_version": 1, - "assignments_format": "xben", + "major_version": 1, + "minor_version": 0, + "assignment_format": "xben", "variant": "mkv_chain", "complete": false } ``` +- `major_version` / `minor_version` — mirror the header version fields. +- `assignment_format` — `"ben"` or `"xben"`, mirroring the header `assignment_format` byte. +- `variant` — `"standard"`, `"mkv_chain"`, or `"two_delta"`, mirroring the embedded stream banner. + Optional; omitted when unknown. +- `complete` — mirrors the header `finalized` flag. + ## Versioning Strategy - incompatible structural changes require `major_version` bump - additive backward-compatible fields may use `minor_version` bump - unknown asset types should be ignored when possible -## Suggested Rust Types +## Rust Types -Conceptual Rust representations: +The in-memory representations of the header and directory entries: ```rust pub struct BendlHeader { pub magic: [u8; 8], pub major_version: u16, pub minor_version: u16, - pub flags: u64, - pub complete: u8, + pub finalized: u8, pub assignment_format: u8, + pub alignment_padding: u16, + pub flags: u32, + pub stream_checksum: u32, pub directory_offset: u64, pub directory_len: u64, pub stream_offset: u64, pub stream_len: u64, - pub sample_count: i28, - pub reserved: u64, + pub sample_count: i64, } pub struct BendlDirectoryEntry { @@ -315,25 +358,29 @@ pub struct BendlDirectoryEntry { } ``` -## Suggested Module Layout +## Module Layout -If implemented in `ben`, the new code should likely live under: +The implementation lives under: ```text -ben/src/bundle/ +ben/src/io/bundle/ mod.rs format.rs reader.rs writer.rs manifest.rs + verify.rs + error.rs ``` Responsibilities: -- `format.rs`: binary header/directory definitions -- `reader.rs`: bundle reader -- `writer.rs`: bundle writer/finalizer +- `format.rs`: binary header/directory definitions and their encode/decode helpers +- `reader.rs`: bundle reader (header + directory parsing, asset and stream access) +- `writer.rs`: bundle writer/finalizer and the post-finalize appender - `manifest.rs`: JSON metadata structs +- `verify.rs`: bounded readers, CRC tees, and checksum verification adapters +- `error.rs`: read-side error and checksum-error types ## Out of Scope for V1 @@ -342,14 +389,14 @@ Responsibilities: - random-write mutation of existing bundles - archive-level compression beyond the assignment stream format -## Current Recommendation +## Summary -Implement `.bendl` V1 as: +`.bendl` v1 is: - a seekable file container -- a fixed header plus binary directory -- front-loaded optional assets -- trailing BEN/XBEN assignment stream -- header patched on successful finalize +- a fixed header plus asset payloads, assignment stream, and trailing binary directory +- optional assets referenced by directory entries +- an embedded BEN/XBEN assignment stream +- a header patched on successful finalize -This keeps the format simple, recoverable, and aligned with the current streaming requirements. +This keeps the format simple, recoverable, and aligned with the streaming requirements. diff --git a/docs/bendl-implementation-plan.md b/docs/bendl-implementation-plan.md deleted file mode 100644 index 5a95a4e..0000000 --- a/docs/bendl-implementation-plan.md +++ /dev/null @@ -1,261 +0,0 @@ -# BENDL Implementation Plan - -## Goal - -Turn the `.bendl` roadmap and format specification into an implementation sequence that is low-risk and easy to validate incrementally. - -This plan assumes: - -- `.ben` and `.xben` remain unchanged -- `.bendl` is a new seekable container format -- the assignment stream is stored at the end of the file -- header fields are patched on successful finalization - -## Guiding Strategy - -Build `.bendl` in layers: - -1. binary format types -2. read-only support -3. write/finalize support -4. CLI integration -5. Python integration - -This keeps the early steps small and testable. - -## Phase 1: Core Format Types - -Add a new top-level module: - -```text -ben/src/bundle/ - mod.rs - format.rs - manifest.rs -``` - -### Tasks - -- Define `BendlHeader`. -- Define constants for: - - magic bytes - - version numbers - - assignment format identifiers - - asset types - - asset flags -- Define `BendlDirectoryEntry`. -- Implement binary encode/decode helpers for: - - header read/write - - directory entry read/write -- Add manifest-side serde structs for JSON metadata assets. - -### Deliverable - -Pure format layer with no I/O orchestration yet. - -### Tests - -- header round-trip tests -- directory entry round-trip tests -- invalid magic/version tests -- asset flag parsing tests - -## Phase 2: Read-Only Bundle Support - -Add: - -```text -ben/src/bundle/reader.rs -``` - -### Tasks - -- Implement `BendlReader`. -- Validate and parse the fixed header. -- Read and decode the directory table. -- Expose accessors for: - - `is_complete()` - - `sample_count() -> Option` - - `assignment_format()` - - `assets()` -- Implement helpers to: - - open asset payloads by name/type - - open the assignment stream region -- For incomplete bundles: - - treat assignment stream as `stream_offset..EOF` - -### Deliverable - -A read-only API that can inspect bundle metadata and expose the embedded assignment stream. - -### Tests - -- parse finalized bundle fixture -- parse incomplete bundle fixture -- recover front-loaded assets when `complete == 0` -- ignore unknown asset types cleanly - -## Phase 3: Bundle Writer - -Add: - -```text -ben/src/bundle/writer.rs -``` - -### Tasks - -- Implement `BendlWriter`. -- Write provisional header. -- Write directory table. -- Write front-loaded assets. -- Track `stream_offset`. -- Stream BEN or XBEN payload at the end. -- Count samples while writing. -- On `finish()`: - - compute `stream_len` - - patch header - - set `complete = 1` - -### Important Constraints - -- Writing should require `Seek`. -- `finish()` should be explicit. -- `Drop` should not silently attempt complex repair/finalization. - -### Deliverable - -A bundle writer that can produce finalized `.bendl` files and leave partially usable files behind if interrupted. - -### Tests - -- finalized bundle writes correct header fields -- incomplete writer leaves `complete = 0` -- assets remain readable after partial write -- correct `sample_count` patching - -## Phase 4: Assignment Stream Integration - -Connect bundle writing to the existing BEN/XBEN infrastructure. - -### Tasks - -- Allow writer to store: - - BEN assignment stream - - XBEN assignment stream -- Reuse existing encoders rather than reimplementing stream encoding. -- Add helper APIs such as: - - `write_ben_stream(...)` - - `write_xben_stream(...)` - - `open_assignment_reader(...)` - -### Deliverable - -The bundle layer becomes a thin container around the current assignment formats. - -### Tests - -- bundle with BEN payload decodes correctly -- bundle with XBEN payload decodes correctly -- incomplete XBEN stream remains partially readable when possible - -## Phase 5: CLI Support - -Add CLI commands after the core library is stable. - -Potential command surface: - -```text -ben bundle create -ben bundle inspect -ben bundle extract -``` - -### Tasks - -- create `.bendl` from assignment stream + optional assets -- inspect header and asset list -- extract embedded assets or assignment payload -- report completeness/finalization state - -### Deliverable - -User-facing bundle workflow in the Rust CLI. - -### Tests - -- integration tests for create/inspect/extract -- interrupted/incomplete bundle inspection -- metadata visibility before finalized stream count - -## Phase 6: Python Support - -Add optional `pyben` support once the Rust API settles. - -### Tasks - -- expose bundle inspection API -- expose `sample_count` if finalized -- expose graph/relabel-map asset loading -- optionally expose embedded assignment stream through `PyBenDecoder` - -### Deliverable - -Python can open `.bendl` as a higher-level dataset object. - -### Tests - -- open finalized bundle -- open incomplete bundle -- read graph metadata without forcing assignment scan - -## Recommended Implementation Order - -Recommended practical sequence: - -1. `format.rs` -2. `reader.rs` -3. tests + sample fixtures -4. `writer.rs` -5. CLI support -6. `pyben` support - -This order gives you inspection/debugging tools before write-path complexity. - -## Suggested Public API Shape - -Possible `ben` API surface: - -```rust -pub mod bundle; - -pub use bundle::reader::BendlReader; -pub use bundle::writer::BendlWriter; -``` - -And bundle module internals: - -```rust -bundle::format -bundle::manifest -bundle::reader -bundle::writer -``` - -## Risks - -- Header patching requires seekable outputs. -- Incomplete bundles need carefully defined recovery behavior. -- XBEN payloads may still require full scan when bundle metadata is absent or unfinalized. -- Asset directory changes should be versioned carefully to preserve forward compatibility. - -## Recommended First Milestone - -The first milestone should be: - -- parse and inspect `.bendl` files -- list bundled assets -- open assignment stream region -- expose `complete` and `sample_count` - -That gives immediate value and makes it easier to validate the spec before building the writer. diff --git a/docs/bendl-roadmap.md b/docs/bendl-roadmap.md deleted file mode 100644 index f8fb8f8..0000000 --- a/docs/bendl-roadmap.md +++ /dev/null @@ -1,175 +0,0 @@ -# BENDL Roadmap - -## Goal - -Add a higher-level `.bendl` container format that feels like a single file to users while preserving the streamable nature of the underlying assignment data. - -The low-level assignment formats remain: - -- `.ben` -- `.xben` - -The new `.bendl` format is a richer file-oriented container for: - -- assignment data -- metadata -- graph data -- relabel maps -- future optional assets - -## Design Principles - -- Keep `.ben` and `.xben` streamable. -- Treat `.bendl` as a seekable container format for regular files. -- Put stable assets near the front of the file. -- Put the live assignment stream at the end of the file. -- Allow incomplete `.bendl` files to remain partially usable after interruption. -- Patch the header on successful finalization instead of requiring a footer. - -## Proposed Layout - -`.bendl` should use this high-level layout: - -```text -[Fixed Header] -[Directory / Metadata Section] -[Optional Extra Assets] -[Streaming Assignments Section] -``` - -Where: - -- the header is written first with placeholder values -- the directory and optional assets are written before streaming starts -- the assignment stream is appended at the end -- on successful completion, the writer seeks back and patches the header - -## Why This Layout - -This layout ensures: - -- graph data and relabel maps are readable even if the stream is interrupted -- the assignment stream can still be decoded up to EOF if the file is incomplete -- final facts like `sample_count` are only written once they are actually known - -## Header Concept - -The exact binary layout is still to be finalized, but the header should carry fields conceptually like: - -```rust -struct BendlHeader { - magic: [u8; 8], - version: u16, - flags: u16, - complete: u8, - reserved: [u8; 5], - directory_offset: u64, - directory_len: u64, - stream_offset: u64, - stream_len: u64, - sample_count: u64, -} -``` - -Notes: - -- `complete == 0` means the file was not finalized -- `stream_len == 0` can mean unknown or unfinalized -- `sample_count == u64::MAX` can represent unknown sample count - -## Directory / Asset Section - -The directory section should describe any front-loaded assets, such as: - -- graph -- relabel map -- metadata blob -- future extras - -This can be backed by a simple JSON or binary directory table. The important part is that these assets are discoverable without scanning the assignment stream. - -## Assignment Stream - -The assignment stream should be stored at the end of the file so writing can proceed incrementally. - -The stream payload may be: - -- BEN data -- XBEN data - -The `.bendl` container should treat this as the primary large append-only region. - -## Finalization Model - -Expected write flow: - -1. Write a provisional header. -2. Write directory data and optional assets. -3. Record `stream_offset`. -4. Stream the assignment data. -5. On successful completion, seek back and patch the header with: - - `complete = true` - - `stream_len` - - `sample_count` - - any other finalized metadata - -If writing is interrupted: - -- the header remains incomplete -- the front-loaded assets are still readable -- the assignment stream may still be readable up to EOF -- exact `sample_count` is unavailable unless the reader scans - -## Reader Semantics - -Reader behavior should be: - -- read the fixed header -- inspect `complete` -- load directory and front-loaded assets -- read assignment data starting at `stream_offset` -- if `complete == false`, treat the file as recoverable but incomplete - -This means `.bendl` readers should expose both: - -- whether the bundle is complete -- whether assignment data is still usable - -## Relationship to Existing Formats - -- `.ben` and `.xben` remain the portable stream/data formats -- `.bendl` becomes the richer container format for complete datasets - -This keeps responsibilities separated: - -- assignment encoding stays in BEN/XBEN -- dataset metadata and optional extras live in BENDL - -## PyBen Implications - -Potential future Python API support: - -- open a `.bendl` file directly -- expose `sample_count` immediately if finalized -- expose optional `graph` and `relabel_map` -- fall back to scanning assignment data if `sample_count` is unknown - -## Open Questions - -- exact binary encoding of the directory section -- whether the asset directory should be JSON or a compact binary table -- whether checksums should be included in the header -- whether assignment payload should always be XBEN inside `.bendl` -- whether `.bendl` writing should require seekable output explicitly - -## Current Recommendation - -Proceed with `.bendl` as: - -- a single-file container -- a seekable file format -- front-loaded metadata/assets -- trailing assignment stream -- header patched on finalize - -This best matches the requirements discussed so far. diff --git a/docs/coding-standards.md b/docs/coding-standards.md new file mode 100644 index 0000000..bc1788c --- /dev/null +++ b/docs/coding-standards.md @@ -0,0 +1,273 @@ +# Binary Ensemble — Coding Standards + +This document describes the coding standards of the `binary-ensemble` workspace as they are actually +practiced in the code today. It is descriptive first (what the code does) and prescriptive second +(what new code should do to fit in). When in doubt, imitate the surrounding module. + +The workspace is a Rust library + four CLI tools for compressing ensembles of districting plans (the +BEN / XBEN / BENDL formats), plus a PyO3 binding crate that ships the `binary_ensemble` Python +package. + +A companion document, [`docs/glossary.md`](glossary.md), is the source of truth for **terminology**. +This document covers **mechanics**. The two are meant to be read together: the glossary tells you +what to call a thing, this tells you how to write the code around it. + +______________________________________________________________________ + +## 1. Workspace layout + +The repository is a Cargo workspace (`resolver = "2"`) with two members: + +- **`ben/`** — package `binary-ensemble`, library name `binary_ensemble`. Contains the codec, I/O, + ops, JSON-graph, format, and CLI logic, plus four thin binaries (`ben`, `reben`, `pcben`, `bendl`) + under `ben/src/bin/`. +- **`ben-py/`** — package `ben-py`, cdylib `ben_py_core`. PyO3 bindings that depend on + `binary-ensemble` by path and are published as the `binary_ensemble` Python package. + +Conventions: + +- **The version is shared.** Both crates use `version.workspace = true` from `[workspace.package]`; + never set a per-crate version. Don't hard-code the version string anywhere in source or comments + either — keep code self-contained and free of version pins. +- **Binaries stay thin, with a uniform `main`.** Each `ben/src/bin/*.rs` just calls + `cli::::run()`, and on `Err` prints `Error: {err}` to stderr and exits non-zero. All real + behavior lives in the library so it is testable without spawning a process. Each CLI module + exposes `pub fn run() -> CliResult`, parses a `clap` `#[derive(Parser)]` `Args`, and dispatches to + per-mode handlers; CLI failures use the crate's own `CliError`/`CliResult`, not bare `io::Error`. +- **The Python crate owns all PyO3.** Nothing in `ben/` depends on `pyo3`. Python concerns live + entirely in `ben-py/`. + +______________________________________________________________________ + +## 2. Toolchain, formatting, and the task runner + +- **Edition 2021**, MIT licensed. No pinned `rust-toolchain.toml`; build on ambient stable. +- **`rustfmt` with an explicit config** (`rustfmt.toml`): `max_width = 100`, `comment_width = 100`, + `wrap_comments = true`. Comments are auto-wrapped to 100 columns — write naturally and let + `rustfmt` reflow. Always run `cargo fmt --all` (or `task format-rust`) before committing. +- **Quality gates are run locally via `Taskfile.yml`** (the `task` / `go-task` runner), not in CI. + The GitHub workflow (`ci_cd.yml`) only builds and publishes wheels with `maturin`. Before pushing, + run the same checks the maintainer does: + - `task test` — Rust fast suite + `#[ignore]`-gated slow suite + Python `pytest`. + - `task format` — `cargo fmt --all` + `ruff format` for Python. + - `task lint` — `ruff check` for Python. + - `task coverage-*` — `cargo llvm-cov` (the `bin/` wrappers are excluded from coverage; they're + meant to be trivial). +- **Python tooling is `uv` + `maturin` + `ruff` + `pytest`.** Develop the extension with + `task ben-py-develop` (runs `maturin develop` inside the `uv` env). Format/lint Python with + `ruff`. + +______________________________________________________________________ + +## 3. Module organization + +- **Directory modules use `mod.rs`.** A module folder is fronted by `mod.rs` (e.g. `codec/mod.rs`, + `format/mod.rs`, `io/mod.rs`), which declares the submodules and re-exports the module's common + surface. +- **Re-export the public surface with `pub use`.** Parent modules flatten the names callers need + (e.g. `codec/mod.rs` does `pub use frames::{BenDecodeFrame, BenEncodeFrame};`; `format/mod.rs` + does `pub use errors::FormatError;`). Add new public items to the appropriate re-export rather + than forcing callers down deep paths. +- **Every module opens with a `//!` doc comment** that says what the module is for and links + siblings with intra-doc links (e.g. `` [`encode`] ``, `` [`decode`] ``, `` [`translate`] ``). The + `pub mod` declarations in `lib.rs` each carry a `///` one-liner. +- **Errors live in `errors.rs`.** A module that defines its own error type puts it in a sibling + `errors.rs` (e.g. `format/errors.rs`, `codec/translate/errors.rs`, `json/graph/errors.rs`) and + re-exports it from `mod.rs`. +- **Tests live next to what they test** under `#[cfg(test)] mod tests`. Small modules use a sibling + `tests.rs`; larger ones use a `tests/` subdirectory split by topic (e.g. + `codec/decode/tests/{standard,mkvchain,twodelta}.rs`, + `io/bundle/tests/{reader,writer,format}.rs`). Cross-cutting, process-level, and stability tests go + in `ben/tests/` integration files. +- **Shared test helpers go in `test_utils`**, declared `#[doc(hidden)] pub mod test_utils;` so + they're reusable across the crate's test trees without polluting the public docs. +- **Guard platform assumptions explicitly.** `lib.rs` rejects non-64-bit targets with + `compile_error!`. Encode invariants you rely on rather than letting them fail silently. + +______________________________________________________________________ + +## 4. Types and domain modeling + +- **Model domain concepts as enums/structs with doc-commented variants.** E.g. + `BenVariant { Standard, MkvChain, TwoDelta }`, each variant documented with what it stores and + when it applies. +- **Push invariants into the type system when you can.** `XBenVariant` is a deliberately restricted + subset (`Standard`, `MkvChain`) that *cannot* represent `TwoDelta`, so functions parameterised by + `XBenVariant` are uncallable for TwoDelta at compile time. Prefer this kind of + make-illegal-states-unrepresentable design over runtime `assert!`s. +- **Provide `From`/`TryFrom` between related types**, and when a conversion can fail, return a + dedicated, named error type (e.g. `TwoDeltaNotXBenError`) rather than a bare `()` or a string — + even a tiny marker struct gets `Display` + `std::error::Error` impls. +- **Derive the obvious traits.** Small value types carry + `#[derive(Debug, Clone, Copy, PartialEq, Eq)]` (and `Serialize` / `Deserialize` where they cross + the JSON boundary). Give public types a `Debug` impl by default. +- **Mark extensible public enums/structs `#[non_exhaustive]`.** Options and transform types that may + grow new variants/fields (e.g. `RelabelTransform`, `RunPolicy`, `RelabelOptions`) are + `#[non_exhaustive]` so adding to them isn't a breaking change. +- **Build complex options with a constructor-plus-`with_*` builder.** `RelabelOptions` is created by + an intent-named constructor (`first_seen`, `node_permutation`, `convert_to`) and then refined with + `with_*` methods, rather than exposing a wide public constructor or public fields. +- **Reserve unused bits/fields explicitly** for forward compatibility (e.g. named `RESERVED_BIT_*` + constants) rather than leaving holes undocumented. + +______________________________________________________________________ + +## 5. Error handling + +- **Library errors are `thiserror` enums, one per module, in `errors.rs`.** Each variant has a + descriptive `#[error("...")]` message that includes the relevant values (e.g. `UnknownBanner` + prints the actual bytes seen *and* the expected set). Wrap source errors with `#[from]` (e.g. + `Io(#[from] io::Error)`). +- **Bridge domain errors to `io::Error` at streaming boundaries.** The pattern is an explicit + `impl From for io::Error` that forwards a real IO error unchanged and maps everything + else to `io::ErrorKind::InvalidData`. This lets streaming readers/writers keep `io::Result` + signatures while still carrying precise error context. +- **`expect()` only for genuinely infallible cases, with a message that states the invariant** (e.g. + `.expect("valid fallback log filter")`). Avoid `unwrap()`/`expect()` on real IO, parsing, or + caller-supplied data in library paths — return a `Result` and propagate with `?`. +- **`?` is the default control flow** for fallible calls. Reserve `panic!`/`unreachable!` for true + logic invariants, not expected failures. + +______________________________________________________________________ + +## 6. Logging + +- **Use `tracing`, not `log` and not `println!`.** Emit diagnostics with the `tracing` macros. In + practice the codebase logs almost entirely at `trace!` (fine-grained internal flow) with the + occasional `warn!`; reach for higher levels only when a message genuinely belongs there. +- **Subscriber init is centralized and idempotent.** `logging::init_logging()` sets the global + subscriber exactly once via `std::sync::Once`, reads `RUST_LOG` (defaulting to `off`), writes to + **stderr**, and uses a compact format with time/target/level/ANSI disabled. Don't stand up ad-hoc + subscribers elsewhere. +- **`stdout` is for program output only** (decoded data, version banners, inspect listings) — never + for logging. **`stderr`** carries logs and progress. +- **Long streaming operations report progress with `indicatif`.** + +______________________________________________________________________ + +## 7. I/O and performance + +- **Stream; don't slurp.** The crate processes ensembles too large to hold in memory. Functions take + buffered readers/writers and work frame-by-frame / line-by-line. +- **Be generic over IO** with `R: Read`/`R: BufRead` and `W: Write` bounds so the same code serves + files, pipes, in-memory buffers, and test fixtures. +- **Binary fields use `byteorder` with an explicit endianness** — never rely on native byte order + for on-disk data. +- **Integrity is checked with CRC32C (`crc32c` crate).** That crate was chosen deliberately over + `crc32fast` (it can't be misconfigured into IEEE CRC-32); the rationale is recorded in + `ben/Cargo.toml`. Keep integrity checks on payloads and the assignment stream. +- **Preserve the lazy-decode property of BEN frames** — a frame keeps its raw bytes without eagerly + unpacking runs, which is what makes subsample-by-skip and random-access reads fast. Don't + introduce a unified frame representation that forces eager bit-unpacking on read. + +______________________________________________________________________ + +## 8. Documentation + +Documentation is treated as part of the code, not an afterthought. + +- **`//!` on every module, `///` on public items.** Item docs use the conventional rustdoc sections + already prevalent here: `# Arguments`, `# Returns`, `# Examples`, `# Errors`, `# Panics`. Format + illustrations in docs use fenced ```` ```text ```` blocks. +- **Comments explain intent and stay self-contained and timeless.** Don't reference planning-doc + filenames, plan section numbers, or version numbers from source/inline comments. Pointing a reader + at the stable `docs/glossary.md` for terminology is fine and done in the code; pointing at a + transient plan is not. +- **Substantive design goes in `docs/`.** Architecture/context in `CONTEXT.md`; vocabulary in + `docs/glossary.md`; the on-disk contract in the format spec; plans in `docs/-plan.md` + *before* implementation. + +______________________________________________________________________ + +## 9. Naming + +- **Standard Rust casing:** `snake_case` items, `UpperCamelCase` types, `SCREAMING_SNAKE_CASE` + consts. +- **Names follow the glossary's lexicon exactly.** This is an explicit, enforced standard: use + `plan` / `assignment` / `sample` / `ensemble` / `variant` / `banner` / `frame` / `stream` with + their glossary meanings, and the verbs `encode` / `decode` (never `compress` / `decompress`). + Spell format names out in identifiers (`ben`, `ben32`, `xben`, `bendl`, `jsonl`). If prose and an + identifier disagree, the glossary wins and the identifier is what changes. +- **Functions are named descriptively after their transform**, encoding direction and operands. Two + suffix/affix conventions are consistent and worth following: + - Direction is spelled out with `_to_` / `_from_` (`decode_xben_to_jsonl`, `encode_jsonl_to_ben`, + `ben_to_ben32_lines`). + - A `_path` suffix marks the convenience wrapper that takes a file path over the streaming core + (`decode_ben_to_jsonl` vs `decode_ben_to_jsonl_path`). + - An `_unverified` suffix marks the variant that **skips integrity checks** (`asset_bytes` vs + `asset_bytes_unverified`). The default name verifies; the escape hatch is explicitly labeled. + Prefer clarity over brevity. +- **Name magic values once as consts** (banners, magic bytes, header sizes, asset-type/flag values); + never inline a protocol literal at a use site. + +______________________________________________________________________ + +## 10. Testing + +- **Property-based testing with `proptest`** for invariants — above all the round-trip property + (encode → decode reproduces the original assignments). `.proptest-regressions` files are committed + so found counterexamples stay covered. +- **Determinism in randomized tests:** seed with `rand_chacha` / explicit seeds; use `lipsum` for + synthetic text. Tests must be reproducible. +- **Slow / stress tests are gated with `#[ignore]`** and run separately (`cargo test -- --ignored`, + i.e. `task test-rust-slow`). Keep the default `cargo test` fast. +- **Filesystem tests must be hermetic** — use temp files, never repo-relative scratch paths. +- **Test the behavior, at the right layer.** Unit tests live beside their module; format-stability, + CLI, and full-pipeline behavior live in `ben/tests/`. The CLI is exercised end-to-end + (`test_cli.rs`), and format stability is pinned by golden tests — treat an on-disk format change + that breaks them as a deliberate, documented decision. + +______________________________________________________________________ + +## 11. Python bindings (`ben-py`) + +- **All PyO3 code is isolated in the `ben-py` crate** and built against the **stable ABI** + (`abi3-py311`, `extension-module`). The core library has no Python dependency. +- **Match the prevailing PyO3 version's idioms** (the bound API: `Bound<'_, _>`, + `wrap_pyfunction!`). Don't mix older and newer PyO3 styles within the crate. +- **Names align across the language boundary.** Rust structs carry a `Py` prefix internally (e.g. + `PyBenEncoder`) but are exposed to Python with the prefix stripped via + `#[pyclass(name = "BenEncoder")]`; Python methods use the same `encode_*` / `decode_*` verbs as + Rust. +- **Spell out Python-visible signatures** with `#[pyo3(signature = ...)]` for defaults and a + matching `#[pyo3(text_signature = "...")]` so the signature shows up in Python help/IDE tooling. +- **Map Rust errors to specific Python exceptions** at the boundary via small private `map_*_err` + helpers that match on the source error and pick the right exception (`PyIOError` for IO, + `PyValueError` for bad input, `PyKeyError` for missing keys, `PyException` as fallback). A panic + must never cross the FFI line. +- **Ship typing metadata:** the package includes `py.typed` and a `_core.pyi` stub; keep the stub in + sync with the exported surface. Python users import re-exported names from the `binary_ensemble` + package, not from `_core` directly. + +______________________________________________________________________ + +## 12. Dependencies + +- **Conservative, single-purpose crates, each justified.** Current set includes `byteorder` + (explicit-endian binary IO), `crc32c` (integrity), `xz2` (LZMA2 for XBEN), `serde`/`serde_json`, + `clap` (derive CLI), `indicatif` (progress), `petgraph` + `rustworkx-core` (graph ordering for + relabeling), `pipe` (in-memory streaming), `pcompress` (foreign-format bridge), `thiserror`, and + `tracing`/`tracing-subscriber`. Dev-only: `proptest`, `lipsum`, `rand` + `rand_chacha` + + `rand_distr`. +- **Record non-obvious choices in `Cargo.toml`.** The `crc32c`-vs- `crc32fast` decision is + documented inline; do the same for any future "why this crate" decision. +- **Prefer reusing a present dependency** over adding a new one. + +______________________________________________________________________ + +## Quick checklist for a new change + +- [ ] `cargo fmt --all` clean; `task test` green (fast + `--ignored` + Python); + `ruff check`/`ruff format` clean for any Python. +- [ ] New public items have `//!`/`///` docs with the standard sections; comments are self-contained + (no plan/section/version references). +- [ ] Errors are `thiserror` enums in `errors.rs` with informative messages; boundaries bridge to + `io::Error`; no stray `unwrap()`/`expect()` on real IO or input. +- [ ] Diagnostics via `tracing` (stderr); program output only on stdout; progress via `indicatif`. +- [ ] Streaming over buffered, generic IO; explicit endianness; BEN frame lazy-decode preserved; + integrity (CRC32C) intact. +- [ ] Identifiers match `docs/glossary.md`; magic values named as consts. +- [ ] Round-trip / invariant covered by `proptest`; randomness seeded; slow tests `#[ignore]`d; temp + files for FS tests. +- [ ] PyO3 changes stay in `ben-py`, keep `abi3`, map errors to typed Python exceptions, and update + the `_core.pyi` stub. diff --git a/docs/format-stability.md b/docs/format-stability.md new file mode 100644 index 0000000..4138af5 --- /dev/null +++ b/docs/format-stability.md @@ -0,0 +1,94 @@ +# Format Stability Policy + +This crate ships committed binary fixtures under `ben/tests/fixtures/v/` and a matching +`ben/tests/test_format_stability.rs` that decodes each one. The fixtures are the v1.0.0 wire-format +stability contract. + +## Contract + +**Once a fixture directory is committed for a stable major version, every file inside it MUST +continue to decode correctly in every later release of that major version, and across major versions +to the extent practical.** That is the entire policy. + +In particular: + +- The decoded output of every committed fixture must equal the canonical input it was minted from. +- A reader at any later commit must accept the fixture without errors on any verifying path (asset + checksums, stream checksum, decode-to-JSONL). +- Forward-compatible bits (reserved header flags, reserved asset-flag bits, higher minor versions) + must continue to be ignored by readers, as the fixtures specifically pin that behavior. + +## What "never regenerate in place" means + +When the wire format changes: + +- **Additive minor change** (a new flag bit, a new asset type): mint a *new* fixture into the + current `v/` directory if it pins behavior the existing fixtures do not, but **leave every + existing fixture untouched**. +- **Breaking major change** (header shape, frame shape, checksum algorithm): add a new + `tests/fixtures/v/` directory and a parallel generator and test set. The older `v/` + directory stays exactly as it was, and the test suite continues to decode it through whatever + versioned reader path the library provides. + +If a fixture is found to be wrong after the fact — e.g. it was minted with a bug that has since been +fixed — the right response is **not** to regenerate it. The right response is one of: + +1. If the fixture's bytes are now invalid under v1.0.0 readers, the bug shipped in a release and the + v1.0.0 reader needs to keep accepting that exact pattern (additive fixup, not byte-level + rewrite). +1. If the fixture was committed before any release shipped to users, the breakage is internal and + the fixture may be regenerated, but only with a clear note in the commit message. + +The default answer is option 1. + +## Why the regen test is `#[ignore]` + +`generate_format_stability_fixtures` exists so that the regeneration procedure is documented in code +rather than in a separate script, but it is `#[ignore]` so that a routine `cargo test` cannot +overwrite the committed bytes by accident. The only legitimate reasons to run it are: + +- Bootstrapping a brand-new fixture directory for a new major version. +- Adding a brand-new fixture inside the current directory that does not already exist. + +Both cases should land in a dedicated PR whose title makes the intent explicit (e.g. +`fixtures: add v2.0.0 stability set`). Running the generator over an already-populated directory in +any other context is a bug. + +A second `#[ignore]` regenerator, `regenerate_twodelta_fixtures`, re-mints **only** the `twodelta.*` +fixtures (and their source). `TwoDelta` is unreleased, so it qualifies for option 2 above — its wire +format may change and its fixtures may be re-minted in place — but every released +Standard/MkvChain/BENDL fixture must stay byte-for-byte identical. The focused regenerator exists so +re-minting TwoDelta cannot touch the released set; after running it, confirm `git status` shows only +`twodelta.ben`/`twodelta.xben` (and `source_twodelta.jsonl`) changed. + +## Inventory + +The current `v1.0.0` set covers: + +- `standard.ben`, `mkvchain.ben`, `twodelta.ben` — one BEN file per variant. The `twodelta.*` + fixtures are minted from `TWODELTA_CANONICAL_JSONL` (not the shared `CANONICAL_JSONL`) and + deliberately exercise **mixed snapshot/delta frames**: an anchor snapshot, a 2-swap delta, a + repeat, a >2-district transition that forces a mid-stream snapshot, and a delta rebased onto it. +- `standard.xben`, `mkvchain.xben`, `twodelta.xben` — one XBEN file per variant. +- `flags_set.bendl` — a BENDL bundle with every currently-defined header and asset flag bit set on + at least one object: header `HEADER_FLAG_STREAM_CHECKSUM`; a graph asset flagged + `ASSET_FLAG_JSON | ASSET_FLAG_XZ | ASSET_FLAG_CHECKSUM`; a metadata asset flagged + `ASSET_FLAG_JSON | ASSET_FLAG_CHECKSUM`; an XBEN assignment stream. +- `unknown_flags.bendl` — a derivative of `flags_set.bendl` with reserved bits set in the header + `flags` and in a custom asset's `asset_flags`. Pins forward-compatible reader behavior: unknown + bits must be ignored, all known operations still succeed. +- `source.jsonl`, `source_twodelta.jsonl`, `source_graph.json`, `source_metadata.json` — + human-readable sources committed alongside the binary fixtures so the contents can be inspected + without running the codec. `source.jsonl` mints the Standard/MkvChain/BENDL fixtures; + `source_twodelta.jsonl` mints the TwoDelta fixtures. + +If you add a new fixture, list it here. + +## Cross-host reproducibility note + +XBEN compression uses `xz` and is sensitive to thread count, compression level, and block size. The +generator pins `n_threads = Some(1)`, `compression_level = Some(6)`, and lets the codec choose the +rest, which makes the minted bytes deterministic across machines. **Stability does not depend on +this**, however — only the decoded output is contractual. If a future liblzma version produces +different compressed bytes for the same input, that does not break this contract; what matters is +that the committed bytes continue to decode. diff --git a/docs/glossary.md b/docs/glossary.md new file mode 100644 index 0000000..cfd684d --- /dev/null +++ b/docs/glossary.md @@ -0,0 +1,362 @@ +# Glossary + +Shared lexicon for the `binary-ensemble` workspace. This file is the source of truth for terminology +used in code, documentation, commit messages, and conversations about the project. When prose and +code disagree, prose follows this glossary; code identifiers should be brought into alignment by the +renames listed at the end. + +## Domain Objects + +- **Plan** + - The mathematical object: a partition of dual-graph nodes into districts. + - Orientation-free and label-free up to relabeling. A plan has many possible assignments (one per + node ordering and district relabeling). +- **Assignment** + - The vector encoding of a plan: a length-N `Vec` where index *i* is the **district id** of + node *i*, in dual-graph node order. + - An assignment uniquely determines a plan; a plan does not uniquely determine an assignment. +- **District id** + - The integer values stored in an assignment vector. Names *what the integer means* (a district). + Replaces "assignment id" in prose; code rename pending. +- **Sample** + - One entry in an ensemble stream: the pair `(sample_number, assignment)`. + - `sample_number` is 1-indexed and lives in *expanded* space (see **sample count**). +- **Ensemble** + - An ordered stream of samples produced by a single sampler run. The unit that `.ben`, `.xben`, + and `.bendl` files all wrap. + - Conceptually, an ensemble is a probabilistic draw from the space of possible plans. +- **Sample count** + - The number of independent draws represented by an ensemble. + - Always *expanded*: when a `MkvChain` frame collapses 5 identical consecutive samples into a + single frame with `count = 5`, the ensemble's sample count contribution is 5, not 1. + - The `sample_count` field of a bundle header carries this expanded number. + +## Sampler vs Chain + +- **Sampler** + - Umbrella term for any algorithm that produces an ensemble of plans. Covers both Markov-chain + methods (MCMC) and particle/weighted methods (SMC). +- **Chain** + - Specifically MCMC. Use only when the Markov property matters; otherwise prefer "sampler." +- **ReCom-step** + - A single accepted ReCom (recombination) move: consecutive samples differ by exactly one pairwise + district swap. The transition a `TwoDelta` **delta frame** encodes. Transitions that are *not* a + clean pairwise swap — a multi-district move, random/independent sampling, or a district that was + previously empty — are encoded as full snapshot frames instead (see **Variant fitness by + sampler**). +- **Sample repetition** + - The data shape that arises when consecutive samples in an ensemble are identical. May come from + MCMC self-loops (proposal rejected) or from any other source. + - `MkvChain` compresses sample repetitions via per-frame repetition counts. + - `TwoDelta` also accommodates sample repetitions, via per-frame repetition counts plus a + dedicated repeat-frame layout (`twodelta_repeat_frame` in `io/writer/stream_writer/ben.rs`). The + per-frame TwoDelta delta encoder returns `EncodeError::TwoDeltaIdentical` if called with two + identical assignments, but the stream writer routes repeats through the repeat-frame path before + that error can surface. + +## Encoding Stack + +The five layers of the BEN-family encoding pipeline. Use the layer name unambiguously; never +compress multiple layers into one word. + +| Layer | Name | What it is | |---|---|---| | 0 | bit-packing | cramming run values into bit-precise +widths | | 1 | RLE | `(value, length)` pairs | | 2 | frame | one sample's encoded bytes: frame +header + payload, plus a repetition count for `MkvChain` and `TwoDelta` | | 3 | stream | banner + +concatenated frames; the contents of a `.ben` file or the LZMA2-decompressed body of a `.xben` file +| | 4 | container | the on-disk file: `.ben`, `.xben`, or `.bendl` | + +- **Banner** + - 17-byte ASCII identifier at the start of every BEN/XBEN stream. One per file. + - Three legal values, one per variant: `STANDARD BEN FILE`, `MKVCHAIN BEN FILE`, + `TWODELTA BEN FILE`. +- **Magic** + - 8-byte file-format identifier at offset 0 of a `.bendl` file. Different concept and different + shape from a banner; the two terms are kept distinct. +- **Header** — always qualified: + - **Frame header**: the leading bytes of one frame (bit-width fields and per-variant metadata; 6 + bytes for `Standard`/`MkvChain`, 9 bytes for a `TwoDelta` delta frame; a `TwoDelta` snapshot + frame is `MkvChain`-formatted with a 6-byte header). The 1-byte per-frame tag that precedes + every frame in a `TwoDelta` stream is a stream-layer concern, not part of the frame header. + - **Bundle header**: the 64-byte fixed header at offset 0 of a `.bendl` file. + - Bare "header" is ambiguous and should not appear in shared docs. +- **Frame** + - One sample's encoded bytes. `MkvChain` and `TwoDelta` frames carry a trailing `u16` repetition + count. `Standard` frames are 1-sample. +- **Stream** + - Banner plus concatenated frames. In a `TwoDelta` stream each frame is additionally prefixed with + a 1-byte tag (snapshot vs delta) so the two body layouts self-distinguish; the first frame is + always a snapshot. The `assignment_stream` region of a `.bendl` is exactly a layer-3 BEN/XBEN + stream stored at a known offset; the bundle layer treats it as opaque bytes. +- **BEN32 intermediate** + - The columnar wire format used inside an XBEN container's LZMA2-compressed body: u16 value + u16 + length per run, u32 zero sentinel between samples. + - Used only by `Standard` and `MkvChain`. `TwoDelta` bypasses BEN32 and uses its own columnar + layout. + - **Never a standalone file format.** Always say "BEN32 intermediate" or "BEN32 wire format," + never "BEN32 file." + +## Variants + +- **Variant** + - One of `{Standard, MkvChain, TwoDelta}`. A property of a stream, fixed for the whole stream by + the banner. One variant per file. +- **Inter-sample constraint** + - The rule a variant imposes on consecutive samples. + - `Standard`: none. + - `MkvChain`: identical-consecutive samples are collapsible into a single frame with `count > 1`. + - `TwoDelta`: none on the input. A *non-repeat* transition that is a single ReCom-step (exactly + two district ids exchange positions; no position outside that pair changes, and both ids already + exist) is delta-encoded; any other transition is stored as a full snapshot frame. + Identical-consecutive samples are accommodated via repetition counts and a repeat-frame layout, + not by the per-frame delta encoder. +- **Variant fitness by sampler** + - `Standard`: any ensemble; baseline. + - `MkvChain`: full-chain ensembles (every step including rejections logged). Compresses sample + repetitions efficiently. Suitable for any sampler. + - `TwoDelta`: any ensemble. Delta-compresses pairwise ReCom steps and emits a full snapshot frame + for every other transition, so it is **compatible** with random sampling and Forest ReCom — + those just produce more snapshot frames and less delta compression. Best compression comes from + a full-chain *pairwise* ReCom ensemble, where nearly every accepted move changes exactly two + districts. + +## Files and Containers + +- **`.ben`** — a layer-3 stream stored on disk, no outer wrapping. +- **`.xben`** — a `.ben` stream's content (BEN32 intermediate for `Standard`/`MkvChain`, columnar + for `TwoDelta`) wrapped in LZMA2. +- **`.bendl`** — a bundle: bundle header + asset payloads + assignment stream + trailing directory. +- **Container** is the umbrella term for any of these on-disk files. + +## Tools and Packages + +The workspace ships one Rust library, four CLI binaries, and one Python package backed by PyO3 +bindings. + +- **The library crate** — `binary-ensemble` on crates.io (lib name `binary_ensemble`). Contains the + codec, I/O, ops, and bundle modules; the four CLI binaries are thin wrappers over `cli::*::run()`. +- **The CLI tool family** — collective name for the four binaries below. Each owns one architectural + role: + - **Codec tool** — `ben`. Encode/decode BEN-family streams plus xz wrapping convenience. + - **Pipeline tool** — `reben`. Drives the relabel pipeline (decode → transform → re-encode) with + canned transforms (first-seen relabel, key-based or topology-based node ordering). + - **Bridge tool** — `pcben` (rename pending — currently `pben`). Translates between BEN and the + foreign **PCompress** format. + - **Bundle tool** — `bendl`. Create / inspect / extract / append for `.bendl` containers. +- **The Python package** — `binary_ensemble` on PyPI. The user-facing Python entry point. +- **The Python bindings crate** — `ben-py` (cdylib `ben_py_core`). Internal scaffolding; users never + import this name — they import `binary_ensemble`. +- **`_core`** — the pymodule produced by the bindings crate; imported as `binary_ensemble._core`. + Implementation detail — Python users reference re-exported names from the package, not `_core` + directly. + +### Cross-language consistency + +Verbs and class names are intentionally aligned across Rust and Python to keep the lexicon uniform. +The Python API uses `encode_*` / `decode_*` (matching Rust prose), not `compress_*` / `decompress_*` +(the historical Python-only naming, scheduled for rename). Python classes are exposed as +`BenEncoder` / `BenDecoder` (the `Py` prefix on Rust-side structs is a PyO3 implementation +convention and is stripped at the Python boundary). + +## Operations and Verbs + +CLI mode names and prose verbs are not always identical. Prose follows this glossary; CLI flags are +listed for reference. + +- **encode** + - Produce some BEN-family output from JSONL or another BEN-family input. + - CLI: `ben -m encode` (JSONL → BEN), `ben -m x-encode` (BEN → XBEN, or JSONL → XBEN direct). +- **decode** + - Produce JSONL from a BEN-family input. + - CLI: `ben -m decode` (BEN → JSONL), `ben -m x-decode` (XBEN → BEN, or with `-p` to JSONL). +- **`x-` prefix** + - Means "with LZMA2 wrapping." Not a separate verb; a modifier on `encode`/`decode`. +- **Sample lookup** *(prose)* / random-access decode + - Decode just sample N from a BEN file. + - CLI: `ben -m read -n N`. The mode is a candidate for a CLI rename in the next major release. +- **Subsampling** + - Iterate over a subset of frames without consuming the whole stream. The umbrella that + `read -n N` is the special case "subsample of size 1." +- **Asset extract** vs **sample-range extract** + - Two unrelated operations sharing the verb "extract." Always qualify in prose. + - **Asset extract**: pull a named asset out of a bundle. Code: `extract_asset`. CLI: + `bendl extract`. + - **Sample-range extract**: pull a contiguous range of samples out of a BEN file. Code: + `extract_sample_range`. +- **xz-compress / xz-decompress** + - Wrap an arbitrary file in xz / unwrap it. Not a BEN-aware operation; included in the `ben` CLI + for convenience. +- **Inspect** + - List the assets in a bundle. CLI: `bendl inspect`. +- **Create** + - Build a new bundle from a stream plus assets. CLI: `bendl create`. +- **Append** *(strict)* + - Add a new asset to a *finalized* bundle: write new asset payloads after the old EOF, write a + replacement trailing directory, then repatch the header. The old directory becomes orphaned + bytes after a successful patch, and remains authoritative if the final header patch fails. + - **Never** means extending the assignment stream. If stream-extension is ever wanted, call it + **rewrite** or **reflow** (the implementation builds a new bundle and copies assets across). +- **Bridge** + - The architectural role of the `pben` tool: a translator between our formats and a foreign format + (PCompress). Distinct from a codec, which is internal. + +## Dual Graphs + +The geographic adjacency graph that gives meaning to a node ordering. Every assignment vector is +interpreted with respect to a particular dual graph: index *i* is the district id of dual-graph node +*i*. + +- **Dual graph** + - The adjacency graph over geographic units (blocks, VTDs, tracts, precincts). Nodes are units; + edges are adjacencies. The redistricting term, used in prose. + - In code-internal contexts (bundle asset names, type names) the bare word "graph" suffices + because the redistricting context is implicit. +- **NetworkX adjacency format** (or **NX adjacency JSON**) + - The on-disk JSON shape we read and write for dual graphs (`NxGraphAdjFormat`). Bundle asset + name: `graph.json`. + - One of several JSON formats NetworkX itself supports; we pick this one specifically. Avoid the + ambiguous "graph format" — qualify when format-precise. +- **Node ordering** + - A permutation of nodes — "which node sits at index 0, 1, 2, ..." Produced by an ordering + operation; consumed by node reordering. +- **Key-based ordering** + - Sort nodes by a node attribute. Driven by `sort_json_file_by_key`. Example: sort by `GEOID20`. +- **Topology-based ordering** + - Sort nodes by graph topology, not by attribute. Driven by `sort_json_file_by_ordering` with an + **ordering method**. +- **Sort key** + - The attribute name passed to a key-based ordering (e.g., `"id"`, `"GEOID20"`). +- **Ordering method** + - The enum value passed to a topology-based ordering. Current options: + - **MLC** — Multi-Level Clustering. Recursive clustering, applied per connected component. + - **RCM** — Reverse Cuthill-McKee. Bandwidth-minimization, applied per connected component. +- **Connected component** + - A maximal connected subgraph. Both MLC and RCM order each component independently and + concatenate the results. +- **Node permutation map** + - The data artifact that records a node ordering: a sparse `HashMap` (or dense + `Vec`) mapping new index → old index. + - The on-disk form is JSON; the bundle stores it as the `node_permutation_map.json` asset. + - The Rust convention is `new_to_old_node_map` for the sparse form and `Vec` with + `perm[new_idx] == old_idx` for the dense form. +- **Sparse vs dense permutation** + - Sparse: `HashMap`. The on-disk form. Compact when many nodes are unmoved. + - Dense: `Vec`. The fast-lookup form built by `dense_permutation`. +- **Geographic unit** + - The thing a dual-graph node represents in the real world. Examples: a US Census block, a VTD, a + tract, a precinct. + - "Node" (graph term) and "geographic unit" (geography term) are the same object viewed from two + angles. "Node" is canonical in codec/format/relabel discussions; the specific geography term is + used when the substrate matters for the discussion. +- **Resolution** + - The chosen geographic-unit type for an ensemble. Block is the highest resolution; VTD, tract, + county get progressively lower. + - A property of the *ensemble*, not of the BEN-family file format. +- **GEOID** / **GEOID20** + - US Census Bureau identifier strings. A common choice of **sort key** for key-based node + ordering. + +## Relabeling Taxonomy + +Three operations historically all called some variant of "relabel." The umbrella "relabel" alone now +means **district relabeling**. + +- **District relabeling** + - Rename the integer values in an assignment vector. A pure value permutation. The plain word + "relabel" without qualifier means this. +- **Node reordering** (or **node permutation**) + - Permute the node positions in an assignment vector — rearrange which dual-graph node sits at + each index. Driven by a **node permutation map** (see Dual Graphs section), which is itself + produced by sorting the dual graph (key-based or topology-based). +- **Relabel pipeline** (or **relabel machinery**) + - The codec scaffolding that streams decode → transform → re-encode. Implemented as a single + driver `relabel_ben_file(reader, writer, options)` parameterised by `RelabelOptions` in + `ops/relabel/`. Neutral about which transform runs. +- **First-seen relabeling** (or **first-seen district labeling**) + - The specific district relabeling that renames labels in order of first appearance, starting at + 1\. Replaces the historical "canonicalize_assignment" terminology; code rename pending. +- **`reben`** as a tool + - The CLI that runs the relabel pipeline with one of the canned transforms. + +## Disambiguated Terms + +Words that historically had multiple meanings; the meanings are now segregated. + +### "canonical\*" — three former senses, now two + +- **Canonicalized JSONL** — input format conventions: `assignment` and `sample` keys, sample numbers + from 1, etc. Reserved meaning. Stays. +- **First-seen relabeling** — the operation formerly called "canonicalize_assignment." Loses the + "canonical" word. +- **Standardized name** — the required filename for a known asset in a bundle (formerly "canonical + name"). Renamed; code rename pending. + +### "header" — qualify always + +- **Frame header**: per-frame BEN bytes. +- **Bundle header**: 64-byte BENDL prefix. + +### "extract" — qualify always + +- **Asset extract**: bundle. +- **Sample-range extract**: BEN file. + +### "payload" — qualify always + +- **Asset payload**: directory-entry-referenced bytes. +- **Frame payload**: frame-internal bytes after the frame header. + +## Bundle Internals + +- **Bundle** + - A `.bendl` file. An instance. +- **Bundle header** + - The 64 fixed bytes at offset 0. +- **Magic** + - The 8 leading bytes of the bundle header. +- **Assignment stream** + - The embedded BEN/XBEN stream — opaque to the bundle layer. +- **Assignment format** + - The bundle header field saying whether the embedded stream is BEN or XBEN. **Distinct from + variant** (which lives in the stream's banner). +- **Asset** + - A directory entry plus its payload bytes. Always an instance. +- **Asset type** + - The kind of an asset. Values: `{metadata = 1, graph = 2, node_permutation_map = 3, custom = 4}`. +- **Known asset** + - Type ∈ {1, 2, 3}. Singleton, fixed standardized name, format-defined semantics. +- **Custom asset** + - Type 4. Writer-chosen name, multiple allowed per bundle. +- **Standardized name** + - The required filename for a known asset (e.g., `node_permutation_map.json`). +- **Directory** / **directory table** + - The list of directory entries. The header's `directory_offset` and `directory_len` identify the + authoritative directory; successful finalization writes it at EOF, while failed post-finalize + appends may leave newer orphaned bytes after the old authoritative directory. +- **Directory entry** + - One row of the directory: type + flags + name + offset + length + checksum. +- **Bundle flags** + - The `flags: u32` field at offset 16 of the bundle header. Bundle-level capabilities. +- **Asset flags** + - The `asset_flags: u16` field on each directory entry. Per-asset encoding/checksum. +- **Finalize** *(verb)* + - Write the trailing directory and flip the finalized flag. The terminating step of bundle + creation. +- **Finalized** *(state)* + - The bundle's directory and stream lengths are authoritative; safe to read with no recovery + logic. +- **Incomplete** *(state)* + - The finalized flag is unset. The directory may be missing; the assignment stream extends to EOF. +- **Provisional directory** + - An optional pre-stream directory written for crash recovery. Becomes obsolete once finalize + writes the authoritative trailing directory. +- **Trailing directory** / **authoritative directory** + - The directory pointed to by the bundle header. The one readers should consult. +- **Post-finalize append** + - Adding a new asset to a finalized bundle (see **append**). + +## Future work + +- **Extract disambiguation in the CLI surface.** Today `bendl extract` is the only "extract" verb on + the CLI side; sample-range extract lives at the library API. If a future change makes both forms + callable from the same binary, the CLI surface needs to disambiguate per the **asset extract** vs + **sample-range extract** prose distinction (see Operations and Verbs). diff --git a/docs/twodelta-format-spec.md b/docs/twodelta-format-spec.md new file mode 100644 index 0000000..23831a2 --- /dev/null +++ b/docs/twodelta-format-spec.md @@ -0,0 +1,272 @@ +# TwoDelta BEN Format Specification + +## Status + +Stable wire format. This document specifies the on-disk byte layout of the **TwoDelta** variant of a +BEN stream, for both the plain `.ben` container and the columnar body carried inside a `.xben` +container. The variant is pinned by the committed v1.0.0 stability fixtures (`twodelta.ben`, +`twodelta.xben`). + +TwoDelta shares the banner mechanism, run-length encoding, and bit-packing convention of the +**Standard** and **MkvChain** variants; those shared layers are specified in the BEN stream format +specification and are only summarized here. TwoDelta differs in that most frames are **deltas** +against the previous sample rather than independent snapshots, which gives it a different frame +layout and a different `.xben` body. + +The `.bendl` bundle container embeds a BEN/XBEN stream as an opaque payload and is unaffected by the +variant; see the BENDL format specification. + +## Design Goals + +- Encode a full-chain ensemble from a *pairwise* ReCom sampler compactly by storing, for most + samples, only the two district ids that changed and where. +- Remain a valid BEN stream: same banner mechanism, same `.ben`/`.xben` containers, same expanded + sample-count semantics. +- Degrade gracefully: any transition that is not a clean two-district swap is stored as a full + snapshot, so the variant never fails to represent a sample — it only loses compression on that + step. + +## Terminology + +This document uses the workspace glossary. The terms that matter most here: + +- **assignment** — a length-N `Vec` where index *i* is the district id of dual-graph node *i*. +- **ReCom-step** — a single accepted ReCom move: consecutive samples differ by exactly one pairwise + district swap (exactly two district ids exchange positions; no position outside that pair + changes). +- **anchor** — the first sample of the stream, stored as a full snapshot. Every later delta is + reconstructed by replaying from the most recent snapshot. +- **delta frame** — a frame that stores a pair of district ids and the alternating run lengths over + the positions those ids occupy, to be applied to the previous assignment. +- **snapshot frame** — a frame that stores a full assignment, used for the anchor and as the + fallback when a transition is not a clean two-district swap. +- **inter-sample constraint** — the rule TwoDelta imposes: a delta frame is only emitted for a clean + two-district swap where both ids are already present; everything else falls back to a snapshot. + +The encoding stack layers (bit-packing, RLE, frame, stream, container) are as defined in the +glossary and the BEN stream spec. + +## Byte Order + +All multi-byte integers — the frame-header fields, the trailing repetition counts, and every field +of the columnar `.xben` body — are **big-endian**. (As with the other variants, this is independent +of the little-endian BENDL bundle header.) + +## Inter-Sample Constraint and Frame Selection + +For each sample after the anchor, the encoder classifies the transition from the previous sample in +a single scan: + +- **Repeat** — no position changed value. The sample repeats the previous one. Collapsed into a + repetition count, or emitted as a no-op delta frame (see **Repeats**). +- **Delta** — every changed position swaps between exactly the same two district ids A and B, and + **both** A and B already appear in the previous assignment. Emitted as a delta frame. +- **Snapshot** — more than two distinct district ids change, **or** the transition introduces a + district id that was absent from the previous assignment (so there is no prior layout to delta + against). Emitted as a snapshot frame, which also re-establishes the anchor for subsequent deltas. + +The first frame of any TwoDelta stream is always a snapshot (the anchor). A decoder MUST have a +reconstructed assignment in hand before it can apply a delta; a stream whose first frame is a delta +is malformed. + +Because a delta frame is reconstructed from the previous assignment, TwoDelta frames are **not** +independently decodable: random access or subsampling requires replaying from the most recent +snapshot forward. This is the deliberate trade-off the delta encoding makes against the cheap +frame-skip subsampling that Standard and MkvChain allow. + +## Delta Semantics + +A delta frame stores: + +- a **pair** `(A, B)` of district ids, and +- a vector of **alternating run lengths** over the positions the pair occupies in the *new* + assignment, ordered by position. + +The run lengths describe, in node order and restricted to positions holding A or B, the lengths of +maximal runs of one id then the other: the first run is id A, the second is id B, the third is A +again, and so on. The pair is ordered so that **A is whichever id occupies the lowest-indexed +position held by either id in the new assignment**. This makes the first run length at least `1` +(there is no leading zero), and it is the round-trip-determinism invariant the decoder relies on: a +pair ordered the other way would silently decode to a different assignment. + +To decode a delta against the previous assignment: walk the assignment in node order; at each +position that currently holds A or B, overwrite it with the active run's id, consuming run lengths +in order and flipping A↔B at each run boundary. Positions holding any other id are left untouched. A +decoder MUST error if the run lengths are exhausted before all pair positions are covered, or if run +length remains after the assignment ends. + +### Worked example + +Previous assignment `[1, 1, 2, 2]`, new assignment `[1, 2, 2, 2]` (the node at index 1 moves from +district 1 to district 2). + +- Changed ids are `{1, 2}`, both already present → a delta. The lowest pair position in the new + assignment is index 0, which holds `1`, so the pair is `(1, 2)`. +- Restricted to pair positions (all four here), the new assignment reads `1, 2, 2, 2` → run lengths + `[1, 3]`. +- Decoding `[1, 1, 2, 2]` with pair `(1, 2)` and runs `[1, 3]`: position 0 stays `1` (run of 1), + positions 1–3 become `2` (run of 3) → `[1, 2, 2, 2]`. ✓ + +## Plain `.ben` Layout + +A TwoDelta `.ben` stream is: + +```text +[17-byte Banner: "TWODELTA BEN FILE"] +[Tagged Frame 1] (snapshot — the anchor) +[Tagged Frame 2] +... +``` + +Unlike Standard and MkvChain, every frame is prefixed with a 1-byte **frame tag** that selects the +body layout: + +```text +[tag: u8][frame body] +``` + +- `0x00` — **snapshot frame**. The body is byte-for-byte an MkvChain frame (see the BEN stream + spec): + + ```text + offset size field + 0 1 max_val_bit_count + 1 1 max_len_bit_count + 2 4 n_bytes + 6 ... payload (n_bytes bytes, bit-packed RLE of the full assignment) + 6+n_bytes 2 count + ``` + +- `0x01` — **delta frame**. The body is: + + ```text + offset size field + 0 2 pair_a (district id A) + 2 2 pair_b (district id B) + 4 1 max_len_bit_count + 5 4 n_bytes + 9 ... payload (n_bytes bytes, bit-packed run lengths) + 9+n_bytes 2 count + ``` + +The delta frame header is 9 bytes (the stream-layer tag is not part of the frame header). The +payload bit-packs the alternating run lengths, each in `max_len_bit_count` bits, +most-significant-bit first, with the final byte zero-padded on the low side — the same bit-packing +convention the BEN stream spec defines for run lengths, except only lengths are packed (the pair +lives in the header, and there are no per-run values). `max_len_bit_count` is in `1..=16`; a decoder +MUST reject `0` or a value above `16`. All stored run lengths are `>= 1`; any zero decoded from the +padding tail is discarded. + +The stream ends at a clean end-of-input on the **tag boundary**. End-of-input after a tag byte but +before a complete body is a truncated frame and MUST error. + +### Worked example (continued) + +The delta from the example above, with `max_len = 3` → `max_len_bit_count = 2`, runs `[1, 3]` packed +as `01 11` → `0x70`, `n_bytes = 1`, `count = 1`: + +```text +01 00 01 00 02 02 00 00 00 01 70 00 01 +^tag ^A ^B ^bits ^n_bytes ^pl ^count +``` + +## XBEN Layout (Columnar Body) + +A TwoDelta `.xben` file wraps a columnar body in LZMA2. The body is **not** the per-frame tagged +layout above, and it is **not** the BEN32 intermediate used by Standard/MkvChain. It is a distinct +columnar layout that batches deltas for better compression: + +```text +[17-byte Banner: "TWODELTA BEN FILE"] (inside the LZMA2 payload) +[Body Frame 1] (full frame — the anchor) +[Body Frame 2] +... +``` + +Each body frame is discriminated by its first byte: + +- `0x00` — **full frame** (snapshot/anchor): + + ```text + [0x00] + [run_count: u32] + [ run_count × ( value: u16, length: u16 ) ] full-assignment RLE runs + [count: u16] + ``` + + Unlike the BEN32 frames of the other variants, a full frame is **length-prefixed** by `run_count` + rather than terminated by a zero sentinel. + +- `0x02` — **chunk frame**: a columnar batch of `n` delta frames. Fields are stored column-by-column + across all `n` frames: + + ```text + [0x02] + [n: u32] number of delta frames in the chunk + [ n × ( pair_a: u16, pair_b: u16 ) ] pairs column + [ n × ( count: u16 ) ] counts column + [ n × ( run_count: u32 ) ] per-frame run-length count column + [ run_data: u16 × (sum of run_counts) ] all frames' run lengths, concatenated in frame order + ``` + + The run lengths here are stored as plain `u16` values, not bit-packed. + +Tag value `0x01` is not used in the `.xben` body. The first body frame MUST be a full frame; a chunk +before any full frame has no anchor to delta against and is malformed. A chunk's deltas precede any +following full frame, so replaying body frames in order reconstructs the samples in order. The +default batch size is 10000 delta frames per chunk; the batch size affects only compression and +framing, never the decoded result. + +## Repeats + +A repeated sample (identical to the previous one) is represented in one of two ways, both of which +preserve the *expanded* sample count: + +- via a frame **`count` greater than 1**, which expands to that many identical samples (as in + MkvChain); or +- via a **no-op delta frame** whose run lengths reproduce the previous assignment unchanged. This + arises when a repeat must be emitted as its own frame rather than merged into a neighbor's count. + +As with the other variants, a frame `count` of `0` is invalid and MUST be rejected by readers. + +## Reader Rules + +A reader MUST: + +1. Read and validate the 17-byte `TWODELTA BEN FILE` banner. +1. For a plain `.ben` stream: read frames as `[tag][body]`, dispatching `0x00` to the snapshot body + and `0x01` to the delta body; reject any other tag. End the stream on a clean EOF at a tag + boundary; treat an EOF inside a body as truncation. +1. For an `.xben` body: after LZMA2 decompression and the banner, read body frames by leading byte, + dispatching `0x00` to the full layout and `0x02` to the chunk layout; reject any other tag. +1. Maintain the previous assignment. Reconstruct snapshot/full frames directly; apply delta/chunk + frames against the previous assignment. Error if a delta is encountered before any anchor. +1. Treat each frame's `count` as its expanded sample multiplicity (reject `count == 0`), and add it + to the expanded sample count. + +## Relationship to the Other Variants + +- A TwoDelta **snapshot** frame body is byte-identical to an MkvChain frame; the only stream-level + difference is the leading tag byte. The anchor and every fallback snapshot reuse that layout. +- TwoDelta is the only variant whose frames depend on prior frames, and the only one whose `.xben` + body is columnar rather than BEN32. Converting a TwoDelta stream to Standard or MkvChain requires + replaying the deltas into full assignments; converting the other direction requires a pairwise + ReCom sampler's transition structure and is only valid when every accepted move changes exactly + two districts. +- TwoDelta is **not compatible** with random sampling (consecutive samples can differ in arbitrarily + many ids) or with Forest ReCom (a single move can touch more than two districts). For those, use + Standard or MkvChain. + +## Versioning Strategy + +The frame tags, the snapshot/delta/full/chunk layouts, the bit-packing convention, and the pair +ordering invariant are contractual: committed fixtures encoded under a stable major version MUST +continue to decode in every later release of that major version. Any change to a layout or to the +pair-ordering rule is a breaking change requiring a new fixture set under a new major version; +existing fixtures are never regenerated in place. See the format-stability policy. + +## Out of Scope + +- The Standard and MkvChain variants (independent snapshot frames; BEN32 `.xben` body). +- The `.bendl` bundle container that embeds a BEN/XBEN stream as an opaque payload. +- LZMA2 framing details; XBEN treats LZMA2 as an opaque outer wrapper around the columnar body. From 0ae330cde861670fef94b991dbac68f455efd5f0 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 9 Jun 2026 07:08:54 -0600 Subject: [PATCH 139/221] update docs --- .gitignore | 3 +- Taskfile.yml | 12 +- ben-py/docs/api/bundle.md | 135 ++++++++++ ben-py/docs/api/codec.md | 55 +++++ ben-py/docs/api/graph.md | 39 +++ ben-py/docs/api/stream.md | 49 ++++ ben-py/docs/concepts/cli-parity.md | 58 +++++ ben-py/docs/concepts/compatibility.md | 99 ++++++++ ben-py/docs/concepts/compression.md | 4 +- ben-py/docs/concepts/data-model.md | 147 +++++++++++ ben-py/docs/concepts/formats.md | 73 ++++++ ben-py/docs/concepts/jsonl-schema.md | 88 +++++++ ben-py/docs/concepts/limitations.md | 78 ++++++ ben-py/docs/concepts/ordering-deep-dive.md | 91 +++++++ ben-py/docs/concepts/overview.md | 7 + ben-py/docs/concepts/performance.md | 111 +++++++++ ben-py/docs/concepts/variants.md | 64 +++-- ben-py/docs/conf.py | 45 ++-- ben-py/docs/how-to/api-cookbook.md | 173 +++++++++++++ ben-py/docs/how-to/compress-gerrychain-run.md | 66 +++-- ben-py/docs/how-to/end-to-end-workflow.md | 116 +++++++++ ben-py/docs/how-to/error-reference.md | 159 ++++++++++++ ben-py/docs/how-to/examples-gallery.md | 76 ++++++ ben-py/docs/how-to/index.md | 35 +++ ben-py/docs/how-to/shrink-for-sharing.md | 4 +- ben-py/docs/how-to/subsample.md | 3 + ben-py/docs/how-to/troubleshooting.md | 163 +++++++++++++ ben-py/docs/index.md | 16 +- ben-py/docs/user/using_bendl.ipynb | 8 +- ben-py/pyproject.toml | 2 + ben-py/uv.lock | 230 ++++++++++++++++++ 31 files changed, 2143 insertions(+), 66 deletions(-) create mode 100644 ben-py/docs/concepts/cli-parity.md create mode 100644 ben-py/docs/concepts/compatibility.md create mode 100644 ben-py/docs/concepts/data-model.md create mode 100644 ben-py/docs/concepts/jsonl-schema.md create mode 100644 ben-py/docs/concepts/limitations.md create mode 100644 ben-py/docs/concepts/ordering-deep-dive.md create mode 100644 ben-py/docs/concepts/performance.md create mode 100644 ben-py/docs/how-to/api-cookbook.md create mode 100644 ben-py/docs/how-to/end-to-end-workflow.md create mode 100644 ben-py/docs/how-to/error-reference.md create mode 100644 ben-py/docs/how-to/examples-gallery.md create mode 100644 ben-py/docs/how-to/troubleshooting.md diff --git a/.gitignore b/.gitignore index cb433f5..6abf718 100755 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ dev_files demo __pycache__ -*.so \ No newline at end of file +*.so +/ben-py/docs/code-theme-preview.md diff --git a/Taskfile.yml b/Taskfile.yml index fb0892e..df0a8dc 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -344,11 +344,17 @@ tasks: - uv run --extra docs --extra docs-exec sphinx-build -W -b dirhtml docs docs/_build docs-serve: - desc: Build the docs and serve them at http://localhost:8000 + desc: Serve the docs at http://localhost:8000, live-rebuilding on every change dir: ben-py cmds: - - task: docs - - uv run python -m http.server 8000 --directory docs/_build + # sphinx-autobuild watches the docs source and rebuilds + reloads the browser on + # save. Editing conf.py (palettes, Pygments styles) triggers a full rebuild. The + # generated pygments-themes.css lives under docs/_generated, which conf.py rewrites + # each build — ignore it (and the build/cache dirs) so the watcher doesn't loop. + - >- + uv run --extra docs sphinx-autobuild -b dirhtml docs docs/_build + --ignore "*/_generated/*" --ignore "*/_build/*" --ignore "*/.jupyter_cache/*" + --port 8000 --open-browser docs-test: desc: Execute every Python code block in the Markdown docs diff --git a/ben-py/docs/api/bundle.md b/ben-py/docs/api/bundle.md index 73181be..250d964 100644 --- a/ben-py/docs/api/bundle.md +++ b/ben-py/docs/api/bundle.md @@ -1,11 +1,91 @@ # `binary_ensemble.bundle` +The bundle module is the recommended high-level API. It writes and reads `.bendl` files: +single-file containers that hold an assignment stream plus graph, metadata, permutation +maps, and custom assets. + +## When to use it + +Use this module when you want the file to be self-describing. That is the normal case for +redistricting ensembles because an assignment is only meaningful with the graph node order it +was written against. + +| Task | API | +|---|---| +| Create a new bundle | `BendlEncoder(path, overwrite=True)` | +| Attach a dual graph | `encoder.add_graph(graph, sort=...)` | +| Stream assignments while sampling | `with encoder.stream("ben") as stream: ...` | +| Read assignments and assets | `BendlDecoder(path)` | +| Reorder/relabel an existing bundle | `relabel_bundle(...)` | +| Recompress a bundle to XBEN | `compress_stream(...)` | + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +assert len(decoder) == decoder.count_samples() +assert decoder.assignment_format() in {"ben", "xben"} +``` + ```{eval-rst} .. automodule:: binary_ensemble.bundle ``` ## Encoder +`BendlEncoder` has two modes: + +| Mode | Open with | Stream writes | Asset writes | +|---|---|---:|---:| +| Create | `BendlEncoder(path, overwrite=True)` | one stream | before or after the stream | +| Append | `BendlEncoder.append(path)` | unavailable | immediate appends to a finalized bundle | + +The stream context finalizes the bundle when it closes cleanly. You only need to use the +encoder itself as a context manager for assets-only bundles or if that style is clearer in +your code. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("api-demo.bendl", overwrite=True) +encoder.add_metadata({"sampler": "demo"}) + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + stream.write([1, 2, 2, 2]) +``` + +### Graph handling + +`add_graph()` accepts NetworkX adjacency JSON, a path to that JSON, raw bytes, or a readable +object. By default it reorders with `sort="mlc"` for better compression and returns the +reordered NetworkX graph. Write assignments in the returned graph's node order. + +| `sort` | Meaning | Needs `key`? | Stores permutation map? | +|---|---|---:|---:| +| `"mlc"` | Multi-level clustering; topology-based default | no | yes | +| `"rcm"` | Reverse Cuthill-McKee topology ordering | no | yes | +| `"key"` | Sort nodes by a node attribute | yes | yes | +| `None` | Store the graph as-is | no | no | + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) +for node in graph.nodes: + graph.nodes[node]["GEOID20"] = f"{node:04d}" + +encoder = BendlEncoder("api-graph.bendl", overwrite=True) +ordered_graph = encoder.add_graph(nx.adjacency_data(graph), sort="key", key="GEOID20") + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + +assert ordered_graph.number_of_nodes() == 4 +``` + ```{eval-rst} .. autoclass:: binary_ensemble.bundle.BendlEncoder :members: @@ -13,6 +93,18 @@ ## The stream session +`BendlEncoder.stream()` returns a `BendlStreamSession`. It is intentionally small: write +assignments, then close. A bundle can have only one assignment stream. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("api-session.bendl", overwrite=True) +with encoder.stream("ben", variant="twodelta") as stream: + for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: + stream.write(assignment) +``` + ```{eval-rst} .. autoclass:: binary_ensemble._core.BendlStreamSession :members: @@ -20,6 +112,37 @@ ## Decoder +`BendlDecoder` iterates the embedded stream and exposes bundle inspection methods. + +| Method | Use | +|---|---| +| `len(decoder)` / `count_samples()` | Expanded number of samples | +| `assignment_format()` | `"ben"` or `"xben"` for the embedded stream | +| `version()` / `is_complete()` | Bundle header inspection | +| `asset_names()` / `list_assets()` | Asset directory inspection | +| `read_graph()` | `networkx.Graph` rebuilt from `graph.json`, or `None` | +| `read_metadata()` | Parsed `metadata.json`, or `None` | +| `read_node_permutation_map()` | Parsed permutation map, or `None` | +| `read_json_asset(name)` | Parse a JSON asset | +| `read_asset_bytes(name)` | Raw bytes for any asset | +| `extract_stream(path)` | Copy the embedded stream out as `.ben` or `.xben` bytes | +| `subsample_*()` | Iterate only selected samples | + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(decoder.asset_names()) +print(decoder.read_metadata()) + +for assignment in decoder.subsample_range(1, 3): + print(assignment[:4]) +``` + +Iteration rewinds on a fresh `for` loop. Do not drive two simultaneous loops from the same +decoder object; open a second decoder if you need independent cursors. + ```{eval-rst} .. autoclass:: binary_ensemble.bundle.BendlDecoder :members: @@ -27,6 +150,18 @@ ## Whole-bundle transforms +These functions preserve bundle assets while rewriting the embedded stream. + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="api-sorted.bendl", sort="mlc") +compress_stream("api-sorted.bendl", out_file="api-archive.bendl") +``` + +Both transforms require exactly one output mode: pass `out_file=...` to create a new file or +`in_place=True` to atomically replace the input. + ```{eval-rst} .. autofunction:: binary_ensemble.bundle.compress_stream diff --git a/ben-py/docs/api/codec.md b/ben-py/docs/api/codec.md index f26718c..287d4b0 100644 --- a/ben-py/docs/api/codec.md +++ b/ben-py/docs/api/codec.md @@ -1,11 +1,55 @@ # `binary_ensemble.codec` +The codec module contains whole-file transforms. These functions do not expose an iterator: +they read one file and write another. + +Use them for conversion jobs. Use {mod}`binary_ensemble.stream` for sample-by-sample access +to plain streams, and {mod}`binary_ensemble.bundle` when graph and metadata should stay with +the assignments. + +## Inputs and outputs + +| Function family | Input | Output | Carries assets? | +|---|---|---|---:| +| `encode_jsonl_to_*` | JSON Lines with an `assignment` field | BEN or XBEN stream | no | +| `encode_ben_to_xben` | BEN stream | XBEN stream | no | +| `decode_*_to_jsonl` | BEN or XBEN stream | JSON Lines | no | +| `decode_xben_to_ben` | XBEN stream | BEN stream | no | + +The expected JSONL shape is: + +```json +{"assignment": [1, 1, 2, 2], "sample": 1} +{"assignment": [1, 2, 2, 2], "sample": 2} +``` + +Only the `assignment` values are encoded into the stream. Store graph data, sampler +settings, scores, and provenance in a `.bendl` bundle if they need to travel with the file. + ```{eval-rst} .. automodule:: binary_ensemble.codec ``` ## Encoders +```python +from binary_ensemble import encode_ben_to_xben, encode_jsonl_to_ben, encode_jsonl_to_xben + +encode_jsonl_to_ben("plans.jsonl", "api-plans.ben", overwrite=True) +encode_ben_to_xben("api-plans.ben", "api-plans.xben", overwrite=True) + +encode_jsonl_to_xben( + "plans.jsonl", + "api-direct.xben", + overwrite=True, + variant="twodelta", + compression_level=9, +) +``` + +`variant=` is only used when creating BEN frames from assignments. XBEN-specific knobs +(`n_threads`, `compression_level`, and `xz_block_size`) tune the LZMA2 stage. + ```{eval-rst} .. autofunction:: binary_ensemble.codec.encode_jsonl_to_ben @@ -16,6 +60,17 @@ ## Decoders +```python +from binary_ensemble import decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl + +decode_ben_to_jsonl("chain.ben", "api-chain.jsonl", overwrite=True) +decode_xben_to_ben("chain.xben", "api-chain.ben", overwrite=True) +decode_xben_to_jsonl("chain.xben", "api-chain-from-xben.jsonl", overwrite=True) +``` + +Decoding auto-detects the stream variant from the file; you never pass `variant=` when +reading. + ```{eval-rst} .. autofunction:: binary_ensemble.codec.decode_ben_to_jsonl diff --git a/ben-py/docs/api/graph.md b/ben-py/docs/api/graph.md index 822a066..5b4cc16 100644 --- a/ben-py/docs/api/graph.md +++ b/ben-py/docs/api/graph.md @@ -1,11 +1,50 @@ # `binary_ensemble.graph` +The graph module exposes the same reordering algorithms used by +`BendlEncoder.add_graph()` and `relabel_bundle()`. Use it when you want to inspect or manage +the graph order yourself before writing assignments. + +Each function returns `(reordered_graph, node_permutation_map)`. + +| Function | Ordering | +|---|---| +| `reorder(graph, sort="mlc")` | Dispatch helper for all orderings | +| `reorder_multi_level_cluster(graph)` | Recursive topology-based clustering | +| `reorder_reverse_cuthill_mckee(graph)` | Reverse Cuthill-McKee bandwidth reduction | +| `reorder_by_key(graph, key)` | Sort by a node attribute, or `"id"` for node id | + ```{eval-rst} .. automodule:: binary_ensemble.graph ``` ## Reordering functions +```python +import networkx as nx + +from binary_ensemble import graph + +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(4, 4)) +for node in dual_graph.nodes: + dual_graph.nodes[node]["GEOID20"] = f"{node:04d}" + +adjacency = nx.adjacency_data(dual_graph) +reordered, permutation_map = graph.reorder(adjacency, sort="key", key="GEOID20") + +assert reordered.number_of_nodes() == dual_graph.number_of_nodes() +assert "node_permutation_old_to_new" in permutation_map +``` + +The returned `reordered` graph is a NetworkX graph in the new node order. If you write an +assignment stream against this graph, emit assignment values in `list(reordered.nodes)` +order. + +```{tip} +If you are creating a bundle, `BendlEncoder.add_graph(..., sort=...)` is usually simpler: +it reorders the graph, stores `graph.json`, stores `node_permutation_map.json`, and returns +the reordered graph in one call. +``` + ```{eval-rst} .. autofunction:: binary_ensemble.graph.reorder diff --git a/ben-py/docs/api/stream.md b/ben-py/docs/api/stream.md index ae63f50..f3818fe 100644 --- a/ben-py/docs/api/stream.md +++ b/ben-py/docs/api/stream.md @@ -1,11 +1,37 @@ # `binary_ensemble.stream` +The stream module is the low-level API for plain `.ben` and `.xben` files. A plain stream +contains assignments only; it does not carry a graph, metadata, or custom assets. Prefer +{mod}`binary_ensemble.bundle` unless some other tool specifically needs raw stream files. + +## Stream vs. bundle + +| Need | Use | +|---|---| +| Self-describing file with graph and metadata | `BendlEncoder` / `BendlDecoder` | +| Small raw stream for another tool | `BenEncoder` / `BenDecoder` | +| Whole-file JSONL conversion | {mod}`binary_ensemble.codec` | + ```{eval-rst} .. automodule:: binary_ensemble.stream ``` ## Encoder +`BenEncoder` writes `.ben` streams. It does not write `.xben` directly; encode to BEN first, +then call {func}`binary_ensemble.codec.encode_ben_to_xben` for archival compression. + +```python +from binary_ensemble import BenEncoder + +with BenEncoder("api-chain.ben", overwrite=True, variant="twodelta") as encoder: + encoder.write([1, 1, 2, 2]) + encoder.write([1, 2, 2, 2]) +``` + +Variant choices are documented in [Encoding variants](../concepts/variants.md). Decoders +auto-detect the variant from the stream banner, so you only choose it when encoding. + ```{eval-rst} .. autoclass:: binary_ensemble.stream.BenEncoder :members: @@ -13,6 +39,29 @@ ## Decoder +`BenDecoder` iterates plain `.ben` or `.xben` streams. Use `mode="xben"` for XBEN: + +```python +from binary_ensemble import BenDecoder + +ben_decoder = BenDecoder("chain.ben") +xben_decoder = BenDecoder("chain.xben", mode="xben") + +assert len(ben_decoder) == len(xben_decoder) +``` + +The same subsampling methods available on bundles are available here: + +```python +from binary_ensemble import BenDecoder + +for assignment in BenDecoder("chain.ben").subsample_every(25): + print(assignment[:4]) +``` + +Plain BEN is the fastest format for repeated reads and subsampling. XBEN is smaller, but it +pays a decompression startup cost. + ```{eval-rst} .. autoclass:: binary_ensemble.stream.BenDecoder :members: diff --git a/ben-py/docs/concepts/cli-parity.md b/ben-py/docs/concepts/cli-parity.md new file mode 100644 index 0000000..4f798e8 --- /dev/null +++ b/ben-py/docs/concepts/cli-parity.md @@ -0,0 +1,58 @@ +# Python and CLI parity + +The Python package wraps the same Rust engine as the command-line tools. The API names mirror +the CLI split so workflows can move between notebooks, scripts, and shell pipelines. + +## Command map + +| CLI task | Python equivalent | Notes | +|---|---|---| +| Encode JSONL to BEN | `encode_jsonl_to_ben(...)` | Whole-file conversion | +| Encode JSONL to XBEN | `encode_jsonl_to_xben(...)` | Whole-file conversion plus XBEN compression | +| Convert BEN to XBEN | `encode_ben_to_xben(...)` | Plain stream only | +| Decode BEN to JSONL | `decode_ben_to_jsonl(...)` | Plain stream only | +| Decode XBEN to BEN | `decode_xben_to_ben(...)` | Useful before repeated subsampling | +| Decode XBEN to JSONL | `decode_xben_to_jsonl(...)` | Plain stream only | +| Create a BENDL bundle | `BendlEncoder(...)` | Recommended Python workflow | +| Inspect a BENDL bundle | `BendlDecoder(...).list_assets()` | Also exposes graph and metadata helpers | +| Extract a bundle stream | `BendlDecoder(...).extract_stream(...)` | Copies embedded BEN/XBEN stream bytes | +| Append bundle assets | `BendlEncoder.append(...)` | Asset appends only; no stream appends | +| Relabel/reorder a bundle | `relabel_bundle(...)` | Requires BEN stream plus graph | +| Recompress bundle stream | `compress_stream(...)` | BEN bundle to XBEN bundle | +| Reorder a graph | `binary_ensemble.graph.reorder(...)` | Same orderings as bundle relabeling | + +## Plain stream conversion + +```python +from binary_ensemble import encode_ben_to_xben, encode_jsonl_to_ben + +encode_jsonl_to_ben("plans.jsonl", "cli-parity.ben", overwrite=True) +encode_ben_to_xben("cli-parity.ben", "cli-parity.xben", overwrite=True) +``` + +## Bundle inspection + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(decoder.assignment_format()) +print(decoder.count_samples()) +print(decoder.list_assets()) +``` + +## Bundle transform + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="cli-parity-sorted.bendl", sort="mlc") +compress_stream("cli-parity-sorted.bendl", out_file="cli-parity-archive.bendl") +``` + +## Choosing shell or Python + +Use the CLI for shell pipelines and batch conversions. Use Python when the assignment stream +is produced inside Python, when you need to attach structured metadata, or when downstream +analysis is also in Python. diff --git a/ben-py/docs/concepts/compatibility.md b/ben-py/docs/concepts/compatibility.md new file mode 100644 index 0000000..7599573 --- /dev/null +++ b/ben-py/docs/concepts/compatibility.md @@ -0,0 +1,99 @@ +# Compatibility and stability + +This page covers what is stable at the Python package boundary and what belongs to the +underlying binary format. + +## Python package compatibility + +`binary-ensemble` requires Python 3.11 or newer and depends on NetworkX at runtime. + +```python +import binary_ensemble + +assert "BendlEncoder" in binary_ensemble.__all__ +``` + +Pre-built wheels are intended to make normal installation a one-command process: + +```bash +pip install binary-ensemble +``` + +Building from source requires a Rust toolchain and `maturin`; see +[Installation](../getting-started/installation.md). + +## Public Python surface + +Import from these public modules, or from the top-level `binary_ensemble` namespace: + +| Module | Stability expectation | +|---|---| +| `binary_ensemble.bundle` | Public bundle API | +| `binary_ensemble.stream` | Public plain-stream API | +| `binary_ensemble.codec` | Public whole-file conversion API | +| `binary_ensemble.graph` | Public graph-reordering API | + +Do not import from `binary_ensemble._core` in application code. It is the compiled extension +implementation detail behind the public modules. + +## File-format stability + +The byte-level format stability policy lives in the repository-level +[format stability document](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/format-stability.md). +At the Python level, the important rule is simpler: readers auto-detect stream variants, and +bundle readers expose the bundle version through `BendlDecoder.version()`. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.version()) +print(decoder.assignment_format()) +``` + +## Reading older files + +When opening an existing file, start with inspection: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +print(decoder.is_complete()) +print(decoder.version()) +print(decoder.asset_names()) +``` + +For plain streams, use `BenDecoder(path)` for `.ben` and `BenDecoder(path, mode="xben")` for +`.xben`. + +## Reproducibility metadata + +The binary format stores assignments losslessly. Reproducing an analysis also requires the +context around the stream. For serious runs, store at least: + +- package versions, +- sampler name and parameters, +- random seed, +- graph source and hash, +- node-order choice, +- scoring definitions, +- creation date and operator notes. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("compatibility.bendl", overwrite=True) +encoder.add_metadata( + { + "sampler": "ReCom", + "seed": 1234, + "node_order": "GEOID20", + "binary_ensemble": "record the package version here", + } +) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + diff --git a/ben-py/docs/concepts/compression.md b/ben-py/docs/concepts/compression.md index 905fcc2..77a73ee 100644 --- a/ben-py/docs/concepts/compression.md +++ b/ben-py/docs/concepts/compression.md @@ -135,8 +135,8 @@ assignment into the new order, and stores the permutation map: ```python from binary_ensemble import relabel_bundle, compress_stream -relabel_bundle("ensemble.bendl", out_file="ensemble.relabeled.bendl", sort="key", key="GEOID20") -compress_stream("ensemble.relabeled.bendl", out_file="ensemble.xben.bendl") +relabel_bundle("ensemble.bendl", out_file="ensemble-relabeled.bendl", sort="key", key="GEOID20") +compress_stream("ensemble-relabeled.bendl", out_file="ensemble-archive.bendl") ``` See [Shrink a bundle for sharing](../how-to/shrink-for-sharing.md) for the full recipe. diff --git a/ben-py/docs/concepts/data-model.md b/ben-py/docs/concepts/data-model.md new file mode 100644 index 0000000..f2edb3e --- /dev/null +++ b/ben-py/docs/concepts/data-model.md @@ -0,0 +1,147 @@ +# The data contract + +`binary-ensemble` is intentionally small at the storage layer: it stores integer +assignments, optional graph assets, and optional metadata. The important part is making sure +those pieces agree. This page spells out the contract that every encoder, decoder, and +converter assumes. + +## Assignment shape + +An assignment is a plain sequence of integers: + +```python +assignment = [1, 1, 2, 2, 3, 3] +``` + +The list position is the graph node position. The value is the district id assigned to that +node. + +| Rule | Why it matters | +|---|---| +| Every assignment in one stream must have the same length. | A stream represents one ensemble over one fixed dual graph. | +| Values must be positive district ids that fit in 16 bits. | The binary format stores district ids compactly. | +| The order must match the graph order you intend to use when reading. | BEN cannot infer geographic meaning from the values alone. | +| Missing nodes are not represented. | Use one entry per graph node, even for islands or zero-population units. | + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +first_assignment = next(iter(decoder)) + +assert graph is None or len(first_assignment) == graph.number_of_nodes() +``` + +## Node order + +Node order is the most important invariant in the system. Assignment index `i` means "the +node at position `i` in the dual graph." If you change the graph order without changing the +assignment order, the file still decodes successfully but describes the wrong plans. + +Bundles are the recommended default because they keep the graph and assignment stream +together: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() # NetworkX graph in assignment order, or None +metadata = decoder.read_metadata() +``` + +When `BendlEncoder.add_graph(..., sort="mlc")`, `sort="rcm"`, or `sort="key"` reorders a +graph, it returns the reordered graph. Write assignments in that returned order, and store +the permutation map the bundle creates for you. + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) +adjacency = nx.adjacency_data(graph) + +encoder = BendlEncoder("ordered.bendl", overwrite=True) +ordered_graph = encoder.add_graph(adjacency, sort="rcm") +write_order = list(ordered_graph.nodes) +assert len(write_order) == 4 + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +```{warning} +Reordering the graph is lossless, but it is not cosmetic. Once you choose an order, every +assignment written to that stream must use that exact order. +``` + +## JSONL input + +The whole-file codec helpers expect JSON Lines with one JSON object per line and an +`assignment` field: + +```json +{"assignment": [1, 1, 2, 2], "sample": 1} +{"assignment": [1, 2, 2, 2], "sample": 2} +``` + +Extra fields such as `sample`, scores, or metadata can be present in the input JSONL, but +only the assignment stream is encoded into `.ben` or `.xben`. If you need graph metadata, +sampler settings, or scores to travel with the file, put the stream in a `.bendl` bundle and +attach those payloads as assets. + +```python +from binary_ensemble import encode_jsonl_to_ben + +encode_jsonl_to_ben("plans.jsonl", "plans.ben", overwrite=True) +``` + +## Bundle assets + +A `.bendl` bundle can carry well-known assets and custom assets: + +| Asset | Reader helper | Typical payload | +|---|---|---| +| `graph.json` | `read_graph()` or `read_json_asset("graph.json")` | NetworkX adjacency JSON | +| `metadata.json` | `read_metadata()` | Sampler name, seed, date, chain settings | +| `node_permutation_map.json` | `read_node_permutation_map()` | Reversible old-to-new node order map | +| Custom JSON/text asset | `read_json_asset()` or `read_asset_bytes()` | Scores, notes, provenance, run manifests | + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +for asset in decoder.list_assets(): + print(asset["name"], asset["type"], asset["flags"]) +``` + +## Variants and formats + +The data contract is independent of the stream variant. `standard`, `mkv_chain`, and +`twodelta` all decode to the same thing: a sequence of `list[int]` assignments. Choose the +variant for compression behavior, not for downstream semantics. + +Likewise, `.ben`, `.xben`, and `.bendl` carry the same assignment stream at different +packaging layers: + +| Format | Carries assignments | Carries graph/assets | Best use | +|---|---:|---:|---| +| `.ben` | yes | no | Active work, fast streaming and subsampling | +| `.xben` | yes | no | Small plain-stream archive | +| `.bendl` with BEN stream | yes | yes | Recommended working bundle | +| `.bendl` with XBEN stream | yes | yes | Recommended share/archive bundle | + +## Validation checklist + +Before you encode a real ensemble, check these points: + +- Decide the node order once, before the first sample is written. +- If you reorder a graph, run the sampler or assignment extraction in the reordered graph's + node order. +- Keep assignment length constant for the whole stream. +- Store the graph in the bundle unless every reader already has the exact matching graph. +- Store sampler settings, random seed, scoring definitions, and provenance as metadata or + custom assets. +- Use BEN while sampling and iterating; recompress to XBEN when the file is ready to share. diff --git a/ben-py/docs/concepts/formats.md b/ben-py/docs/concepts/formats.md index e58abc6..fc3b077 100644 --- a/ben-py/docs/concepts/formats.md +++ b/ben-py/docs/concepts/formats.md @@ -59,6 +59,79 @@ when you specifically don't want the bundle packaging — for example, feeding a another tool that expects it. ``` +## How each format works + +The three containers build on each other: BEN defines the encoding, XBEN compresses a BEN +stream, and BENDL packages a BEN or XBEN stream with its assets. Here's the mechanism behind +each. + +### BEN: a layered encoding of one assignment at a time + +A BEN stream encodes each sample (one district id per node, read in node order) through four +stacked layers: + +1. **Run-length encoding (RLE).** Consecutive nodes in the same district collapse to + `(district, length)` pairs — `[1, 1, 1, 2, 2, 2, 2, 3]` becomes `[(1, 3), (2, 4), (3, 1)]`. + Fewer, longer runs mean a smaller frame, which is exactly why + [node reordering](compression.md) is the biggest compression lever. +2. **Bit-packing.** Each frame inspects its own largest district id and largest run length, then + packs every value and length to *exactly* that many bits — no wasted bytes. The example above + has a max id of `3` (2 bits) and a max length of `4` (3 bits), so each run costs 5 bits. +3. **Frames.** Each sample becomes one self-describing frame: a short header (the two bit-widths + plus the payload's byte length) followed by the packed payload. Because the header states the + byte length up front, a reader can **skip** a sample it doesn't want without unpacking a single + payload bit — that's what makes [subsampling](../how-to/subsample.md) cheap. +4. **Stream.** A 17-byte banner that names the [variant](variants.md), then the frames written + back-to-back. There's no global index or end marker, so frames can be appended one at a time + while sampling and read back until the input simply runs out. + +The variant only changes the **frame shape** — independent snapshots (`standard`), snapshots plus +repeat counts (`mkv_chain`), or deltas against the previous sample (`twodelta`). All three ride on +the same RLE and bit-packing layers underneath. + +### XBEN: LZMA2 over a byte-aligned rewrite + +XBEN is a BEN stream run through [LZMA2](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) +(the `xz` algorithm) — but **not** over the bit-packed frames directly. Bit-packing makes each plan +small, but it pushes runs across byte boundaries (a run can start mid-byte), so identical patterns +in different plans don't line up as identical bytes — and a byte-oriented compressor can't see the +repetition. + +So XBEN first re-expands the stream into an intermediate columnar form where each run is a +fixed-width, byte-aligned `(value, length)` pair and each frame ends with a zero sentinel. Now +equivalent runs across plans line up **byte-for-byte**, and LZMA2 — which hunts for repeated byte +sequences — collapses the redundancy across the whole ensemble. This is also why +[district relabeling](compression.md) pays off: it makes structurally identical plans encode to +identical bytes, which LZMA2 then deduplicates. + +```{admonition} Asymmetric cost +:class: note +Decompressing XBEN is fast (minutes, even for large files), but high-ratio LZMA2 *compression* is +slow — block-level ensembles can take an hour. Encode to XBEN once for archival or transfer; do +day-to-day reading against a BEN stream. +``` + +### BENDL: a self-describing, crash-recoverable bundle + +A bundle is a single seekable file laid out as a fixed-size header, then the asset payloads, then +the embedded assignment stream, then a directory table at the end: + +- The **header** records which stream format is embedded (BEN or XBEN), where the stream lives, the + expanded **sample count** (so counting an ensemble is an O(1) header read, not a full scan), and a + CRC32C checksum over the stream bytes. +- The **directory table** indexes every asset — the dual graph, the node permutation map, the + metadata, and any custom assets — by offset and length, each with its own CRC32C. A reader can + pull out just the graph without scanning the file, and verify it before trusting it. +- The **assignment stream** is stored opaquely: the bundle never parses BEN/XBEN internals, it just + carries the bytes and notes the format. That's what lets you swap a BEN bundle for an XBEN one by + recompressing only the inner stream. + +The writer lays the file down in order — a provisional header marked *unfinalized*, then assets, +then the stream, then the directory — and **patches the header last** to flip it to finalized and +fill in the final lengths, checksum, and sample count. So if the process dies mid-write, the +partial file is still recoverable (assignments read to end-of-file) and clearly flagged incomplete; +that final header patch is the single commit point. + ## Going deeper The exact byte layouts are documented in the format specifications, for readers building diff --git a/ben-py/docs/concepts/jsonl-schema.md b/ben-py/docs/concepts/jsonl-schema.md new file mode 100644 index 0000000..afae4ad --- /dev/null +++ b/ben-py/docs/concepts/jsonl-schema.md @@ -0,0 +1,88 @@ +# JSONL input schema + +The whole-file codec helpers read JSON Lines: one JSON object per line, each with an +`assignment` field. + +```json +{"assignment": [1, 1, 2, 2], "sample": 1} +{"assignment": [1, 2, 2, 2], "sample": 2} +``` + +Use this format when a sampler has already written plans to disk and you want to convert the +complete file to BEN or XBEN in one call. + +## Required field + +`assignment` +: A list of integer district ids. Every line in one file must describe the same fixed graph + order, so every `assignment` should have the same length. + +```python +import json + +with open("plans.jsonl") as handle: + first = json.loads(next(handle)) + +assert isinstance(first["assignment"], list) +assert all(isinstance(value, int) for value in first["assignment"]) +``` + +## Optional fields + +The codec ignores every field except `assignment`. Fields like `sample`, `score`, +`cut_edges`, or sampler metadata may be useful in the source file, but they are not stored in +plain `.ben` or `.xben` streams. + +If those fields need to travel with the compressed ensemble, use a `.bendl` bundle and store +them as metadata or custom assets. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("jsonl-contract.bendl", overwrite=True) +encoder.add_metadata({"source": "plans.jsonl", "assignment_field": "assignment"}) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## Validation before conversion + +For large files, validate the cheap structural invariants before starting an expensive +conversion: + +```python +import json + +expected_length = None + +with open("plans.jsonl") as handle: + for line_number, line in enumerate(handle, start=1): + row = json.loads(line) + assignment = row["assignment"] + + if expected_length is None: + expected_length = len(assignment) + + if len(assignment) != expected_length: + raise ValueError(f"line {line_number}: assignment length changed") + + if not all(isinstance(value, int) and value > 0 for value in assignment): + raise ValueError(f"line {line_number}: assignment values must be positive integers") +``` + +That check does not prove the assignments match the intended graph order. It only verifies +that the JSONL is structurally safe to encode. Node order is covered in +[The data contract](data-model.md). + +## Conversion + +```python +from binary_ensemble import encode_jsonl_to_ben, encode_jsonl_to_xben + +encode_jsonl_to_ben("plans.jsonl", "plans.ben", overwrite=True) +encode_jsonl_to_xben("plans.jsonl", "plans.xben", overwrite=True) +``` + +Use `encode_jsonl_to_ben()` when you plan to keep working with the ensemble. Use +`encode_jsonl_to_xben()` when the output is immediately going to archive or transfer. + diff --git a/ben-py/docs/concepts/limitations.md b/ben-py/docs/concepts/limitations.md new file mode 100644 index 0000000..fa20c35 --- /dev/null +++ b/ben-py/docs/concepts/limitations.md @@ -0,0 +1,78 @@ +# Limitations and invariants + +This page is intentionally blunt. `binary-ensemble` is designed for a specific data shape: +large streams of district-assignment vectors over a fixed dual graph. It is very good at +that job, but it does not try to be a general geospatial archive format. + +## Assignment-only streams + +Plain `.ben` and `.xben` files store only assignment streams. They do not store: + +- the dual graph, +- node attributes, +- sampler settings, +- per-plan scores, +- provenance metadata. + +Use `.bendl` when that context must travel with the assignments. + +## One graph order per stream + +A stream represents one ensemble over one fixed node order. Every assignment in the stream +must have the same length and the same positional meaning. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +first_assignment = next(iter(decoder)) + +assert graph is None or graph.number_of_nodes() == len(first_assignment) +``` + +If the graph order is wrong, decoding still succeeds because integer vectors are still valid. +The resulting plans are wrong, not unreadable. + +## One stream per bundle + +A `.bendl` bundle carries one assignment stream. You can append assets after finalization, but +you cannot append more samples or add a second stream. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder.append("ensemble.bendl") +encoder.add_asset("notes.txt", "post-run note", content_type="text") +encoder.close() +``` + +## XBEN is not the working format + +XBEN is optimized for storage size, not write speed. Compression can be slow on block-level +ensembles, especially at high compression levels. Use BEN while sampling, iterating, and +subsampling; recompress to XBEN once the file is ready to share. + +## Relabel before XBEN + +`relabel_bundle()` expects a BEN bundle with an embedded graph. Run it before +`compress_stream()`. + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="limited-sorted.bendl", sort="mlc") +compress_stream("limited-sorted.bendl", out_file="limited-archive.bendl") +``` + +## District ids are integers + +Assignments store integer district ids. The practical limit is 16-bit positive district ids, +which is far above normal statewide redistricting use. Non-integer labels should be mapped to +integers before encoding. + +## No geospatial geometry + +Bundles can store graph JSON and custom text or JSON assets, but they do not embed arbitrary +geospatial file trees by default. Store geometry paths, hashes, and provenance in metadata, or +ship the geometry separately when readers need it. diff --git a/ben-py/docs/concepts/ordering-deep-dive.md b/ben-py/docs/concepts/ordering-deep-dive.md new file mode 100644 index 0000000..701047a --- /dev/null +++ b/ben-py/docs/concepts/ordering-deep-dive.md @@ -0,0 +1,91 @@ +# Graph ordering deep dive + +Graph ordering controls the sequence in which assignment values are written. Because BEN uses +run-length encoding, this one choice can dominate the final file size. + +## What ordering changes + +An assignment is positional: + +```python +assignment = [1, 1, 2, 2] +``` + +If the graph order is `[A, B, C, D]`, the assignment says `A -> 1`, `B -> 1`, +`C -> 2`, `D -> 2`. If the graph order changes, the assignment must change with it. + +## The available orderings + +| Ordering | Use when | Strength | Cost | +|---|---|---|---| +| `sort="key"` | Nodes have a meaningful geographic key such as `GEOID20` | Often strongest on Census data | Cheap sort | +| `sort="mlc"` | No reliable key, topology should drive order | Strong default | Graph algorithm | +| `sort="rcm"` | Want topology-based bandwidth reduction | Solid fallback | Graph algorithm | +| `sort=None` | You must preserve existing order exactly | No compression help | None | + +## Try an ordering directly + +```python +import networkx as nx + +from binary_ensemble import graph + +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(4, 4)) +for node in dual_graph.nodes: + dual_graph.nodes[node]["GEOID20"] = f"{node:04d}" + +adjacency = nx.adjacency_data(dual_graph) +ordered_graph, permutation_map = graph.reorder(adjacency, sort="key", key="GEOID20") + +print(list(ordered_graph.nodes)[:4]) +old_to_new = permutation_map["node_permutation_old_to_new"] +print({node: old_to_new[node] for node in list(old_to_new)[:4]}) +``` + +## Use an ordering while creating a bundle + +`add_graph()` returns the reordered graph. That returned graph is the graph whose node order +your assignments should follow. + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +dual_graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) +adjacency = nx.adjacency_data(dual_graph) + +encoder = BendlEncoder("ordering.bendl", overwrite=True) +ordered_graph = encoder.add_graph(adjacency, sort="rcm") +write_order = list(ordered_graph.nodes) + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + +assert len(write_order) == 4 +``` + +## Use an ordering after a bundle already exists + +If you already have a BEN bundle with a graph, use `relabel_bundle()`. It reorders the graph, +rewrites every assignment into that new order, and stores a fresh permutation map. + +```python +from binary_ensemble import relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="ordering-sorted.bendl", sort="mlc") +``` + +## Common mistakes + +- Sorting a graph after writing assignments, without rewriting the assignments. +- Writing assignments in `gerrychain.Graph.nodes` order while storing a differently ordered + graph. +- Sorting by a key that is missing or not unique enough for the intended order. +- Recompressing to XBEN before testing graph order, which makes later repair slower. + +## Recommended decision + +Start with `sort="key", key="GEOID20"` when a Census-like geographic key exists. Otherwise +use `sort="mlc"`. Use `sort=None` only when preserving an external node order is more +important than compression. diff --git a/ben-py/docs/concepts/overview.md b/ben-py/docs/concepts/overview.md index f578251..5fcbd19 100644 --- a/ben-py/docs/concepts/overview.md +++ b/ben-py/docs/concepts/overview.md @@ -65,3 +65,10 @@ The Python package mirrors the project's CLI tools: See [The API map](api-map.md) for when to reach for each, and the [Vocabulary](vocabulary.md) page for the precise meaning of *plan*, *assignment*, *sample*, and *ensemble*. + +For the invariants that must hold across a real run — assignment length, graph node order, +JSONL shape, and bundle assets — see [The data contract](data-model.md). + +For operational guidance after the basics, see [Performance guide](performance.md), +[Graph ordering deep dive](ordering-deep-dive.md), [Limitations and invariants](limitations.md), +and [Compatibility and stability](compatibility.md). diff --git a/ben-py/docs/concepts/performance.md b/ben-py/docs/concepts/performance.md new file mode 100644 index 0000000..3ae7139 --- /dev/null +++ b/ben-py/docs/concepts/performance.md @@ -0,0 +1,111 @@ +# Performance guide + +`binary-ensemble` has two very different performance profiles: + +- BEN is fast to write and read. Use it while sampling, inspecting, and subsampling. +- XBEN is much smaller but slower to create. Use it for archival and transfer. + +## Format trade-offs + +| Format | Write speed | Read speed | Size | Best use | +|---|---|---|---|---| +| JSONL | slowest and largest | simple but bulky | largest | Interchange and debugging | +| BEN | fast | fast | small | Day-to-day work | +| XBEN | slow to create | fast after startup | smallest | Archive and sharing | +| BENDL + BEN | fast | fast | small plus assets | Recommended working file | +| BENDL + XBEN | slow to create | fast after startup | smallest plus assets | Recommended archive file | + +## The biggest lever: node order + +Run-length encoding rewards long stretches of the same district id. A better graph node order +creates longer runs in every assignment. + +```python +import networkx as nx + +from binary_ensemble import graph + +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(8, 8)) +for node in dual_graph.nodes: + dual_graph.nodes[node]["GEOID20"] = f"{node:04d}" + +ordered_graph, permutation_map = graph.reorder( + nx.adjacency_data(dual_graph), + sort="key", + key="GEOID20", +) + +assert ordered_graph.number_of_nodes() == 64 +assert "node_permutation_old_to_new" in permutation_map +``` + +For real Census block graphs, a geographic key such as `GEOID20` is often the best first +try. Without a meaningful key, use `sort="mlc"` or `sort="rcm"`. + +## The second lever: XBEN recompression + +XBEN runs LZMA2 over an XZ representation of the BEN stream. That exploits repetition across +plans and can reduce large block-level ensembles by orders of magnitude. + +```python +from binary_ensemble import compress_stream + +compress_stream("ensemble.bendl", out_file="performance-archive.bendl") +``` + +Expect XBEN compression to be asymmetric: slow to create, much faster to read. On large +ensembles, create the XBEN file once and keep a BEN working copy if you will iterate often. + +## Tuning XBEN + +The plain-stream XBEN encoders expose tuning options: + +```python +from binary_ensemble import encode_ben_to_xben + +encode_ben_to_xben( + "chain.ben", + "performance.xben", + overwrite=True, + n_threads=4, + compression_level=6, + xz_block_size=None, +) +``` + +Guidance: + +| Option | Effect | Practical default | +|---|---|---| +| `n_threads` | Parallelizes compression work | `None` to use the library default | +| `compression_level` | Higher is smaller but slower | `9` for final archive, lower for iteration | +| `xz_block_size` | Controls XZ block sizing | `None` unless benchmarking a specific workload | + +`compress_stream()` for bundles uses the library's bundle recompression defaults. If you +need fine control over XBEN tuning, extract the stream, tune the plain-stream conversion, and +package deliberately. + +## Subsampling + +Subsampling is designed to avoid decoding unneeded samples. + +```python +from binary_ensemble import BendlDecoder + +for assignment in BendlDecoder("ensemble.bendl").subsample_every(25): + print(assignment[:4]) +``` + +BEN streams are cheapest to subsample. XBEN streams pay a decompression startup cost, then +can still skip through the decoded stream efficiently. + +## Practical workflow + +For serious runs: + +1. Reorder the graph before or during bundle creation. +2. Write a BEN bundle while sampling. +3. Attach metadata, graph, and provenance assets. +4. Use BEN for quality checks and analysis. +5. Relabel/reorder the final bundle if needed. +6. Recompress to XBEN for sharing. diff --git a/ben-py/docs/concepts/variants.md b/ben-py/docs/concepts/variants.md index ff73875..bcb130b 100644 --- a/ben-py/docs/concepts/variants.md +++ b/ben-py/docs/concepts/variants.md @@ -8,36 +8,64 @@ back. You choose a variant with the `variant=` argument on the encoders and the `encode_jsonl_to_*` helpers. +All three sit on the same foundation — run-length encoding and bit-packing into frames, behind +a banner (see [how BEN works](formats.md#how-each-format-works)). What differs is the **frame +shape**: whether a frame stands alone, carries a repeat count, or is a difference against the +plan before it. That choice also decides whether you can [subsample](../how-to/subsample.md) by +skipping frames or have to replay them. + ## `standard` -Each plan is stored independently — RLE + bit-packing, nothing more. It's the simplest -encoding and the baseline. For ensembles with no repetition, its output is very slightly -smaller than `mkv_chain`; for chains with repeats, the other variants win comfortably. +Each plan is stored independently — RLE + bit-packing, nothing more. One sample is one frame: a +small header (the bit-widths and the payload byte length) followed by the packed runs. Nothing in +a frame refers to any other frame. + +Because every frame is self-contained and its byte length is in the header, a reader can **skip +straight over** frames it doesn't want without unpacking them — so random access and subsampling +are cheap. For ensembles with no repetition its output is very slightly smaller than `mkv_chain`; +for chains with repeats, the other variants win comfortably. - **Good for:** any ensemble; a safe baseline. ## `mkv_chain` -Like `standard`, but identical consecutive plans are collapsed into a single frame carrying a -repetition count. This is built for **MCMC chains logged in full** — including self-loops, -where a proposal was rejected and the same plan repeats (as in -[Reversible ReCom](https://mggg.org/rrc)). +Like `standard`, but each frame also carries a **repetition count**. A run of identical +consecutive plans collapses into one frame plus a count of *N*, which the reader expands back into +*N* samples — so the stored sample count is preserved while the bytes aren't. This is built for +**MCMC chains logged in full**, where a rejected proposal leaves the same plan repeated on +consecutive steps (self-loops, as in [Reversible ReCom](https://mggg.org/rrc)). + +Frames are still independently decodable, so `mkv_chain` keeps `standard`'s cheap frame-skip +subsampling. It only beats `standard` when consecutive plans actually repeat; with no repeats the +extra count field makes it marginally larger. - **Good for:** full-chain MCMC ensembles where rejections produce repeated plans. ## `twodelta` -The **default**, and usually the best general-purpose choice. It delta-encodes **pairwise -ReCom steps**: when two consecutive plans differ by exactly one recombination move (two -districts swap some nodes, nothing else changes), only the difference is stored. Any other -transition — a multi-district move, independent/random sampling, a newly created district — -is stored as a full snapshot frame instead, and identical consecutive plans are handled with -repetition counts. - -Because it falls back to snapshots, `twodelta` is **compatible with every sampler**; non-ReCom -ensembles just produce more snapshot frames and less delta savings. Its best-case compression -comes from a full-chain *pairwise* ReCom ensemble, where nearly every accepted move changes -exactly two districts. +The **default**, and usually the best general-purpose choice. Instead of storing every plan in +full, it stores most plans as the **difference** from the one before. + +The first sample is a full snapshot — the **anchor**. For each later sample, the encoder looks at +how it changed from the previous one and picks a frame type: + +- **Delta frame** — the change is one clean **pairwise** recombination: exactly two districts swap + some nodes (both district ids already exist, nothing else moves). The frame stores just those two + ids and where they now sit, applied on top of the previous plan. This is the case that makes + `twodelta` small. +- **Repeat** — the plan is unchanged, handled with a repetition count like `mkv_chain`. +- **Snapshot frame** — anything else (a multi-district move, a brand-new district id, independent + or random sampling) is stored as a full plan, which also becomes the new anchor for the deltas + that follow. + +Because it always falls back to snapshots, `twodelta` is **compatible with every sampler** — +non-ReCom ensembles just produce more snapshots and less delta savings. Its best case is a +full-chain *pairwise* ReCom ensemble, where nearly every accepted move is a two-district swap. + +The trade-off: a delta frame only makes sense relative to the plan before it, so a reader +reconstructs a sample by **replaying forward from the most recent snapshot**. That means +`twodelta` gives up the cheap frame-skip subsampling that `standard` and `mkv_chain` allow — random +access costs a short replay. - **Good for:** ReCom chains (best case) and as a robust default for anything else. diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index 0c96861..7f2ae4f 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -104,6 +104,7 @@ pygments_style = "friendly" pygments_dark_style = "github-dark" + # -- Color palettes ---------------------------------------------------------- # # Each palette gives furo a light-mode and a dark-mode brand color as @@ -129,12 +130,12 @@ def _brand(primary, content): PALETTES = { - "ocean": {"light": _brand("#0099cd", "#0066a0"), "dark": _brand("#36c5f0", "#5cc8f5")}, + "ocean": {"light": _brand("#0099cd", "#0066a0"), "dark": _brand("#36c5f0", "#5cc8f5")}, "indigo": {"light": _brand("#4f46e5", "#4338ca"), "dark": _brand("#818cf8", "#a5b4fc")}, "forest": {"light": _brand("#047857", "#065f46"), "dark": _brand("#34d399", "#6ee7b7")}, "sunset": {"light": _brand("#ea580c", "#c2410c"), "dark": _brand("#fb923c", "#fdba74")}, - "plum": {"light": _brand("#7c3aed", "#6d28d9"), "dark": _brand("#a78bfa", "#c4b5fd")}, - "slate": {"light": _brand("#334155", "#1e293b"), "dark": _brand("#94a3b8", "#cbd5e1")}, + "plum": {"light": _brand("#7c3aed", "#6d28d9"), "dark": _brand("#a78bfa", "#c4b5fd")}, + "slate": {"light": _brand("#334155", "#1e293b"), "dark": _brand("#94a3b8", "#cbd5e1")}, # From a Huemint palette: a charcoal dark mode with neon-teal accents, and a # matching light mode that carries the teal as a darker, legible shade on white. "aurora": { @@ -215,6 +216,8 @@ def _brand(primary, content): }, # From a Huemint palette: a warm near-black dark mode with a bright orange / # cerulean (complementary) accent pair; light mode darkens both for white. + # "color-brand-content": "#075985", + # "color-brand-content": "#176995", "tangerine": { "dark_pygments": "fruity", "light_pygments": "warm-light", @@ -223,7 +226,7 @@ def _brand(primary, content): "color-background-secondary": "#f1f0e6", "color-foreground-primary": "#140f0c", "color-brand-primary": "#c2410c", - "color-brand-content": "#004483", + "color-brand-content": "#0077c4", }, "dark": { "color-background-primary": "#1c1917", @@ -340,7 +343,15 @@ def _brand(primary, content): def _warm_light(): from pygments.style import Style from pygments.token import ( - Comment, Error, Generic, Keyword, Name, Number, Operator, String, Token, + Comment, + Error, + Generic, + Keyword, + Name, + Number, + Operator, + String, + Token, ) return type( @@ -352,35 +363,35 @@ def _warm_light(): "highlight_color": "#e7dcc4", "styles": { Token: "#20180f", - Comment: "italic #857762", + Comment: "italic #685c4b", Comment.Preproc: "noitalic #b0420a", Keyword: "bold #b0420a", - Keyword.Type: "nobold #9a5b00", - Keyword.Constant: "nobold #9a5b00", + Keyword.Type: "nobold #623c00", + Keyword.Constant: "nobold #623c00", Operator: "#6a4a2a", Operator.Word: "bold #b0420a", - Name.Builtin: "#9a5b00", - Name.Function: "#0a5a86", + Name.Builtin: "bold #623c00", + Name.Function: "#08527d", Name.Class: "bold #0a5a86", Name.Namespace: "bold #0a5a86", Name.Exception: "bold #b3261e", Name.Variable: "#20180f", - Name.Constant: "#9a5b00", + Name.Constant: "#623c00", Name.Decorator: "#b0420a", Name.Attribute: "#0a5a86", - Name.Tag: "bold #0e6a60", - String: "#0e6a60", - String.Doc: "italic #857762", + Name.Tag: "bold #0a544c", + String: "#0a544c", + String.Doc: "italic #685c4b", String.Escape: "bold #b0420a", - Number: "#8a5a00", + Number: "bold #7c2560", Generic.Heading: "bold #20180f", Generic.Subheading: "bold #0a5a86", Generic.Deleted: "#b3261e", - Generic.Inserted: "#0e6a60", + Generic.Inserted: "#0a544c", Generic.Error: "#b3261e", Generic.Emph: "italic", Generic.Strong: "bold", - Generic.Prompt: "bold #857762", + Generic.Prompt: "bold #685c4b", Error: "border:#b3261e", }, }, diff --git a/ben-py/docs/how-to/api-cookbook.md b/ben-py/docs/how-to/api-cookbook.md new file mode 100644 index 0000000..4d4bbb1 --- /dev/null +++ b/ben-py/docs/how-to/api-cookbook.md @@ -0,0 +1,173 @@ +# API cookbook + +Short recipes for common jobs. These examples assume the sample files from +[How-to guides](index.md) exist in the working directory. + +## Create a minimal bundle + +```python +from binary_ensemble import BendlEncoder + +plans = [[1, 1, 2, 2], [1, 2, 2, 2]] + +encoder = BendlEncoder("cookbook-minimal.bendl", overwrite=True) +with encoder.stream("ben") as stream: + for plan in plans: + stream.write(plan) +``` + +## Create a bundle with metadata + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("cookbook-metadata.bendl", overwrite=True) +encoder.add_metadata({"sampler": "demo", "seed": 1234}) + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## Create a bundle with a graph + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) + +encoder = BendlEncoder("cookbook-graph.bendl", overwrite=True) +ordered_graph = encoder.add_graph(nx.adjacency_data(graph), sort="rcm") + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + +assert ordered_graph.number_of_nodes() == 4 +``` + +## Read assignments + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +for assignment in decoder: + print(assignment[:4]) + break +``` + +## Read graph and metadata + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +graph = decoder.read_graph() +metadata = decoder.read_metadata() + +print(graph.number_of_nodes()) +print(metadata["sampler"]) +``` + +## Inspect bundle assets + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +for asset in decoder.list_assets(): + print(asset["name"], asset["type"], asset["flags"]) +``` + +## Add custom assets + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("cookbook-assets.bendl", overwrite=True) +encoder.add_asset("scores.json", '{"cut_edges": [10, 11]}', content_type="json") +encoder.add_asset("notes.txt", "Created for cookbook example.", content_type="text") + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## Append an asset after finalization + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder.append("ensemble.bendl") +encoder.add_asset("cookbook-note.txt", "Added after the run.", content_type="text") +encoder.close() +``` + +## Subsample every Nth plan + +```python +from binary_ensemble import BendlDecoder + +for assignment in BendlDecoder("ensemble.bendl").subsample_every(30): + print(assignment[:4]) +``` + +## Subsample by range + +```python +from binary_ensemble import BendlDecoder + +window = list(BendlDecoder("ensemble.bendl").subsample_range(10, 15)) +assert len(window) == 6 +``` + +## Convert JSONL to BEN and XBEN + +```python +from binary_ensemble import encode_ben_to_xben, encode_jsonl_to_ben + +encode_jsonl_to_ben("plans.jsonl", "cookbook-plans.ben", overwrite=True) +encode_ben_to_xben("cookbook-plans.ben", "cookbook-plans.xben", overwrite=True) +``` + +## Decode XBEN back to BEN + +```python +from binary_ensemble import decode_xben_to_ben + +decode_xben_to_ben("chain.xben", "cookbook-work.ben", overwrite=True) +``` + +## Extract the stream from a bundle + +```python +from binary_ensemble import BendlDecoder + +BendlDecoder("ensemble.bendl").extract_stream("cookbook-extracted.ben", overwrite=True) +``` + +## Relabel and recompress a bundle + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="cookbook-sorted.bendl", sort="mlc") +compress_stream("cookbook-sorted.bendl", out_file="cookbook-archive.bendl") +``` + +## Reorder a graph directly + +```python +import networkx as nx + +from binary_ensemble import graph + +dual_graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) +ordered_graph, permutation_map = graph.reorder(nx.adjacency_data(dual_graph), sort="rcm") + +assert ordered_graph.number_of_nodes() == 4 +assert "node_permutation_old_to_new" in permutation_map +``` diff --git a/ben-py/docs/how-to/compress-gerrychain-run.md b/ben-py/docs/how-to/compress-gerrychain-run.md index a68568e..ab52915 100644 --- a/ben-py/docs/how-to/compress-gerrychain-run.md +++ b/ben-py/docs/how-to/compress-gerrychain-run.md @@ -9,7 +9,11 @@ This recipe needs GerryChain installed: `pip install gerrychain`. `binary-ensemb only ever sees plain lists of integers, so the same pattern works with any sampler. ``` -## Set up the chain +## Reorder the graph before building the chain + +The best compression wins come from graph order. `BendlEncoder.add_graph(..., sort="mlc")` +embeds an MLC-reordered graph and returns that reordered graph as a live NetworkX graph. Build +the GerryChain run on that returned graph so the sampler and the bundle agree on node order. ```python from functools import partial @@ -18,7 +22,17 @@ from gerrychain import Partition, Graph, MarkovChain, updaters, accept from gerrychain.proposals import recom from gerrychain.constraints import contiguous -graph = Graph.from_json("gerrymandria.json") +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("ensemble.bendl", overwrite=True) + +# Explicitly show the default: MLC reorders the graph for better run-length compression. +mlc_graph = encoder.add_graph("gerrymandria.json", sort="mlc") + +# Hand the reordered graph back into GerryChain. This is the load-bearing step: +# the chain now runs in the same node order the bundle stores. +graph = Graph.from_networkx(mlc_graph) +node_order = list(graph.nodes) initial_partition = Partition( graph, @@ -43,18 +57,19 @@ chain = MarkovChain( ## Stream the chain into a bundle -The one thing to get right is **node order**: an assignment vector is only meaningful in the -dual graph's node order, so reorder each plan to match the order you embed. +The one thing to get right is still **node order**. Since the chain was built on +`Graph.from_networkx(mlc_graph)`, each plan should be written in `node_order`, the node order +from that same GerryChain graph. ```python -from binary_ensemble import BendlEncoder - -# The order assignments must be written in. -node_order = list(graph.nodes) - -encoder = BendlEncoder("ensemble.bendl", overwrite=True) -encoder.add_graph("gerrymandria.json", sort=None) # embed the dual graph as-is -encoder.add_metadata({"sampler": "ReCom", "epsilon": 0.01, "steps": 1000}) +encoder.add_metadata( + { + "sampler": "ReCom", + "epsilon": 0.01, + "steps": 1000, + "node_order": "mlc", + } +) with encoder.stream("ben", variant="twodelta") as stream: # twodelta suits ReCom chains for partition in chain: @@ -67,12 +82,29 @@ with encoder.stream("ben", variant="twodelta") as stream: # twodelta suits ReCo That's it — `ensemble.bendl` now holds all 1,000 plans plus the graph and metadata in one file. To read it back, see [Read and iterate an ensemble](read-and-iterate.md). -## Make it smaller +## Why this is better than reordering later + +You *can* write a raw-order BEN bundle and later call `relabel_bundle()` to reorder the graph +and rewrite the stream. But when you control the sampling code, it is cleaner to reorder first: + +1. `add_graph(..., sort="mlc")` stores the reordered graph and permutation map. +2. `Graph.from_networkx(mlc_graph)` makes GerryChain run on that exact graph. +3. `series.loc[node_order]` writes assignments in that exact order. + +That means the working BEN file is already locality-friendly, so every downstream step starts +from the compressed-friendly order. + +## Archive the result + +After the run, recompress the embedded BEN stream to XBEN for sharing: + +```python +from binary_ensemble import compress_stream + +compress_stream("ensemble.bendl", out_file="ensemble-archive.bendl") +``` -The bundle above stores the graph in its original node order. For a much smaller file, reorder -the graph (so assignments form long runs) and recompress to XBEN — see -[Shrink a bundle for sharing](shrink-for-sharing.md). You can do this after the fact, so it -never complicates the sampling loop. +For more on final archival workflows, see [Shrink a bundle for sharing](shrink-for-sharing.md). ```{tip} Encoding `twodelta` (the default) delta-compresses pairwise ReCom moves. If you log a full diff --git a/ben-py/docs/how-to/end-to-end-workflow.md b/ben-py/docs/how-to/end-to-end-workflow.md new file mode 100644 index 0000000..7e25197 --- /dev/null +++ b/ben-py/docs/how-to/end-to-end-workflow.md @@ -0,0 +1,116 @@ +# End-to-end workflow + +This tutorial follows the recommended lifecycle: + +1. prepare a graph, +2. write a BEN bundle while producing assignments, +3. inspect and analyze the bundle, +4. add provenance, +5. relabel and recompress for sharing. + +The code uses a tiny NetworkX grid so it runs anywhere. The same structure applies to a +GerryChain run. + +## Prepare the graph + +```python +import networkx as nx + +SIDE = 4 +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(SIDE, SIDE)) + +for node in dual_graph.nodes: + row, col = divmod(node, SIDE) + dual_graph.nodes[node]["TOTPOP"] = 1 + dual_graph.nodes[node]["GEOID20"] = f"{row:02d}{col:02d}" + +adjacency = nx.adjacency_data(dual_graph) +``` + +## Write the working BEN bundle + +`add_graph()` returns the graph in the order assignments should use. In this toy example the +assignment generator already uses integer node positions, so we only need the node count. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("workflow.bendl", overwrite=True) +ordered_graph = encoder.add_graph(adjacency, sort="key", key="GEOID20") +encoder.add_metadata({"sampler": "toy-grid", "seed": 2026, "node_order": "GEOID20"}) + +node_count = ordered_graph.number_of_nodes() + +with encoder.stream("ben", variant="twodelta") as stream: + for step in range(20): + assignment = [(node + step) % 4 + 1 for node in range(node_count)] + stream.write(assignment) +``` + +## Inspect the result + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("workflow.bendl") + +print(decoder.count_samples()) +print(decoder.assignment_format()) +print(decoder.asset_names()) + +assert decoder.read_graph().number_of_nodes() == node_count +assert decoder.read_metadata()["sampler"] == "toy-grid" +``` + +## Analyze a subset + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("workflow.bendl") + +district_one_sizes = [] +for assignment in decoder.subsample_every(5): + district_one_sizes.append(sum(1 for district in assignment if district == 1)) + +print(district_one_sizes) +``` + +## Attach post-run provenance + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder.append("workflow.bendl") +encoder.add_asset("analysis-notes.txt", "Checked with the end-to-end tutorial.", content_type="text") +encoder.close() +``` + +## Produce a shareable archive + +Relabel/reorder first, then recompress the embedded stream to XBEN. + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("workflow.bendl", out_file="workflow-sorted.bendl", sort="mlc") +compress_stream("workflow-sorted.bendl", out_file="workflow-archive.bendl") +``` + +## Adapting this to GerryChain + +The only GerryChain-specific step is extracting assignments in the same node order as the +graph stored in the bundle. + + +```python +write_order = list(ordered_graph.nodes) + +with encoder.stream("ben", variant="twodelta") as stream: + for partition in chain: + series = partition.assignment.to_series() + stream.write(series.loc[write_order].astype(int).tolist()) +``` + +The invariant is the same for every sampler: the list you pass to `stream.write()` must be in +the embedded graph's node order. diff --git a/ben-py/docs/how-to/error-reference.md b/ben-py/docs/how-to/error-reference.md new file mode 100644 index 0000000..92949ae --- /dev/null +++ b/ben-py/docs/how-to/error-reference.md @@ -0,0 +1,159 @@ +# Error reference + +This page maps common symptoms to likely causes and fixes. + +## Output file already exists + +**Symptom:** a writer or converter raises an `OSError` saying the output path exists. + +**Cause:** writers default to `overwrite=False`. + +**Fix:** choose a new path or pass `overwrite=True`. + +```python +from binary_ensemble import encode_jsonl_to_ben + +encode_jsonl_to_ben("plans.jsonl", "error-reference.ben", overwrite=True) +``` + +## Wrong reader for the file type + +**Symptom:** opening a file raises an error that points you at another decoder. + +**Cause:** `.bendl`, `.ben`, and `.xben` are different containers. + +**Fix:** use the matching reader. + +```python +from binary_ensemble import BendlDecoder, BenDecoder + +bundle = BendlDecoder("ensemble.bendl") +ben_stream = BenDecoder("chain.ben") +xben_stream = BenDecoder("chain.xben", mode="xben") + +assert bundle.assignment_format() in {"ben", "xben"} +assert ben_stream.assignment_format() == "ben" +assert xben_stream.assignment_format() == "xben" +``` + +## `read_graph()` returns `None` + +**Cause:** the bundle has no `graph.json` asset. + +**Fix:** inspect assets, then attach the graph in future bundles. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.asset_names()) +``` + +## Relabeling fails because the bundle has no graph + +**Cause:** `relabel_bundle()` must know the graph order to rewrite assignment positions. + +**Fix:** create bundles with `add_graph()`, or relabel before discarding the graph context. + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) + +encoder = BendlEncoder("error-with-graph.bendl", overwrite=True) +encoder.add_graph(nx.adjacency_data(graph), sort=None) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## Relabeling fails after XBEN recompression + +**Cause:** `relabel_bundle()` works on BEN bundles. XBEN is the final archive step. + +**Fix:** relabel first, then recompress. + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="error-sorted.bendl", sort="mlc") +compress_stream("error-sorted.bendl", out_file="error-archive.bendl") +``` + +## `content_type` is rejected + +**Cause:** `add_asset()` accepts only `content_type="json"` or `content_type="text"` from the +Python wrapper. JSON payloads must be valid UTF-8 JSON; text payloads must be valid UTF-8. + +**Fix:** choose the right content type and validate payloads before writing. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("error-assets.bendl", overwrite=True) +encoder.add_asset("valid.json", '{"ok": true}', content_type="json") +encoder.add_asset("valid.txt", "plain text", content_type="text") + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## `sort="key"` fails + +**Cause:** key ordering requires a `key=` argument, and every node must have the relevant +attribute unless you use `key="id"`. + +**Fix:** provide the key and check the graph attributes. + +```python +import networkx as nx + +from binary_ensemble import graph + +dual_graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) +for node in dual_graph.nodes: + dual_graph.nodes[node]["GEOID20"] = f"{node:04d}" + +ordered_graph, _ = graph.reorder( + nx.adjacency_data(dual_graph), + sort="key", + key="GEOID20", +) + +assert ordered_graph.number_of_nodes() == 4 +``` + +## Assignments decode but downstream maps are wrong + +**Cause:** graph order and assignment order do not match. + +**Fix:** compare assignment length to graph size, then audit how assignments were extracted +from the sampler. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +assignment = next(iter(decoder)) + +assert graph is not None +assert len(assignment) == graph.number_of_nodes() +``` + +Length agreement is necessary, not sufficient. The only complete fix is writing assignments +in the embedded graph's node order. + +## XBEN startup warning + +**Cause:** XBEN must initialize decompression before yielding assignments. + +**Fix:** this is expected. Convert to BEN if you will repeatedly inspect or subsample the +same stream. + +```python +from binary_ensemble import decode_xben_to_ben + +decode_xben_to_ben("chain.xben", "error-work.ben", overwrite=True) +``` diff --git a/ben-py/docs/how-to/examples-gallery.md b/ben-py/docs/how-to/examples-gallery.md new file mode 100644 index 0000000..906cf76 --- /dev/null +++ b/ben-py/docs/how-to/examples-gallery.md @@ -0,0 +1,76 @@ +# Examples gallery + +Small standalone patterns you can paste into scripts. For longer explanations, follow the +links from each example. + +## Minimal bundle + +```python +from binary_ensemble import BendlEncoder, BendlDecoder + +encoder = BendlEncoder("gallery-minimal.bendl", overwrite=True) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + stream.write([1, 2, 2, 2]) + +assert len(BendlDecoder("gallery-minimal.bendl")) == 2 +``` + +See [Quickstart](../getting-started/quickstart.md). + +## Bundle with graph, metadata, and notes + +```python +import networkx as nx + +from binary_ensemble import BendlDecoder, BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) + +encoder = BendlEncoder("gallery-rich.bendl", overwrite=True) +encoder.add_graph(nx.adjacency_data(graph), sort=None) +encoder.add_metadata({"seed": 2026, "sampler": "demo"}) +encoder.add_asset("notes.txt", "Toy gallery bundle.", content_type="text") + +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + +decoder = BendlDecoder("gallery-rich.bendl") +assert decoder.read_graph().number_of_nodes() == 4 +assert decoder.read_metadata()["seed"] == 2026 +``` + +See [Custom assets and appending](custom-assets-and-append.md). + +## Plain stream conversion + +```python +from binary_ensemble import decode_xben_to_ben, encode_ben_to_xben + +encode_ben_to_xben("chain.ben", "gallery-chain.xben", overwrite=True) +decode_xben_to_ben("gallery-chain.xben", "gallery-chain.ben", overwrite=True) +``` + +See [Convert between formats](convert-formats.md). + +## Subsample for diagnostics + +```python +from binary_ensemble import BendlDecoder + +diagnostic_plans = list(BendlDecoder("ensemble.bendl").subsample_every(40)) +assert len(diagnostic_plans) > 0 +``` + +See [Subsample a large ensemble](subsample.md). + +## Archive a final bundle + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="gallery-sorted.bendl", sort="mlc") +compress_stream("gallery-sorted.bendl", out_file="gallery-archive.bendl") +``` + +See [Shrink a bundle for sharing](shrink-for-sharing.md). diff --git a/ben-py/docs/how-to/index.md b/ben-py/docs/how-to/index.md index d125679..3b80808 100644 --- a/ben-py/docs/how-to/index.md +++ b/ben-py/docs/how-to/index.md @@ -57,6 +57,27 @@ encode_ben_to_xben("chain.ben", "chain.xben", overwrite=True) ::::{grid} 1 1 2 2 :gutter: 3 +:::{grid-item-card} End-to-end workflow +:link: end-to-end-workflow +:link-type: doc + +Build a working BEN bundle, inspect it, attach provenance, and archive it as XBEN. +::: + +:::{grid-item-card} API cookbook +:link: api-cookbook +:link-type: doc + +Copy focused snippets for the most common Python API tasks. +::: + +:::{grid-item-card} Examples gallery +:link: examples-gallery +:link-type: doc + +Small standalone patterns for minimal bundles, rich bundles, conversion, subsampling, and archival. +::: + :::{grid-item-card} Compress a GerryChain run :link: compress-gerrychain-run :link-type: doc @@ -99,4 +120,18 @@ Reorder, relabel, and recompress a bundle to its smallest shareable form. Attach metadata and arbitrary blobs, then add more to a finalized bundle. ::: +:::{grid-item-card} Troubleshooting +:link: troubleshooting +:link-type: doc + +Diagnose wrong readers, incomplete bundles, missing graphs, and node-order mismatches. +::: + +:::{grid-item-card} Error reference +:link: error-reference +:link-type: doc + +Map common exceptions and confusing symptoms to causes and fixes. +::: + :::: diff --git a/ben-py/docs/how-to/shrink-for-sharing.md b/ben-py/docs/how-to/shrink-for-sharing.md index 18dd912..3887bbf 100644 --- a/ben-py/docs/how-to/shrink-for-sharing.md +++ b/ben-py/docs/how-to/shrink-for-sharing.md @@ -19,7 +19,7 @@ from binary_ensemble import relabel_bundle # Sort by a geographic key (often the most effective ordering). Use sort="mlc" or # sort="rcm" for a topology-based ordering instead. -relabel_bundle("ensemble.bendl", out_file="ensemble.sorted.bendl", sort="key", key="GEOID20") +relabel_bundle("ensemble.bendl", out_file="ensemble-sorted.bendl", sort="key", key="GEOID20") ``` See [Why reordering shrinks files](../concepts/compression.md) for what `mlc`, `rcm`, and @@ -33,7 +33,7 @@ unchanged: ```python from binary_ensemble import compress_stream -compress_stream("ensemble.sorted.bendl", out_file="ensemble.xben.bendl") +compress_stream("ensemble-sorted.bendl", out_file="ensemble-archive.bendl") ``` The result is a single `.bendl` that's typically orders of magnitude smaller — and still diff --git a/ben-py/docs/how-to/subsample.md b/ben-py/docs/how-to/subsample.md index 7794129..81205fc 100644 --- a/ben-py/docs/how-to/subsample.md +++ b/ben-py/docs/how-to/subsample.md @@ -18,6 +18,9 @@ for assignment in BendlDecoder("ensemble.bendl").subsample_indices([1, 50, 100]) ## By a contiguous range +Ranges are **1-based and inclusive**: `subsample_range(10, 15)` yields samples 10, 11, 12, +13, 14, and 15. + ```python for assignment in BendlDecoder("ensemble.bendl").subsample_range(10, 15): print(assignment[:10]) diff --git a/ben-py/docs/how-to/troubleshooting.md b/ben-py/docs/how-to/troubleshooting.md new file mode 100644 index 0000000..1ea4061 --- /dev/null +++ b/ben-py/docs/how-to/troubleshooting.md @@ -0,0 +1,163 @@ +# Troubleshooting + +Most `binary-ensemble` failures come from one of three sources: the wrong container, an +unfinished bundle, or a mismatch between assignment order and graph order. This guide gives +the quickest checks before you dig into a large run. + +## I opened a file with the wrong reader + +Use the reader that matches the container: + +| File | Reader | +|---|---| +| `.bendl` | `BendlDecoder` | +| `.ben` | `BenDecoder(path)` | +| `.xben` | `BenDecoder(path, mode="xben")` | + +```python +from binary_ensemble import BendlDecoder, BenDecoder + +bundle = BendlDecoder("ensemble.bendl") +plain_ben = BenDecoder("chain.ben") +plain_xben = BenDecoder("chain.xben", mode="xben") + +assert bundle.assignment_format() in {"ben", "xben"} +assert plain_ben.assignment_format() == "ben" +assert plain_xben.assignment_format() == "xben" +``` + +If you want the raw stream from a bundle, extract it: + +```python +from binary_ensemble import BendlDecoder + +BendlDecoder("ensemble.bendl").extract_stream("extracted.ben", overwrite=True) +``` + +## My bundle is incomplete + +A bundle is finalized when the stream context closes cleanly. If the process exits while +writing, the file may contain readable stream bytes but the header remains marked incomplete. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.is_complete()) +``` + +Use context managers around stream writes so finalization happens at the right time: + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder("new.bendl", overwrite=True) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +For an assets-only bundle, use the encoder itself as the context manager or call `close()`: + +```python +from binary_ensemble import BendlEncoder + +with BendlEncoder("assets-only.bendl", overwrite=True) as encoder: + encoder.add_metadata({"kind": "asset index"}) +``` + +## The assignments decode, but the maps look wrong + +This is almost always a node-order problem. Decoding can only recover the integer vectors +that were written; it cannot prove that those vectors line up with the intended geography. + +Check the basics: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +assignment = next(iter(decoder)) + +assert graph is not None +assert len(assignment) == graph.number_of_nodes() +``` + +If the lengths match but the maps still look wrong, confirm that the sampler wrote +assignments in the same node order as `list(graph.nodes)` from the embedded graph. When in +doubt, rebuild a tiny known assignment, write it, and read it back before launching the full +run. + +## `read_graph()` returns `None` + +The bundle does not contain `graph.json`. Plain `.ben` and `.xben` streams never contain a +graph, and a `.bendl` bundle only contains one if the writer called `add_graph()`. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.asset_names()) +``` + +For future runs, attach the graph before or during bundle creation: + +```python +import networkx as nx + +from binary_ensemble import BendlEncoder + +graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) + +encoder = BendlEncoder("with-graph.bendl", overwrite=True) +encoder.add_graph(nx.adjacency_data(graph), sort=None) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) +``` + +## Recompression or relabeling refuses my arguments + +`compress_stream()` and `relabel_bundle()` need exactly one output mode: + +```python +from binary_ensemble import compress_stream, relabel_bundle + +compress_stream("ensemble.bendl", out_file="ensemble-archive.bendl") +relabel_bundle("ensemble.bendl", out_file="ensemble-sorted.bendl", sort="mlc") +``` + +or: + +```python +from binary_ensemble import compress_stream, relabel_bundle + +relabel_bundle("ensemble.bendl", in_place=True, sort="mlc") +compress_stream("ensemble.bendl", in_place=True) +``` + +Passing both `out_file` and `in_place=True`, or passing neither, raises `ValueError`. Relabel +before recompressing to XBEN; relabeling needs a BEN stream and an embedded graph. + +## XBEN compression is slow + +That is expected. XBEN uses high-ratio LZMA2 compression and is meant for archival or +transfer. Work against BEN while sampling, reading, and subsampling; recompress to XBEN once +the bundle is ready to share. + +```python +from binary_ensemble import BendlEncoder, compress_stream + +encoder = BendlEncoder("to-archive.bendl", overwrite=True) +with encoder.stream("ben") as stream: + stream.write([1, 1, 2, 2]) + +compress_stream("to-archive.bendl", out_file="archive-copy.bendl") +``` + +If you need to repeatedly subsample a plain `.xben` stream, decode it back to `.ben` once: + +```python +from binary_ensemble import decode_xben_to_ben + +decode_xben_to_ben("chain.xben", "chain.work.ben", overwrite=True) +``` diff --git a/ben-py/docs/index.md b/ben-py/docs/index.md index 545132d..cbd9e1f 100644 --- a/ben-py/docs/index.md +++ b/ben-py/docs/index.md @@ -80,7 +80,7 @@ Install the package and compress your first ensemble in a few lines. :link-type: doc Dual graphs, assignments, the BEN/XBEN/BENDL formats, and the compression levers — -the mental model behind the API. +the mental model, data contract, performance model, and compatibility story behind the API. ::: :::{grid-item-card} {octicon}`tools` How-to guides @@ -88,7 +88,7 @@ the mental model behind the API. :link-type: doc Task-focused recipes: compress a GerryChain run, subsample, convert formats, -shrink a bundle for sharing. +shrink a bundle for sharing, diagnose errors, and copy cookbook patterns. ::: :::{grid-item-card} {octicon}`code` API reference @@ -114,10 +114,17 @@ getting-started/quickstart concepts/overview concepts/vocabulary +concepts/data-model +concepts/jsonl-schema concepts/formats concepts/variants concepts/compression +concepts/ordering-deep-dive +concepts/performance concepts/api-map +concepts/cli-parity +concepts/limitations +concepts/compatibility ``` ```{toctree} @@ -125,12 +132,17 @@ concepts/api-map :caption: How-to guides how-to/index +how-to/end-to-end-workflow +how-to/api-cookbook +how-to/examples-gallery how-to/compress-gerrychain-run how-to/read-and-iterate how-to/subsample how-to/convert-formats how-to/shrink-for-sharing how-to/custom-assets-and-append +how-to/troubleshooting +how-to/error-reference ``` ```{toctree} diff --git a/ben-py/docs/user/using_bendl.ipynb b/ben-py/docs/user/using_bendl.ipynb index 4a75772..2a4a00a 100644 --- a/ben-py/docs/user/using_bendl.ipynb +++ b/ben-py/docs/user/using_bendl.ipynb @@ -1128,17 +1128,17 @@ "output_type": "stream", "text": [ "/tmp/claude-1000/ipykernel_3730522/3018985229.py:6: UserWarning: XBEN may take a second to start decoding.\n", - " xben_decoder = BendlDecoder(\"example_data/rich.xben.bendl\")\n" + " xben_decoder = BendlDecoder(\"example_data/rich-archive.bendl\")\n" ] } ], "source": [ "# Write a fresh XBEN copy, original preserved. (out_file won't overwrite an\n", "# existing file, so clear any copy from a previous run first.)\n", - "Path(\"example_data/rich.xben.bendl\").unlink(missing_ok=True)\n", - "compress_stream(\"example_data/rich.bendl\", out_file=\"example_data/rich.xben.bendl\")\n", + "Path(\"example_data/rich-archive.bendl\").unlink(missing_ok=True)\n", + "compress_stream(\"example_data/rich.bendl\", out_file=\"example_data/rich-archive.bendl\")\n", "\n", - "xben_decoder = BendlDecoder(\"example_data/rich.xben.bendl\")\n", + "xben_decoder = BendlDecoder(\"example_data/rich-archive.bendl\")\n", "print(\"recompressed format:\", xben_decoder.assignment_format())\n", "print(\"assets preserved: \", xben_decoder.asset_names())\n", "print(\"metadata preserved: \", xben_decoder.read_metadata())\n", diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index 673195f..818a54b 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -37,6 +37,8 @@ docs = [ # MyST markdown + notebook rendering "myst-nb>=1.3.0", "linkify-it-py>=2.0.3", + # Live-reloading local preview server (used by `task docs-serve`) + "sphinx-autobuild>=2024.2.4", ] # Extra deps to *execute* the tutorial notebooks (used by CI and local verification # with NB_EXECUTION_MODE=cache), so every example is run end to end. diff --git a/ben-py/uv.lock b/ben-py/uv.lock index 9776957..a380723 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -27,6 +27,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, ] +[[package]] +name = "anyio" +version = "4.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, +] + [[package]] name = "appnope" version = "0.1.4" @@ -89,6 +102,7 @@ docs = [ { name = "linkify-it-py" }, { name = "myst-nb" }, { name = "sphinx" }, + { name = "sphinx-autobuild" }, { name = "sphinx-copybutton" }, { name = "sphinx-design" }, { name = "sphinxext-opengraph" }, @@ -119,6 +133,7 @@ requires-dist = [ { name = "myst-nb", marker = "extra == 'docs'", specifier = ">=1.3.0" }, { name = "networkx", specifier = ">=3.0" }, { name = "sphinx", marker = "extra == 'docs'", specifier = ">=8.2.3" }, + { name = "sphinx-autobuild", marker = "extra == 'docs'", specifier = ">=2024.2.4" }, { name = "sphinx-copybutton", marker = "extra == 'docs'", specifier = ">=0.5.2" }, { name = "sphinx-design", marker = "extra == 'docs'", specifier = ">=0.6.1" }, { name = "sphinxext-opengraph", marker = "extra == 'docs'", specifier = ">=0.9.1" }, @@ -624,6 +639,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -2202,6 +2226,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3", size = 3589741, upload-time = "2025-03-02T22:31:56.836Z" }, ] +[[package]] +name = "sphinx-autobuild" +version = "2025.8.25" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, + { name = "sphinx" }, + { name = "starlette" }, + { name = "uvicorn" }, + { name = "watchfiles" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e0/3c/a59a3a453d4133777f7ed2e83c80b7dc817d43c74b74298ca0af869662ad/sphinx_autobuild-2025.8.25.tar.gz", hash = "sha256:9cf5aab32853c8c31af572e4fecdc09c997e2b8be5a07daf2a389e270e85b213", size = 15200, upload-time = "2025-08-25T18:44:55.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/20/56411b52f917696995f5ad27d2ea7e9492c84a043c5b49a3a3173573cd93/sphinx_autobuild-2025.8.25-py3-none-any.whl", hash = "sha256:b750ac7d5a18603e4665294323fd20f6dcc0a984117026d1986704fa68f0379a", size = 12535, upload-time = "2025-08-25T18:44:54.164Z" }, +] + [[package]] name = "sphinx-basic-ng" version = "1.0.0b2" @@ -2355,6 +2396,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, ] +[[package]] +name = "starlette" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/44/ec35f1b6e83094b997da438a02c8c9b0ade2b1e84cfc48bd4656780760a6/starlette-1.2.1.tar.gz", hash = "sha256:9b9b5ebb992e67d6093741e63c2f59e4f6fff986f81163c087867bd7b924b3f6", size = 2701854, upload-time = "2026-05-31T01:07:51.847Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" }, +] + [[package]] name = "tabulate" version = "0.9.0" @@ -2440,6 +2494,123 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] +[[package]] +name = "uvicorn" +version = "0.49.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/1f/fa18009dea8469069cca78a4e877a008ab78f08b064bfc9ab891579077ff/uvicorn-0.49.0.tar.gz", hash = "sha256:ebf4271aa580d9de97f93192d4595176df6e91f9aae919ca73e4fc07df1e66a3", size = 91284, upload-time = "2026-06-03T22:01:30.448Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/fa/e1388bbcf24ef3274f45c0c1c7b501fd14971037c1b6ee23610553307497/uvicorn-0.49.0-py3-none-any.whl", hash = "sha256:ba3d14c3ee7e41c6c654c46c9eb489d33213cdd30aa1696eab1374337c13f68f", size = 71376, upload-time = "2026-06-03T22:01:29.037Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/41/5e1a4bb12aac5f1493fa1bdc11154eca3b258ca4eba65d39c473fe19d8e9/watchfiles-1.2.0.tar.gz", hash = "sha256:c995fba777f1ea992f090f9236e9284cf7a5d1a0130dd5a3d82c598cacd76838", size = 108252, upload-time = "2026-05-18T04:32:04.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/3d/8024c801df84d1587740d0359e7fdd80afeae3d159011f3d5376dd82f18e/watchfiles-1.2.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:704fd259e332e01f9b9c178f4bce9e49027e5587cc2600eeeaf8e76e1c846201", size = 400242, upload-time = "2026-05-18T04:31:19.014Z" }, + { url = "https://files.pythonhosted.org/packages/87/5b/f4dfd45323e949984a3a7f9dc31d1cbb049921e7d98253488dda72ccdaa9/watchfiles-1.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6543cf55d170003296d185c0af981f3e1311564907e1f4e08671fc7693a890a5", size = 394562, upload-time = "2026-05-18T04:30:08.46Z" }, + { url = "https://files.pythonhosted.org/packages/98/d8/19483ef075d601c409bce8bcbb5c0f81a10876fff870400568f08ce484a1/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89d8c2394a065ca86f5d2910ff263ae67c127e1376ccc4f9fc35c71db879f80a", size = 456611, upload-time = "2026-05-18T04:30:45.723Z" }, + { url = "https://files.pythonhosted.org/packages/b1/6a/cc81fbe7ee42f2f22e661a6e12def7807e01b14b2f39e0ff83fd373fd307/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:772b80df316480d894a0e3165fdd19cf77f5d17f9a787f94029465ad0e3529d1", size = 461379, upload-time = "2026-05-18T04:31:29.292Z" }, + { url = "https://files.pythonhosted.org/packages/b1/57/7e669002082c0a0f4fb5113bb70125f7110124b846b0a11bc5ae8e90eac1/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d158cd89df6053823533e06fb1d73c549133bff5f0396170c0e53d9559340717", size = 493556, upload-time = "2026-05-18T04:30:05.44Z" }, + { url = "https://files.pythonhosted.org/packages/45/7d/f60a2b19807b21fe8281f3a8da4f59eef0d5f96825ac4680ba2d4f2ebf91/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d516b3283a758e087841aedb8031549fb41ced08f3db10aa6d2bf32dc042525b", size = 575255, upload-time = "2026-05-18T04:30:40.568Z" }, + { url = "https://files.pythonhosted.org/packages/bd/49/77f5b5e6efbcd57482f74948ebb1b97e5c0046d6b61475042d830c84b3ff/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53b2290c92e0506d102cd448fbc610d87079553f86caa39d67440856a8b8bba5", size = 467052, upload-time = "2026-05-18T04:31:17.942Z" }, + { url = "https://files.pythonhosted.org/packages/ee/5a/73e2959af1b97fd5d556f9a8bdba017be23ceeef731869d5eaa0a753d5a3/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a711b51aec4370d0dcda5b6c09463206f133a5759341d7744b953a7b62e1100e", size = 456858, upload-time = "2026-05-18T04:30:30.182Z" }, + { url = "https://files.pythonhosted.org/packages/50/57/1bc8c27fad7e6c19bddee15d276dbb6ab72480ec01c127afff1673aee417/watchfiles-1.2.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:e2ca07fa7d89195ec0865d3d285666286740bfa83d83e5cee204043a31ecc165", size = 467579, upload-time = "2026-05-18T04:32:15.897Z" }, + { url = "https://files.pythonhosted.org/packages/09/6c/3c2e44edba3553c5e3c3b8c8a2a6dee6b9e12ae2cf4bd2378bebf9dc3038/watchfiles-1.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e0618518f282c4ebff60f5e5b1247b6d91bb8b9f4476947563a1e74acc66f3c6", size = 633253, upload-time = "2026-05-18T04:31:37.123Z" }, + { url = "https://files.pythonhosted.org/packages/30/c2/d8c84a882ab39bbefcc4915ab3e91830b7a7e990c5570b0b69075aba3faf/watchfiles-1.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d191c054d0715c3c95c99df9b8dbf6fd096d8c1e021e8f212e1bd8bc444ccb5", size = 660713, upload-time = "2026-05-18T04:31:24.62Z" }, + { url = "https://files.pythonhosted.org/packages/a9/07/f97736a5fc605364fe67b25e9fa4a6965dfd4840d50c406ada507e9d735f/watchfiles-1.2.0-cp311-cp311-win32.whl", hash = "sha256:9342472aff9b093c5acd4f6d8f70ae0937964ab56542502bcf5579782da69ae8", size = 277222, upload-time = "2026-05-18T04:31:21.131Z" }, + { url = "https://files.pythonhosted.org/packages/cf/99/2b04981977fc2608afd60360d928c6aecf6b950292ca221d98f4005f6694/watchfiles-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:dbd6c97045dad81227c8d040173da044c1de08de64a5ea8b555da4aee1d5fa22", size = 290274, upload-time = "2026-05-18T04:31:45.966Z" }, + { url = "https://files.pythonhosted.org/packages/3c/74/f7f58a7075ee9cf612b0cfcddb78b8cd8234f0742d6f0075cf0da2dde1c6/watchfiles-1.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:57a2d9fa4fb4c2ecae57b13dfff2c7ab53e21a2ba674fe9f05506680fcdcc0d7", size = 283460, upload-time = "2026-05-18T04:31:39.126Z" }, + { url = "https://files.pythonhosted.org/packages/b8/2f/e42c992d2afda3108ea1c02acecc991b9f31d05c14adc2a7cee9ee211fc4/watchfiles-1.2.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:bc13eb17538be00c874699dc0abe4ee2bc8d50bb1166a6b9e175ef3fd7eb8f26", size = 400115, upload-time = "2026-05-18T04:32:02.06Z" }, + { url = "https://files.pythonhosted.org/packages/5f/8f/6af2ea19065c91d8b0ea3516fdfc8c0d349f407e8e9fbf4e5a17360de8ad/watchfiles-1.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2d95ddc1eb6914154253d239089900813f6a767e174b8e6a50e7fdacb7e4236c", size = 393659, upload-time = "2026-05-18T04:30:50.951Z" }, + { url = "https://files.pythonhosted.org/packages/13/01/b32a967c56fb3e3e5be3db52c3d3b87fa4513aa367d8ed1ad96d42952e5f/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f70d8b291ef6e88d19b1f297a6905ddb978888d9272b0d05e6f53309856bcfc", size = 453207, upload-time = "2026-05-18T04:31:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/04/98/97557a812180338cb1abd32e1cffcc4588f59b5f23e0cb006b2ba95ba64a/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:56d8641cf834c2836922899105bd3ce3d0dfc69291d52edf0b4d0436829b34c0", size = 459273, upload-time = "2026-05-18T04:31:50.377Z" }, + { url = "https://files.pythonhosted.org/packages/e8/a8/b4b08dcb7653b8087c6586f7ce649505900e866bbcfe40dc9587af02e686/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2581a94056e55d7d0a31a823ea92bf73749c489ca2285bfdc0fbe6b2bb49d50c", size = 489927, upload-time = "2026-05-18T04:31:42.485Z" }, + { url = "https://files.pythonhosted.org/packages/50/94/3dceea03545d2e5ddfd839f0ddd5e1cecbf1697b5a428d5ba11cef6af95d/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:41bc1199f7523b3f82843c88cbb979180c949caef0342cf90968f178e5d49b01", size = 570476, upload-time = "2026-05-18T04:31:03.071Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f2/d39a5450c3532092b91f81d274360e613c2371bc874a89c7a1a3c5e8d138/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7571e4464cb6e434958f867f7f730b8ab0b75e3f8e5eac0499168486ab3c33a8", size = 465650, upload-time = "2026-05-18T04:30:12.701Z" }, + { url = "https://files.pythonhosted.org/packages/22/24/ed72f68cbc1333ca9b9f2200aa048bb6658ae41709bc1caad4310f4bdffd/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53a384f76b631c3ae5334ce6a52f0baa3a911eb94a4eac7f160079868b716d5", size = 456398, upload-time = "2026-05-18T04:30:13.784Z" }, + { url = "https://files.pythonhosted.org/packages/0d/64/982ef4a4e5bab5b6e5b6becc8cd5e732f6130a78b855f0abec6439a9a135/watchfiles-1.2.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:d20029a60a71a052a24c4db7673bc4de39ab89adbaccbfb5d67987c5d73f424d", size = 465140, upload-time = "2026-05-18T04:31:52.111Z" }, + { url = "https://files.pythonhosted.org/packages/a0/0c/95282abf4ed680b6096010bcfc30c5fa7a041fc5aa5a2ad17a2cc6c75bba/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2cb93af48550faf1cea04c303107c8b75833de7013e57ce27d3b8d21d8d0f58c", size = 630259, upload-time = "2026-05-18T04:31:25.676Z" }, + { url = "https://files.pythonhosted.org/packages/30/45/607c1de1530c4bdcf2cf1d1ecc2505ddba5d96bd43ba9f2b0e79876f850f/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2995c176de7692b86a2e4c58d9ec718f753150a979cb4a754e2b4ffa38e70906", size = 659859, upload-time = "2026-05-18T04:30:24.333Z" }, + { url = "https://files.pythonhosted.org/packages/fa/08/d9e2e0f9e8e6791d33aefc694ad7eefa7f901f63caff84a81ded38692f9c/watchfiles-1.2.0-cp312-cp312-win32.whl", hash = "sha256:7a2cffd17d27d2ecbb310c2b1d8174f222a5495b1a721894afa88ec11e25b898", size = 275480, upload-time = "2026-05-18T04:30:31.307Z" }, + { url = "https://files.pythonhosted.org/packages/1c/e6/9d42569c0102645cc8cea5d8c7d8a1e9d4ada2cb7f05f75e554b8aa2202a/watchfiles-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:f155b3a1b2a5fc89cdc70d47ee5d54e3b75e88efa34982028a35daef9ba00379", size = 288718, upload-time = "2026-05-18T04:32:10.745Z" }, + { url = "https://files.pythonhosted.org/packages/0a/26/88e0dc6ee3898169d7fa22bb6a69cabf2502d2ee25cb8c876d1262d204f8/watchfiles-1.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:8fa585ede612ee9f9e91b18bebf9ba11b9ae29a4e3a0d0cf6fca3e382133f0d5", size = 281026, upload-time = "2026-05-18T04:30:22.23Z" }, + { url = "https://files.pythonhosted.org/packages/d1/4d/70a7feced9f87e2ff26dba42667290f41694fc64646c67261fbb8cab5d5c/watchfiles-1.2.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:01ea8d66f0693b9b60a6541c8d10263091ca9a9060d242f3c1f3143f9aad2c98", size = 399730, upload-time = "2026-05-18T04:31:38.162Z" }, + { url = "https://files.pythonhosted.org/packages/31/3a/0da302f2307aee316922806ebd5726c542cbd787c938271cf14a074c7daf/watchfiles-1.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ba0480b9a74af058f43b337e937a451e109295c420916d68ad24e3dc02f5e44", size = 392842, upload-time = "2026-05-18T04:30:27.051Z" }, + { url = "https://files.pythonhosted.org/packages/db/ef/d5bdb705c224dbc256aa0c1ec47bf4e61ec52558f2afb44a71a1fe4d7015/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f34e26a19f91f710c08e0183429f0d1d15df734e6bc78c31e77b9ea9c433658", size = 452989, upload-time = "2026-05-18T04:31:11.945Z" }, + { url = "https://files.pythonhosted.org/packages/71/29/5495f2c1661949ef7a35e4d71111d129cfe7606414a26887a919d0a55406/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b4e77f6a55f858504069abd35d336a637555c09bca453dde1ee1e5ada8a6a1fb", size = 458978, upload-time = "2026-05-18T04:30:52.606Z" }, + { url = "https://files.pythonhosted.org/packages/d5/8c/7f9c07c433811c2fffd93e13fdfb7135de9aab5f2ae41be08960fa0047dc/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0cb4d80e212f116474a545c21c912b445f16bb0cef9e6a73a498164223e14e2f", size = 490248, upload-time = "2026-05-18T04:31:36.003Z" }, + { url = "https://files.pythonhosted.org/packages/3c/11/d93632febc52fbc21be90231bb7c17fd5387f46c9076fd40a5f9c2ae6910/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b974946a10af379d425e2eef5b62f5c6ebeaccf91d45eaad6f5b27ecd4f91aa0", size = 571847, upload-time = "2026-05-18T04:31:10.862Z" }, + { url = "https://files.pythonhosted.org/packages/55/b4/383173e73aabb07ad1d9c7aa859d95437ac46a6d6a1e11005facda0c9d19/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86bc13c25a8d1fcd70b51d0ce7c9b65e90de5666fcbfd3e34957cc73ee19aeb5", size = 465974, upload-time = "2026-05-18T04:30:17.006Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6c/89b1a230a78f57c52dd8893adb1f92f94411721b6ec12596c56d98c74356/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca148d73dea36c9763aaa351e4d7a51780ec1584217c45276f4fe8239c768b71", size = 454782, upload-time = "2026-05-18T04:30:35.656Z" }, + { url = "https://files.pythonhosted.org/packages/24/62/1732118367cfff0a9fce3bf62ff4bfded09ef5df21d9d446b858b3f70a96/watchfiles-1.2.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:c525543d91961c6955b2636b308569e84a1d1c5f5f2932041ab9ef46422f43e3", size = 465182, upload-time = "2026-05-18T04:30:20.846Z" }, + { url = "https://files.pythonhosted.org/packages/28/96/716f7e5f51339bf22963f3345f9f27d7f3b30e2eadc597e257c881dd3c53/watchfiles-1.2.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:a204794696ffb8f9b10fba6f7cb5216d42f3b2b71860ccac6b6e42f5f10973b0", size = 629841, upload-time = "2026-05-18T04:31:05.397Z" }, + { url = "https://files.pythonhosted.org/packages/4c/fe/c40783950fd771ccf66ab3ec2722d188a9af1c7f96c6e811f36e40c6e03f/watchfiles-1.2.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:10d86db20695afe7997ac9e1717637d6714a8d0220458c33f3d2061f54cec427", size = 658028, upload-time = "2026-05-18T04:31:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/71/72/4508db1856d1d87fcbb3b63f4839bab1b5682cb0e8d224d122263c09654a/watchfiles-1.2.0-cp313-cp313-win32.whl", hash = "sha256:eb283ee99e21ad6443c8cdb06ac5b34b1308c329cbdf03fa02b445363714c799", size = 275183, upload-time = "2026-05-18T04:30:59.57Z" }, + { url = "https://files.pythonhosted.org/packages/f9/36/14b76ca57652e5cc5fd1c11f32a261292c08a0d19a00351013c2549cbfb2/watchfiles-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:a0f27f01bee51861392bb6b7c4fdb290b27d1eb194e9e28788d68102a0e898d9", size = 288059, upload-time = "2026-05-18T04:32:07.937Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8d/0a85e395398d8d20fadfe5c5d32c726eee17a519e78fb356f2cf7531bffe/watchfiles-1.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:3651aa7058595e9cfb75d35dd5ada2bf9f48a5b8a0f3562821d3e210c507e077", size = 280186, upload-time = "2026-05-18T04:31:54.484Z" }, + { url = "https://files.pythonhosted.org/packages/37/68/36db056f1fdcc5f07302f56e631774d6835bcd6fa3ace402304621d5f9e5/watchfiles-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:faea288b6f0ab1902ef08f4ca6de005dccf856c4e0c4f21b8c5fce02d90a1b08", size = 399031, upload-time = "2026-05-18T04:30:44.576Z" }, + { url = "https://files.pythonhosted.org/packages/c1/64/01a9d6f66a82a5c101ce939274106cc72759d62427e153f01edd2b9f87c2/watchfiles-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:01859b11fd9fbca670f4d5da00fbac282cfea9bd67a2125d8b2833a3b5617ea9", size = 391205, upload-time = "2026-05-18T04:30:25.413Z" }, + { url = "https://files.pythonhosted.org/packages/84/2c/0a44fe058cb4bb7b8ede6b6670698bbb7c0400740e378d00022189b7b31d/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fff610d7bb2256a317bb1e96f0d7862c7aa8076733ee5df0fd41bbe76a24a4f4", size = 451892, upload-time = "2026-05-18T04:32:14.005Z" }, + { url = "https://files.pythonhosted.org/packages/67/a1/351e0d56cd35e6488b5c8b4fb11a809a5bc923e8fe8fed9faf8920be0c89/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b141a4891c995a039cd89e9a49e62df1dc8a559a5d1a6e4c7106d16c12777a55", size = 458867, upload-time = "2026-05-18T04:31:22.279Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7d/9d09605187f1b838998624049fcf8bf47b73c1a3b76901fcac1782f62277/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f22943b7770483f6ea0721c6b11d022947a98eb0acae14694de034f4d0d38925", size = 490217, upload-time = "2026-05-18T04:31:43.657Z" }, + { url = "https://files.pythonhosted.org/packages/60/5d/a17a16eccb182f04188cd308ec24b1a71a9b5c4e7098269cf35d9fa56d02/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1bc6195825b7dcd217968bb1f801a60fd4c16e8eeab5bedc7fe917d7d5995ab4", size = 571458, upload-time = "2026-05-18T04:32:11.875Z" }, + { url = "https://files.pythonhosted.org/packages/d3/3d/4dd457062083ab1938e5dfd45032eb425cee2ac817287ca8ff4356183e5d/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4a4b147f5dca2a5d325a06a832fb43f345751adfbc63204aec30e0d9ca965a2", size = 464707, upload-time = "2026-05-18T04:30:43.492Z" }, + { url = "https://files.pythonhosted.org/packages/c6/71/ea8c57b128f5383de74d0c7d2d9c57ad7c9a65a930c451bd25d524b295b7/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4543579a9bdb0c9560039b4ffddbdb39545707659fbc430ce4c10f3f68d557f9", size = 454663, upload-time = "2026-05-18T04:30:16.061Z" }, + { url = "https://files.pythonhosted.org/packages/53/fd/2e812bf938406d7db351f0703ddd3fc6c061cf30d96153a77bc79a943a44/watchfiles-1.2.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:20aa0e708b920bde876a4aa82dc7dd6ebea228a63a67cda6632c2fc87b787efa", size = 463537, upload-time = "2026-05-18T04:31:44.9Z" }, + { url = "https://files.pythonhosted.org/packages/86/56/d17a7f1dd1bc3035f1072694a551301272f1739c2d8e319c927cb9e29b38/watchfiles-1.2.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:d413349d565dab74297f2a63e84a097936be69bf8f3b3801f27f380e32040f44", size = 629194, upload-time = "2026-05-18T04:31:14.141Z" }, + { url = "https://files.pythonhosted.org/packages/be/06/f1ff66bf5cae50aa4062779a0ecd0bbaf15e466195719074078947d9a17d/watchfiles-1.2.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f28b2725eb8cce327b9b3ab02415c853011dc55c95832fe90de6bc56f5315f72", size = 656194, upload-time = "2026-05-18T04:31:47.14Z" }, + { url = "https://files.pythonhosted.org/packages/e7/54/a9c7ea9a82a4ac65e7004c0a03920b5cdd2f9c3b678757d9cd425aa51d53/watchfiles-1.2.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:b8c8358484d5fa12ef34f05b7f4168eaf1932f408725ff6d023c33ec17bd79d4", size = 400205, upload-time = "2026-05-18T04:32:05.153Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5d/c9ab3534374a4a67450696905d6ef16a04405448b8dc52bd752ae50423d4/watchfiles-1.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9f04b092229ad2c50126dd3c922c8822e51e605993764a33058d4a791ab42281", size = 392508, upload-time = "2026-05-18T04:30:54.849Z" }, + { url = "https://files.pythonhosted.org/packages/26/ca/1ad30103535cf0cecd7b993e8d50edc5351b1820e38f2d22e3df58962feb/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a7ce236284f002a156f70add88efe5c70879cccbb658be0822c54b1306fc09d", size = 452448, upload-time = "2026-05-18T04:30:53.727Z" }, + { url = "https://files.pythonhosted.org/packages/37/a1/ceee2cdf2afbd715fa07758d39c9859513eae411b23196f7fd039e5feedd/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b9909cc2b48468b575eefa944919e1fe8a36c5849d5c7c168f80a8c1db69398e", size = 459605, upload-time = "2026-05-18T04:30:23.312Z" }, + { url = "https://files.pythonhosted.org/packages/e8/f6/421e30fd1cb3907a84ed92ab3f1983e37ba2dca015e9a894a048418417a2/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a37faaed405c67e28e6be45a1fa4f206ef5a2860f27c237db9fa30704c38242", size = 490757, upload-time = "2026-05-18T04:30:47.358Z" }, + { url = "https://files.pythonhosted.org/packages/41/b0/55ed1b97ed08be7bba6f9a541cac15f2a858e1d74d2b07b6da70a82aab00/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9649193aa27bd9ff2e80ff29bfaa93085496c7a3a377592823cc58b77ee88add", size = 568672, upload-time = "2026-05-18T04:30:38.915Z" }, + { url = "https://files.pythonhosted.org/packages/d1/cf/d8ae8a80dd7bafab395ea7681c10237311bbf34d37704a8c744e7cf31fc7/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e4ff8e37f99cf1da89e255e07c9c4b37c214038c4283707bdec308cb1b0ea1f", size = 464197, upload-time = "2026-05-18T04:30:09.914Z" }, + { url = "https://files.pythonhosted.org/packages/7c/8a/3076c496ca8dafe0e8cd03fcebdfc47be4b1174b4e5b24ff6e396e6b3af2/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:054dc20fd2e3132b4c3883b4a00d72fd6e1f56fdaf89fccd12e8057d74cd74d7", size = 453181, upload-time = "2026-05-18T04:30:14.829Z" }, + { url = "https://files.pythonhosted.org/packages/e5/10/9745e17c98e7b8a86454df0a3c7b5686bd650383f1e9f26e4ebcbd6cc0c0/watchfiles-1.2.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:e140ed30ebde76796b686e67c182cff10ea2fbab186fafd1560f74bb5a473a6e", size = 465109, upload-time = "2026-05-18T04:30:28.123Z" }, + { url = "https://files.pythonhosted.org/packages/8f/95/8ef4a95481d3e0cb52d62a06fa6e972e81424be2d9698b91a2fecca9904c/watchfiles-1.2.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:bb7e52ecf68ba46d22df23467b87cffeb2146908aa523ebfe803019618cfda06", size = 630653, upload-time = "2026-05-18T04:31:49.304Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e4/3b3bf36b0f829b50c6ebcb8d031583863c59f923d6a6af3d485e470d0fac/watchfiles-1.2.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:23282a321c8baf9b3a3c4afff673f9fe65eb7fdc2338d765ccad9d3d1916a5ba", size = 657838, upload-time = "2026-05-18T04:31:06.497Z" }, + { url = "https://files.pythonhosted.org/packages/21/b1/6cbbb50c1f3002ab568777d44aa21206dfb8807a840990c4037523b51812/watchfiles-1.2.0-cp314-cp314-win32.whl", hash = "sha256:c0db965c5f79aa49fe672d297cf1febc5ad149b658594944f49a54a2b96270a7", size = 275108, upload-time = "2026-05-18T04:30:06.891Z" }, + { url = "https://files.pythonhosted.org/packages/92/45/190ce6db8dcb4536682cf75d3889ff1a27182a58cb519d343cb6d9ea63d8/watchfiles-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:71283b39fd17e5408eb123bd37aeecfd9d54c81fc184421943208aadb879d103", size = 288441, upload-time = "2026-05-18T04:32:12.901Z" }, + { url = "https://files.pythonhosted.org/packages/74/0d/3eae1c2313ab08378431d907c3f8095ecca00f3eda33111cf4f0f2591799/watchfiles-1.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:c5c19526f4e54a00f2666a6c0e9e40d582c09e865055ea7378bf0009aab857b3", size = 280684, upload-time = "2026-05-18T04:31:26.902Z" }, + { url = "https://files.pythonhosted.org/packages/b1/75/fb64e6c25d6b5ca636d03df34ffb1c6e9873303e76d27967e045f8df088f/watchfiles-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:d73a585accffa5ae39c17264c36ec3166d2fad7000c780f5ef83b2722afb9dd2", size = 398857, upload-time = "2026-05-18T04:32:17.108Z" }, + { url = "https://files.pythonhosted.org/packages/73/4e/9f7adf01754cbf81843722ccfec169d8f26c69778281a302855cecd2ee08/watchfiles-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ae99b14c5f21e026e0e9d96f40e07d8570ebee6cafd9d8fc318354606daa7a28", size = 392413, upload-time = "2026-05-18T04:31:07.911Z" }, + { url = "https://files.pythonhosted.org/packages/47/c8/bec626bcc2d69f44b9acb24ce7d60ed7b16b73628eea747fcbd169d8edda/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4429f3b105524a10b72c3a819b091c495d2811d419c1e1e8df773a5a5974f831", size = 452409, upload-time = "2026-05-18T04:31:20.142Z" }, + { url = "https://files.pythonhosted.org/packages/00/b7/b6362068e81e7c556d155a34c35d40ac3ef42d747b06d7f6e5bf58e359c2/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:43d818978d06062d9b22c4fab2ebe44cf5213d42dc8e62bda8c2760cfa2eeb33", size = 458827, upload-time = "2026-05-18T04:32:06.219Z" }, + { url = "https://files.pythonhosted.org/packages/67/f8/9a813fa42afb1e0b4625e75f0479826644d3ee8dc287e093799bc01f390c/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b9f732dc58b2dbe69e464ccf8fff7a03b0dd0be439da4c0720d3558527d3d6b4", size = 490104, upload-time = "2026-05-18T04:31:56.034Z" }, + { url = "https://files.pythonhosted.org/packages/2f/bf/27dfb6094ca4c9aad21298b5525b6c53cb36121ee454331d05161e58d130/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f200104103feb097de4cab8fe4f5dd18a2026934c7dea98c55a2f5fd6d5a33b", size = 571360, upload-time = "2026-05-18T04:31:57.133Z" }, + { url = "https://files.pythonhosted.org/packages/fb/39/44a096d67270ea93df91d33877dbe91fbda3aa4f8ec2edf799d93eda8736/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ac26eefbf4af1741247d6fb68b11c49a25b2f7413fbd318a83a12aaa9cf666", size = 464644, upload-time = "2026-05-18T04:30:57.33Z" }, + { url = "https://files.pythonhosted.org/packages/0e/80/c7472203bad6268e3ef1ad260739704847898938ad7ea8b63a5131f46b50/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c4997d4e4a55f0d02b6cde327322daf3a0400e5df6c6b15948994bf72497925", size = 454771, upload-time = "2026-05-18T04:30:48.736Z" }, + { url = "https://files.pythonhosted.org/packages/51/cf/3b10b268b4b7f0fc26e9debb5eef1998b515887840f444cd3ec80c688755/watchfiles-1.2.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4c887eba18b7945ac73067a8b4a66f21cd46c2539b2bc68588f7be6c7eb6d26b", size = 463494, upload-time = "2026-05-18T04:31:33.826Z" }, + { url = "https://files.pythonhosted.org/packages/3d/3e/a4302545cd589262a0dc7d140e86f7688eba3f9c72776c27f7e23b8864c4/watchfiles-1.2.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:3416ff151bb6b5a8d8d11664974fbef4d9305b9b2957839ab5a270468fd8df30", size = 629383, upload-time = "2026-05-18T04:31:15.596Z" }, + { url = "https://files.pythonhosted.org/packages/db/99/d5649df0a9a410d45b7c882304d0b790903ac9b6e8f2cfd12114e0c6b9f2/watchfiles-1.2.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:0e831a271c035d89789cffc386b6aa1375f39f1cd25eb7ca0997e4970d152fc5", size = 656093, upload-time = "2026-05-18T04:31:58.707Z" }, + { url = "https://files.pythonhosted.org/packages/92/b9/362702539275019a54dd2e94511b31a9b89c5f9e6a21966de7eb692549fc/watchfiles-1.2.0-cp315-cp315-macosx_10_12_x86_64.whl", hash = "sha256:37a6721cdf3f65dbb13aa9503510ccb4451603ac837e44d265d7992a597e1374", size = 400109, upload-time = "2026-05-18T04:31:16.879Z" }, + { url = "https://files.pythonhosted.org/packages/8f/75/71d5ba62db781e5587bded1d944c675374bc4aa37ff33d5018d98e8b6538/watchfiles-1.2.0-cp315-cp315-macosx_11_0_arm64.whl", hash = "sha256:2b37d10b5a63bd4d87e18472d80fa525bd670586fae62e5dd580452764879b65", size = 392167, upload-time = "2026-05-18T04:31:28.058Z" }, + { url = "https://files.pythonhosted.org/packages/3c/01/c66dd95d0423fe30d31820e2d1d5bda773764131bbb6ac0cb1cf303ac328/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a105bc2283f67e8fbec74253ec2d94925de92ed72c0393f1206bf326b7b7b69", size = 452372, upload-time = "2026-05-18T04:31:00.836Z" }, + { url = "https://files.pythonhosted.org/packages/91/15/2fe99557e72f85627c6a8eed50d889e8d101623e060a22ad75b875cb932d/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5327989a465505f05cfe06f04fa9d0c2fd5432bb243e10e6f012b1bdca3c8579", size = 459596, upload-time = "2026-05-18T04:31:34.96Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/d4acfa0023367428ed48351b3b9b267893037b6cadae55620c61c24bcfd4/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecb47f183a8025b2aa18b546725c3657e542112ae9c0613a2af79b4fa8d04ad7", size = 490869, upload-time = "2026-05-18T04:31:59.923Z" }, + { url = "https://files.pythonhosted.org/packages/a4/5f/3164cbdce06c9fb95c4f7b9e2f9760b5e2797af43a9ecc317ef42a23a278/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8520a4ab0e37f770afc34459c4f8f7019e153f9124dc101c15538365875d1ab2", size = 571641, upload-time = "2026-05-18T04:32:00.948Z" }, + { url = "https://files.pythonhosted.org/packages/41/e6/85d3731c55e65cd7690f3f803d24c139588aaf863e4bf2148fe7a7fa1a19/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71cd71740ed2c15211ebb237ced4e39a1cdf6f80566e5fe95428da1626f4fde6", size = 464444, upload-time = "2026-05-18T04:30:34.298Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7d/562641012b8b09872742c3b8adf9629ec479fd78f8d68ae4a0c13da8add6/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f88af53d6ddaf72179ef613ddc905e6f4785f712b49b80b3bef9f3525e6194b4", size = 453593, upload-time = "2026-05-18T04:31:23.464Z" }, + { url = "https://files.pythonhosted.org/packages/56/fe/cb8ef3d6f929d14158fdaaad9925985b7310abc9384dcd4d82dd0016fb59/watchfiles-1.2.0-cp315-cp315-manylinux_2_31_riscv64.whl", hash = "sha256:cee9d5efd929efdac5f7e58f72b3376f676b64050a91c5b99a7094c5b2317488", size = 465096, upload-time = "2026-05-18T04:31:30.384Z" }, + { url = "https://files.pythonhosted.org/packages/25/91/80908e835e100527a9267147b08c0eee1fa6ab0ffec15edc04d1d44885f7/watchfiles-1.2.0-cp315-cp315-musllinux_1_1_aarch64.whl", hash = "sha256:b718bf356bbc15e559bd8ef41782b573b8ae0e3f177ab244b440568d7ea02cfb", size = 630638, upload-time = "2026-05-18T04:30:49.89Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/95ab2f256bb4af3cb2eb23b9317bda984ee6e0f11733a5c004a6c95b06e3/watchfiles-1.2.0-cp315-cp315-musllinux_1_1_x86_64.whl", hash = "sha256:922c0e019fe68b3ae392965a766b02a71ba1168c932cebc3733cd52c5fe5b377", size = 657684, upload-time = "2026-05-18T04:31:32.027Z" }, + { url = "https://files.pythonhosted.org/packages/23/f4/7513ef1e85fc4c6331b59479d6d72661fc391fbe543678052ac72c8b6c19/watchfiles-1.2.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:4674d49eb94706dfe666c069fc0a1b646ffcf920473492e209f6d5f60d3f0cc2", size = 403050, upload-time = "2026-05-18T04:30:36.753Z" }, + { url = "https://files.pythonhosted.org/packages/27/0b/a54103cfd732bb703c7a749222011a0483ef3705948dae3b203158601119/watchfiles-1.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:094b9b70103d4e963499bdea001ee3c2697b144cd9ae6218a62c0f89ec9e31db", size = 396629, upload-time = "2026-05-18T04:32:03.268Z" }, + { url = "https://files.pythonhosted.org/packages/5e/2c/73f31a3b893886206c3f54d73e8ad8dee58cdb2f69ad2622e0a8a9e07f4e/watchfiles-1.2.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0ef001f8c25ad0fa9529f914c1600647ecd0f542d11c19b7894768c67b6acb7", size = 457318, upload-time = "2026-05-18T04:31:01.932Z" }, + { url = "https://files.pythonhosted.org/packages/e9/f9/45d021e4a5cc7b9dd567f7cbb06d3b75f751a690063fb6cc7ec60f4e46b7/watchfiles-1.2.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a88fc94e647bc4eec523f1caa540258eb71d14278b9daf72fa1e2658a98df0f0", size = 457771, upload-time = "2026-05-18T04:30:56.331Z" }, +] + [[package]] name = "wcwidth" version = "0.2.14" @@ -2449,6 +2620,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, ] +[[package]] +name = "websockets" +version = "16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/04/24/4b2031d72e840ce4c1ccb255f693b15c334757fc50023e4db9537080b8c4/websockets-16.0.tar.gz", hash = "sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5", size = 179346, upload-time = "2026-01-10T09:23:47.181Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/db/de907251b4ff46ae804ad0409809504153b3f30984daf82a1d84a9875830/websockets-16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:31a52addea25187bde0797a97d6fc3d2f92b6f72a9370792d65a6e84615ac8a8", size = 177340, upload-time = "2026-01-10T09:22:34.539Z" }, + { url = "https://files.pythonhosted.org/packages/f3/fa/abe89019d8d8815c8781e90d697dec52523fb8ebe308bf11664e8de1877e/websockets-16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:417b28978cdccab24f46400586d128366313e8a96312e4b9362a4af504f3bbad", size = 175022, upload-time = "2026-01-10T09:22:36.332Z" }, + { url = "https://files.pythonhosted.org/packages/58/5d/88ea17ed1ded2079358b40d31d48abe90a73c9e5819dbcde1606e991e2ad/websockets-16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af80d74d4edfa3cb9ed973a0a5ba2b2a549371f8a741e0800cb07becdd20f23d", size = 175319, upload-time = "2026-01-10T09:22:37.602Z" }, + { url = "https://files.pythonhosted.org/packages/d2/ae/0ee92b33087a33632f37a635e11e1d99d429d3d323329675a6022312aac2/websockets-16.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:08d7af67b64d29823fed316505a89b86705f2b7981c07848fb5e3ea3020c1abe", size = 184631, upload-time = "2026-01-10T09:22:38.789Z" }, + { url = "https://files.pythonhosted.org/packages/c8/c5/27178df583b6c5b31b29f526ba2da5e2f864ecc79c99dae630a85d68c304/websockets-16.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be95cfb0a4dae143eaed2bcba8ac23f4892d8971311f1b06f3c6b78952ee70b", size = 185870, upload-time = "2026-01-10T09:22:39.893Z" }, + { url = "https://files.pythonhosted.org/packages/87/05/536652aa84ddc1c018dbb7e2c4cbcd0db884580bf8e95aece7593fde526f/websockets-16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d6297ce39ce5c2e6feb13c1a996a2ded3b6832155fcfc920265c76f24c7cceb5", size = 185361, upload-time = "2026-01-10T09:22:41.016Z" }, + { url = "https://files.pythonhosted.org/packages/6d/e2/d5332c90da12b1e01f06fb1b85c50cfc489783076547415bf9f0a659ec19/websockets-16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c1b30e4f497b0b354057f3467f56244c603a79c0d1dafce1d16c283c25f6e64", size = 184615, upload-time = "2026-01-10T09:22:42.442Z" }, + { url = "https://files.pythonhosted.org/packages/77/fb/d3f9576691cae9253b51555f841bc6600bf0a983a461c79500ace5a5b364/websockets-16.0-cp311-cp311-win32.whl", hash = "sha256:5f451484aeb5cafee1ccf789b1b66f535409d038c56966d6101740c1614b86c6", size = 178246, upload-time = "2026-01-10T09:22:43.654Z" }, + { url = "https://files.pythonhosted.org/packages/54/67/eaff76b3dbaf18dcddabc3b8c1dba50b483761cccff67793897945b37408/websockets-16.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7f0659570eefb578dacde98e24fb60af35350193e4f56e11190787bee77dac", size = 178684, upload-time = "2026-01-10T09:22:44.941Z" }, + { url = "https://files.pythonhosted.org/packages/84/7b/bac442e6b96c9d25092695578dda82403c77936104b5682307bd4deb1ad4/websockets-16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:71c989cbf3254fbd5e84d3bff31e4da39c43f884e64f2551d14bb3c186230f00", size = 177365, upload-time = "2026-01-10T09:22:46.787Z" }, + { url = "https://files.pythonhosted.org/packages/b0/fe/136ccece61bd690d9c1f715baaeefd953bb2360134de73519d5df19d29ca/websockets-16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8b6e209ffee39ff1b6d0fa7bfef6de950c60dfb91b8fcead17da4ee539121a79", size = 175038, upload-time = "2026-01-10T09:22:47.999Z" }, + { url = "https://files.pythonhosted.org/packages/40/1e/9771421ac2286eaab95b8575b0cb701ae3663abf8b5e1f64f1fd90d0a673/websockets-16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:86890e837d61574c92a97496d590968b23c2ef0aeb8a9bc9421d174cd378ae39", size = 175328, upload-time = "2026-01-10T09:22:49.809Z" }, + { url = "https://files.pythonhosted.org/packages/18/29/71729b4671f21e1eaa5d6573031ab810ad2936c8175f03f97f3ff164c802/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9b5aca38b67492ef518a8ab76851862488a478602229112c4b0d58d63a7a4d5c", size = 184915, upload-time = "2026-01-10T09:22:51.071Z" }, + { url = "https://files.pythonhosted.org/packages/97/bb/21c36b7dbbafc85d2d480cd65df02a1dc93bf76d97147605a8e27ff9409d/websockets-16.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e0334872c0a37b606418ac52f6ab9cfd17317ac26365f7f65e203e2d0d0d359f", size = 186152, upload-time = "2026-01-10T09:22:52.224Z" }, + { url = "https://files.pythonhosted.org/packages/4a/34/9bf8df0c0cf88fa7bfe36678dc7b02970c9a7d5e065a3099292db87b1be2/websockets-16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a0b31e0b424cc6b5a04b8838bbaec1688834b2383256688cf47eb97412531da1", size = 185583, upload-time = "2026-01-10T09:22:53.443Z" }, + { url = "https://files.pythonhosted.org/packages/47/88/4dd516068e1a3d6ab3c7c183288404cd424a9a02d585efbac226cb61ff2d/websockets-16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:485c49116d0af10ac698623c513c1cc01c9446c058a4e61e3bf6c19dff7335a2", size = 184880, upload-time = "2026-01-10T09:22:55.033Z" }, + { url = "https://files.pythonhosted.org/packages/91/d6/7d4553ad4bf1c0421e1ebd4b18de5d9098383b5caa1d937b63df8d04b565/websockets-16.0-cp312-cp312-win32.whl", hash = "sha256:eaded469f5e5b7294e2bdca0ab06becb6756ea86894a47806456089298813c89", size = 178261, upload-time = "2026-01-10T09:22:56.251Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f0/f3a17365441ed1c27f850a80b2bc680a0fa9505d733fe152fdf5e98c1c0b/websockets-16.0-cp312-cp312-win_amd64.whl", hash = "sha256:5569417dc80977fc8c2d43a86f78e0a5a22fee17565d78621b6bb264a115d4ea", size = 178693, upload-time = "2026-01-10T09:22:57.478Z" }, + { url = "https://files.pythonhosted.org/packages/cc/9c/baa8456050d1c1b08dd0ec7346026668cbc6f145ab4e314d707bb845bf0d/websockets-16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:878b336ac47938b474c8f982ac2f7266a540adc3fa4ad74ae96fea9823a02cc9", size = 177364, upload-time = "2026-01-10T09:22:59.333Z" }, + { url = "https://files.pythonhosted.org/packages/7e/0c/8811fc53e9bcff68fe7de2bcbe75116a8d959ac699a3200f4847a8925210/websockets-16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:52a0fec0e6c8d9a784c2c78276a48a2bdf099e4ccc2a4cad53b27718dbfd0230", size = 175039, upload-time = "2026-01-10T09:23:01.171Z" }, + { url = "https://files.pythonhosted.org/packages/aa/82/39a5f910cb99ec0b59e482971238c845af9220d3ab9fa76dd9162cda9d62/websockets-16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e6578ed5b6981005df1860a56e3617f14a6c307e6a71b4fff8c48fdc50f3ed2c", size = 175323, upload-time = "2026-01-10T09:23:02.341Z" }, + { url = "https://files.pythonhosted.org/packages/bd/28/0a25ee5342eb5d5f297d992a77e56892ecb65e7854c7898fb7d35e9b33bd/websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:95724e638f0f9c350bb1c2b0a7ad0e83d9cc0c9259f3ea94e40d7b02a2179ae5", size = 184975, upload-time = "2026-01-10T09:23:03.756Z" }, + { url = "https://files.pythonhosted.org/packages/f9/66/27ea52741752f5107c2e41fda05e8395a682a1e11c4e592a809a90c6a506/websockets-16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0204dc62a89dc9d50d682412c10b3542d748260d743500a85c13cd1ee4bde82", size = 186203, upload-time = "2026-01-10T09:23:05.01Z" }, + { url = "https://files.pythonhosted.org/packages/37/e5/8e32857371406a757816a2b471939d51c463509be73fa538216ea52b792a/websockets-16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:52ac480f44d32970d66763115edea932f1c5b1312de36df06d6b219f6741eed8", size = 185653, upload-time = "2026-01-10T09:23:06.301Z" }, + { url = "https://files.pythonhosted.org/packages/9b/67/f926bac29882894669368dc73f4da900fcdf47955d0a0185d60103df5737/websockets-16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6e5a82b677f8f6f59e8dfc34ec06ca6b5b48bc4fcda346acd093694cc2c24d8f", size = 184920, upload-time = "2026-01-10T09:23:07.492Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a1/3d6ccdcd125b0a42a311bcd15a7f705d688f73b2a22d8cf1c0875d35d34a/websockets-16.0-cp313-cp313-win32.whl", hash = "sha256:abf050a199613f64c886ea10f38b47770a65154dc37181bfaff70c160f45315a", size = 178255, upload-time = "2026-01-10T09:23:09.245Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" }, + { url = "https://files.pythonhosted.org/packages/f3/1d/e88022630271f5bd349ed82417136281931e558d628dd52c4d8621b4a0b2/websockets-16.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8cc451a50f2aee53042ac52d2d053d08bf89bcb31ae799cb4487587661c038a0", size = 177406, upload-time = "2026-01-10T09:23:12.178Z" }, + { url = "https://files.pythonhosted.org/packages/f2/78/e63be1bf0724eeb4616efb1ae1c9044f7c3953b7957799abb5915bffd38e/websockets-16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:daa3b6ff70a9241cf6c7fc9e949d41232d9d7d26fd3522b1ad2b4d62487e9904", size = 175085, upload-time = "2026-01-10T09:23:13.511Z" }, + { url = "https://files.pythonhosted.org/packages/bb/f4/d3c9220d818ee955ae390cf319a7c7a467beceb24f05ee7aaaa2414345ba/websockets-16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:fd3cb4adb94a2a6e2b7c0d8d05cb94e6f1c81a0cf9dc2694fb65c7e8d94c42e4", size = 175328, upload-time = "2026-01-10T09:23:14.727Z" }, + { url = "https://files.pythonhosted.org/packages/63/bc/d3e208028de777087e6fb2b122051a6ff7bbcca0d6df9d9c2bf1dd869ae9/websockets-16.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:781caf5e8eee67f663126490c2f96f40906594cb86b408a703630f95550a8c3e", size = 185044, upload-time = "2026-01-10T09:23:15.939Z" }, + { url = "https://files.pythonhosted.org/packages/ad/6e/9a0927ac24bd33a0a9af834d89e0abc7cfd8e13bed17a86407a66773cc0e/websockets-16.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:caab51a72c51973ca21fa8a18bd8165e1a0183f1ac7066a182ff27107b71e1a4", size = 186279, upload-time = "2026-01-10T09:23:17.148Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ca/bf1c68440d7a868180e11be653c85959502efd3a709323230314fda6e0b3/websockets-16.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19c4dc84098e523fd63711e563077d39e90ec6702aff4b5d9e344a60cb3c0cb1", size = 185711, upload-time = "2026-01-10T09:23:18.372Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f8/fdc34643a989561f217bb477cbc47a3a07212cbda91c0e4389c43c296ebf/websockets-16.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a5e18a238a2b2249c9a9235466b90e96ae4795672598a58772dd806edc7ac6d3", size = 184982, upload-time = "2026-01-10T09:23:19.652Z" }, + { url = "https://files.pythonhosted.org/packages/dd/d1/574fa27e233764dbac9c52730d63fcf2823b16f0856b3329fc6268d6ae4f/websockets-16.0-cp314-cp314-win32.whl", hash = "sha256:a069d734c4a043182729edd3e9f247c3b2a4035415a9172fd0f1b71658a320a8", size = 177915, upload-time = "2026-01-10T09:23:21.458Z" }, + { url = "https://files.pythonhosted.org/packages/8a/f1/ae6b937bf3126b5134ce1f482365fde31a357c784ac51852978768b5eff4/websockets-16.0-cp314-cp314-win_amd64.whl", hash = "sha256:c0ee0e63f23914732c6d7e0cce24915c48f3f1512ec1d079ed01fc629dab269d", size = 178381, upload-time = "2026-01-10T09:23:22.715Z" }, + { url = "https://files.pythonhosted.org/packages/06/9b/f791d1db48403e1f0a27577a6beb37afae94254a8c6f08be4a23e4930bc0/websockets-16.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:a35539cacc3febb22b8f4d4a99cc79b104226a756aa7400adc722e83b0d03244", size = 177737, upload-time = "2026-01-10T09:23:24.523Z" }, + { url = "https://files.pythonhosted.org/packages/bd/40/53ad02341fa33b3ce489023f635367a4ac98b73570102ad2cdd770dacc9a/websockets-16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b784ca5de850f4ce93ec85d3269d24d4c82f22b7212023c974c401d4980ebc5e", size = 175268, upload-time = "2026-01-10T09:23:25.781Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/6158d4e459b984f949dcbbb0c5d270154c7618e11c01029b9bbd1bb4c4f9/websockets-16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:569d01a4e7fba956c5ae4fc988f0d4e187900f5497ce46339c996dbf24f17641", size = 175486, upload-time = "2026-01-10T09:23:27.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/2d/7583b30208b639c8090206f95073646c2c9ffd66f44df967981a64f849ad/websockets-16.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50f23cdd8343b984957e4077839841146f67a3d31ab0d00e6b824e74c5b2f6e8", size = 185331, upload-time = "2026-01-10T09:23:28.259Z" }, + { url = "https://files.pythonhosted.org/packages/45/b0/cce3784eb519b7b5ad680d14b9673a31ab8dcb7aad8b64d81709d2430aa8/websockets-16.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:152284a83a00c59b759697b7f9e9cddf4e3c7861dd0d964b472b70f78f89e80e", size = 186501, upload-time = "2026-01-10T09:23:29.449Z" }, + { url = "https://files.pythonhosted.org/packages/19/60/b8ebe4c7e89fb5f6cdf080623c9d92789a53636950f7abacfc33fe2b3135/websockets-16.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bc59589ab64b0022385f429b94697348a6a234e8ce22544e3681b2e9331b5944", size = 186062, upload-time = "2026-01-10T09:23:31.368Z" }, + { url = "https://files.pythonhosted.org/packages/88/a8/a080593f89b0138b6cba1b28f8df5673b5506f72879322288b031337c0b8/websockets-16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206", size = 185356, upload-time = "2026-01-10T09:23:32.627Z" }, + { url = "https://files.pythonhosted.org/packages/c2/b6/b9afed2afadddaf5ebb2afa801abf4b0868f42f8539bfe4b071b5266c9fe/websockets-16.0-cp314-cp314t-win32.whl", hash = "sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6", size = 178085, upload-time = "2026-01-10T09:23:33.816Z" }, + { url = "https://files.pythonhosted.org/packages/9f/3e/28135a24e384493fa804216b79a6a6759a38cc4ff59118787b9fb693df93/websockets-16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd", size = 178531, upload-time = "2026-01-10T09:23:35.016Z" }, + { url = "https://files.pythonhosted.org/packages/72/07/c98a68571dcf256e74f1f816b8cc5eae6eb2d3d5cfa44d37f801619d9166/websockets-16.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:349f83cd6c9a415428ee1005cadb5c2c56f4389bc06a9af16103c3bc3dcc8b7d", size = 174947, upload-time = "2026-01-10T09:23:36.166Z" }, + { url = "https://files.pythonhosted.org/packages/7e/52/93e166a81e0305b33fe416338be92ae863563fe7bce446b0f687b9df5aea/websockets-16.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:4a1aba3340a8dca8db6eb5a7986157f52eb9e436b74813764241981ca4888f03", size = 175260, upload-time = "2026-01-10T09:23:37.409Z" }, + { url = "https://files.pythonhosted.org/packages/56/0c/2dbf513bafd24889d33de2ff0368190a0e69f37bcfa19009ef819fe4d507/websockets-16.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f4a32d1bd841d4bcbffdcb3d2ce50c09c3909fbead375ab28d0181af89fd04da", size = 176071, upload-time = "2026-01-10T09:23:39.158Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8f/aea9c71cc92bf9b6cc0f7f70df8f0b420636b6c96ef4feee1e16f80f75dd/websockets-16.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0298d07ee155e2e9fda5be8a9042200dd2e3bb0b8a38482156576f863a9d457c", size = 176968, upload-time = "2026-01-10T09:23:41.031Z" }, + { url = "https://files.pythonhosted.org/packages/9a/3f/f70e03f40ffc9a30d817eef7da1be72ee4956ba8d7255c399a01b135902a/websockets-16.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a653aea902e0324b52f1613332ddf50b00c06fdaf7e92624fbf8c77c78fa5767", size = 178735, upload-time = "2026-01-10T09:23:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" }, +] + [[package]] name = "widgetsnbextension" version = "4.0.14" From aa038e4e3a1c34506ac1c59d5f0e0ebbb50309d2 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 9 Jun 2026 10:53:59 -0600 Subject: [PATCH 140/221] a different color theme --- ben-py/docs/conf.py | 95 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index 7f2ae4f..ddb7f89 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -219,7 +219,7 @@ def _brand(primary, content): # "color-brand-content": "#075985", # "color-brand-content": "#176995", "tangerine": { - "dark_pygments": "fruity", + "dark_pygments": "warm-dark", "light_pygments": "warm-light", "light": { "color-background-primary": "#fbfaf2", @@ -300,6 +300,7 @@ def _brand(primary, content): # the "Dark"/"Light" labels are just hints about which mode a style suits. CODE_THEMES = { "Dark": [ + "warm-dark", "github-dark", "gruvbox-dark", "one-dark", @@ -364,30 +365,30 @@ def _warm_light(): "styles": { Token: "#20180f", Comment: "italic #685c4b", - Comment.Preproc: "noitalic #b0420a", - Keyword: "bold #b0420a", + Comment.Preproc: "noitalic #c2410c", + Keyword: "bold #c2410c", Keyword.Type: "nobold #623c00", - Keyword.Constant: "nobold #623c00", + Keyword.Constant: "nobold #e36397", Operator: "#6a4a2a", - Operator.Word: "bold #b0420a", - Name.Builtin: "bold #623c00", - Name.Function: "#08527d", + Operator.Word: "bold #c2410c", + Name.Builtin: "bold #08527d", + Name.Function: "bold #08527d", Name.Class: "bold #0a5a86", Name.Namespace: "bold #0a5a86", - Name.Exception: "bold #b3261e", + Name.Exception: "bold #d10a46", Name.Variable: "#20180f", Name.Constant: "#623c00", - Name.Decorator: "#b0420a", + Name.Decorator: "#c2410c", Name.Attribute: "#0a5a86", - Name.Tag: "bold #0a544c", - String: "#0a544c", + Name.Tag: "bold #09814a", + String: "bold #09814a", String.Doc: "italic #685c4b", - String.Escape: "bold #b0420a", - Number: "bold #7c2560", + String.Escape: "bold #c2410c", + Number: "bold #861657", Generic.Heading: "bold #20180f", Generic.Subheading: "bold #0a5a86", Generic.Deleted: "#b3261e", - Generic.Inserted: "#0a544c", + Generic.Inserted: "#09814a", Generic.Error: "#b3261e", Generic.Emph: "italic", Generic.Strong: "bold", @@ -398,7 +399,71 @@ def _warm_light(): ) -CUSTOM_STYLES = {"warm-light": _warm_light()} +# "warm-dark" is the dark companion to warm-light: the SAME token roles and bold/italic +# treatment, in bright dark-mode colors. It keeps fruity's blue / green / orange family and +# warm-light's magenta numbers; every token is chosen to clear ~5.5:1+ on the dark canvas. +# (Mirrors warm-light's token set, so the two themes feel consistent across light/dark.) +def _warm_dark(): + from pygments.style import Style + from pygments.token import ( + Comment, + Error, + Generic, + Keyword, + Name, + Number, + Operator, + String, + Token, + ) + + return type( + "WarmDarkStyle", + (Style,), + { + "name": "warm-dark", + "background_color": "#292524", + "highlight_color": "#2a2218", + "line_number_color": "inherit", + "line_number_background_color": "transparent", + "styles": { + Token: "#f4efe6", + Comment: "italic #9a8f7c", + Comment.Preproc: "noitalic #ff750f", + Keyword: "bold #ff750f", + Keyword.Type: "nobold #d8a657", + Keyword.Constant: "nobold #f27da4", + Operator: "#c2b9a8", + Operator.Word: "bold #ff750f", + Name.Builtin: "bold #3a96cf", + Name.Function: "bold #3a96cf", + Name.Class: "bold #3a96cf", + Name.Namespace: "bold #3a96cf", + Name.Exception: "bold #e92063", + Name.Variable: "#f4efe6", + Name.Constant: "#d8a657", + Name.Decorator: "#ff750f", + Name.Attribute: "#3a96cf", + Name.Tag: "bold #3fb950", + String: "bold #79b473", + String.Doc: "italic #9a8f7c", + String.Escape: "bold #ff750f", + Number: "bold #c490d1", + Generic.Heading: "bold #f4efe6", + Generic.Subheading: "bold #3a96cf", + Generic.Deleted: "#ff6b6b", + Generic.Inserted: "#3fb950", + Generic.Error: "#ff6b6b", + Generic.Emph: "italic", + Generic.Strong: "bold", + Generic.Prompt: "bold #9a8f7c", + Error: "border:#ff6b6b", + }, + }, + ) + + +CUSTOM_STYLES = {"warm-light": _warm_light(), "warm-dark": _warm_dark()} def _pygments_theme_css(): From c4b616fc942cdb547c9cac4dfe56879bf3388eed Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 9 Jun 2026 12:21:04 -0600 Subject: [PATCH 141/221] better contrast in color theme --- ben-py/docs/conf.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index ddb7f89..3b49817 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -94,6 +94,21 @@ "networkx": ("https://networkx.org/documentation/stable/", None), } +# -- linkcheck --------------------------------------------------------------- +# +# Keep link checking separate from normal HTML builds because it depends on external +# services. CI runs it as its own step so transient failures are easy to diagnose. +linkcheck_timeout = 10 +linkcheck_retries = 2 +linkcheck_anchors = False +linkcheck_ignore = [ + r"http://localhost:\d+/", + r"http://127\.0\.0\.1:\d+/", + # GitHub source/blob URLs are useful in rendered docs but frequently rate-limit + # unauthenticated CI linkcheck runs. + r"https://github\.com/peterrrock2/binary-ensemble/(blob|tree)/.*", +] + # -- HTML output ------------------------------------------------------------- html_theme = "furo" @@ -368,7 +383,7 @@ def _warm_light(): Comment.Preproc: "noitalic #c2410c", Keyword: "bold #c2410c", Keyword.Type: "nobold #623c00", - Keyword.Constant: "nobold #e36397", + Keyword.Constant: "nobold #b8336a", Operator: "#6a4a2a", Operator.Word: "bold #c2410c", Name.Builtin: "bold #08527d", @@ -380,15 +395,15 @@ def _warm_light(): Name.Constant: "#623c00", Name.Decorator: "#c2410c", Name.Attribute: "#0a5a86", - Name.Tag: "bold #09814a", - String: "bold #09814a", + Name.Tag: "bold #0a6d3f", + String: "bold #0a6d3f", String.Doc: "italic #685c4b", String.Escape: "bold #c2410c", Number: "bold #861657", Generic.Heading: "bold #20180f", Generic.Subheading: "bold #0a5a86", Generic.Deleted: "#b3261e", - Generic.Inserted: "#09814a", + Generic.Inserted: "#0a6d3f", Generic.Error: "#b3261e", Generic.Emph: "italic", Generic.Strong: "bold", @@ -439,12 +454,12 @@ def _warm_dark(): Name.Function: "bold #3a96cf", Name.Class: "bold #3a96cf", Name.Namespace: "bold #3a96cf", - Name.Exception: "bold #e92063", + Name.Exception: "bold #ff5d80", Name.Variable: "#f4efe6", Name.Constant: "#d8a657", Name.Decorator: "#ff750f", Name.Attribute: "#3a96cf", - Name.Tag: "bold #3fb950", + Name.Tag: "bold #79b473", String: "bold #79b473", String.Doc: "italic #9a8f7c", String.Escape: "bold #ff750f", @@ -452,7 +467,7 @@ def _warm_dark(): Generic.Heading: "bold #f4efe6", Generic.Subheading: "bold #3a96cf", Generic.Deleted: "#ff6b6b", - Generic.Inserted: "#3fb950", + Generic.Inserted: "#79b473", Generic.Error: "#ff6b6b", Generic.Emph: "italic", Generic.Strong: "bold", From 9dcaee7e5ac9e9d3b1f368055d014ecffa8a954f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 9 Jun 2026 13:14:07 -0600 Subject: [PATCH 142/221] better doc strings for the api section --- ben-py/src/decode/bundle_decoder.rs | 32 ++++++-- ben-py/src/decode/decoder.rs | 8 +- ben-py/src/encode/bundle_encoder.rs | 120 +++++++++++++++++++++++++--- 3 files changed, 141 insertions(+), 19 deletions(-) diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index b1c302c..013862c 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -162,14 +162,18 @@ impl PyBendlDecoder { Ok(slf.into()) } - /// Restrict iteration to a contiguous, half-open range of samples ``[start, end)``. + /// Restrict iteration to a contiguous, 1-indexed inclusive range of samples. /// /// Args: /// start: First sample number to keep (1-indexed, inclusive). - /// end: One past the last sample number to keep (exclusive). + /// end: Last sample number to keep (1-indexed, inclusive). /// /// Returns: /// BendlDecoder: ``self``, for chaining into a ``for`` loop. + /// + /// Example: + /// >>> list(BendlDecoder("ensemble.bendl").subsample_range(10, 15)) + /// # samples 10, 11, 12, 13, 14, and 15 #[pyo3(text_signature = "(self, start, end, /)")] fn subsample_range<'py>( mut slf: PyRefMut<'py, Self>, @@ -205,13 +209,19 @@ impl PyBendlDecoder { // Bundle inspection surface. // ----------------------------------------------------------------- - /// Return the container format of the embedded assignment stream as `"ben"` or `"xben"`. + /// Return the container format of the embedded assignment stream. + /// + /// Returns: + /// str: ``"ben"`` or ``"xben"``. #[pyo3(text_signature = "(self)")] fn assignment_format(&self) -> &'static str { self.cursor.mode().as_str() } - /// Return the bundle's format version as a `(major, minor)` tuple. + /// Return the bundle's format version as a ``(major, minor)`` tuple. + /// + /// Returns: + /// tuple[int, int]: Bundle format version. #[pyo3(text_signature = "(self)")] fn version(&self) -> (u16, u16) { let h = self.reader.header(); @@ -219,12 +229,18 @@ impl PyBendlDecoder { } /// Whether the bundle was successfully finalized. + /// + /// Returns: + /// bool: ``True`` for a complete bundle, ``False`` for a recoverable partial bundle. #[pyo3(text_signature = "(self)")] fn is_complete(&self) -> bool { self.reader.is_finalized() } /// Names of every entry in the bundle's directory, in directory order. + /// + /// Returns: + /// list[str]: Asset names such as ``"graph.json"`` and ``"metadata.json"``. #[pyo3(text_signature = "(self)")] fn asset_names(&self) -> Vec { self.reader @@ -234,8 +250,12 @@ impl PyBendlDecoder { .collect() } - /// Return the full bundle directory as a list of dicts with keys `name`, `type`, `offset`, - /// `len`, and `flags` (a list of string tags). + /// Return the full bundle directory. + /// + /// Returns: + /// list[dict]: Each dict has ``name``, ``type``, ``offset``, ``len``, and ``flags``. + /// ``flags`` is a list of string tags such as ``"json"``, ``"xz"``, and + /// ``"checksum"``. #[pyo3(text_signature = "(self)")] fn list_assets<'py>(&self, py: Python<'py>) -> PyResult>> { let entries = self.reader.assets(); diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 9e2044b..8351967 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -129,14 +129,18 @@ impl PyBenDecoder { Ok(slf.into()) } - /// Restrict iteration to a contiguous, half-open range of samples ``[start, end)``. + /// Restrict iteration to a contiguous, 1-indexed inclusive range of samples. /// /// Args: /// start: First sample number to keep (1-indexed, inclusive). - /// end: One past the last sample number to keep (exclusive). + /// end: Last sample number to keep (1-indexed, inclusive). /// /// Returns: /// BenDecoder: ``self``, for chaining into a ``for`` loop. + /// + /// Example: + /// >>> list(BenDecoder("plans.ben").subsample_range(10, 15)) + /// # samples 10, 11, 12, 13, 14, and 15 #[pyo3(text_signature = "(self, start, end, /)")] fn subsample_range<'py>( mut slf: PyRefMut<'py, Self>, diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index f66bd19..9a63dc8 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -78,6 +78,10 @@ pub struct PyBendlEncoder { impl PyBendlEncoder { /// Open a new bundle writer in create mode. /// + /// A create-mode encoder writes one `.bendl` bundle. Add graph and metadata assets, then + /// open exactly one assignment stream with :meth:`stream`. The stream context finalizes the + /// bundle on a clean close. + /// /// Args: /// file_path: Output path. Must not exist unless ``overwrite=True``. /// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. @@ -85,6 +89,12 @@ impl PyBendlEncoder { /// Raises: /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be /// created. + /// + /// Example: + /// >>> from binary_ensemble import BendlEncoder + /// >>> encoder = BendlEncoder("ensemble.bendl", overwrite=True) + /// >>> with encoder.stream("ben") as stream: + /// ... stream.write([1, 1, 2, 2]) #[new] #[pyo3(signature = (file_path, overwrite = false))] #[pyo3(text_signature = "(file_path, overwrite=False)")] @@ -101,8 +111,25 @@ impl PyBendlEncoder { }) } - /// Open an existing finalized bundle for append. `stream()` is unavailable; `add_*` commit - /// immediately. + /// Open an existing finalized bundle for append. + /// + /// Append mode is for assets only. ``stream()`` is unavailable because a bundle has exactly + /// one assignment stream. Each ``add_*`` operation commits immediately. + /// + /// Args: + /// file_path: Existing finalized ``.bendl`` bundle. + /// + /// Returns: + /// BendlEncoder: An encoder in append mode. + /// + /// Raises: + /// OSError: If the bundle cannot be opened for append. + /// Exception: If the file is not a finalized bundle. + /// + /// Example: + /// >>> encoder = BendlEncoder.append("ensemble.bendl") + /// >>> encoder.add_asset("notes.txt", "reviewed", content_type="text") + /// >>> encoder.close() #[staticmethod] #[pyo3(signature = (file_path))] #[pyo3(text_signature = "(file_path)")] @@ -128,7 +155,20 @@ impl PyBendlEncoder { }) } - /// Add a custom asset (asset type `CUSTOM`). `content_type` is `"json"` or `"text"`. + /// Add a custom asset (asset type ``CUSTOM``). + /// + /// Args: + /// name: Asset name stored in the bundle directory. + /// payload: UTF-8 text or JSON bytes to store. + /// content_type: Either ``"json"`` or ``"text"``. JSON assets are marked so + /// :meth:`binary_ensemble.bundle.BendlDecoder.read_json_asset` can parse them. + /// + /// Raises: + /// ValueError: If ``content_type`` is not ``"json"`` or ``"text"``. + /// Exception: If the encoder is closed, failed, or currently streaming. + /// + /// Example: + /// >>> encoder.add_asset("scores.json", '{"cut_edges": [10]}', content_type="json") #[pyo3(signature = (name, payload, content_type))] #[pyo3(text_signature = "(self, name, payload, content_type)")] fn add_asset(&mut self, name: &str, payload: Vec, content_type: &str) -> PyResult<()> { @@ -144,8 +184,20 @@ impl PyBendlEncoder { Err(state_error(&self.state, "add_asset")) } - /// Add the canonical `metadata.json` known asset. `metadata` accepts the same inputs as a graph - /// (dict/list, bytes, a file-like with `.read()`, or a path). + /// Add the canonical ``metadata.json`` known asset. + /// + /// ``metadata`` accepts a Python ``dict``/``list``, UTF-8 JSON bytes, a file-like object with + /// ``.read()``, or a path to JSON. The decoder returns it with :meth:`read_metadata`. + /// + /// Args: + /// metadata: JSON-compatible metadata payload. + /// + /// Raises: + /// Exception: If the metadata cannot be converted to JSON bytes, or if the encoder is in + /// an invalid state. + /// + /// Example: + /// >>> encoder.add_metadata({"sampler": "ReCom", "seed": 1234}) #[pyo3(signature = (metadata))] #[pyo3(text_signature = "(self, metadata)")] fn add_metadata(&mut self, py: Python<'_>, metadata: Bound<'_, PyAny>) -> PyResult<()> { @@ -163,7 +215,7 @@ impl PyBendlEncoder { Err(state_error(&self.state, "add_metadata")) } - /// Add the `graph.json` known asset. + /// Add the ``graph.json`` known asset and return the graph to use for assignments. /// /// `sort` defaults to `"mlc"`, so by default the graph is reordered for better compression. /// `sort` is `"mlc"` (multi-level clustering), `"rcm"` (reverse Cuthill-McKee), `"key"` to sort @@ -173,6 +225,22 @@ impl PyBendlEncoder { /// `BendlDecoder.read_graph`) so the chain runs on that ordering. Reordering is pre-stream /// only; a raw graph (`sort=None`) may also be attached post-stream / in append mode. The /// returned graph's node count is recorded for per-write validation. + /// + /// Args: + /// graph: NetworkX adjacency JSON as a dict/list, bytes, file-like object, or path. + /// sort: ``"mlc"``, ``"rcm"``, ``"key"``, or ``None``. + /// key: Node attribute used when ``sort="key"``. Use ``"id"`` for node id ordering. + /// + /// Returns: + /// networkx.Graph: The stored graph, after any reordering. + /// + /// Raises: + /// ValueError: If ``sort``/``key`` is invalid. + /// Exception: If a reordering graph is added after the stream has started. + /// + /// Example: + /// >>> stored_graph = encoder.add_graph("graph.json", sort="mlc") + /// >>> write_order = list(stored_graph.nodes) #[pyo3(signature = (graph, sort = Some("mlc".to_string()), key = None))] #[pyo3(text_signature = "(self, graph, sort='mlc', key=None)")] fn add_graph( @@ -233,8 +301,27 @@ impl PyBendlEncoder { Err(state_error(&self.state, "add_graph")) } - /// Open the single-use assignment stream. Only `"ben"` is accepted today; XBEN comes from - /// `bundle.compress_stream`. `variant` selects the BEN variant (default `"twodelta"`). + /// Open the single-use assignment stream. + /// + /// Only ``"ben"`` is accepted today; XBEN bundles are produced by + /// :func:`binary_ensemble.bundle.compress_stream` after writing. ``variant`` selects the BEN + /// variant and defaults to ``"twodelta"``. + /// + /// Args: + /// format: Stream format, currently only ``"ben"``. + /// variant: BEN variant: ``"standard"``, ``"mkv_chain"``, ``"twodelta"``, or ``None``. + /// + /// Returns: + /// BendlStreamSession: Context manager whose :meth:`write` method accepts assignments. + /// + /// Raises: + /// ValueError: If ``format`` or ``variant`` is invalid. + /// Exception: If a stream has already been written, append mode is active, or the encoder + /// is closed/failed. + /// + /// Example: + /// >>> with encoder.stream("ben", variant="twodelta") as stream: + /// ... stream.write([1, 1, 2, 2]) #[pyo3(signature = (format = "ben", variant = None))] #[pyo3(text_signature = "(self, format='ben', variant=None)")] fn stream( @@ -304,9 +391,12 @@ impl PyBendlEncoder { } } - /// Finalize the bundle. Idempotent. In create mode a normal close (including before any - /// `stream()`) finalizes the bundle; after a failed stream it does not finalize. In append mode - /// it is a no-op after the already-committed appends. + /// Finalize or close the bundle. Idempotent. + /// + /// In create mode, closing before any stream creates a finalized assets-only bundle. The + /// stream context normally finalizes the bundle for you. After a failed stream, ``close()`` + /// does not stamp the partial bundle as complete. In append mode, asset writes have already + /// committed and ``close()`` is a no-op. fn close(&mut self) -> PyResult<()> { match &self.state { // The session owns the writer and finalizes on its own close. @@ -408,10 +498,16 @@ impl PyBendlStreamSession { /// assignment: The plan as a ``list[int]`` of district ids, one per node in /// dual-graph node order. /// + /// Returns: + /// None. + /// /// Raises: /// ValueError: If the bundle carries a pre-stream graph and the assignment length does /// not equal the graph's node count. /// OSError: If the session is already closed, or the write fails. + /// + /// Example: + /// >>> stream.write([1, 1, 2, 2]) #[pyo3(signature = (assignment))] #[pyo3(text_signature = "(self, assignment)")] fn write(&mut self, assignment: Vec) -> PyResult<()> { @@ -433,6 +529,8 @@ impl PyBendlStreamSession { } /// Finalize the bundle and close the stream. Idempotent. + /// + /// You usually do not call this directly; leaving the stream ``with`` block cleanly calls it. fn close(&mut self, py: Python<'_>) -> PyResult<()> { let Some(writer) = self.writer.take() else { return Ok(()); From ec3c1f7947d0f384d1c4b4516588cd05082069fa Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Tue, 9 Jun 2026 16:09:42 -0600 Subject: [PATCH 143/221] more docs!! --- .github/workflows/docs.yml | 3 + README.md | 4 +- Taskfile.yml | 6 ++ ben-py/docs/concepts/cli-parity.md | 2 +- ben-py/docs/concepts/compatibility.md | 6 +- ben-py/docs/concepts/compression.md | 2 +- ben-py/docs/concepts/formats.md | 12 +-- ben-py/docs/concepts/limitations.md | 6 +- ben-py/docs/concepts/ordering-deep-dive.md | 5 +- ben-py/docs/concepts/performance.md | 2 +- ben-py/docs/concepts/release-versioning.md | 72 ++++++++++++++++++ ben-py/docs/concepts/vocabulary.md | 23 +++++- ben-py/docs/getting-started/installation.md | 4 +- ben-py/docs/how-to/anti-patterns.md | 74 +++++++++++++++++++ ben-py/docs/how-to/compress-gerrychain-run.md | 5 +- ben-py/docs/how-to/end-to-end-workflow.md | 4 +- ben-py/docs/how-to/error-reference.md | 3 +- ben-py/docs/how-to/index.md | 9 ++- ben-py/docs/how-to/shrink-for-sharing.md | 6 +- ben-py/docs/index.md | 15 +++- ben-py/docs/user/using_bendl.ipynb | 8 +- 21 files changed, 233 insertions(+), 38 deletions(-) create mode 100644 ben-py/docs/concepts/release-versioning.md create mode 100644 ben-py/docs/how-to/anti-patterns.md diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3836188..99253eb 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -39,6 +39,9 @@ jobs: - name: Test documentation code snippets run: uv run --with pytest pytest tests/test_docs_snippets.py -q + - name: Check documentation links + run: uv run sphinx-build -b linkcheck docs docs/_build/linkcheck + - name: Upload built site uses: actions/upload-artifact@v4 with: diff --git a/README.md b/README.md index 8a55931..dfc9cb8 100755 --- a/README.md +++ b/README.md @@ -14,10 +14,10 @@ and compress them into pure binary files. ## Usage -You may install the binary-ensemble package from the cargo package manager using +From a checkout, install the command-line tools with Cargo using ``` -cargo install binary-ensemble +cargo install --path ben ``` [Here](./example/small_example.jsonl) is a link to a small example file that you can use to see what diff --git a/Taskfile.yml b/Taskfile.yml index df0a8dc..72b2a90 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -362,6 +362,12 @@ tasks: cmds: - uv run pytest tests/test_docs_snippets.py + docs-linkcheck: + desc: Check external links in the Python docs + dir: ben-py + cmds: + - uv run --extra docs sphinx-build -b linkcheck docs docs/_build/linkcheck + clean-linux: &clean-unix desc: Clean build artifacts internal: true diff --git a/ben-py/docs/concepts/cli-parity.md b/ben-py/docs/concepts/cli-parity.md index 4f798e8..ec66881 100644 --- a/ben-py/docs/concepts/cli-parity.md +++ b/ben-py/docs/concepts/cli-parity.md @@ -18,7 +18,7 @@ the CLI split so workflows can move between notebooks, scripts, and shell pipeli | Extract a bundle stream | `BendlDecoder(...).extract_stream(...)` | Copies embedded BEN/XBEN stream bytes | | Append bundle assets | `BendlEncoder.append(...)` | Asset appends only; no stream appends | | Relabel/reorder a bundle | `relabel_bundle(...)` | Requires BEN stream plus graph | -| Recompress bundle stream | `compress_stream(...)` | BEN bundle to XBEN bundle | +| Recompress bundle stream | `compress_stream(...)` | Embedded BEN stream to embedded XBEN stream | | Reorder a graph | `binary_ensemble.graph.reorder(...)` | Same orderings as bundle relabeling | ## Plain stream conversion diff --git a/ben-py/docs/concepts/compatibility.md b/ben-py/docs/concepts/compatibility.md index 7599573..fdb7284 100644 --- a/ben-py/docs/concepts/compatibility.md +++ b/ben-py/docs/concepts/compatibility.md @@ -3,6 +3,9 @@ This page covers what is stable at the Python package boundary and what belongs to the underlying binary format. +For release-boundary guidance and what to include in compatibility reports, see +[Release and versioning](release-versioning.md). + ## Python package compatibility `binary-ensemble` requires Python 3.11 or newer and depends on NetworkX at runtime. @@ -39,7 +42,7 @@ implementation detail behind the public modules. ## File-format stability The byte-level format stability policy lives in the repository-level -[format stability document](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/format-stability.md). +[format stability document](https://github.com/peterrrock2/binary-ensemble/blob/1.0.0/docs/format-stability.md). At the Python level, the important rule is simpler: readers auto-detect stream variants, and bundle readers expose the bundle version through `BendlDecoder.version()`. @@ -96,4 +99,3 @@ encoder.add_metadata( with encoder.stream("ben") as stream: stream.write([1, 1, 2, 2]) ``` - diff --git a/ben-py/docs/concepts/compression.md b/ben-py/docs/concepts/compression.md index 77a73ee..6e63c9b 100644 --- a/ben-py/docs/concepts/compression.md +++ b/ben-py/docs/concepts/compression.md @@ -125,7 +125,7 @@ equivalent plans encode identically and compress better. Run it before encoding The recommended pipeline for a small, shareable archive is: -1. Build a BEN bundle while sampling (ideally on an already-reordered graph). +1. Build a `.bendl` bundle with a BEN stream while sampling (ideally on an already-reordered graph). 2. **Relabel and reorder** the bundle to maximize run length and cross-plan repetition. 3. **Recompress** the bundle's stream to XBEN. diff --git a/ben-py/docs/concepts/formats.md b/ben-py/docs/concepts/formats.md index fc3b077..7ca7e48 100644 --- a/ben-py/docs/concepts/formats.md +++ b/ben-py/docs/concepts/formats.md @@ -41,7 +41,7 @@ plans — no separate graph file to track down, no chance of pairing the wrong o the bundle is the recommended default. A bundle can wrap *either* a BEN stream (the working form) or an XBEN stream (the compressed -form). You typically build a BEN bundle while sampling, then +form). You typically build a `.bendl` bundle with a BEN stream while sampling, then [recompress it to XBEN](../how-to/shrink-for-sharing.md) for distribution. ## Choosing a format @@ -123,8 +123,8 @@ the embedded assignment stream, then a directory table at the end: metadata, and any custom assets — by offset and length, each with its own CRC32C. A reader can pull out just the graph without scanning the file, and verify it before trusting it. - The **assignment stream** is stored opaquely: the bundle never parses BEN/XBEN internals, it just - carries the bytes and notes the format. That's what lets you swap a BEN bundle for an XBEN one by - recompressing only the inner stream. + carries the bytes and notes the format. That's what lets you replace the embedded BEN stream + with an embedded XBEN stream by recompressing only the inner stream. The writer lays the file down in order — a provisional header marked *unfinalized*, then assets, then the stream, then the directory — and **patches the header last** to flip it to finalized and @@ -137,6 +137,6 @@ that final header patch is the single commit point. The exact byte layouts are documented in the format specifications, for readers building interoperating tools: -- [BEN / XBEN stream format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/ben-format-spec.md) -- [TwoDelta variant format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/twodelta-format-spec.md) -- [BENDL bundle format](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/bendl-format-spec.md) +- [BEN / XBEN stream format](https://github.com/peterrrock2/binary-ensemble/blob/1.0.0/docs/ben-format-spec.md) +- [TwoDelta variant format](https://github.com/peterrrock2/binary-ensemble/blob/1.0.0/docs/twodelta-format-spec.md) +- [BENDL bundle format](https://github.com/peterrrock2/binary-ensemble/blob/1.0.0/docs/bendl-format-spec.md) diff --git a/ben-py/docs/concepts/limitations.md b/ben-py/docs/concepts/limitations.md index fa20c35..81210cb 100644 --- a/ben-py/docs/concepts/limitations.md +++ b/ben-py/docs/concepts/limitations.md @@ -4,6 +4,8 @@ This page is intentionally blunt. `binary-ensemble` is designed for a specific d large streams of district-assignment vectors over a fixed dual graph. It is very good at that job, but it does not try to be a general geospatial archive format. +For concrete examples of what not to do, see [Anti-patterns](../how-to/anti-patterns.md). + ## Assignment-only streams Plain `.ben` and `.xben` files store only assignment streams. They do not store: @@ -55,8 +57,8 @@ subsampling; recompress to XBEN once the file is ready to share. ## Relabel before XBEN -`relabel_bundle()` expects a BEN bundle with an embedded graph. Run it before -`compress_stream()`. +`relabel_bundle()` expects a `.bendl` bundle with an embedded BEN stream and graph. Run it +before `compress_stream()`. ```python from binary_ensemble import compress_stream, relabel_bundle diff --git a/ben-py/docs/concepts/ordering-deep-dive.md b/ben-py/docs/concepts/ordering-deep-dive.md index 701047a..2050736 100644 --- a/ben-py/docs/concepts/ordering-deep-dive.md +++ b/ben-py/docs/concepts/ordering-deep-dive.md @@ -67,8 +67,9 @@ assert len(write_order) == 4 ## Use an ordering after a bundle already exists -If you already have a BEN bundle with a graph, use `relabel_bundle()`. It reorders the graph, -rewrites every assignment into that new order, and stores a fresh permutation map. +If you already have a `.bendl` bundle with a BEN stream and a graph, use `relabel_bundle()`. +It reorders the graph, rewrites every assignment into that new order, and stores a fresh +permutation map. ```python from binary_ensemble import relabel_bundle diff --git a/ben-py/docs/concepts/performance.md b/ben-py/docs/concepts/performance.md index 3ae7139..d83e554 100644 --- a/ben-py/docs/concepts/performance.md +++ b/ben-py/docs/concepts/performance.md @@ -104,7 +104,7 @@ can still skip through the decoded stream efficiently. For serious runs: 1. Reorder the graph before or during bundle creation. -2. Write a BEN bundle while sampling. +2. Write a `.bendl` bundle with a BEN stream while sampling. 3. Attach metadata, graph, and provenance assets. 4. Use BEN for quality checks and analysis. 5. Relabel/reorder the final bundle if needed. diff --git a/ben-py/docs/concepts/release-versioning.md b/ben-py/docs/concepts/release-versioning.md new file mode 100644 index 0000000..ecee41e --- /dev/null +++ b/ben-py/docs/concepts/release-versioning.md @@ -0,0 +1,72 @@ +# Release and versioning + +This page describes the promises users should rely on at release boundaries. + +## Python package versions + +The Python package version is the normal package version installed by `pip`. Use it to record +which Python bindings wrote or read a bundle. + +```python +from importlib import metadata + +version = metadata.version("binary-ensemble") +print(version) +``` + +For reproducible runs, store that value in bundle metadata alongside sampler settings, graph +source, and random seed. + +## Public API compatibility + +The supported Python API is: + +- `binary_ensemble.bundle` +- `binary_ensemble.stream` +- `binary_ensemble.codec` +- `binary_ensemble.graph` +- the same symbols re-exported from top-level `binary_ensemble` + +`binary_ensemble._core` is an implementation detail. It may change to support the public +wrappers. + +## File format compatibility + +BEN, XBEN, and BENDL format stability is governed by the repository-level +[format stability document](https://github.com/peterrrock2/binary-ensemble/blob/1.0.0/docs/format-stability.md). +At the Python level, readers expose the bundle version and assignment stream format: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.version()) +print(decoder.assignment_format()) +``` + +## How to report a compatibility problem + +When a file does not open, report: + +- the package version, +- the file type (`.ben`, `.xben`, or `.bendl`), +- the bundle version from `BendlDecoder.version()` if it opens far enough, +- whether `BendlDecoder.is_complete()` is true, +- the exact exception text, +- whether the file was produced by Python API or CLI. + +If the file can be shared, include the smallest reproducing file. If it cannot be shared, +include the output of: + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +print(decoder.version()) +print(decoder.assignment_format()) +print(decoder.is_complete()) +print(decoder.list_assets()) +``` + +Do not include confidential geography or plans in public issues unless they are already +public data. diff --git a/ben-py/docs/concepts/vocabulary.md b/ben-py/docs/concepts/vocabulary.md index 763d481..3a0d290 100644 --- a/ben-py/docs/concepts/vocabulary.md +++ b/ben-py/docs/concepts/vocabulary.md @@ -1,8 +1,7 @@ # Vocabulary -These are the core terms used throughout the docs and the API. They come from the project's -[glossary](https://github.com/peterrrock2/binary-ensemble/blob/main/docs/glossary.md), which -is the source of truth for the whole workspace. +These are the core terms used throughout the docs and the API. They align with the project's +repository-level glossary. ## Dual graph @@ -72,3 +71,21 @@ you never specify a variant when reading. See [Encoding variants](variants.md). - **Chain** — specifically an MCMC method, where the Markov property matters. Use *sampler* unless you specifically mean a Markov chain. + +## Preferred wording + +Use these terms consistently in docs, examples, and user-facing messages. + +| Prefer | Avoid | Reason | +|---|---|---| +| `.bendl` bundle | `xben bundle`, `BEN-DL file` | The container is BENDL; the embedded stream may be BEN or XBEN. | +| assignment stream | plan stream, map stream | The bytes store assignment vectors, not geometries or rendered maps. | +| assignment | encoded plan, vector plan | An assignment is the concrete `list[int]` representation of a plan. | +| sample | step, row | A sample is one decoded assignment in an ensemble. | +| graph order or node order | file order, JSON order | The order is the positional contract between graph and assignments. | +| reorder | relabel, sort labels | Reordering changes node positions. | +| district relabeling | reordering districts | Relabeling changes district ids, not node positions. | +| `run-archive.bendl` | `run.xben.bendl` | Bundle filenames should have one `.bendl` extension. | + +When a page needs to mention both node reordering and district relabeling, name both +explicitly. They are different compression levers. diff --git a/ben-py/docs/getting-started/installation.md b/ben-py/docs/getting-started/installation.md index 7a93043..69f18c2 100644 --- a/ben-py/docs/getting-started/installation.md +++ b/ben-py/docs/getting-started/installation.md @@ -51,10 +51,10 @@ maturin develop --release # builds the extension and installs it editable ## Command-line tools This Python package wraps the same engine as the project's CLI tools (`ben`, `reben`, -`bendl`, `pcben`), which are distributed through Cargo: +`bendl`, `pcben`). From a checkout, build or install those tools with Cargo: ```bash -cargo install binary-ensemble +cargo install --path ben ``` The Python API mirrors the CLI's structure — see [The API map](../concepts/api-map.md). diff --git a/ben-py/docs/how-to/anti-patterns.md b/ben-py/docs/how-to/anti-patterns.md new file mode 100644 index 0000000..eff3488 --- /dev/null +++ b/ben-py/docs/how-to/anti-patterns.md @@ -0,0 +1,74 @@ +# Anti-patterns + +These are valid-looking patterns that produce bad workflows, confusing files, or silently +wrong analysis. + +## Writing assignments in the wrong graph order + +An assignment vector has no geographic meaning by itself. It only means something with +respect to the graph order. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +assignment = next(iter(decoder)) + +assert graph is not None +assert len(assignment) == graph.number_of_nodes() +``` + +That length check is necessary but not sufficient. The sampler still must write assignments +in `list(graph.nodes)` order. + +## Reordering the graph after writing assignments + +Do not sort or relabel a graph file by hand after encoding a stream. If the graph order +changes, every assignment position must be rewritten too. Use `relabel_bundle()` for that. + +```python +from binary_ensemble import relabel_bundle + +relabel_bundle("ensemble.bendl", out_file="ensemble-sorted.bendl", sort="mlc") +``` + +## Using XBEN as the working format + +XBEN is for archive and transfer. It is small, but compression is expensive and repeated +inspection pays decompression startup costs. Work in BEN or a BEN-backed `.bendl` bundle, +then recompress once. + +## Shipping a plain stream without its graph + +Plain `.ben` and `.xben` files do not carry graph or metadata. That is fine for internal +pipelines where the graph is guaranteed, but it is fragile for collaboration. Prefer `.bendl` +for anything shared. + +## Repeated bundle extensions + +Do not name bundles `run.xben.bendl`, `run.sorted.bendl`, or `run.archive.bendl`. A bundle is +a bundle regardless of the embedded stream. Use one `.bendl` extension and put state in the +basename: + +| Avoid | Prefer | +|---|---| +| `run.xben.bendl` | `run-archive.bendl` | +| `run.sorted.bendl` | `run-sorted.bendl` | +| `run.relabeled.bendl` | `run-relabeled.bendl` | + +Plain streams should still use `.ben` and `.xben`. + +## Appending samples to a finalized bundle + +Append mode is for assets only. A `.bendl` bundle has one assignment stream. To add more +samples, write a new bundle. + +```python +from binary_ensemble import BendlEncoder + +encoder = BendlEncoder.append("ensemble.bendl") +encoder.add_asset("review-notes.txt", "Asset append only.", content_type="text") +encoder.close() +``` + diff --git a/ben-py/docs/how-to/compress-gerrychain-run.md b/ben-py/docs/how-to/compress-gerrychain-run.md index ab52915..22a6d6b 100644 --- a/ben-py/docs/how-to/compress-gerrychain-run.md +++ b/ben-py/docs/how-to/compress-gerrychain-run.md @@ -84,8 +84,9 @@ file. To read it back, see [Read and iterate an ensemble](read-and-iterate.md). ## Why this is better than reordering later -You *can* write a raw-order BEN bundle and later call `relabel_bundle()` to reorder the graph -and rewrite the stream. But when you control the sampling code, it is cleaner to reorder first: +You *can* write a raw-order `.bendl` bundle with a BEN stream and later call +`relabel_bundle()` to reorder the graph and rewrite the stream. But when you control the +sampling code, it is cleaner to reorder first: 1. `add_graph(..., sort="mlc")` stores the reordered graph and permutation map. 2. `Graph.from_networkx(mlc_graph)` makes GerryChain run on that exact graph. diff --git a/ben-py/docs/how-to/end-to-end-workflow.md b/ben-py/docs/how-to/end-to-end-workflow.md index 7e25197..ea5c4e4 100644 --- a/ben-py/docs/how-to/end-to-end-workflow.md +++ b/ben-py/docs/how-to/end-to-end-workflow.md @@ -3,7 +3,7 @@ This tutorial follows the recommended lifecycle: 1. prepare a graph, -2. write a BEN bundle while producing assignments, +2. write a `.bendl` bundle with a BEN stream while producing assignments, 3. inspect and analyze the bundle, 4. add provenance, 5. relabel and recompress for sharing. @@ -27,7 +27,7 @@ for node in dual_graph.nodes: adjacency = nx.adjacency_data(dual_graph) ``` -## Write the working BEN bundle +## Write the working bundle `add_graph()` returns the graph in the order assignments should use. In this toy example the assignment generator already uses integer node positions, so we only need the node count. diff --git a/ben-py/docs/how-to/error-reference.md b/ben-py/docs/how-to/error-reference.md index 92949ae..936a7e3 100644 --- a/ben-py/docs/how-to/error-reference.md +++ b/ben-py/docs/how-to/error-reference.md @@ -70,7 +70,8 @@ with encoder.stream("ben") as stream: ## Relabeling fails after XBEN recompression -**Cause:** `relabel_bundle()` works on BEN bundles. XBEN is the final archive step. +**Cause:** `relabel_bundle()` works on `.bendl` bundles with embedded BEN streams. XBEN is the +final archive step. **Fix:** relabel first, then recompress. diff --git a/ben-py/docs/how-to/index.md b/ben-py/docs/how-to/index.md index 3b80808..2ba6092 100644 --- a/ben-py/docs/how-to/index.md +++ b/ben-py/docs/how-to/index.md @@ -61,7 +61,7 @@ encode_ben_to_xben("chain.ben", "chain.xben", overwrite=True) :link: end-to-end-workflow :link-type: doc -Build a working BEN bundle, inspect it, attach provenance, and archive it as XBEN. +Build a working `.bendl` bundle, inspect it, attach provenance, and archive it with XBEN. ::: :::{grid-item-card} API cookbook @@ -78,6 +78,13 @@ Copy focused snippets for the most common Python API tasks. Small standalone patterns for minimal bundles, rich bundles, conversion, subsampling, and archival. ::: +:::{grid-item-card} Anti-patterns +:link: anti-patterns +:link-type: doc + +Avoid node-order mistakes, repeated bundle extensions, wrong working formats, and fragile sharing. +::: + :::{grid-item-card} Compress a GerryChain run :link: compress-gerrychain-run :link-type: doc diff --git a/ben-py/docs/how-to/shrink-for-sharing.md b/ben-py/docs/how-to/shrink-for-sharing.md index 3887bbf..d3fe2b7 100644 --- a/ben-py/docs/how-to/shrink-for-sharing.md +++ b/ben-py/docs/how-to/shrink-for-sharing.md @@ -1,8 +1,8 @@ # Shrink a bundle for sharing -A bundle you build while sampling is usually a BEN bundle in the graph's original node order — -convenient, but not as small as it could be. Before handing it to a collaborator or archiving -it, two steps get it to its smallest form: +A `.bendl` bundle you build while sampling usually has an embedded BEN stream in the graph's +original node order — convenient, but not as small as it could be. Before handing it to a +collaborator or archiving it, two steps get it to its smallest form: 1. **Relabel and reorder** so assignments form long runs and equivalent plans encode identically. diff --git a/ben-py/docs/index.md b/ben-py/docs/index.md index cbd9e1f..6652a10 100644 --- a/ben-py/docs/index.md +++ b/ben-py/docs/index.md @@ -28,7 +28,7 @@ toolkit built for exactly this data: it turns those JSONL mountains into compact files you can store, share, and stream sample-by-sample without unpacking the whole thing. `binary-ensemble` is the Python interface to the -[binary-ensemble](https://crates.io/crates/binary-ensemble) Rust library. +[binary-ensemble Rust crate](https://github.com/peterrrock2/binary-ensemble/tree/1.0.0/ben). ```{admonition} How much smaller? :class: tip @@ -98,6 +98,13 @@ shrink a bundle for sharing, diagnose errors, and copy cookbook patterns. Every public class and function in `binary_ensemble`, organized by module. ::: +:::{grid-item-card} {octicon}`mortar-board` Tutorial notebooks +:link: user/using_bendl +:link-type: doc + +Executable notebooks with rendered outputs. CI runs them end to end against the live API. +::: + :::: ```{toctree} @@ -125,6 +132,7 @@ concepts/api-map concepts/cli-parity concepts/limitations concepts/compatibility +concepts/release-versioning ``` ```{toctree} @@ -135,6 +143,7 @@ how-to/index how-to/end-to-end-workflow how-to/api-cookbook how-to/examples-gallery +how-to/anti-patterns how-to/compress-gerrychain-run how-to/read-and-iterate how-to/subsample @@ -164,7 +173,7 @@ api/index :hidden: :caption: Project -format stability -Rust crate +format stability +Rust crate source GitHub ``` diff --git a/ben-py/docs/user/using_bendl.ipynb b/ben-py/docs/user/using_bendl.ipynb index 2a4a00a..d648bd9 100644 --- a/ben-py/docs/user/using_bendl.ipynb +++ b/ben-py/docs/user/using_bendl.ipynb @@ -1156,7 +1156,7 @@ "metadata": {}, "source": [ "(Passing both `in_place=True` and `out_file=`, or neither, raises — the choice\n", - "is exclusive. Note XBEN bundles emit a one-time startup warning on decode,\n", + "is exclusive. Note `.bendl` bundles with XBEN streams emit a one-time startup warning on decode,\n", "since opening them does real decompression work.)" ] }, @@ -1236,15 +1236,15 @@ " `sort=\"rcm\"`, `sort=\"key\", key=\"GEOID\"`, or `sort=None` for raw), then build\n", " the chain on the returned graph — you get a compression win *and* a write order\n", " that already matches the stored graph.\n", - "- **`relabel_bundle`** to reorder an *existing* BEN bundle and rewrite its stream\n", + "- **`relabel_bundle`** to reorder an existing `.bendl` bundle with a BEN stream and rewrite its stream\n", " to match (in place or to a new file) — e.g. to optimize a bundle you received\n", " raw, before archiving it.\n", "- **`binary_ensemble.graph.reorder*`** when you want the reordering standalone\n", " (e.g. to reuse an ordering across several bundles).\n", "- **`add_metadata` / `add_asset`** to stamp provenance and ship analysis\n", " alongside the plans; **`append`** to add results to a finished bundle.\n", - "- **`compress_stream`** to graduate an active BEN bundle to an archival XBEN\n", - " one without losing any asset.\n", + "- **`compress_stream`** to graduate an active `.bendl` bundle from an embedded BEN stream to an embedded XBEN\n", + " stream without losing any asset.\n", "- Drop to the plain **`binary_ensemble.stream`** API (via `extract_stream`)\n", " only when you specifically need the bare stream and are tracking the graph\n", " and node order yourself.\n", From 3d20a76e71265f28b6c4685f9220a68bebff2f66 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:06:05 -0600 Subject: [PATCH 144/221] reroute old methods through the ben stream reader --- ben/src/codec/decode/ben32.rs | 48 +------ ben/src/codec/decode/jsonl.rs | 111 ++------------- ben/src/codec/decode/mod.rs | 2 +- ben/src/codec/decode/tests/mkvchain.rs | 189 +++++++++++++++---------- ben/src/codec/decode/tests/standard.rs | 136 ++++++++++-------- ben/src/codec/decode/xz.rs | 121 ++++++---------- ben/src/codec/translate/mod.rs | 82 ++++++++--- ben/src/codec/translate/tests.rs | 38 +++++ 8 files changed, 349 insertions(+), 378 deletions(-) diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs index 86f74c5..fd9e13c 100644 --- a/ben/src/codec/decode/ben32.rs +++ b/ben/src/codec/decode/ben32.rs @@ -1,7 +1,6 @@ use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; -use serde_json::json; -use std::io::{self, BufRead, Write}; +use std::io::{self, BufRead}; /// Decode a single ben32 frame into an assignment vector and repetition count. /// @@ -52,48 +51,3 @@ pub(crate) fn decode_ben32_line( Ok((output_vec, count)) } - -/// Decode a ben32 stream into JSONL assignment records. -/// -/// # Arguments -/// -/// * `reader` - The ben32 input stream. -/// * `writer` - The destination for the JSONL output. -/// * `starting_sample` - The 0-based sample offset that should be added to the emitted sample -/// numbers. -/// * `variant` - The BEN variant used to interpret repetition counts. -/// -/// # Returns -/// -/// Returns `Ok(())` after the ben32 stream has been fully decoded. -pub(crate) fn jsonl_decode_ben32( - mut reader: R, - mut writer: W, - starting_sample: usize, - variant: BenVariant, -) -> io::Result<()> { - let mut sample_number = 1; - loop { - let result = decode_ben32_line(&mut reader, variant); - if let Err(e) = result { - if e.kind() == io::ErrorKind::UnexpectedEof { - return Ok(()); - } - return Err(e); - } - - let (output_vec, count) = result.unwrap(); - - for _ in 0..count { - let line = json!({ - "assignment": output_vec, - "sample": sample_number + starting_sample, - }) - .to_string() - + "\n"; - - writer.write_all(line.as_bytes())?; - sample_number += 1; - } - } -} diff --git a/ben/src/codec/decode/jsonl.rs b/ben/src/codec/decode/jsonl.rs index 2a7e25d..84f7ca7 100644 --- a/ben/src/codec/decode/jsonl.rs +++ b/ben/src/codec/decode/jsonl.rs @@ -1,12 +1,5 @@ -use crate::codec::decode::jsonl_decode_ben32; -use crate::format::banners::{variant_from_banner, BANNER_LEN}; -use crate::format::FormatError; use crate::io::reader::BenStreamReader; -use crate::progress::Spinner; -use crate::BenVariant; -use serde_json::json; -use std::io::{self, BufRead, BufReader, Read, Write}; -use xz2::read::XzDecoder; +use std::io::{self, BufRead, Read, Write}; /// Decode a BEN stream into JSONL assignment records. /// @@ -36,97 +29,14 @@ pub fn decode_ben_to_jsonl(reader: R, writer: W) -> io::Resul /// # Returns /// /// Returns `Ok(())` after the XBEN stream has been fully decoded into JSONL. -pub fn decode_xben_to_jsonl(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - - let mut first_buffer = [0u8; BANNER_LEN]; - - decoder.read_exact(&mut first_buffer)?; - - let variant = match variant_from_banner(&first_buffer) { - Some(BenVariant::Standard) => BenVariant::Standard, - Some(BenVariant::MkvChain) => BenVariant::MkvChain, - Some(BenVariant::TwoDelta) => { - let mut xben = BenStreamReader::from_xben_decompressed( - BufReader::new(decoder), - BenVariant::TwoDelta, - ); - let mut sample_number = 1usize; - let spinner = Spinner::new("Decoding sample"); - for record in &mut xben { - let (assignment, count) = record?; - for _ in 0..count { - spinner.set_count(sample_number as u64); - let line = json!({ - "assignment": assignment, - "sample": sample_number, - }) - .to_string() - + "\n"; - writer.write_all(line.as_bytes())?; - sample_number += 1; - } - } - return Ok(()); - } - None => { - return Err(io::Error::from(FormatError::UnknownBanner { - actual: first_buffer.to_vec(), - })); - } - }; - - let mut buffer = [0u8; 1 << 20]; - let mut overflow: Vec = Vec::new(); - - let mut line_count: usize = 0; - let mut starting_sample: usize = 0; - let spinner = Spinner::new("Decoding sample"); - loop { - let count = decoder.read(&mut buffer)?; - if count == 0 { - break; - } - - overflow.extend(&buffer[..count]); - - let mut last_valid_assignment = 0; - - // TwoDelta was dispatched before this loop and returned early. - if variant == BenVariant::Standard { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - spinner.set_count(line_count as u64); - } - } - } else { - for i in (last_valid_assignment + 3..overflow.len().saturating_sub(2)).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - spinner.set_count(line_count as u64); - } - } - } - - if last_valid_assignment == 0 { - continue; - } - - jsonl_decode_ben32( - &overflow[0..last_valid_assignment], - &mut writer, - starting_sample, - variant, - )?; - overflow.drain(..last_valid_assignment); - starting_sample = line_count; - } - Ok(()) +/// +/// # Errors +/// +/// Surfaces an error (rather than a truncated result) if the decompressed stream ends partway +/// through a frame, declares a zero repetition count, or carries an unknown banner. +pub fn decode_xben_to_jsonl(reader: R, writer: W) -> io::Result<()> { + let mut xben_decoder = BenStreamReader::from_xben(reader).map_err(io::Error::from)?; + xben_decoder.write_all_jsonl(writer) } #[cfg(test)] @@ -154,8 +64,7 @@ mod tests { .unwrap(); // Use a read-only File as the writer — writing to it fails with a permission error, which - // propagates through the jsonl_decode_ben32 call at line 128 of this file. No custom Write - // impl needed. + // propagates through the write_all_jsonl call. No custom Write impl needed. let nonce = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index 4b64f12..1f4536d 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -10,7 +10,7 @@ mod twodelta; mod xz; pub use ben::decode_ben_line; -pub(crate) use ben32::{decode_ben32_line, jsonl_decode_ben32}; +pub(crate) use ben32::decode_ben32_line; pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub use path::{ decode_ben_to_jsonl_path, decode_xben_to_ben_path, decode_xben_to_jsonl_path, diff --git a/ben/src/codec/decode/tests/mkvchain.rs b/ben/src/codec/decode/tests/mkvchain.rs index 881b80e..bd43cb8 100644 --- a/ben/src/codec/decode/tests/mkvchain.rs +++ b/ben/src/codec/decode/tests/mkvchain.rs @@ -3,14 +3,28 @@ // layout under test. #![allow(clippy::unusual_byte_groupings)] -use crate::codec::decode::jsonl_decode_ben32; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use crate::codec::encode::{encode_ben_to_xben, xz_compress}; use crate::util::rle::rle_to_vec; -use crate::BenVariant; use serde_json::{json, Value}; use std::io::{self, BufReader}; +/// Wrap a raw MkvChain ben32 body in its banner and xz-compress it into a complete XBEN stream. +fn mkv_xben_from_ben32_body(body: &[u8]) -> Vec { + let mut inner = b"MKVCHAIN BEN FILE".to_vec(); + inner.extend_from_slice(body); + let mut xz = Vec::new(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); + xz +} + // The bit-packed payload for assignment [(1,4),(2,1),(3,3)] = [1,1,1,1,2,3,3,3]. // max_val_bit_count=2, max_len_bit_count=3, n_bytes=2: // bits 00-04: 01100 → val=01=1, len=100=4 @@ -110,67 +124,56 @@ fn decode_ben_to_jsonl_empty_stream_produces_no_output() { assert!(out.is_empty()); } -// ─── jsonl_decode_ben32 ──────────────────────────────────────────────── +// ─── decode_xben_to_jsonl — byte-level ben32 bodies ─────────────────── #[test] -fn jsonl_decode_ben32_mkvchain_count_one() { +fn decode_xben_to_jsonl_ben32_mkvchain_count_one() { // ben32: [(1,4),(2,1),(3,3)] + terminator + count=1 - let input: Vec = vec![ + let input = mkv_xben_from_ben32_body(&[ 0, 1, 0, 4, // (1, 4) 0, 2, 0, 1, // (2, 1) 0, 3, 0, 3, // (3, 3) 0, 0, 0, 0, // terminator 0, 1, // count = 1 - ]; + ]); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let assign = rle_to_vec(vec![(1u16, 4), (2, 1), (3, 3)]); assert_eq!(out, expected_line(&assign, 1).as_bytes()); } #[test] -fn jsonl_decode_ben32_mkvchain_count_five_expands_correctly() { +fn decode_xben_to_jsonl_ben32_mkvchain_count_five_expands_correctly() { // Single record with count=5 → 5 lines - let mut input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; - input.extend_from_slice(&5u16.to_be_bytes()); + let mut body: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; + body.extend_from_slice(&5u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let expected: String = (1..=5).map(|i| expected_line(&[23], i)).collect(); assert_eq!(out, expected.as_bytes()); } #[test] -fn jsonl_decode_ben32_mkvchain_two_records_correct_sample_numbers() { +fn decode_xben_to_jsonl_ben32_mkvchain_two_records_correct_sample_numbers() { // Record 1: [23] count=2 → samples 1,2 Record 2: [1,2,3,4] count=1 → sample 3 - let mut input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; - input.extend_from_slice(&2u16.to_be_bytes()); - input.extend_from_slice(&[0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 0, 0, 0]); - input.extend_from_slice(&1u16.to_be_bytes()); + let mut body: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; + body.extend_from_slice(&2u16.to_be_bytes()); + body.extend_from_slice(&[0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 0, 0, 0]); + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let expected = expected_line(&[23], 1) + &expected_line(&[23], 2) + &expected_line(&[1, 2, 3, 4], 3); assert_eq!(out, expected.as_bytes()); } -#[test] -fn jsonl_decode_ben32_mkvchain_starting_sample_offset() { - // starting_sample=5 → first output line has sample=6 - let mut input: Vec = vec![0, 7, 0, 1, 0, 0, 0, 0]; - input.extend_from_slice(&2u16.to_be_bytes()); - - let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 5, BenVariant::MkvChain).unwrap(); - - let expected = expected_line(&[7], 6) + &expected_line(&[7], 7); - assert_eq!(out, expected.as_bytes()); -} - // ─── decode_xben_to_ben round-trip ──────────────────────────────────── #[test] @@ -237,24 +240,54 @@ fn decode_ben_to_jsonl_truncated_count_field_errors() { #[test] fn decode_xben_to_jsonl_rejects_mkvchain_partial_overflow() { - // Compress just the banner + 3 garbage bytes → no valid frames - let mut xz = Vec::new(); - let mut inner = b"MKVCHAIN BEN FILE".to_vec(); - inner.extend_from_slice(&[1, 2, 3]); - xz_compress( - BufReader::new(inner.as_slice()), - &mut xz, - Some(1), - Some(0), - None, - ) - .unwrap(); + // Banner + 3 stray bytes: the decompressed body ends partway through a ben32 run. This is a + // truncated stream, not a clean end at a frame boundary, and must be rejected. + let xz = mkv_xben_from_ben32_body(&[1, 2, 3]); let mut out = Vec::new(); - decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().to_lowercase().contains("truncated")); assert!(out.is_empty()); } +#[test] +fn decode_xben_to_jsonl_rejects_mkvchain_frame_missing_count() { + // A full run and the zero sentinel, but no trailing u16 count: the frame is incomplete and the + // stream must be rejected as truncated rather than the frame being silently dropped. + let xz = mkv_xben_from_ben32_body(&[0, 7, 0, 3, 0, 0, 0, 0]); + + let mut out = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().to_lowercase().contains("truncated")); +} + +#[test] +fn decode_xben_to_jsonl_rejects_mkvchain_zero_count_frame() { + // A complete frame whose repetition count is zero. The wire format requires count >= 1, so the + // reader must error instead of silently emitting nothing for the frame. + let xz = mkv_xben_from_ben32_body(&[0, 7, 0, 3, 0, 0, 0, 0, 0, 0]); + + let mut out = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("count")); + assert!(out.is_empty()); +} + +#[test] +fn decode_xben_to_ben_rejects_mkvchain_zero_count_frame() { + // The BEN-translation path must reject a zero count rather than writing a corrupt BEN frame + // that downstream readers would only reject later. + let xz = mkv_xben_from_ben32_body(&[0, 7, 0, 3, 0, 0, 0, 0, 0, 0]); + + let mut out = Vec::new(); + let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("count")); +} + // ─── decode_ben_to_jsonl — byte-level frame encoding counterparts ────── These mirror the Standard // tests in standard.rs exactly, differing only in the MKVCHAIN banner and the trailing u16 BE count // field appended to each frame. @@ -512,17 +545,18 @@ fn decode_ben_to_jsonl_three_frames() { assert_eq!(out, expected.as_bytes()); } -// ─── jsonl_decode_ben32 — byte-level counterparts ───────────────────── Each Standard ben32 record -// has [pairs...][0,0,0,0] terminator. Each MkvChain ben32 record appends a u16 BE count after the -// terminator. +// ─── decode_xben_to_jsonl — more byte-level counterparts ─────────────── Each Standard ben32 +// record has [pairs...][0,0,0,0] terminator. Each MkvChain ben32 record appends a u16 BE count +// after the terminator. #[test] -fn jsonl_decode_ben32_16bit_val() { - let mut input = vec![0, 1, 0, 4, 2, 0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]; - input.extend_from_slice(&1u16.to_be_bytes()); +fn decode_xben_to_jsonl_ben32_16bit_val() { + let mut body = vec![0, 1, 0, 4, 2, 0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]; + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let rle_assign = vec![(1u16, 4), (512, 1), (3, 3)]; let expected = json!({ @@ -535,12 +569,13 @@ fn jsonl_decode_ben32_16bit_val() { } #[test] -fn jsonl_decode_ben32_16bit_len() { - let mut input = vec![0, 1, 0, 4, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0]; - input.extend_from_slice(&1u16.to_be_bytes()); +fn decode_xben_to_jsonl_ben32_16bit_len() { + let mut body = vec![0, 1, 0, 4, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0]; + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let rle_assign = vec![(1u16, 4), (2, 512), (3, 3)]; let expected = json!({ @@ -553,12 +588,13 @@ fn jsonl_decode_ben32_16bit_len() { } #[test] -fn jsonl_decode_ben32_max_val_65535() { - let mut input = vec![0, 23, 0, 4, 255, 255, 0, 15, 0, 8, 0, 3, 0, 0, 0, 0]; - input.extend_from_slice(&1u16.to_be_bytes()); +fn decode_xben_to_jsonl_ben32_max_val_65535() { + let mut body = vec![0, 23, 0, 4, 255, 255, 0, 15, 0, 8, 0, 3, 0, 0, 0, 0]; + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let rle_assign = vec![(23u16, 4), (65535, 15), (8, 3)]; let expected = json!({ @@ -571,12 +607,13 @@ fn jsonl_decode_ben32_max_val_65535() { } #[test] -fn jsonl_decode_ben32_max_len_65535() { - let mut input = vec![0, 23, 0, 4, 0, 60, 255, 255, 0, 8, 0, 3, 0, 0, 0, 0]; - input.extend_from_slice(&1u16.to_be_bytes()); +fn decode_xben_to_jsonl_ben32_max_len_65535() { + let mut body = vec![0, 23, 0, 4, 0, 60, 255, 255, 0, 8, 0, 3, 0, 0, 0, 0]; + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let rle_assign = vec![(23u16, 4), (60, 65535), (8, 3)]; let expected = json!({ @@ -589,38 +626,40 @@ fn jsonl_decode_ben32_max_len_65535() { } #[test] -fn jsonl_decode_ben32_single_element() { - let mut input = vec![0, 23, 0, 1, 0, 0, 0, 0]; - input.extend_from_slice(&1u16.to_be_bytes()); +fn decode_xben_to_jsonl_ben32_single_element() { + let mut body = vec![0, 23, 0, 1, 0, 0, 0, 0]; + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let expected = json!({"assignment": [23u16], "sample": 1}).to_string() + "\n"; assert_eq!(out, expected.as_bytes()); } #[test] -fn jsonl_decode_ben32_three_frames() { +fn decode_xben_to_jsonl_ben32_three_frames() { // Three ben32 records with count=1 each — mirrors test_decode_ben32_multiple_simple_lines. - let mut input: Vec = Vec::new(); + let mut body: Vec = Vec::new(); // Record 1: rle [(1,4),(2,4),(3,4),(4,4)] - input.extend_from_slice(&[0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0]); - input.extend_from_slice(&1u16.to_be_bytes()); + body.extend_from_slice(&[0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0]); + body.extend_from_slice(&1u16.to_be_bytes()); // Record 2: rle [(2,2),(3,7),(1,1),(2,1),(3,1)] - input.extend_from_slice(&[ + body.extend_from_slice(&[ 0, 2, 0, 2, 0, 3, 0, 7, 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 0, 0, 0, ]); - input.extend_from_slice(&1u16.to_be_bytes()); + body.extend_from_slice(&1u16.to_be_bytes()); // Record 3: rle [(1..10, each 1)] - input.extend_from_slice(&[ + body.extend_from_slice(&[ 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 5, 0, 1, 0, 6, 0, 1, 0, 7, 0, 1, 0, 8, 0, 1, 0, 9, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, ]); - input.extend_from_slice(&1u16.to_be_bytes()); + body.extend_from_slice(&1u16.to_be_bytes()); + let input = mkv_xben_from_ben32_body(&body); let mut out = Vec::new(); - jsonl_decode_ben32(input.as_slice(), &mut out, 0, BenVariant::MkvChain).unwrap(); + decode_xben_to_jsonl(BufReader::new(input.as_slice()), &mut out).unwrap(); let rle_lst: Vec> = vec![ vec![(1, 4), (2, 4), (3, 4), (4, 4)], diff --git a/ben/src/codec/decode/tests/standard.rs b/ben/src/codec/decode/tests/standard.rs index 4ea688d..c7a9509 100644 --- a/ben/src/codec/decode/tests/standard.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -3,14 +3,28 @@ // layout under test. #![allow(clippy::unusual_byte_groupings)] -use crate::codec::decode::jsonl_decode_ben32; use crate::codec::decode::{decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl}; use crate::codec::encode::xz_compress; use crate::util::rle::rle_to_vec; -use crate::BenVariant; use serde_json::{json, Value}; use std::io::{self, BufRead, BufReader}; +/// Wrap a raw Standard ben32 body in its banner and xz-compress it into a complete XBEN stream. +fn standard_xben_from_ben32_body(body: &[u8]) -> Vec { + let mut inner = b"STANDARD BEN FILE".to_vec(); + inner.extend_from_slice(body); + let mut xz = Vec::new(); + xz_compress( + BufReader::new(inner.as_slice()), + &mut xz, + Some(1), + Some(0), + None, + ) + .unwrap(); + xz +} + #[test] fn test_jsonl_decode_ben_underflow() { let mut input: Vec = b"STANDARD BEN FILE".to_vec(); @@ -274,7 +288,7 @@ fn test_decode_ben_max_val_and_len_at_65535() { } #[test] -fn test_jsonl_decode_ben32_propagates_non_eof_error() { +fn test_decode_xben_to_jsonl_propagates_non_eof_reader_errors() { struct AlwaysErrBuf; impl io::Read for AlwaysErrBuf { @@ -291,7 +305,7 @@ fn test_jsonl_decode_ben32_propagates_non_eof_error() { fn consume(&mut self, _amt: usize) {} } - let err = jsonl_decode_ben32(AlwaysErrBuf, Vec::new(), 0, BenVariant::Standard).unwrap_err(); + let err = decode_xben_to_jsonl(AlwaysErrBuf, Vec::new()).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::Other); assert_eq!(err.to_string(), "boom"); } @@ -329,18 +343,36 @@ fn test_decode_xben_to_jsonl_rejects_invalid_inner_header() { } #[test] -fn test_decode_xben_to_ben_handles_partial_overflow_without_frame() { - let mut xz = Vec::new(); - let mut inner = b"STANDARD BEN FILE".to_vec(); - inner.extend_from_slice(&[1, 2, 3]); - xz_compress( - BufReader::new(inner.as_slice()), - &mut xz, - Some(1), - Some(0), - None, - ) - .unwrap(); +fn test_decode_xben_to_ben_rejects_truncated_ben32_body() { + // The decompressed body ends three bytes into a ben32 run — a truncated stream, not a clean + // end at a frame boundary. The reader must reject it rather than silently dropping the tail. + let xz = standard_xben_from_ben32_body(&[1, 2, 3]); + + let mut out = Vec::new(); + let err = decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().to_lowercase().contains("truncated")); +} + +#[test] +fn test_decode_xben_to_jsonl_rejects_truncated_ben32_body() { + // Same truncated body as the to_ben case; the JSONL path must reject it identically. + let xz = standard_xben_from_ben32_body(&[1, 2, 3]); + + let mut out = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().to_lowercase().contains("truncated")); + assert!( + out.is_empty(), + "no samples should be emitted for a truncated single-frame body" + ); +} + +#[test] +fn test_decode_xben_to_ben_accepts_empty_ben32_body() { + // A banner with no frames is a clean end at a frame boundary, not a truncation. + let xz = standard_xben_from_ben32_body(&[]); let mut out = Vec::new(); decode_xben_to_ben(BufReader::new(xz.as_slice()), &mut out).unwrap(); @@ -348,18 +380,8 @@ fn test_decode_xben_to_ben_handles_partial_overflow_without_frame() { } #[test] -fn test_decode_xben_to_jsonl_handles_partial_overflow_without_frame() { - let mut xz = Vec::new(); - let mut inner = b"STANDARD BEN FILE".to_vec(); - inner.extend_from_slice(&[1, 2, 3]); - xz_compress( - BufReader::new(inner.as_slice()), - &mut xz, - Some(1), - Some(0), - None, - ) - .unwrap(); +fn test_decode_xben_to_jsonl_accepts_empty_ben32_body() { + let xz = standard_xben_from_ben32_body(&[]); let mut out = Vec::new(); decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap(); @@ -505,15 +527,18 @@ fn test_decode_ben_multiple_simple_lines() { assert_eq!(output, expected_output.concat().as_bytes()); } +// ─── decode_xben_to_jsonl — byte-level ben32 bodies ──────────────────── +// Each test wraps a hand-built Standard ben32 body in a banner + xz so the public decoder is +// exercised against exact wire bytes. + #[test] -fn test_jsonl_decode_ben32_simple() { - let input = vec![0, 1, 0, 4, 0, 2, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_simple() { + let input = standard_xben_from_ben32_body(&[0, 1, 0, 4, 0, 2, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); @@ -530,14 +555,13 @@ fn test_jsonl_decode_ben32_simple() { } #[test] -fn test_jsonl_decode_ben32_16_bit_val() { - let input = vec![0, 1, 0, 4, 2, 0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_16_bit_val() { + let input = standard_xben_from_ben32_body(&[0, 1, 0, 4, 2, 0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } @@ -553,14 +577,13 @@ fn test_jsonl_decode_ben32_16_bit_val() { } #[test] -fn test_jsonl_decode_ben32_16_bit_len() { - let input = vec![0, 1, 0, 4, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_16_bit_len() { + let input = standard_xben_from_ben32_body(&[0, 1, 0, 4, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } @@ -576,14 +599,14 @@ fn test_jsonl_decode_ben32_16_bit_len() { } #[test] -fn test_jsonl_decode_ben32_max_val_65535() { - let input = vec![0, 23, 0, 4, 255, 255, 0, 15, 0, 8, 0, 3, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_max_val_65535() { + let input = + standard_xben_from_ben32_body(&[0, 23, 0, 4, 255, 255, 0, 15, 0, 8, 0, 3, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } @@ -599,14 +622,14 @@ fn test_jsonl_decode_ben32_max_val_65535() { } #[test] -fn test_jsonl_decode_ben32_max_len_65535() { - let input = vec![0, 23, 0, 4, 0, 60, 255, 255, 0, 8, 0, 3, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_max_len_65535() { + let input = + standard_xben_from_ben32_body(&[0, 23, 0, 4, 0, 60, 255, 255, 0, 8, 0, 3, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } @@ -622,15 +645,13 @@ fn test_jsonl_decode_ben32_max_len_65535() { } #[test] -fn test_decode_ben32_single_element() { - let input: Vec = vec![0, 23, 0, 1, 0, 0, 0, 0]; +fn test_decode_xben_to_jsonl_ben32_single_element() { + let input = standard_xben_from_ben32_body(&[0, 23, 0, 1, 0, 0, 0, 0]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); - println!("result {:?}", result); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } @@ -644,18 +665,17 @@ fn test_decode_ben32_single_element() { } #[test] -fn test_decode_ben32_multiple_simple_lines() { - let input = vec![ +fn test_decode_xben_to_jsonl_ben32_multiple_simple_lines() { + let input = standard_xben_from_ben32_body(&[ 0, 1, 0, 4, 0, 2, 0, 4, 0, 3, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, 2, 0, 2, 0, 3, 0, 7, 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 4, 0, 1, 0, 5, 0, 1, 0, 6, 0, 1, 0, 7, 0, 1, 0, 8, 0, 1, 0, 9, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, - ]; + ]); - let mut reader = input.as_slice(); let mut output: Vec = Vec::new(); let writer = &mut output; - let result = jsonl_decode_ben32(&mut reader, writer, 0, BenVariant::Standard); + let result = decode_xben_to_jsonl(BufReader::new(input.as_slice()), writer); if let Err(e) = result { panic!("Error: {}", e); } diff --git a/ben/src/codec/decode/xz.rs b/ben/src/codec/decode/xz.rs index c5f21be..ceb864b 100644 --- a/ben/src/codec/decode/xz.rs +++ b/ben/src/codec/decode/xz.rs @@ -1,17 +1,21 @@ -use crate::codec::translate::ben32_to_ben_lines; -use crate::format::banners::{banner_for_variant, variant_from_banner, BANNER_LEN}; -use crate::format::FormatError; -use crate::io::reader::BenStreamReader; +use crate::codec::translate::ben32_to_ben_line; +use crate::format::banners::banner_for_variant; +use crate::io::reader::{BenStreamReader, DecodeFrame}; use crate::io::writer::BenStreamWriter; use crate::progress::Spinner; use crate::{BenVariant, XBenVariant}; -use std::io::{self, BufRead, BufReader, Read, Write}; +use std::io::{self, BufRead, Read, Write}; use xz2::read::XzDecoder; /// Decode an XBEN stream into an equivalent BEN stream. /// /// The output begins with the normal BEN banner followed by uncompressed BEN frames. /// +/// `Standard` and `MkvChain` streams are translated frame-by-frame at the ben32 layer, which +/// preserves each frame's original run boundaries and never materializes assignment vectors. +/// `TwoDelta` streams use a different compressed layout, so their assignments are materialized and +/// re-encoded through the TwoDelta stream writer. +/// /// # Arguments /// /// * `reader` - The compressed XBEN input stream. @@ -20,87 +24,52 @@ use xz2::read::XzDecoder; /// # Returns /// /// Returns `Ok(())` after the full XBEN stream has been decoded into BEN. +/// +/// # Errors +/// +/// Surfaces an error (rather than a truncated result) if the decompressed stream ends partway +/// through a frame, declares a zero repetition count, or carries an unknown banner. pub fn decode_xben_to_ben(reader: R, mut writer: W) -> io::Result<()> { - let mut decoder = XzDecoder::new(reader); - - let mut first_buffer = [0u8; BANNER_LEN]; - - decoder.read_exact(&mut first_buffer)?; + let xben = BenStreamReader::from_xben(reader).map_err(io::Error::from)?; + let variant = xben.variant(); - let variant: XBenVariant = match variant_from_banner(&first_buffer) { - Some(BenVariant::Standard) => { - writer.write_all(banner_for_variant(BenVariant::Standard))?; - XBenVariant::Standard - } - Some(BenVariant::MkvChain) => { - writer.write_all(banner_for_variant(BenVariant::MkvChain))?; - XBenVariant::MkvChain - } - Some(BenVariant::TwoDelta) => { - let mut xben = BenStreamReader::from_xben_decompressed( - BufReader::new(decoder), - BenVariant::TwoDelta, - ); - let mut ben = BenStreamWriter::for_ben(writer, BenVariant::TwoDelta)?; - for record in &mut xben { - let (assignment, count) = record?; + if variant == BenVariant::TwoDelta { + let mut ben = BenStreamWriter::for_ben(writer, BenVariant::TwoDelta)?; + for record in xben { + let (assignment, count) = record?; + for _ in 0..count { ben.write_assignment(assignment.clone())?; - for _ in 1..count { - ben.write_assignment(assignment.clone())?; - } } - ben.finish()?; - return Ok(()); - } - None => { - return Err(io::Error::from(FormatError::UnknownBanner { - actual: first_buffer.to_vec(), - })); } - }; + ben.finish()?; + return Ok(()); + } - let mut buffer = [0u8; 1048576]; - let mut overflow: Vec = Vec::new(); + let xben_variant = match variant { + BenVariant::Standard => XBenVariant::Standard, + BenVariant::MkvChain => XBenVariant::MkvChain, + BenVariant::TwoDelta => unreachable!("TwoDelta was dispatched above"), + }; + writer.write_all(banner_for_variant(variant))?; - let mut line_count: usize = 0; let spinner = Spinner::new("Decoding sample"); - loop { - let count = decoder.read(&mut buffer)?; - if count == 0 { - break; - } - - overflow.extend(&buffer[..count]); - - let mut last_valid_assignment = 0; - - // TwoDelta was dispatched before this loop and returned early. - if variant == XBenVariant::Standard { - for i in (3..overflow.len()).step_by(4) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 1; - line_count += 1; - spinner.set_count(line_count as u64); - } - } - } else { - for i in (3..overflow.len() - 2).step_by(2) { - if overflow[i - 3..=i] == [0, 0, 0, 0] { - last_valid_assignment = i + 3; - let lines = &overflow[i + 1..i + 3]; - let n_lines = u16::from_be_bytes([lines[0], lines[1]]); - line_count += n_lines as usize; - spinner.set_count(line_count as u64); - } + let mut sample_count = 0usize; + for item in xben.into_frames() { + let (frame, count) = item?; + let mut frame_bytes = match frame { + DecodeFrame::XBen(bytes, _) => bytes, + DecodeFrame::Ben(_) => { + unreachable!("an XBEN stream's frame iterator always yields ben32 frames") } + }; + // A MkvChain ben32 frame carries its repetition count after the zero sentinel; the + // translator takes the count as a separate argument, so drop it from the frame bytes. + if xben_variant == XBenVariant::MkvChain { + frame_bytes.truncate(frame_bytes.len() - 2); } - - if last_valid_assignment == 0 { - continue; - } - - ben32_to_ben_lines(&overflow[0..last_valid_assignment], &mut writer, variant)?; - overflow = overflow[last_valid_assignment..].to_vec(); + writer.write_all(&ben32_to_ben_line(frame_bytes, xben_variant, count)?)?; + sample_count += count as usize; + spinner.set_count(sample_count as u64); } Ok(()) } diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 01a9729..d2e093c 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -20,7 +20,8 @@ use crate::{BenVariant, XBenVariant}; /// /// # Arguments /// -/// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator. +/// * `ben32_vec` - The ben32 frame bytes, including the four-byte terminator but excluding any +/// trailing repetition count. /// * `variant` - The BEN32-supporting variant. Determines whether the resulting BEN frame embeds a /// trailing repetition count. /// * `count` - The repetition count for `MkvChain`. Ignored for `Standard`. @@ -28,7 +29,11 @@ use crate::{BenVariant, XBenVariant}; /// # Returns /// /// Returns the encoded BEN frame payload and header. -fn ben32_to_ben_line(ben32_vec: Vec, variant: XBenVariant, count: u16) -> io::Result> { +pub(crate) fn ben32_to_ben_line( + ben32_vec: Vec, + variant: XBenVariant, + count: u16, +) -> io::Result> { let mut buffer = [0u8; 4]; let mut ben32_rle: Vec<(u16, u16)> = Vec::new(); @@ -62,6 +67,32 @@ fn ben32_to_ben_line(ben32_vec: Vec, variant: XBenVariant, count: u16) -> io Ok(BenEncodeFrame::from_rle(ben32_rle, BenVariant::from(variant), Some(count)).into_bytes()) } +/// Read one 4-byte ben32 word, distinguishing a clean end of input from a truncated word. +/// +/// Returns `Ok(true)` when the word was fully read and `Ok(false)` when the reader was already at +/// EOF before yielding any byte. EOF after one to three bytes is a corrupt-stream signal and +/// surfaces as `UnexpectedEof` rather than a silently short read. +fn read_ben32_word(reader: &mut R, buf: &mut [u8; 4]) -> io::Result { + let mut filled = 0usize; + while filled < buf.len() { + match reader.read(&mut buf[filled..]) { + Ok(0) => { + if filled == 0 { + return Ok(false); + } + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "ben32 stream ends mid-run: input exhausted partway through a 4-byte run", + )); + } + Ok(n) => filled += n, + Err(e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(true) +} + /// Translate a stream of ben32 frames into BEN frames. /// /// This is primarily used while decoding XBEN, where the compressed payload is stored in ben32 @@ -78,43 +109,54 @@ fn ben32_to_ben_line(ben32_vec: Vec, variant: XBenVariant, count: u16) -> io /// /// # Returns /// -/// Returns `Ok(())` after the input stream has been fully translated. +/// Returns `Ok(())` once the input ends cleanly at a frame boundary. +/// +/// # Errors +/// +/// Returns `UnexpectedEof` if the input ends partway through a frame (mid-run, before the zero +/// sentinel, or before a MkvChain repetition count) and `InvalidData` if a MkvChain frame declares +/// a repetition count of zero. A truncated tail is never silently dropped. pub fn ben32_to_ben_lines( mut reader: R, mut writer: W, variant: XBenVariant, ) -> io::Result<()> { - 'outer: loop { + loop { let mut ben32_vec: Vec = Vec::new(); let mut ben32_read_buff: [u8; 4] = [0u8; 4]; let mut n_reps = 0; - 'inner: loop { - match reader.read_exact(&mut ben32_read_buff) { - Ok(()) => { - ben32_vec.extend(ben32_read_buff); - if ben32_read_buff == [0u8; 4] { - if variant == XBenVariant::MkvChain { - n_reps = reader.read_u16::()?; - } - break 'inner; - } + loop { + if !read_ben32_word(&mut reader, &mut ben32_read_buff)? { + if ben32_vec.is_empty() { + // Clean end of input at a frame boundary. + return Ok(()); } - Err(e) => { - if e.kind() == io::ErrorKind::UnexpectedEof { - break 'outer; + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "ben32 stream ends mid-frame: input exhausted before the 4-byte zero sentinel", + )); + } + + ben32_vec.extend(ben32_read_buff); + if ben32_read_buff == [0u8; 4] { + if variant == XBenVariant::MkvChain { + n_reps = reader.read_u16::()?; + if n_reps == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "ben32 frame count must be greater than zero", + )); } - return Err(e); } + break; } } let ben_vec = ben32_to_ben_line(ben32_vec, variant, n_reps)?; writer.write_all(&ben_vec)?; } - - Ok(()) } /// Convert a single BEN frame payload into its ben32 representation. diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 705f44a..481a22c 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -296,6 +296,44 @@ fn test_ben32_to_ben_line_rejects_missing_terminator() { ); } +#[test] +fn test_ben32_to_ben_lines_rejects_mid_run_eof() { + // Input ends partway through a 4-byte ben32 run. The translator must reject this as a + // truncated stream rather than silently dropping the partial frame. + let input = [0u8, 7, 0]; + let err = ben32_to_ben_lines(&input[..], Vec::new(), XBenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + assert!(err.to_string().contains("mid-run")); +} + +#[test] +fn test_ben32_to_ben_lines_rejects_mid_frame_eof() { + // One complete run but no zero sentinel before EOF: the frame is incomplete and must error + // instead of being silently discarded. + let input = [0u8, 7, 0, 3]; + let err = ben32_to_ben_lines(&input[..], Vec::new(), XBenVariant::Standard).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); + assert!(err.to_string().contains("mid-frame")); +} + +#[test] +fn test_ben32_to_ben_lines_rejects_missing_mkv_count() { + // Sentinel present but the trailing u16 repetition count is missing. + let input = [0u8, 7, 0, 3, 0, 0, 0, 0]; + let err = ben32_to_ben_lines(&input[..], Vec::new(), XBenVariant::MkvChain).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof); +} + +#[test] +fn test_ben32_to_ben_lines_rejects_zero_mkv_count() { + // A complete frame with count == 0 is corrupt: re-emitting it would write a BEN frame that + // every reader downstream rejects, so the translator errors at the source instead. + let input = [0u8, 7, 0, 3, 0, 0, 0, 0, 0, 0]; + let err = ben32_to_ben_lines(&input[..], Vec::new(), XBenVariant::MkvChain).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("count")); +} + #[test] fn test_ben32_to_ben_lines_preserves_mkv_counts() { let input = [ From 6b8adde6ea7a50d5b322b5fba1d4fc4a35d743ab Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:06:08 -0600 Subject: [PATCH 145/221] format --- ben-py/docs/conf.py | 57 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index 3b49817..1418345 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -145,12 +145,30 @@ def _brand(primary, content): PALETTES = { - "ocean": {"light": _brand("#0099cd", "#0066a0"), "dark": _brand("#36c5f0", "#5cc8f5")}, - "indigo": {"light": _brand("#4f46e5", "#4338ca"), "dark": _brand("#818cf8", "#a5b4fc")}, - "forest": {"light": _brand("#047857", "#065f46"), "dark": _brand("#34d399", "#6ee7b7")}, - "sunset": {"light": _brand("#ea580c", "#c2410c"), "dark": _brand("#fb923c", "#fdba74")}, - "plum": {"light": _brand("#7c3aed", "#6d28d9"), "dark": _brand("#a78bfa", "#c4b5fd")}, - "slate": {"light": _brand("#334155", "#1e293b"), "dark": _brand("#94a3b8", "#cbd5e1")}, + "ocean": { + "light": _brand("#0099cd", "#0066a0"), + "dark": _brand("#36c5f0", "#5cc8f5"), + }, + "indigo": { + "light": _brand("#4f46e5", "#4338ca"), + "dark": _brand("#818cf8", "#a5b4fc"), + }, + "forest": { + "light": _brand("#047857", "#065f46"), + "dark": _brand("#34d399", "#6ee7b7"), + }, + "sunset": { + "light": _brand("#ea580c", "#c2410c"), + "dark": _brand("#fb923c", "#fdba74"), + }, + "plum": { + "light": _brand("#7c3aed", "#6d28d9"), + "dark": _brand("#a78bfa", "#c4b5fd"), + }, + "slate": { + "light": _brand("#334155", "#1e293b"), + "dark": _brand("#94a3b8", "#cbd5e1"), + }, # From a Huemint palette: a charcoal dark mode with neon-teal accents, and a # matching light mode that carries the teal as a darker, legible shade on white. "aurora": { @@ -258,7 +276,12 @@ def _brand(primary, content): # Whether to render the in-browser palette/code-theme dropdowns. Off by default so the # published site ships locked to the active palette and its default code themes; set # DOCS_SWITCHER=1 while developing to expose the controls and experiment live. -SHOW_SWITCHER = os.environ.get("DOCS_SWITCHER", "").lower() not in ("", "0", "false", "no") +SHOW_SWITCHER = os.environ.get("DOCS_SWITCHER", "").lower() not in ( + "", + "0", + "false", + "no", +) html_theme_options = { "source_repository": "https://github.com/peterrrock2/binary-ensemble/", @@ -485,8 +508,12 @@ def _pygments_theme_css(): from pygments.formatters import HtmlFormatter menu = [s for group in CODE_THEMES.values() for s in group] - dark_defaults = [p["dark_pygments"] for p in PALETTES.values() if p.get("dark_pygments")] - light_defaults = [p["light_pygments"] for p in PALETTES.values() if p.get("light_pygments")] + dark_defaults = [ + p["dark_pygments"] for p in PALETTES.values() if p.get("dark_pygments") + ] + light_defaults = [ + p["light_pygments"] for p in PALETTES.values() if p.get("light_pygments") + ] # A style name may resolve to a builtin (the string) or a registered custom class. def make_formatter(style): @@ -506,7 +533,9 @@ def rules(formatter, prefix): # Explicit picks (and any palette default, so it resolves even if absent from the # menu) apply in any mode via the order-independent `html body` prefix. for style in dict.fromkeys(menu + dark_defaults + light_defaults): - blocks.append(rules(make_formatter(style), f'html body[data-code-theme="{style}"]')) + blocks.append( + rules(make_formatter(style), f'html body[data-code-theme="{style}"]') + ) # "Auto" applies a palette's dark/light default, each scoped to its own mode so the # other mode keeps the global Pygments style. The auto-mode (`prefers-color-scheme`) # variants mirror Furo's `:not([data-theme=…])` selectors for system readers. @@ -517,8 +546,12 @@ def rules(formatter, prefix): blocks.append("@media (prefers-color-scheme: dark){\n" + auto + "\n}") for style in dict.fromkeys(light_defaults): fmt = make_formatter(style) - blocks.append(rules(fmt, f'body[data-theme="light"][data-code-auto-light="{style}"]')) - auto = rules(fmt, f'body:not([data-theme="dark"])[data-code-auto-light="{style}"]') + blocks.append( + rules(fmt, f'body[data-theme="light"][data-code-auto-light="{style}"]') + ) + auto = rules( + fmt, f'body:not([data-theme="dark"])[data-code-auto-light="{style}"]' + ) blocks.append("@media (prefers-color-scheme: light){\n" + auto + "\n}") return "\n".join(blocks) From a4f5b3952088bbb571824adb007ccabc2ec88340 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:18:11 -0600 Subject: [PATCH 146/221] harden against oversized assets --- Taskfile.yml | 8 +++++++- ben/src/codec/decode/ben.rs | 16 +++++++++------ ben/src/codec/decode/mod.rs | 1 + ben/src/codec/encode/xz.rs | 3 ++- ben/src/codec/frames/decode.rs | 20 ++++++++++++++++++ ben/src/codec/frames/encode.rs | 37 +++++++++++++++++++++++++++------- ben/src/codec/frames/tests.rs | 32 +++++++++++++++++++++++++++++ ben/tests/test_stress_edges.rs | 33 ++++++++++++++++++++++++++++++ 8 files changed, 135 insertions(+), 15 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 72b2a90..6fcf871 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -119,7 +119,13 @@ tasks: env: PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' cmds: - - cargo test -- --ignored + # The fixture regenerators are #[ignore]d so they never run by accident — including here. + # Running them rewrites the committed wire-format fixtures (see docs/format-stability.md), + # so they must only ever be invoked by name in a dedicated fixtures PR. + - >- + cargo test -- --ignored + --skip generate_format_stability_fixtures + --skip regenerate_twodelta_fixtures test-rust: desc: Run Rust tests for the workspace (fast suite plus #[ignore]-gated stress tests) diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 652b561..3c56d9e 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -1,10 +1,11 @@ use std::io::{self, Read}; -/// Upper bound on `n_bytes` accepted by [`decode_ben_line`]. A frame larger than this is rejected -/// without allocating, so malformed or adversarial input cannot OOM the process during fuzzing or -/// stream decoding. The cap is well above any legitimate BEN frame: at 64 MiB of packed RLE data -/// it would hold tens of millions of run pairs. -const MAX_FRAME_PAYLOAD_BYTES: u32 = 1 << 26; +/// Upper bound on `n_bytes` accepted by [`decode_ben_line`] and by the frame readers in +/// [`crate::codec::BenDecodeFrame`]. A frame larger than this is rejected without allocating, so +/// malformed or adversarial input cannot OOM the process during fuzzing or stream decoding. The +/// cap is well above any legitimate BEN frame: at 64 MiB of packed RLE data it would hold tens of +/// millions of run pairs. +pub(crate) const MAX_FRAME_PAYLOAD_BYTES: u32 = 1 << 26; /// Decode a single BEN frame payload into run-length encoded assignments. /// @@ -85,7 +86,10 @@ pub fn decode_ben_line( let mut pending_zero_pairs: usize = 0; for &byte in &assign_bits { - buffer |= (byte as u32).to_be() >> n_bits_in_buff; + // Place the incoming byte at the top of the 32-bit shift register, below any bits already + // buffered. The explicit shift is endian-independent; bit extraction below always reads + // from the register's high end. + buffer |= ((byte as u32) << 24) >> n_bits_in_buff; n_bits_in_buff += 8; if n_bits_in_buff >= max_val_bits as u16 && !val_set { diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index 1f4536d..d60fbc7 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -10,6 +10,7 @@ mod twodelta; mod xz; pub use ben::decode_ben_line; +pub(crate) use ben::MAX_FRAME_PAYLOAD_BYTES; pub(crate) use ben32::decode_ben32_line; pub use jsonl::{decode_ben_to_jsonl, decode_xben_to_jsonl}; pub use path::{ diff --git a/ben/src/codec/encode/xz.rs b/ben/src/codec/encode/xz.rs index 2b22247..c9e6c70 100644 --- a/ben/src/codec/encode/xz.rs +++ b/ben/src/codec/encode/xz.rs @@ -110,7 +110,8 @@ pub fn xz_compress( } encoder.write_all(&buff[..count])?; } - drop(encoder); + + encoder.finish()?; Ok(()) } diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index 84532d6..b6a5022 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -1,8 +1,25 @@ use super::encode::BenEncodeFrame; +use crate::codec::decode::MAX_FRAME_PAYLOAD_BYTES; use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; use std::io::{self, Read}; +/// Reject a declared payload length above [`MAX_FRAME_PAYLOAD_BYTES`] **before** allocating the +/// payload buffer, so a corrupt or adversarial frame header cannot force a multi-gigabyte +/// reservation. Well-formed frames never approach the cap. +fn check_payload_len(n_bytes: u32) -> io::Result<()> { + if n_bytes > MAX_FRAME_PAYLOAD_BYTES { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "BEN frame payload of {n_bytes} bytes exceeds {MAX_FRAME_PAYLOAD_BYTES}; \ + refusing to allocate" + ), + )); + } + Ok(()) +} + /// One sample's encoded bytes at the frame layer, freshly read from a wire stream. /// /// `Standard` and `MkvChain` carry **opaque** bit-packed payload bytes — the runs are not expanded @@ -80,6 +97,7 @@ impl BenDecodeFrame { let max_len_bit_count = reader.read_u8()?; let n_bytes = reader.read_u32::()?; + check_payload_len(n_bytes)?; let mut raw_bytes = vec![0u8; n_bytes as usize]; reader.read_exact(&mut raw_bytes)?; @@ -101,6 +119,7 @@ impl BenDecodeFrame { let max_len_bit_count = reader.read_u8()?; let n_bytes = reader.read_u32::()?; + check_payload_len(n_bytes)?; let mut raw_bytes = vec![0u8; n_bytes as usize]; reader.read_exact(&mut raw_bytes)?; @@ -132,6 +151,7 @@ impl BenDecodeFrame { )); } let n_bytes = reader.read_u32::()?; + check_payload_len(n_bytes)?; let mut payload = vec![0u8; n_bytes as usize]; reader.read_exact(&mut payload)?; diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs index 890126d..19ea787 100644 --- a/ben/src/codec/frames/encode.rs +++ b/ben/src/codec/frames/encode.rs @@ -63,7 +63,9 @@ impl BenEncodeFrame { /// # Panics /// /// Panics if `variant` is [`BenVariant::TwoDelta`]; use [`BenEncodeFrame::from_run_lengths`] - /// for that. + /// for that. Also panics if the packed payload would exceed the `u32` byte length the frame + /// header can carry — that bound sits far beyond any real assignment, so reaching it means + /// the caller's input is corrupt rather than merely large. pub fn from_rle(runs: Vec<(u16, u16)>, variant: BenVariant, count: Option) -> Self { let (max_val, max_len) = runs .iter() @@ -72,9 +74,15 @@ impl BenEncodeFrame { }); let max_val_bit_count = (16 - max_val.leading_zeros() as u8).max(1); let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let assign_bits = (max_val_bit_count + max_len_bit_count) as u32; - let payload_bits = assign_bits * runs.len() as u32; - let n_bytes = payload_bits.div_ceil(8); + let assign_bits = (max_val_bit_count + max_len_bit_count) as u64; + let payload_bits = assign_bits * runs.len() as u64; + let n_bytes = u32::try_from(payload_bits.div_ceil(8)).unwrap_or_else(|_| { + panic!( + "BEN frame payload of {} run(s) at {assign_bits} bit(s)/run overflows the u32 \ + n_bytes field", + runs.len() + ) + }); let mut raw_bytes = compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); @@ -122,6 +130,12 @@ impl BenEncodeFrame { /// Build a `TwoDelta` frame from a pair and pre-computed run lengths. /// /// `count` defaults to `1` if `None`. + /// + /// # Panics + /// + /// Panics if the packed payload would exceed the `u32` byte length the frame header can + /// carry — that bound sits far beyond any real delta, so reaching it means the caller's + /// input is corrupt rather than merely large. pub fn from_run_lengths( pair: (u16, u16), run_length_vector: Vec, @@ -132,8 +146,14 @@ impl BenEncodeFrame { let max_len = run_length_vector.iter().copied().max().unwrap_or(0); let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); - let payload_bits = max_len_bit_count as u32 * run_length_vector.len() as u32; - let n_bytes = payload_bits.div_ceil(8); + let payload_bits = max_len_bit_count as u64 * run_length_vector.len() as u64; + let n_bytes = u32::try_from(payload_bits.div_ceil(8)).unwrap_or_else(|_| { + panic!( + "TwoDelta frame payload of {} run length(s) at {max_len_bit_count} bit(s) each \ + overflows the u32 n_bytes field", + run_length_vector.len() + ) + }); // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) + count (2) let mut raw_bytes = Vec::with_capacity((n_bytes + 11) as usize); @@ -200,7 +220,10 @@ impl BenEncodeFrame { let mut n_bits_in_buff: u16 = 0; for &byte in payload[..n_bytes as usize].iter() { - buffer |= (byte as u32).to_be() >> n_bits_in_buff; + // Place the incoming byte at the top of the 32-bit shift register, below any bits + // already buffered. The explicit shift is endian-independent; extraction below always + // reads from the register's high end. + buffer |= ((byte as u32) << 24) >> n_bits_in_buff; n_bits_in_buff += 8; while n_bits_in_buff >= max_len_bit_count as u16 { diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index c40bee5..7ae08a4 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -162,6 +162,38 @@ fn ben_decode_standard_non_eof_read_error_propagates() { assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } +#[test] +fn ben_decode_oversized_n_bytes_rejected_before_allocating() { + // Headers declaring an absurd payload length must be rejected before the payload buffer is + // allocated — no payload bytes are supplied here, so reaching the allocation would surface as + // an UnexpectedEof (or worse, an OOM under fuzzing) instead of the cap's InvalidData. + let oversized = u32::MAX.to_be_bytes(); + + // Standard: [mvb, mlb, n_bytes]. + let mut data = vec![2u8, 3]; + data.extend_from_slice(&oversized); + let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::Standard) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("refusing to allocate")); + + // MkvChain: same header shape. + let mut data = vec![2u8, 3]; + data.extend_from_slice(&oversized); + let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::MkvChain) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("refusing to allocate")); + + // TwoDelta: [pair_a, pair_b, max_len_bits, n_bytes]. + let mut data = vec![0u8, 1, 0, 2, 4]; + data.extend_from_slice(&oversized); + let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("refusing to allocate")); +} + // ── BenDecodeFrame::from_reader (MkvChain) ────────────────────────────────── #[test] diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 4446c9e..8cee041 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -367,6 +367,39 @@ fn xz_compress_propagates_input_reader_errors() { assert_eq!(err.kind(), std::io::ErrorKind::Other); } +#[test] +fn xz_compress_propagates_output_writer_errors_at_finish() { + // A writer that swallows nothing during streaming but fails once the encoder flushes its + // final block. xz buffers small inputs internally, so the only write the sink ever sees is + // the finish-time flush — exactly the failure a `drop(encoder)` would silently discard. + struct FailingWriter; + impl Write for FailingWriter { + fn write(&mut self, _buf: &[u8]) -> std::io::Result { + Err(std::io::Error::new( + std::io::ErrorKind::StorageFull, + "disk full", + )) + } + fn flush(&mut self) -> std::io::Result<()> { + Err(std::io::Error::new( + std::io::ErrorKind::StorageFull, + "disk full", + )) + } + } + + let payload = b"some bytes that fit inside xz's internal buffer"; + let err = xz_compress( + std::io::BufReader::new(payload.as_slice()), + FailingWriter, + Some(1), + Some(0), + None, + ) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::StorageFull); +} + #[test] fn relabel_map_out_of_range_old_indices_error_cleanly() { let mut ben = Vec::new(); From add9fc3c3e2160842ea7117a1d3fe82f1053577c Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:57:18 -0600 Subject: [PATCH 147/221] harden against malicious assignment lengths and middle zero-byte corruption --- ben/src/codec/decode/ben.rs | 36 ++++++++++++ ben/src/codec/decode/ben32.rs | 29 ++++++++++ ben/src/codec/decode/mod.rs | 9 +++ ben/src/codec/decode/tests/mod.rs | 13 +++++ ben/src/codec/decode/tests/standard.rs | 29 ++++++++++ ben/src/codec/decode/twodelta.rs | 13 ++++- ben/src/codec/frames/decode.rs | 75 +++++++++++++++++++++---- ben/src/codec/frames/tests.rs | 41 ++++++++++++++ ben/src/codec/translate/mod.rs | 12 ++++ ben/src/codec/translate/tests.rs | 10 ++++ ben/src/io/reader/stream_reader/xben.rs | 35 +++++++++++- ben/src/ops/extract/mod.rs | 7 ++- docs/ben-format-spec.md | 11 ++++ 13 files changed, 303 insertions(+), 17 deletions(-) diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 3c56d9e..ca00c73 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -34,6 +34,8 @@ pub(crate) const MAX_FRAME_PAYLOAD_BYTES: u32 = 1 << 26; /// - `n_bytes` not equal to `ceil(real_pairs * (mvb + mlb) / 8)` after decoding (the encoder uses /// `div_ceil` to compute `n_bytes`, so any other value indicates a malformed or maliciously /// crafted frame). +/// - The sum of the run lengths exceeding [`super::MAX_ASSIGNMENT_LEN`], so a small frame cannot +/// demand a multi-gigabyte expansion when the runs are later materialized. pub fn decode_ben_line( mut reader: R, max_val_bits: u8, @@ -162,6 +164,21 @@ pub fn decode_ben_line( )); } + // Expansion sanity bound: callers materialize the runs into a full assignment vector, so the + // sum of the run lengths is the allocation a frame can demand. Reject absurd sums here, before + // any caller pays for the expansion. + let expanded_len: u64 = output_rle.iter().map(|&(_, len)| u64::from(len)).sum(); + if expanded_len > super::MAX_ASSIGNMENT_LEN { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "BEN frame expands to {expanded_len} elements, which exceeds the \ + {} sanity bound", + super::MAX_ASSIGNMENT_LEN + ), + )); + } + Ok(output_rle) } @@ -200,6 +217,25 @@ mod tests { assert!(err.to_string().contains("inconsistent")); } + #[test] + fn decode_ben_line_rejects_oversized_expansion() { + use crate::codec::BenEncodeFrame; + use crate::BenVariant; + // 2049 runs of 65,535 elements expand past the 2^27 sanity bound; each run is + // individually legal, so only the bound on the sum catches this. + let frame = + BenEncodeFrame::from_rle(vec![(1u16, u16::MAX); 2049], BenVariant::Standard, None); + let err = decode_ben_line( + Cursor::new(frame.payload()), + frame.max_val_bit_count().unwrap(), + frame.max_len_bit_count(), + frame.n_bytes(), + ) + .expect_err("must reject"); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("sanity bound")); + } + #[test] fn decode_ben_line_rejects_oversized_n_bytes_without_allocating() { // n_bytes way above the sanity cap must error before allocating. We don't supply any diff --git a/ben/src/codec/decode/ben32.rs b/ben/src/codec/decode/ben32.rs index fd9e13c..e17703d 100644 --- a/ben/src/codec/decode/ben32.rs +++ b/ben/src/codec/decode/ben32.rs @@ -15,6 +15,12 @@ use std::io::{self, BufRead}; /// # Returns /// /// Returns the expanded assignment vector together with its repetition count. +/// +/// # Errors +/// +/// Returns [`io::ErrorKind::InvalidData`] for a run with a zero length (only the all-zero frame +/// sentinel may carry a zero length; the encoder never emits zero-length runs) and for a frame +/// whose expansion would exceed [`super::MAX_ASSIGNMENT_LEN`]. pub(crate) fn decode_ben32_line( mut reader: R, variant: BenVariant, @@ -33,6 +39,29 @@ pub(crate) fn decode_ben32_line( let value = (encoded >> 16) as u16; let count = (encoded & 0xFFFF) as u16; + if count == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "ben32 run for value {value} has zero length; only the frame \ + sentinel may carry a zero length" + ), + )); + } + + // Expansion sanity bound: each 4-byte run can demand up to 65,535 elements, so an + // adversarial frame could otherwise request a multi-gigabyte allocation from a few + // kilobytes of input. + if output_vec.len() as u64 + u64::from(count) > super::MAX_ASSIGNMENT_LEN { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "ben32 frame expands past the {} element sanity bound", + super::MAX_ASSIGNMENT_LEN + ), + )); + } + for _ in 0..count { output_vec.push(value); } diff --git a/ben/src/codec/decode/mod.rs b/ben/src/codec/decode/mod.rs index d60fbc7..e1e2226 100644 --- a/ben/src/codec/decode/mod.rs +++ b/ben/src/codec/decode/mod.rs @@ -1,5 +1,14 @@ //! Decoding routines for BEN and XBEN formats. +/// Upper bound on the *expanded* length of a single decoded assignment (the sum of a frame's run +/// lengths — i.e. the number of dual-graph nodes). Each 4-byte ben32 run can legally demand up to +/// 65,535 elements, so without a bound on the sum a small adversarial frame could request a +/// multi-gigabyte expansion from a few kilobytes of input. The cap is a reader-side sanity bound, +/// not a wire-format limit: at ~134 million nodes it sits more than an order of magnitude above +/// any real dual graph (national census-block graphs run ~10 million nodes) while keeping the +/// worst-case single-assignment allocation at 256 MiB. +pub(crate) const MAX_ASSIGNMENT_LEN: u64 = 1 << 27; + mod ben; mod ben32; pub(crate) mod errors; diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 87ffe64..184f603 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -204,6 +204,19 @@ fn encode_ben_to_xben_mkvchain_roundtrip() { assert_eq!(v3["assignment"], serde_json::json!([2, 2, 1, 1])); } +#[test] +fn decode_twodelta_frame_rejects_zero_run_length() { + use crate::codec::decode::decode_twodelta_frame; + use crate::codec::BenEncodeFrame; + + // The delta paint loop assumes no zero-length runs exist (a zero would underflow its + // per-run countdown and mispaint positions), so a frame carrying one is rejected up front. + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 0, 1], Some(1)); + let err = decode_twodelta_frame(vec![1, 2], &frame).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("zero")); +} + #[test] fn decode_error_remaining_variants() { // Test DecodeError variants we haven't covered diff --git a/ben/src/codec/decode/tests/standard.rs b/ben/src/codec/decode/tests/standard.rs index c7a9509..ecfe9cf 100644 --- a/ben/src/codec/decode/tests/standard.rs +++ b/ben/src/codec/decode/tests/standard.rs @@ -369,6 +369,35 @@ fn test_decode_xben_to_jsonl_rejects_truncated_ben32_body() { ); } +#[test] +fn test_decode_xben_to_jsonl_rejects_zero_length_ben32_run() { + // Run (value=7, len=0): not the frame sentinel (value bytes are non-zero), and the encoder + // never emits zero-length runs — silently skipping it would mask corruption. + let xz = standard_xben_from_ben32_body(&[0, 7, 0, 0, 0, 0, 0, 0]); + + let mut out = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("zero length")); +} + +#[test] +fn test_decode_xben_to_jsonl_rejects_oversized_expansion() { + // 2049 runs of 65,535 elements expand past the 2^27 sanity bound. Each run is individually + // legal, so only the bound on the sum catches this. + let mut body = Vec::new(); + for _ in 0..2049 { + body.extend_from_slice(&[0, 1, 0xFF, 0xFF]); + } + body.extend_from_slice(&[0, 0, 0, 0]); + let xz = standard_xben_from_ben32_body(&body); + + let mut out = Vec::new(); + let err = decode_xben_to_jsonl(BufReader::new(xz.as_slice()), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("sanity bound")); +} + #[test] fn test_decode_xben_to_ben_accepts_empty_ben32_body() { // A banner with no frames is a clean end at a frame boundary, not a truncation. diff --git a/ben/src/codec/decode/twodelta.rs b/ben/src/codec/decode/twodelta.rs index 3b17d85..fa2538e 100644 --- a/ben/src/codec/decode/twodelta.rs +++ b/ben/src/codec/decode/twodelta.rs @@ -16,12 +16,23 @@ use std::io; /// # Returns /// /// Returns the updated assignment vector, or an error if the run lengths are exhausted before all -/// relevant positions are covered. +/// relevant positions are covered or any run length is zero. pub(crate) fn apply_twodelta_runs_to_assignment( mut assignment: Vec, pair: (u16, u16), run_lengths: &[u16], ) -> io::Result> { + // The encoder never emits a zero run length, and the paint loop below assumes none exist: a + // zero reaching it would underflow `remaining_in_run` and silently mispaint positions. Every + // unpacker rejects zeros before this point; this check keeps the invariant local so no future + // caller can reintroduce the hazard. + if run_lengths.contains(&0) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta run lengths contain a zero; the encoder never emits zero-length runs", + )); + } + let (first, second) = pair; let mut run_idx = 0usize; diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index b6a5022..17f755d 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -1,4 +1,3 @@ -use super::encode::BenEncodeFrame; use crate::codec::decode::MAX_FRAME_PAYLOAD_BYTES; use crate::BenVariant; use byteorder::{BigEndian, ReadBytesExt}; @@ -20,6 +19,54 @@ fn check_payload_len(n_bytes: u32) -> io::Result<()> { Ok(()) } +/// Unpack a TwoDelta frame's bit-packed run lengths, rejecting interior zeros. +/// +/// The encoder never emits a zero run length, so a zero slot is legal only as a trailing +/// byte-padding artifact (when `max_len_bits` is narrow, the final byte's zero padding can form +/// one or more complete all-zero slots). A zero followed by a real run length is interior +/// corruption: silently dropping it would shift the alternation parity of every subsequent run +/// and decode to a plausible-but-wrong assignment, so it is rejected instead. This is the +/// TwoDelta analog of the interior-zero discipline in +/// [`decode_ben_line`](crate::codec::decode::decode_ben_line). +pub(crate) fn unpack_twodelta_run_lengths( + payload: &[u8], + max_len_bits: u8, +) -> io::Result> { + let mut run_lengths = Vec::new(); + let mut buffer: u32 = 0; + let mut n_bits_in_buff: u16 = 0; + // Zero slots seen since the last real run length. Accepted as padding if the frame ends, + // rejected as interior corruption if a real run length follows. + let mut pending_zero_slots: usize = 0; + + for &byte in payload { + // Place the incoming byte at the top of the 32-bit shift register, below any bits already + // buffered; extraction always reads from the register's high end. + buffer |= ((byte as u32) << 24) >> n_bits_in_buff; + n_bits_in_buff += 8; + + while n_bits_in_buff >= max_len_bits as u16 { + let item = (buffer >> (32 - max_len_bits)) as u16; + buffer <<= max_len_bits; + n_bits_in_buff -= max_len_bits as u16; + if item == 0 { + pending_zero_slots += 1; + } else { + if pending_zero_slots > 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "TwoDelta frame contains an interior zero run length; the encoder never \ + emits zero-length runs", + )); + } + run_lengths.push(item); + } + } + } + + Ok(run_lengths) +} + /// One sample's encoded bytes at the frame layer, freshly read from a wire stream. /// /// `Standard` and `MkvChain` carry **opaque** bit-packed payload bytes — the runs are not expanded @@ -158,17 +205,23 @@ impl BenDecodeFrame { let count = reader.read_u16::()?; - // Reuse the encode-side bit unpacker so the unpack logic lives in one place; we then drop - // the resulting BenEncodeFrame's raw_bytes since the decode-side TwoDelta arm doesn't keep - // them. let pair = (pair_a, pair_b); - let encode_frame = BenEncodeFrame::from_parts(pair, max_len_bits, payload, count); - let run_lengths = match encode_frame { - BenEncodeFrame::TwoDelta { - run_length_vector, .. - } => run_length_vector, - _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), - }; + let run_lengths = unpack_twodelta_run_lengths(&payload, max_len_bits)?; + + // n_bytes consistency: the encoder writes `n_bytes = ceil(runs * width / 8)`. Any other + // relationship between n_bytes and the recovered run count is a corrupt-frame signal, + // exactly mirroring the Standard/MkvChain payload check in `decode_ben_line`. + let expected_bytes = (run_lengths.len() as u64 * u64::from(max_len_bits)).div_ceil(8); + if u64::from(n_bytes) != expected_bytes { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "inconsistent TwoDelta frame size: n_bytes={n_bytes} but {} run length(s) at \ + {max_len_bits} bit(s) each require {expected_bytes} byte(s)", + run_lengths.len() + ), + )); + } Ok(Some(Self::TwoDelta { pair, diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index 7ae08a4..5d90442 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -162,6 +162,47 @@ fn ben_decode_standard_non_eof_read_error_propagates() { assert_eq!(err.kind(), io::ErrorKind::BrokenPipe); } +// ── unpack_twodelta_run_lengths ────────────────────────────────────────────── + +#[test] +fn twodelta_unpack_accepts_trailing_zero_padding() { + // width=4, payload 0xF0 → slots [15, 0]. The zero slot is the final byte's padding artifact + // and must be dropped without error. + let runs = super::decode::unpack_twodelta_run_lengths(&[0xF0], 4).unwrap(); + assert_eq!(runs, vec![15]); +} + +#[test] +fn twodelta_unpack_rejects_interior_zero_run_length() { + // width=4, payload 0x0F → slots [0, 15]. The zero precedes a real run length, so it cannot be + // padding: silently dropping it would shift the alternation parity of every later run and + // decode to a plausible-but-wrong assignment. + let err = super::decode::unpack_twodelta_run_lengths(&[0x0F], 4).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("interior zero")); +} + +#[test] +fn ben_decode_twodelta_interior_zero_errors_through_frame_reader() { + // Delta frame body: pair (1,2), width 4, n_bytes 1, payload 0x0F (interior zero), count 1. + let data: Vec = vec![0, 1, 0, 2, 4, 0, 0, 0, 1, 0x0F, 0, 1]; + let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("interior zero")); +} + +#[test] +fn ben_decode_twodelta_inconsistent_n_bytes_errors() { + // Delta frame claiming n_bytes=2 whose payload decodes to a single width-4 run length: the + // encoder would have written n_bytes = ceil(1 * 4 / 8) = 1, so 2 is a corrupt-frame signal. + let data: Vec = vec![0, 1, 0, 2, 4, 0, 0, 0, 2, 0xF0, 0x00, 0, 1]; + let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("inconsistent")); +} + #[test] fn ben_decode_oversized_n_bytes_rejected_before_allocating() { // Headers declaring an absurd payload length must be rejected before the payload buffer is diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index d2e093c..00aaa98 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -52,6 +52,18 @@ pub(crate) fn ben32_to_ben_line( let value = (encoded >> 16) as u16; let len = (encoded & 0xFFFF) as u16; + // Only the frame sentinel may carry a zero length. Re-emitting a zero-length run would + // write a BEN frame that every downstream reader rejects, so fail at the source instead. + if len == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "ben32 run for value {value} has zero length; only the frame sentinel may \ + carry a zero length" + ), + )); + } + ben32_rle.push((value, len)); } diff --git a/ben/src/codec/translate/tests.rs b/ben/src/codec/translate/tests.rs index 481a22c..bcb22d0 100644 --- a/ben/src/codec/translate/tests.rs +++ b/ben/src/codec/translate/tests.rs @@ -285,6 +285,16 @@ fn test_ben32_to_ben_line_rejects_invalid_length() { ); } +#[test] +fn test_ben32_to_ben_line_rejects_zero_length_run() { + // Run (7, 0) is not the sentinel, and the encoder never emits zero-length runs. Re-emitting + // it would write a BEN frame that downstream readers reject, so the translator errors here. + let err = + ben32_to_ben_line(vec![0, 7, 0, 0, 0, 0, 0, 0], XBenVariant::Standard, 0).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("zero length")); +} + #[test] fn test_ben32_to_ben_line_rejects_missing_terminator() { let err = diff --git a/ben/src/io/reader/stream_reader/xben.rs b/ben/src/io/reader/stream_reader/xben.rs index 773351b..5c79a55 100644 --- a/ben/src/io/reader/stream_reader/xben.rs +++ b/ben/src/io/reader/stream_reader/xben.rs @@ -70,9 +70,32 @@ fn pop_twodelta_frame_from_overflow(overflow: &[u8]) -> Option crate::codec::decode::MAX_ASSIGNMENT_LEN { + return Some(Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "XBEN TwoDelta full frame expands past the {} element sanity bound", + crate::codec::decode::MAX_ASSIGNMENT_LEN + ), + ))); + } runs.push((value, len)); cursor += 4; } @@ -190,8 +213,16 @@ pub(super) fn next_record_xben( inner.overflow.drain(..consumed); return Some(Err(zero_count_frame_error("XBEN"))); } - let assignment = decode_xben_frame_to_assignment(frame_bytes, variant) - .expect("complete frame from pop_frame_from_overflow"); + // The popped frame is structurally complete (sentinel found), but its runs can + // still be semantically corrupt (zero-length run, oversized expansion), so the + // decode is fallible. + let assignment = match decode_xben_frame_to_assignment(frame_bytes, variant) { + Ok(assignment) => assignment, + Err(e) => { + inner.overflow.drain(..consumed); + return Some(Err(e)); + } + }; inner.previous_assignment = Some(assignment.clone()); inner.overflow.drain(..consumed); return Some(Ok((assignment, count))); diff --git a/ben/src/ops/extract/mod.rs b/ben/src/ops/extract/mod.rs index b178fbc..b549d2b 100644 --- a/ben/src/ops/extract/mod.rs +++ b/ben/src/ops/extract/mod.rs @@ -108,8 +108,9 @@ pub fn extract_assignment_xben( for frame in frame_iterator { let (decode_frame, count) = frame.map_err(SampleError::new_io_error)?; if current_sample == sample_number || current_sample + count as usize > sample_number { - // The frame iterator guarantees complete zero-sentinel ben32 frames in the XBEN arm, so - // decode_ben32_line always succeeds. + // The frame iterator guarantees structurally complete zero-sentinel ben32 frames in + // the XBEN arm, but the runs inside can still be semantically corrupt (zero-length + // run, oversized expansion), so the decode is fallible. let bytes = match &decode_frame { crate::io::reader::DecodeFrame::XBen(b, _) => b, crate::io::reader::DecodeFrame::Ben(_) => { @@ -117,7 +118,7 @@ pub fn extract_assignment_xben( } }; let (assignment, _) = decode_ben32_line(Cursor::new(bytes), variant) - .expect("complete frame from XBEN frame reader"); + .map_err(SampleError::new_io_error)?; return Ok(assignment); } current_sample += count as usize; diff --git a/docs/ben-format-spec.md b/docs/ben-format-spec.md index a3126fa..ca34d88 100644 --- a/docs/ben-format-spec.md +++ b/docs/ben-format-spec.md @@ -227,6 +227,17 @@ A reader MUST: A reader MUST surface an error (not a truncated result) if input ends partway through a frame header, payload, or trailing count. +A reader MUST reject a run with a zero length anywhere it can observe one: in a bit-packed frame +payload (outside the final byte's zero-padding region) and in a BEN32 run that is not the frame +sentinel. The encoder never produces zero-length runs, so any such run is a corruption signal; +tolerating one would either silently drop data or shift later runs out of position. + +A reader MAY impose an implementation-defined sanity bound on the expanded length of a single +assignment (the sum of a frame's run lengths) and reject frames that exceed it. The wire format +places no limit on assignment length, but each run can demand up to 65535 elements, so without a +bound a small malicious frame could request an arbitrarily large allocation. The bound MUST sit +well above any real dual graph (this implementation uses 2^27 ≈ 134 million elements). + Frame-level subsampling does not require unpacking payload bits: a reader can skip a frame by reading its 6-byte header, seeking past `n_bytes` (and, for MkvChain, the 2-byte count), and only unpacking the payloads of frames it keeps. From 1fb97c2e620dd3d720c91eef420f9026b4e97d83 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:30:50 -0600 Subject: [PATCH 148/221] better reader errors (propagate) --- ben/src/cli/ben/modes/decode.rs | 4 ++-- ben/src/cli/ben/modes/encode.rs | 4 ++-- ben/src/cli/ben/modes/xdecode.rs | 4 ++-- ben/src/cli/ben/modes/xencode.rs | 4 ++-- ben/src/cli/ben/paths.rs | 26 ++++++++++++++++++-------- ben/src/cli/ben/tests.rs | 16 +++++++++++++--- 6 files changed, 39 insertions(+), 19 deletions(-) diff --git a/ben/src/cli/ben/modes/decode.rs b/ben/src/cli/ben/modes/decode.rs index e673d6a..b4797d9 100644 --- a/ben/src/cli/ben/modes/decode.rs +++ b/ben/src/cli/ben/modes/decode.rs @@ -21,7 +21,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { } } - let reader = open_reader(args.input_file.as_deref()); + let reader = open_reader(args.input_file.as_deref())?; let writer = match args.input_file.as_ref() { Some(file) if !args.print => { let path = decode_setup( @@ -30,7 +30,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { false, args.overwrite, )?; - open_derived_writer(path) + open_derived_writer(path)? } _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, }; diff --git a/ben/src/cli/ben/modes/encode.rs b/ben/src/cli/ben/modes/encode.rs index 4687f43..b6b48df 100644 --- a/ben/src/cli/ben/modes/encode.rs +++ b/ben/src/cli/ben/modes/encode.rs @@ -32,7 +32,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { return Ok(()); } - let reader = open_reader(args.input_file.as_deref()); + let reader = open_reader(args.input_file.as_deref())?; let writer = match args.input_file.as_ref() { Some(in_file) if !args.print => { let path = encode_setup( @@ -42,7 +42,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { args.overwrite, false, )?; - open_derived_writer(path) + open_derived_writer(path)? } _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, }; diff --git a/ben/src/cli/ben/modes/xdecode.rs b/ben/src/cli/ben/modes/xdecode.rs index 2a3dabe..88336cf 100644 --- a/ben/src/cli/ben/modes/xdecode.rs +++ b/ben/src/cli/ben/modes/xdecode.rs @@ -10,11 +10,11 @@ use crate::codec::decode::decode_xben_to_jsonl; pub(in crate::cli::ben) fn run(args: Args) -> CliResult { tracing::trace!("Running in x-decode mode"); - let reader = open_reader(args.input_file.as_deref()); + let reader = open_reader(args.input_file.as_deref())?; let writer = match args.input_file.as_ref() { Some(file) if !args.print => { let path = decode_setup(file.clone(), args.output_file.clone(), true, args.overwrite)?; - open_derived_writer(path) + open_derived_writer(path)? } _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, }; diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs index ab40173..9113647 100644 --- a/ben/src/cli/ben/modes/xencode.rs +++ b/ben/src/cli/ben/modes/xencode.rs @@ -56,7 +56,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { return Ok(()); } - let reader = open_reader(args.input_file.as_deref()); + let reader = open_reader(args.input_file.as_deref())?; let writer = match args.input_file.as_ref() { Some(in_file) if !args.print => { let path = encode_setup( @@ -66,7 +66,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { args.overwrite, false, )?; - open_derived_writer(path) + open_derived_writer(path)? } _ => open_writer(args.output_file.as_deref(), args.print, args.overwrite)?, }; diff --git a/ben/src/cli/ben/paths.rs b/ben/src/cli/ben/paths.rs index 58249c2..5b404ce 100644 --- a/ben/src/cli/ben/paths.rs +++ b/ben/src/cli/ben/paths.rs @@ -113,11 +113,16 @@ pub(super) fn decode_setup( /// /// # Returns /// -/// Returns a buffered reader for the requested file or stdin. -pub(super) fn open_reader(input_file: Option<&str>) -> DynReader { +/// Returns a buffered reader for the requested file or stdin, or the open error (e.g. a missing +/// input file) for the caller to surface as a normal CLI failure. +pub(super) fn open_reader(input_file: Option<&str>) -> Result { match input_file { - Some(path) => Box::new(BufReader::new(File::open(path).unwrap())), - None => Box::new(BufReader::new(io::stdin())), + Some(path) => { + let file = File::open(path) + .map_err(|e| io::Error::new(e.kind(), format!("cannot open {path}: {e}")))?; + Ok(Box::new(BufReader::new(file))) + } + None => Ok(Box::new(BufReader::new(io::stdin()))), } } @@ -144,7 +149,9 @@ pub(super) fn open_writer( match output_file { Some(path) => { check_overwrite(path, overwrite)?; - Ok(Box::new(BufWriter::new(File::create(path).unwrap()))) + let file = File::create(path) + .map_err(|e| io::Error::new(e.kind(), format!("cannot create {path}: {e}")))?; + Ok(Box::new(BufWriter::new(file))) } None => Ok(Box::new(BufWriter::new(io::stdout()))), } @@ -158,9 +165,12 @@ pub(super) fn open_writer( /// /// # Returns /// -/// Returns a buffered writer for `path`. -pub(super) fn open_derived_writer(path: String) -> DynWriter { - Box::new(BufWriter::new(File::create(path).unwrap())) +/// Returns a buffered writer for `path`, or the create error for the caller to surface as a +/// normal CLI failure. +pub(super) fn open_derived_writer(path: String) -> Result { + let file = File::create(&path) + .map_err(|e| io::Error::new(e.kind(), format!("cannot create {path}: {e}")))?; + Ok(Box::new(BufWriter::new(file))) } /// Count the number of non-empty lines in a JSONL file. Used to populate the bundle header's diff --git a/ben/src/cli/ben/tests.rs b/ben/src/cli/ben/tests.rs index 03bfe8e..bf9721e 100644 --- a/ben/src/cli/ben/tests.rs +++ b/ben/src/cli/ben/tests.rs @@ -240,7 +240,7 @@ fn open_reader_reads_file_contents() { let path = unique_path("reader.txt"); fs::write(&path, "hello\nworld\n").unwrap(); - let mut reader = open_reader(Some(path.to_str().unwrap())); + let mut reader = open_reader(Some(path.to_str().unwrap())).unwrap(); let mut content = String::new(); std::io::Read::read_to_string(&mut reader, &mut content).unwrap(); @@ -250,7 +250,17 @@ fn open_reader_reads_file_contents() { #[test] fn open_reader_accepts_stdin() { - let _reader = open_reader(None); + let _reader = open_reader(None).unwrap(); +} + +#[test] +fn open_reader_missing_file_errors_instead_of_panicking() { + let err = match open_reader(Some("/nonexistent/definitely-missing.jsonl")) { + Ok(_) => panic!("expected open_reader to fail for a missing file"), + Err(e) => e, + }; + assert_eq!(err.kind(), std::io::ErrorKind::NotFound); + assert!(err.to_string().contains("definitely-missing")); } #[test] @@ -278,7 +288,7 @@ fn open_writer_supports_stdout_and_print() { fn open_derived_writer_creates_file() { let path = unique_path("derived.txt"); { - let mut writer = open_derived_writer(path.to_string_lossy().into_owned()); + let mut writer = open_derived_writer(path.to_string_lossy().into_owned()).unwrap(); writer.write_all(b"derived").unwrap(); } From 4123e9a84883dd95ebc63d32ab8c70b359f69f4c Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:32:44 -0600 Subject: [PATCH 149/221] fix some edge case issues in the pcompress translation --- ben/src/cli/pcben/tests.rs | 43 +++++++++++++++++++++++++ ben/src/cli/pcben/translate.rs | 57 +++++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 15 deletions(-) diff --git a/ben/src/cli/pcben/tests.rs b/ben/src/cli/pcben/tests.rs index 7edc867..4cbb298 100644 --- a/ben/src/cli/pcben/tests.rs +++ b/ben/src/cli/pcben/tests.rs @@ -78,6 +78,49 @@ fn assignment_encode_ben_offsets_values_and_writes_ben() { assert!(rendered.contains(r#""assignment":[2,2,3]"#)); } +#[test] +fn assignment_decode_ben_rejects_district_id_zero() { + // BEN ids are one-based in the PCOMPRESS convention; id 0 has no zero-based counterpart. + // Saturating it onto 0 would silently alias districts 0 and 1. + let jsonl = br#"{"assignment":[0,1,1],"sample":1} +"#; + let mut ben = Vec::new(); + encode_jsonl_to_ben(BufReader::new(&jsonl[..]), &mut ben, BenVariant::Standard).unwrap(); + + let mut out = Vec::new(); + let err = assignment_decode_ben(Cursor::new(ben), &mut out).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("district id 0")); +} + +#[test] +fn assignment_encode_ben_rejects_district_id_65535() { + // Zero-based id 65535 has no one-based u16 counterpart; wrapping would silently map it to 0. + let input = b"[0,65535]\n"; + let mut ben = Vec::new(); + let err = assignment_encode_ben(BufReader::new(&input[..]), &mut ben).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("65535")); +} + +#[test] +fn assignment_encode_ben_rejects_malformed_line_without_panicking() { + let input = b"[0,1]\nnot json at all\n"; + let mut ben = Vec::new(); + let err = assignment_encode_ben(BufReader::new(&input[..]), &mut ben).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("malformed")); +} + +#[test] +fn assignment_encode_xben_rejects_malformed_line_without_panicking() { + let input = b"[0,1,oops\n"; + let mut xben = Vec::new(); + let err = assignment_encode_xben(BufReader::new(&input[..]), &mut xben).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("malformed")); +} + #[test] fn resolved_output_path_returns_none_when_both_paths_absent() { // When neither output_file nor input_file is given, stdout mode: Ok(None). diff --git a/ben/src/cli/pcben/translate.rs b/ben/src/cli/pcben/translate.rs index ef09537..aa128e8 100644 --- a/ben/src/cli/pcben/translate.rs +++ b/ben/src/cli/pcben/translate.rs @@ -21,7 +21,7 @@ pub(super) fn assignment_decode_ben( for result in ben_reader { match result { Ok((assignment, count)) => { - render_zero_based_assignment_line(&assignment, &mut line); + render_zero_based_assignment_line(&assignment, &mut line)?; for _ in 0..count { writeln!(writer, "{line}")?; } @@ -34,16 +34,52 @@ pub(super) fn assignment_decode_ben( } /// Render a BEN assignment vector as a zero-based JSON array for PCOMPRESS. -fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) { +/// +/// BEN district ids are one-based in the PCOMPRESS convention; id `0` has no zero-based +/// counterpart, so it is rejected rather than silently aliased onto id `1`. +fn render_zero_based_assignment_line(assignment: &[u16], output: &mut String) -> io::Result<()> { output.clear(); output.push('['); - for (idx, value) in assignment.iter().enumerate() { + for (idx, &value) in assignment.iter().enumerate() { + let Some(zero_based) = value.checked_sub(1) else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "district id 0 cannot be converted to PCOMPRESS's zero-based ids; \ + relabel the BEN stream to one-based ids first", + )); + }; if idx > 0 { output.push(','); } - output.push_str(&value.saturating_sub(1).to_string()); + output.push_str(&zero_based.to_string()); } output.push(']'); + Ok(()) +} + +/// Parse one PCOMPRESS line of zero-based district ids into a one-based BEN assignment. +/// +/// Malformed JSON and the unconvertible id `65535` (whose one-based form overflows `u16`) are +/// surfaced as `InvalidData` errors rather than panics or silent wraparound. +fn parse_one_based_assignment(line: io::Result) -> io::Result> { + let line = line?; + let zero_based: Vec = serde_json::from_str(&line).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("malformed PCOMPRESS assignment line: {e}"), + ) + })?; + zero_based + .into_iter() + .map(|x| { + x.checked_add(1).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "district id 65535 cannot be converted to BEN's one-based ids", + ) + }) + }) + .collect() } /// Read zero-based assignment vectors and encode them as BEN. @@ -54,12 +90,7 @@ pub(super) fn assignment_encode_ben( let mut ben_writer = BenStreamWriter::for_ben(writer, BenVariant::MkvChain)?; for line in reader.lines() { - let assignment: Vec = serde_json::from_str::>(&line.unwrap()) - .unwrap() - .into_iter() - .map(|x| x + 1) - .collect(); - ben_writer.write_assignment(assignment)?; + ben_writer.write_assignment(parse_one_based_assignment(line)?)?; } ben_writer.finish()?; Ok(()) @@ -75,11 +106,7 @@ pub(super) fn assignment_encode_xben( BenStreamWriter::for_xben_with_encoder(encoder, BenVariant::MkvChain, None)?; for line in reader.lines() { - let assignment: Vec = serde_json::from_str::>(&line.unwrap()) - .unwrap() - .into_iter() - .map(|x| x + 1) - .collect(); + let assignment = parse_one_based_assignment(line)?; xben_writer.write_json_value(json!({ "assignment": assignment }))?; } xben_writer.finish()?; From 7f7e9988395faf0524e8987029d19881f6c7a85f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:35:15 -0600 Subject: [PATCH 150/221] handle duplicate index in remap and failed final flush edge cases --- ben/src/io/writer/stream_writer/mod.rs | 10 ++- ben/src/ops/relabel/errors.rs | 6 ++ ben/src/ops/relabel/mod.rs | 12 +++- ben/src/ops/relabel/permutation.rs | 84 +++++++++++++++++++++----- ben/src/ops/relabel/tests.rs | 31 ++++++++-- 5 files changed, 119 insertions(+), 24 deletions(-) diff --git a/ben/src/io/writer/stream_writer/mod.rs b/ben/src/io/writer/stream_writer/mod.rs index c1cea66..9cf34ca 100644 --- a/ben/src/io/writer/stream_writer/mod.rs +++ b/ben/src/io/writer/stream_writer/mod.rs @@ -321,7 +321,15 @@ impl Drop for BenStreamWriter { fn drop(&mut self) { if self.inner.is_some() && matches!(self.state, WriterState::Open | WriterState::BodyClosed) { - let _ = self.finish(); + // Best-effort safety net only: Drop cannot propagate errors, so a failed final flush + // here means the output is incomplete. Callers that care must call `finish()` + // explicitly; the warn makes a forgotten finish diagnosable instead of silent. + if let Err(e) = self.finish() { + tracing::warn!( + "BenStreamWriter dropped without an explicit finish and the final flush \ + failed; output is incomplete: {e}" + ); + } } } } diff --git a/ben/src/ops/relabel/errors.rs b/ben/src/ops/relabel/errors.rs index 081e5a3..f6eaa0c 100644 --- a/ben/src/ops/relabel/errors.rs +++ b/ben/src/ops/relabel/errors.rs @@ -27,6 +27,12 @@ pub enum RelabelError { assignment_len: usize, }, + #[error( + "node permutation map references old index {old_idx} more than once; \ + a permutation must use each old index exactly once" + )] + DuplicateOldIndex { old_idx: usize }, + #[error("IO error: {0}")] Io(#[from] io::Error), } diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 36376a1..4f55802 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -183,7 +183,7 @@ pub fn relabel_ben_file( options.run_policy, |a| match &options.transform { RelabelTransform::Identity => Ok(a.to_vec()), - RelabelTransform::FirstSeen => Ok(first_seen_relabel_assignment(a)), + RelabelTransform::FirstSeen => first_seen_relabel_assignment(a), RelabelTransform::NodePermutation(_) => { permute_assignment(a, permutation.as_ref().expect("set above")) } @@ -312,10 +312,18 @@ fn relabel_first_seen_via_byte_walk( let n_bytes = reader.read_u32::()?; let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?; - first_seen_relabel_rle(&mut ben_line); + first_seen_relabel_rle(&mut ben_line)?; let count_occurrences = if input_variant == BenVariant::MkvChain { let count = reader.read_u16::()?; + // A zero count is a corrupt frame; re-emitting it would write output every reader + // rejects, so fail at the source instead. + if count == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "MkvChain frame count must be greater than zero", + )); + } max_samples .map(|limit| ((limit - sample_number).min(count as usize)) as u16) .unwrap_or(count) diff --git a/ben/src/ops/relabel/permutation.rs b/ben/src/ops/relabel/permutation.rs index dde6bf8..954370a 100644 --- a/ben/src/ops/relabel/permutation.rs +++ b/ben/src/ops/relabel/permutation.rs @@ -3,6 +3,10 @@ use std::collections::HashMap; use std::io; /// Convert a sparse permutation map into a dense index vector. +/// +/// Rejects maps that are not permutations: the new-index side must be contiguous from 0, and the +/// old-index side must use each index at most once. A duplicated old index would silently copy one +/// node's value into several positions while dropping another node entirely. pub(super) fn dense_permutation( new_to_old_node_map: &HashMap, ) -> io::Result> { @@ -23,11 +27,34 @@ pub(super) fn dense_permutation( })); } + // Old-side injectivity. Out-of-range old indices are caught later against the actual + // assignment length (`permute_assignment`), so only duplicates are checked here; duplicates + // would otherwise pass every later check and silently scramble node data. + let mut seen = std::collections::HashSet::with_capacity(permutation.len()); + for &old_idx in &permutation { + if !seen.insert(old_idx) { + return Err(io::Error::from(RelabelError::DuplicateOldIndex { old_idx })); + } + } + Ok(permutation) } +/// Error for an input whose distinct district ids cannot all receive a one-based `u16` label: +/// labels start at 1, so at most `u16::MAX` (65,535) distinct ids are representable. +fn too_many_labels_error() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + "assignment has more than 65535 distinct district ids; \ + one-based u16 labels cannot represent them all", + ) +} + /// Remap an assignment vector's district labels in first-seen order, starting at 1. -pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { +/// +/// Errors if the assignment holds more than `u16::MAX` distinct ids, which one-based `u16` labels +/// cannot represent — wrapping would silently alias two districts. +pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> io::Result> { let mut label_map = HashMap::new(); let mut next_label = 0u16; let mut out = Vec::with_capacity(assignment.len()); @@ -36,7 +63,7 @@ pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { let mapped = match label_map.get(&value) { Some(mapped) => *mapped, None => { - next_label += 1; + next_label = next_label.checked_add(1).ok_or_else(too_many_labels_error)?; label_map.insert(value, next_label); next_label } @@ -44,11 +71,15 @@ pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> Vec { out.push(mapped); } - out + Ok(out) } /// Rewrite the value of each `(val, len)` RLE pair in first-seen order, in place. -pub(super) fn first_seen_relabel_rle(runs: &mut [(u16, u16)]) { +/// +/// Errors if the runs hold more than `u16::MAX` distinct ids; see +/// [`first_seen_relabel_assignment`]. On error, a prefix of `runs` may already be relabeled — +/// callers treat the whole operation as failed and discard. +pub(super) fn first_seen_relabel_rle(runs: &mut [(u16, u16)]) -> io::Result<()> { let mut label_map = HashMap::new(); let mut label = 0u16; label_map.reserve(runs.len()); @@ -56,13 +87,14 @@ pub(super) fn first_seen_relabel_rle(runs: &mut [(u16, u16)]) { let new_val = match label_map.get(val) { Some(v) => *v, None => { - label += 1; + label = label.checked_add(1).ok_or_else(too_many_labels_error)?; label_map.insert(*val, label); label } }; *val = new_val; } + Ok(()) } /// Reorder an assignment vector according to a dense permutation. @@ -126,28 +158,34 @@ mod tests { } #[test] - fn dense_permutation_duplicate_old_indices_allowed_pinning_current_behavior() { - // {0 -> 5, 1 -> 5}: produces a non-bijective dense Vec without error. + fn dense_permutation_duplicate_old_indices_rejected() { + // {0 -> 5, 1 -> 5}: a duplicated old index is not a permutation — applying it would copy + // one node's value into two positions and silently drop another node entirely. let map: HashMap = [(0, 5), (1, 5)].into_iter().collect(); - assert_eq!(dense_permutation(&map).unwrap(), vec![5, 5]); + let err = dense_permutation(&map).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("more than once")); } // ── first_seen_relabel_assignment ────────────────────────────────── #[test] fn first_seen_relabel_assignment_empty() { - assert!(first_seen_relabel_assignment(&[]).is_empty()); + assert!(first_seen_relabel_assignment(&[]).unwrap().is_empty()); } #[test] fn first_seen_relabel_assignment_all_same() { - assert_eq!(first_seen_relabel_assignment(&[7, 7, 7]), vec![1, 1, 1]); + assert_eq!( + first_seen_relabel_assignment(&[7, 7, 7]).unwrap(), + vec![1, 1, 1] + ); } #[test] fn first_seen_relabel_assignment_monotonic() { assert_eq!( - first_seen_relabel_assignment(&[2, 3, 4, 5]), + first_seen_relabel_assignment(&[2, 3, 4, 5]).unwrap(), vec![1, 2, 3, 4] ); } @@ -155,7 +193,7 @@ mod tests { #[test] fn first_seen_relabel_assignment_reversed() { assert_eq!( - first_seen_relabel_assignment(&[5, 4, 3, 2]), + first_seen_relabel_assignment(&[5, 4, 3, 2]).unwrap(), vec![1, 2, 3, 4] ); } @@ -163,17 +201,31 @@ mod tests { #[test] fn first_seen_relabel_assignment_with_gaps() { assert_eq!( - first_seen_relabel_assignment(&[1, 5, 9, 5, 1, 9]), + first_seen_relabel_assignment(&[1, 5, 9, 5, 1, 9]).unwrap(), vec![1, 2, 3, 2, 1, 3] ); } + #[test] + fn first_seen_relabel_rejects_more_distinct_ids_than_labels() { + // All 65,536 distinct u16 ids: one-based labels max out at 65,535, so the 65,536th + // distinct id has no label. Wrapping would silently alias two districts. + let assignment: Vec = (0..=u16::MAX).collect(); + let err = first_seen_relabel_assignment(&assignment).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("65535")); + + let mut runs: Vec<(u16, u16)> = (0..=u16::MAX).map(|v| (v, 1)).collect(); + let err = first_seen_relabel_rle(&mut runs).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + } + // ── first_seen_relabel_rle ───────────────────────────────────────── #[test] fn first_seen_relabel_rle_basic() { let mut runs = vec![(2u16, 3u16), (3, 1), (2, 2), (5, 1)]; - first_seen_relabel_rle(&mut runs); + first_seen_relabel_rle(&mut runs).unwrap(); assert_eq!(runs, vec![(1, 3), (2, 1), (1, 2), (3, 1)]); } @@ -191,10 +243,10 @@ mod tests { vec![3, 3, 1, 1, 2, 2, 3, 3, 4], ]; for input in inputs { - let from_assignment = first_seen_relabel_assignment(&input); + let from_assignment = first_seen_relabel_assignment(&input).unwrap(); let mut runs = assign_to_rle(input.clone()); - first_seen_relabel_rle(&mut runs); + first_seen_relabel_rle(&mut runs).unwrap(); let from_rle = rle_to_vec(runs); assert_eq!( diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 075e164..0f86457 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -28,16 +28,20 @@ impl Read for ErrorAfterOneByte { fn shuffle_with_mapping(vec: &mut [T]) -> HashMap where - T: Clone + std::cmp::PartialEq, + T: Clone, { + // Shuffle *indices* and apply that permutation, so the returned `new -> old` map is a true + // bijection. (Matching shuffled values back to `position(first equal value)` would collapse + // duplicate values onto one old index and produce a non-permutation map.) let mut rng = ChaCha8Rng::seed_from_u64(42); let original_vec = vec.to_vec(); - vec.shuffle(&mut rng); + let mut old_indices: Vec = (0..vec.len()).collect(); + old_indices.shuffle(&mut rng); let mut map = HashMap::new(); - for (new_index, item) in vec.iter().enumerate() { - let original_index = original_vec.iter().position(|i| i == item).unwrap(); - map.insert(new_index, original_index); + for (new_index, &old_index) in old_indices.iter().enumerate() { + vec[new_index] = original_vec[old_index].clone(); + map.insert(new_index, old_index); } map } @@ -282,6 +286,23 @@ fn test_relabel_ben_line_with_map() { assert_eq!(&buf[BANNER_LEN..], expected.as_slice()); } +#[test] +fn first_seen_fast_path_rejects_zero_count_frame() { + // A MkvChain frame with count == 0 is corrupt; the byte-walking fast path must error rather + // than re-emit a frame every downstream reader rejects. + let frame = BenEncodeFrame::from_assignment(vec![1u16, 2, 2], BenVariant::MkvChain, Some(0)); + let with_banner_in = with_banner(BenVariant::MkvChain, frame.as_slice()); + + let err = relabel_ben_file( + with_banner_in.as_slice(), + Vec::new(), + RelabelOptions::first_seen(), + ) + .unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + assert!(err.to_string().contains("count")); +} + #[test] fn test_relabel_ben_line_with_shuffle() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; From cb41eec3feaba08f5d5155a0ae98007469070e94 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Wed, 10 Jun 2026 12:59:37 -0600 Subject: [PATCH 151/221] better coverage harnass --- Taskfile.yml | 13 +- ben/src/codec/decode/ben.rs | 28 ++- ben/src/codec/frames/tests.rs | 20 +- ben/src/ops/relabel/permutation.rs | 4 +- ben/tests/test_fixture_mutations.rs | 368 ++++++++++++++++++++++++++++ docs/format-stability.md | 5 + 6 files changed, 420 insertions(+), 18 deletions(-) create mode 100644 ben/tests/test_fixture_mutations.rs diff --git a/Taskfile.yml b/Taskfile.yml index 6fcf871..755a8eb 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -3,11 +3,11 @@ version: "3" vars: - CARGO_BIN: '{{.HOME}}/.cargo/bin' - LOCAL_BIN: '{{.HOME}}/.local/bin' - LLVM_BIN: '{{.HOME}}/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/bin' - COV_TARGET_DIR: '{{.ROOT_DIR}}/target/llvm-cov-target' - BEN_PY_TEST_PATHS: 'tests/' + CARGO_BIN: "{{.HOME}}/.cargo/bin" + LOCAL_BIN: "{{.HOME}}/.local/bin" + LLVM_BIN: "{{.HOME}}/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/bin" + COV_TARGET_DIR: "{{.ROOT_DIR}}/target/llvm-cov-target" + BEN_PY_TEST_PATHS: "tests/" tasks: default: @@ -19,7 +19,7 @@ tasks: desc: List available tasks silent: true cmds: - - 'command -v go-task >/dev/null 2>&1 && go-task --list-all || task --list-all' + - "command -v go-task >/dev/null 2>&1 && go-task --list-all || task --list-all" ensure-rust-linux: &ensure-rust-unix desc: Install Rust if it is not already available @@ -151,6 +151,7 @@ tasks: cmds: - task: test-rust - task: test-python + - task: docs-test format-rust: desc: Format Rust code diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index ca00c73..9f4cd9f 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -7,6 +7,13 @@ use std::io::{self, Read}; /// millions of run pairs. pub(crate) const MAX_FRAME_PAYLOAD_BYTES: u32 = 1 << 26; +/// Upper bound on the *speculative* run-pair reservation made before decoding a frame payload. +/// The header-derived pair count is attacker-controlled: a minimum-width frame at the payload cap +/// implies ~268 million pairs (≈1 GiB) before a single payload byte has been read. Legitimate +/// frames rarely exceed a few hundred thousand runs, and `Vec` growth covers any that do, so the +/// reservation is clamped and a hostile header costs kilobytes instead of a gigabyte. +const MAX_RLE_PREALLOC_PAIRS: usize = 1 << 16; + /// Decode a single BEN frame payload into run-length encoded assignments. /// /// This function expects only the packed payload bytes for one BEN frame, not the leading per-frame @@ -70,7 +77,8 @@ pub fn decode_ben_line( let bit_width = u64::from(max_val_bits) + u64::from(max_len_bits); let total_bits = u64::from(n_bytes) * 8; let n_assignments_upper_bound = (total_bits / bit_width) as usize; - let mut output_rle: Vec<(u16, u16)> = Vec::with_capacity(n_assignments_upper_bound); + let mut output_rle: Vec<(u16, u16)> = + Vec::with_capacity(n_assignments_upper_bound.min(MAX_RLE_PREALLOC_PAIRS)); let mut buffer: u32 = 0; let mut n_bits_in_buff: u16 = 0; @@ -217,6 +225,24 @@ mod tests { assert!(err.to_string().contains("inconsistent")); } + #[test] + fn decode_ben_line_grows_past_the_clamped_preallocation() { + use crate::codec::BenEncodeFrame; + use crate::BenVariant; + // 100,000 pairs sit above MAX_RLE_PREALLOC_PAIRS, so the output vector must grow past its + // clamped initial reservation without losing or reordering pairs. + let runs = vec![(1u16, 1u16); 100_000]; + let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let decoded = decode_ben_line( + Cursor::new(frame.payload()), + frame.max_val_bit_count().unwrap(), + frame.max_len_bit_count(), + frame.n_bytes(), + ) + .unwrap(); + assert_eq!(decoded, runs); + } + #[test] fn decode_ben_line_rejects_oversized_expansion() { use crate::codec::BenEncodeFrame; diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index 5d90442..b62ea68 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -186,8 +186,8 @@ fn twodelta_unpack_rejects_interior_zero_run_length() { fn ben_decode_twodelta_interior_zero_errors_through_frame_reader() { // Delta frame body: pair (1,2), width 4, n_bytes 1, payload 0x0F (interior zero), count 1. let data: Vec = vec![0, 1, 0, 2, 4, 0, 0, 0, 1, 0x0F, 0, 1]; - let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) - .unwrap_err(); + let err = + BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("interior zero")); } @@ -197,8 +197,8 @@ fn ben_decode_twodelta_inconsistent_n_bytes_errors() { // Delta frame claiming n_bytes=2 whose payload decodes to a single width-4 run length: the // encoder would have written n_bytes = ceil(1 * 4 / 8) = 1, so 2 is a corrupt-frame signal. let data: Vec = vec![0, 1, 0, 2, 4, 0, 0, 0, 2, 0xF0, 0x00, 0, 1]; - let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) - .unwrap_err(); + let err = + BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("inconsistent")); } @@ -213,24 +213,24 @@ fn ben_decode_oversized_n_bytes_rejected_before_allocating() { // Standard: [mvb, mlb, n_bytes]. let mut data = vec![2u8, 3]; data.extend_from_slice(&oversized); - let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::Standard) - .unwrap_err(); + let err = + BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::Standard).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("refusing to allocate")); // MkvChain: same header shape. let mut data = vec![2u8, 3]; data.extend_from_slice(&oversized); - let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::MkvChain) - .unwrap_err(); + let err = + BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::MkvChain).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("refusing to allocate")); // TwoDelta: [pair_a, pair_b, max_len_bits, n_bytes]. let mut data = vec![0u8, 1, 0, 2, 4]; data.extend_from_slice(&oversized); - let err = BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta) - .unwrap_err(); + let err = + BenDecodeFrame::from_reader(&mut io::Cursor::new(data), BenVariant::TwoDelta).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("refusing to allocate")); } diff --git a/ben/src/ops/relabel/permutation.rs b/ben/src/ops/relabel/permutation.rs index 954370a..da21c9b 100644 --- a/ben/src/ops/relabel/permutation.rs +++ b/ben/src/ops/relabel/permutation.rs @@ -63,7 +63,9 @@ pub(super) fn first_seen_relabel_assignment(assignment: &[u16]) -> io::Result *mapped, None => { - next_label = next_label.checked_add(1).ok_or_else(too_many_labels_error)?; + next_label = next_label + .checked_add(1) + .ok_or_else(too_many_labels_error)?; label_map.insert(value, next_label); next_label } diff --git a/ben/tests/test_fixture_mutations.rs b/ben/tests/test_fixture_mutations.rs new file mode 100644 index 0000000..f9620cb --- /dev/null +++ b/ben/tests/test_fixture_mutations.rs @@ -0,0 +1,368 @@ +//! Exhaustive single-byte mutation fuzzing of the committed wire-format fixtures. +//! +//! For every byte position of every committed `tests/fixtures/v1.0.0/` binary fixture, three +//! mutants are produced (bit-flip, increment, zero) and each mutant is driven through **every +//! public read entry point** for its wire format. The contract under test is *panic freedom*: +//! +//! - A mutant may fail to parse — any `io::Result` error is acceptable. +//! - A mutant may even decode successfully (plain BEN and XBEN carry no integrity bytes, so a +//! payload mutation can produce a different-but-structurally-valid stream). That is acceptable +//! here too; whole-stream integrity is the `.bendl` layer's job and is covered by its own +//! checksum tests. +//! - What a mutant must never do, at any byte position, is panic, abort, or hang an entry point. +//! +//! When a new public read API is added, register it in the matching `drive_*` function below — +//! that one registration extends the exhaustive corruption coverage to the new surface. + +use std::io::{self, Cursor, Read}; +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::path::Path; + +use binary_ensemble::codec::decode::{ + decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, +}; +use binary_ensemble::codec::encode::xz_compress; +use binary_ensemble::io::bundle::reader::BendlReader; +use binary_ensemble::io::bundle::writer::BendlAppender; +use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; +use binary_ensemble::ops::extract::{extract_assignment_ben, extract_assignment_xben}; +use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; +use binary_ensemble::BenVariant; + +/// Upper bound on records pulled from any iterator-style entry point. A corrupt stream may yield +/// an error from `next()` without ending the iterator, so iteration is bounded rather than driven +/// to `None`; the fixtures hold five samples, so the bound is far above any legitimate yield +/// count. +const MAX_PULLS: usize = 1_000; + +fn fixture(name: &str) -> Vec { + let path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("v1.0.0") + .join(name); + std::fs::read(&path).unwrap_or_else(|e| panic!("missing fixture {path:?}: {e}")) +} + +/// The three mutation patterns applied at each byte position, deduplicated against the original +/// byte so every driven mutant really differs from the committed fixture. +fn mutations_of(original: u8) -> Vec { + let mut out = vec![original ^ 0xFF, original.wrapping_add(1), 0x00]; + out.dedup(); + out.retain(|&b| b != original); + out +} + +/// Run one labeled entry point over one mutant under `catch_unwind`, converting any panic into a +/// test failure that names the fixture, byte position, mutated value, and entry point. +fn assert_no_panic(fixture_name: &str, pos: usize, byte: u8, label: &str, f: impl FnOnce()) { + let outcome = catch_unwind(AssertUnwindSafe(f)); + assert!( + outcome.is_ok(), + "{label} panicked on {fixture_name} with byte {pos} set to {byte:#04x}" + ); +} + +/// Drive every public plain-BEN read entry point over `bytes`. +fn drive_ben_entry_points(fixture_name: &str, pos: usize, byte: u8, bytes: &[u8]) { + let run = |label: &str, f: &dyn Fn()| assert_no_panic(fixture_name, pos, byte, label, f); + + run("decode_ben_to_jsonl", &|| { + let _ = decode_ben_to_jsonl(bytes, io::sink()); + }); + run("BenStreamReader::from_ben iterate", &|| { + if let Ok(reader) = BenStreamReader::from_ben(bytes) { + for record in reader.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + }); + run("BenStreamReader::from_ben count_samples", &|| { + if let Ok(reader) = BenStreamReader::from_ben(bytes) { + let _ = reader.silent(true).count_samples(); + } + }); + run("BenStreamFrameReader::from_ben iterate", &|| { + if let Ok(frames) = BenStreamFrameReader::from_ben(bytes) { + for frame in frames.take(MAX_PULLS) { + let _ = frame; + } + } + }); + run("into_subsample_by_indices", &|| { + if let Ok(reader) = BenStreamReader::from_ben(bytes) { + for record in reader + .silent(true) + .into_subsample_by_indices(vec![1, 3]) + .take(MAX_PULLS) + { + let _ = record; + } + } + }); + run("into_subsample_by_range", &|| { + if let Ok(reader) = BenStreamReader::from_ben(bytes) { + for record in reader + .silent(true) + .into_subsample_by_range(1, 2) + .take(MAX_PULLS) + { + let _ = record; + } + } + }); + run("into_subsample_every", &|| { + if let Ok(reader) = BenStreamReader::from_ben(bytes) { + for record in reader.silent(true).into_subsample_every(2, 1).take(MAX_PULLS) { + let _ = record; + } + } + }); + run("relabel_ben_file first_seen", &|| { + let _ = relabel_ben_file(bytes, io::sink(), RelabelOptions::first_seen()); + }); + run("relabel_ben_file convert_to MkvChain", &|| { + let _ = relabel_ben_file( + bytes, + io::sink(), + RelabelOptions::convert_to(BenVariant::MkvChain), + ); + }); + run("relabel_ben_file convert_to TwoDelta", &|| { + let _ = relabel_ben_file( + bytes, + io::sink(), + RelabelOptions::convert_to(BenVariant::TwoDelta), + ); + }); + run("relabel_ben_file node_permutation", &|| { + let map = (0..4usize).map(|i| (i, 3 - i)).collect(); + let _ = relabel_ben_file(bytes, io::sink(), RelabelOptions::node_permutation(map)); + }); + run("extract_assignment_ben", &|| { + let _ = extract_assignment_ben(bytes, 1); + let _ = extract_assignment_ben(bytes, 3); + }); +} + +/// Drive every public XBEN read entry point over `bytes`. +fn drive_xben_entry_points(fixture_name: &str, pos: usize, byte: u8, bytes: &[u8]) { + let run = |label: &str, f: &dyn Fn()| assert_no_panic(fixture_name, pos, byte, label, f); + + run("decode_xben_to_jsonl", &|| { + let _ = decode_xben_to_jsonl(io::BufReader::new(bytes), io::sink()); + }); + run("decode_xben_to_ben", &|| { + let _ = decode_xben_to_ben(io::BufReader::new(bytes), io::sink()); + }); + run("xz_decompress", &|| { + let _ = xz_decompress(io::BufReader::new(bytes), io::sink()); + }); + run("BenStreamReader::from_xben iterate", &|| { + if let Ok(reader) = BenStreamReader::from_xben(bytes) { + for record in reader.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + }); + run("BenStreamReader::from_xben count_samples", &|| { + if let Ok(reader) = BenStreamReader::from_xben(bytes) { + let _ = reader.silent(true).count_samples(); + } + }); + run("BenStreamFrameReader::from_xben iterate", &|| { + if let Ok(frames) = BenStreamFrameReader::from_xben(bytes) { + for frame in frames.take(MAX_PULLS) { + let _ = frame; + } + } + }); + run("from_xben into_subsample_by_range", &|| { + if let Ok(reader) = BenStreamReader::from_xben(bytes) { + for record in reader + .silent(true) + .into_subsample_by_range(1, 2) + .take(MAX_PULLS) + { + let _ = record; + } + } + }); + run("extract_assignment_xben", &|| { + let _ = extract_assignment_xben(bytes, 1); + }); +} + +/// Drive every public `.bendl` read entry point over `bytes`. +/// +/// Mutants split into two classes: open-rejected (constructor errors — nothing else reachable) +/// and openable (every accessor must then hold the no-panic contract). +fn drive_bendl_entry_points(fixture_name: &str, pos: usize, byte: u8, bytes: &[u8]) { + let run = |label: &str, f: &dyn Fn()| assert_no_panic(fixture_name, pos, byte, label, f); + + run("BendlReader full surface", &|| { + let Ok(mut reader) = BendlReader::open(Cursor::new(bytes.to_vec())) else { + return; // Open-rejected mutant: the constructor is the whole reachable surface. + }; + + let _ = reader.is_finalized(); + let _ = reader.sample_count(); + let _ = reader.assignment_format(); + let _ = reader.validate_directory(); + + for entry in reader.assets().to_vec() { + let _ = reader.asset_bytes(&entry); + let _ = reader.asset_bytes_unverified(&entry); + if let Ok(mut payload) = reader.asset_payload_reader_unverified(&entry) { + let _ = payload.read_to_end(&mut Vec::new()); + } + let _ = reader.verify_asset_checksum(&entry); + } + let _ = reader.verify_all_asset_checksums(); + let _ = reader.verify_stream_checksum(); + + if let Ok(mut stream) = reader.assignment_stream_reader() { + let _ = stream.read_to_end(&mut Vec::new()); + } + if let Ok(mut stream) = reader.assignment_stream_reader_unverified() { + let _ = stream.read_to_end(&mut Vec::new()); + } + if let Ok(verified) = reader.open_assignment_reader() { + for record in verified.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + if let Ok(verified) = reader.open_assignment_reader() { + let _ = verified.count_samples(); + } + if let Ok(unverified) = reader.open_assignment_reader_unverified() { + for record in unverified.silent(true).take(MAX_PULLS) { + let _ = record; + } + }; + }); + run("BendlAppender::open", &|| { + let _ = BendlAppender::open(Cursor::new(bytes.to_vec())); + }); +} + +/// Apply every single-byte mutation to `original` and hand each mutant to `drive`. +fn mutate_and_drive(fixture_name: &str, original: &[u8], drive: impl Fn(&str, usize, u8, &[u8])) { + let mut mutant = original.to_vec(); + for pos in 0..original.len() { + for byte in mutations_of(original[pos]) { + mutant[pos] = byte; + drive(fixture_name, pos, byte, &mutant); + } + mutant[pos] = original[pos]; + } +} + +fn ben_fixture_names() -> [&'static str; 3] { + ["standard.ben", "mkvchain.ben", "twodelta.ben"] +} + +fn xben_fixture_names() -> [&'static str; 3] { + ["standard.xben", "mkvchain.xben", "twodelta.xben"] +} + +fn bendl_fixture_names() -> [&'static str; 2] { + ["flags_set.bendl", "unknown_flags.bendl"] +} + +#[test] +fn mutated_ben_fixtures_never_panic_any_entry_point() { + for name in ben_fixture_names() { + let original = fixture(name); + mutate_and_drive(name, &original, drive_ben_entry_points); + } +} + +#[test] +fn mutated_xben_fixtures_never_panic_any_entry_point() { + for name in xben_fixture_names() { + let original = fixture(name); + mutate_and_drive(name, &original, drive_xben_entry_points); + } +} + +#[test] +fn mutated_bendl_fixtures_never_panic_any_entry_point() { + for name in bendl_fixture_names() { + let original = fixture(name); + mutate_and_drive(name, &original, drive_bendl_entry_points); + } +} + +/// Wrap a (possibly corrupt) decompressed XBEN body in a fresh, *valid* xz container. +/// +/// Mutating the compressed fixture bytes mostly exercises the xz layer, whose own integrity +/// checks reject the mutant before the BEN32/TwoDelta parsers run. Re-compressing a mutated +/// body delivers the corruption past the xz wrapper, straight to the parsers under test. +fn recompress(body: &[u8]) -> Vec { + let mut out = Vec::new(); + xz_compress(io::BufReader::new(body), &mut out, Some(1), Some(0), None) + .expect("compressing a small in-memory body cannot fail"); + out +} + +/// Single-byte mutation of the *decompressed* XBEN bodies, re-wrapped in valid xz so the inner +/// BEN32/TwoDelta parsers (not the xz layer) face the corruption. +#[test] +fn mutated_xben_bodies_never_panic_any_entry_point() { + for name in xben_fixture_names() { + let compressed = fixture(name); + let mut body = Vec::new(); + xz_decompress(io::BufReader::new(compressed.as_slice()), &mut body) + .expect("committed fixture must decompress"); + + let mut mutant = body.clone(); + for pos in 0..body.len() { + for byte in mutations_of(body[pos]) { + mutant[pos] = byte; + drive_xben_entry_points(name, pos, byte, &recompress(&mutant)); + } + mutant[pos] = body[pos]; + } + } +} + +/// Truncation of the *decompressed* XBEN bodies, re-wrapped in valid xz: a clean container whose +/// inner stream ends mid-frame — the corruption class a damaged-but-recompressed file presents. +#[test] +fn truncated_xben_bodies_never_panic_any_entry_point() { + for name in xben_fixture_names() { + let compressed = fixture(name); + let mut body = Vec::new(); + xz_decompress(io::BufReader::new(compressed.as_slice()), &mut body) + .expect("committed fixture must decompress"); + + for end in 0..body.len() { + drive_xben_entry_points(name, end, 0, &recompress(&body[..end])); + } + } +} + +/// Truncation sweep: every prefix of every fixture, through the same entry points. Single-byte +/// mutation preserves length, so this covers the orthogonal corruption axis (short files). +#[test] +fn truncated_fixtures_never_panic_any_entry_point() { + for name in ben_fixture_names() { + let original = fixture(name); + for end in 0..original.len() { + drive_ben_entry_points(name, end, 0, &original[..end]); + } + } + for name in xben_fixture_names() { + let original = fixture(name); + for end in 0..original.len() { + drive_xben_entry_points(name, end, 0, &original[..end]); + } + } + for name in bendl_fixture_names() { + let original = fixture(name); + for end in 0..original.len() { + drive_bendl_entry_points(name, end, 0, &original[..end]); + } + } +} diff --git a/docs/format-stability.md b/docs/format-stability.md index 4138af5..62b8cf7 100644 --- a/docs/format-stability.md +++ b/docs/format-stability.md @@ -4,6 +4,11 @@ This crate ships committed binary fixtures under `ben/tests/fixtures/v/` and `ben/tests/test_format_stability.rs` that decodes each one. The fixtures are the v1.0.0 wire-format stability contract. +The fixtures have a second consumer: `ben/tests/test_fixture_mutations.rs` drives every +single-byte mutation and truncation prefix of each binary fixture through every public read entry +point, asserting panic freedom on corrupt input. Both suites must keep passing against the same +committed bytes. + ## Contract **Once a fixture directory is committed for a stable major version, every file inside it MUST From c557b113a4c7b9c5c7f9faa6d85c46d172599db6 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:53:15 -0600 Subject: [PATCH 152/221] get rid of the try_from_parts symantics and fix potential zero drop --- ben/src/codec/frames/decode.rs | 60 ++++++++++++------- ben/src/codec/frames/encode.rs | 77 ++++++++++++++++++++++--- ben/src/codec/frames/mod.rs | 1 + ben/src/codec/frames/tests.rs | 51 ++++++++++++++-- ben/src/io/writer/stream_writer/xben.rs | 15 +++-- ben/src/io/writer/tests.rs | 12 ++-- ben/tests/test_coverage.rs | 7 ++- ben/tests/test_fixture_mutations.rs | 15 ++++- 8 files changed, 188 insertions(+), 50 deletions(-) diff --git a/ben/src/codec/frames/decode.rs b/ben/src/codec/frames/decode.rs index 17f755d..fb72e8f 100644 --- a/ben/src/codec/frames/decode.rs +++ b/ben/src/codec/frames/decode.rs @@ -6,7 +6,7 @@ use std::io::{self, Read}; /// Reject a declared payload length above [`MAX_FRAME_PAYLOAD_BYTES`] **before** allocating the /// payload buffer, so a corrupt or adversarial frame header cannot force a multi-gigabyte /// reservation. Well-formed frames never approach the cap. -fn check_payload_len(n_bytes: u32) -> io::Result<()> { +pub(crate) fn check_payload_len(n_bytes: u32) -> io::Result<()> { if n_bytes > MAX_FRAME_PAYLOAD_BYTES { return Err(io::Error::new( io::ErrorKind::InvalidData, @@ -19,6 +19,41 @@ fn check_payload_len(n_bytes: u32) -> io::Result<()> { Ok(()) } +/// Reject a TwoDelta run-length bit width outside `1..=16`. The bit unpackers shift a 32-bit +/// register by `32 - width` and decrement a counter by `width`, so a zero or oversized width +/// is not merely corrupt — it would shift out of range or never terminate. +pub(crate) fn check_twodelta_run_width(max_len_bits: u8) -> io::Result<()> { + if max_len_bits == 0 || max_len_bits > 16 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("invalid TwoDelta run-length bit width: {max_len_bits}"), + )); + } + Ok(()) +} + +/// n_bytes consistency for a TwoDelta frame: the encoder writes `n_bytes = ceil(runs * width / 8)`. +/// Any other relationship between the payload length and the recovered run count is a +/// corrupt-frame signal, exactly mirroring the Standard/MkvChain payload check in +/// [`decode_ben_line`](crate::codec::decode::decode_ben_line). +pub(crate) fn check_twodelta_frame_consistency( + n_bytes: u32, + run_count: usize, + max_len_bits: u8, +) -> io::Result<()> { + let expected_bytes = (run_count as u64 * u64::from(max_len_bits)).div_ceil(8); + if u64::from(n_bytes) != expected_bytes { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "inconsistent TwoDelta frame size: n_bytes={n_bytes} but {run_count} run \ + length(s) at {max_len_bits} bit(s) each require {expected_bytes} byte(s)" + ), + )); + } + Ok(()) +} + /// Unpack a TwoDelta frame's bit-packed run lengths, rejecting interior zeros. /// /// The encoder never emits a zero run length, so a zero slot is legal only as a trailing @@ -191,12 +226,7 @@ impl BenDecodeFrame { let pair_b = reader.read_u16::()?; let max_len_bits = reader.read_u8()?; - if max_len_bits == 0 || max_len_bits > 16 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("invalid TwoDelta run-length bit width: {max_len_bits}"), - )); - } + check_twodelta_run_width(max_len_bits)?; let n_bytes = reader.read_u32::()?; check_payload_len(n_bytes)?; @@ -207,21 +237,7 @@ impl BenDecodeFrame { let pair = (pair_a, pair_b); let run_lengths = unpack_twodelta_run_lengths(&payload, max_len_bits)?; - - // n_bytes consistency: the encoder writes `n_bytes = ceil(runs * width / 8)`. Any other - // relationship between n_bytes and the recovered run count is a corrupt-frame signal, - // exactly mirroring the Standard/MkvChain payload check in `decode_ben_line`. - let expected_bytes = (run_lengths.len() as u64 * u64::from(max_len_bits)).div_ceil(8); - if u64::from(n_bytes) != expected_bytes { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!( - "inconsistent TwoDelta frame size: n_bytes={n_bytes} but {} run length(s) at \ - {max_len_bits} bit(s) each require {expected_bytes} byte(s)", - run_lengths.len() - ), - )); - } + check_twodelta_frame_consistency(n_bytes, run_lengths.len(), max_len_bits)?; Ok(Some(Self::TwoDelta { pair, diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs index 19ea787..fccfa67 100644 --- a/ben/src/codec/frames/encode.rs +++ b/ben/src/codec/frames/encode.rs @@ -1,6 +1,25 @@ use super::compress_rle_to_ben_bytes; use crate::util::rle::assign_to_rle; use crate::BenVariant; +use std::io; + +/// Serialize a TwoDelta frame's wire bytes from its parsed parts: +/// `[pair.0 u16][pair.1 u16][width u8][n_bytes u32][payload][count u16]`, all big-endian. +fn assemble_twodelta_raw_bytes( + pair: (u16, u16), + max_len_bit_count: u8, + payload: &[u8], + count: u16, +) -> Vec { + let mut raw_bytes = Vec::with_capacity(9 + payload.len() + 2); + raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); + raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); + raw_bytes.push(max_len_bit_count); + raw_bytes.extend_from_slice(&(payload.len() as u32).to_be_bytes()); + raw_bytes.extend_from_slice(payload); + raw_bytes.extend_from_slice(&count.to_be_bytes()); + raw_bytes +} /// One sample's encoded bytes at the frame layer. /// @@ -195,11 +214,59 @@ impl BenEncodeFrame { } } - /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload. + /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload, + /// validating the payload as it is unpacked. /// /// This is the inverse of [`BenEncodeFrame::from_run_lengths`]: it re-assembles the serialized /// bytes and decodes the bit-packed payload back into the run-length vector so that both /// representations are available on the resulting frame. + /// + /// # Errors + /// + /// Returns [`io::ErrorKind::InvalidData`] when: + /// + /// - `max_len_bit_count` is outside `1..=16`; + /// - the payload contains an interior zero run length (only the final byte's zero padding may + /// form zero slots — the encoder never emits zero-length runs, and silently dropping one + /// would shift the alternation parity of every later run); + /// - the payload length is not `ceil(runs * width / 8)` for the recovered run count. + pub fn try_from_parts( + pair: (u16, u16), + max_len_bit_count: u8, + payload: Vec, + count: u16, + ) -> io::Result { + use super::decode::{ + check_twodelta_frame_consistency, check_twodelta_run_width, + unpack_twodelta_run_lengths, + }; + + check_twodelta_run_width(max_len_bit_count)?; + let n_bytes = payload.len() as u32; + let run_length_vector = unpack_twodelta_run_lengths(&payload, max_len_bit_count)?; + check_twodelta_frame_consistency(n_bytes, run_length_vector.len(), max_len_bit_count)?; + + let raw_bytes = assemble_twodelta_raw_bytes(pair, max_len_bit_count, &payload, count); + Ok(Self::TwoDelta { + pair, + max_len_bit_count, + n_bytes, + run_length_vector, + raw_bytes, + count, + }) + } + + /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload. + /// + /// Unlike [`BenEncodeFrame::try_from_parts`], this performs no validation: zero run-length + /// slots anywhere in the payload are silently dropped, the bit width is trusted, and the + /// payload length is not checked against the recovered run count. On a corrupt payload that + /// can silently shift the run alternation and decode to a plausible-but-wrong delta. + #[deprecated( + note = "performs no payload validation and silently drops zero run-length slots; \ + use try_from_parts" + )] pub fn from_parts( pair: (u16, u16), max_len_bit_count: u8, @@ -207,13 +274,7 @@ impl BenEncodeFrame { count: u16, ) -> Self { let n_bytes = payload.len() as u32; - let mut raw_bytes = Vec::with_capacity(9 + payload.len() + 2); - raw_bytes.extend_from_slice(&pair.0.to_be_bytes()); - raw_bytes.extend_from_slice(&pair.1.to_be_bytes()); - raw_bytes.push(max_len_bit_count); - raw_bytes.extend_from_slice(&n_bytes.to_be_bytes()); - raw_bytes.extend_from_slice(&payload); - raw_bytes.extend_from_slice(&count.to_be_bytes()); + let raw_bytes = assemble_twodelta_raw_bytes(pair, max_len_bit_count, &payload, count); let mut run_length_vector = Vec::new(); let mut buffer: u32 = 0; diff --git a/ben/src/codec/frames/mod.rs b/ben/src/codec/frames/mod.rs index 4276680..942b926 100644 --- a/ben/src/codec/frames/mod.rs +++ b/ben/src/codec/frames/mod.rs @@ -15,6 +15,7 @@ mod encode; mod tests; pub use decode::BenDecodeFrame; +pub(crate) use decode::{check_payload_len, check_twodelta_run_width}; pub use encode::BenEncodeFrame; /// Bit-pack an RLE run vector into a serialized BEN frame payload. diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index b62ea68..c6736fe 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -421,7 +421,7 @@ fn encode_from_assignment_mkv_carries_count() { assert_eq!(count, 9); } -// ── BenEncodeFrame::from_run_lengths / from_parts (TwoDelta) ──────────────── +// ── BenEncodeFrame::from_run_lengths / try_from_parts (TwoDelta) ──────────── #[test] fn twodelta_from_run_lengths_count_none_defaults_to_one() { @@ -433,29 +433,68 @@ fn twodelta_from_run_lengths_count_none_defaults_to_one() { } #[test] -fn twodelta_from_run_lengths_then_from_parts_roundtrip() { +fn twodelta_from_run_lengths_then_try_from_parts_roundtrip() { let original = BenEncodeFrame::from_run_lengths((3, 4), vec![5, 5, 5], Some(2)); let bytes = original.as_slice().to_vec(); let (pair, max_len_bits, n_bytes, _, _, count) = unwrap_encode_twodelta(original.clone()); let payload_slice = &bytes[9..9 + n_bytes as usize]; - let rebuilt = BenEncodeFrame::from_parts(pair, max_len_bits, payload_slice.to_vec(), count); - let (rb_pair, _, _, rb_runs, _, rb_count) = unwrap_encode_twodelta(rebuilt); + let rebuilt = + BenEncodeFrame::try_from_parts(pair, max_len_bits, payload_slice.to_vec(), count).unwrap(); + let (rb_pair, _, _, rb_runs, rb_raw, rb_count) = unwrap_encode_twodelta(rebuilt); assert_eq!(rb_pair, pair); assert_eq!(rb_runs, vec![5, 5, 5]); assert_eq!(rb_count, count); + assert_eq!(rb_raw, bytes, "rebuilt frame must serialize byte-identically"); } #[test] -fn twodelta_from_parts_preserves_nontrivial_count() { +fn twodelta_try_from_parts_preserves_nontrivial_count() { let original = BenEncodeFrame::from_run_lengths((1, 9), vec![3, 3], Some(42)); let bytes = original.as_slice().to_vec(); let (_, max_len_bits, n_bytes, _, _, _) = unwrap_encode_twodelta(original); let payload = bytes[9..9 + n_bytes as usize].to_vec(); - let rebuilt = BenEncodeFrame::from_parts((1, 9), max_len_bits, payload, 42); + let rebuilt = BenEncodeFrame::try_from_parts((1, 9), max_len_bits, payload, 42).unwrap(); let (_, _, _, _, _, count) = unwrap_encode_twodelta(rebuilt); assert_eq!(count, 42); } +#[test] +fn twodelta_try_from_parts_rejects_invalid_width() { + for width in [0u8, 17] { + let err = BenEncodeFrame::try_from_parts((1, 2), width, vec![0xF0], 1).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("bit width")); + } +} + +#[test] +fn twodelta_try_from_parts_rejects_interior_zero_run_length() { + // width=4, payload 0x0F → slots [0, 15]: an interior zero, not trailing padding. + let err = BenEncodeFrame::try_from_parts((1, 2), 4, vec![0x0F], 1).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("interior zero")); +} + +#[test] +fn twodelta_try_from_parts_rejects_inconsistent_payload_len() { + // width=4, payload [0xF0, 0x00] recovers one run; the encoder writes ceil(1*4/8) = 1 byte, + // so a 2-byte payload is a corrupt-frame signal. + let err = BenEncodeFrame::try_from_parts((1, 2), 4, vec![0xF0, 0x00], 1).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + assert!(err.to_string().contains("inconsistent")); +} + +#[test] +#[allow(deprecated)] +fn twodelta_from_parts_legacy_zero_drop_pinned_until_removal() { + // The deprecated constructor's documented behavior: zero slots are dropped wherever they + // appear. Pinned so the behavior cannot drift while the API survives its deprecation window; + // new code uses try_from_parts, which rejects this same payload. + let frame = BenEncodeFrame::from_parts((1, 2), 4, vec![0x0F], 1); + let (_, _, _, runs, _, _) = unwrap_encode_twodelta(frame); + assert_eq!(runs, vec![15], "interior zero slot is silently dropped"); +} + #[test] fn twodelta_from_run_lengths_single_run() { let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![5], Some(3)); diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index 95bdd7f..6bb87ca 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -9,6 +9,7 @@ use xz2::write::XzEncoder; use crate::codec::decode::decode_ben_line; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; +use crate::codec::frames::{check_payload_len, check_twodelta_run_width}; use crate::codec::BenEncodeFrame; use crate::format::banners::{has_known_banner_prefix, BANNER_LEN}; use crate::progress::Spinner; @@ -309,29 +310,35 @@ impl XBenInner { sample_count += count as usize; spinner.set_count(sample_count as u64); } - // Delta: unpack the bit-packed run lengths and buffer into the current chunk. + // Delta: unpack the bit-packed run lengths and buffer into the current chunk. The + // input stream is untrusted, so the header fields are validated (bit width, + // payload cap) before the payload buffer is allocated, and the strict constructor + // rejects corrupt payloads instead of silently dropping zero run lengths. BEN_TWODELTA_DELTA_TAG => { let pair_a = reader.read_u16::()?; let pair_b = reader.read_u16::()?; let delta_max_len_bits = reader.read_u8()?; + check_twodelta_run_width(delta_max_len_bits)?; let delta_n_bytes = reader.read_u32::()?; + check_payload_len(delta_n_bytes)?; let mut payload = vec![0u8; delta_n_bytes as usize]; reader.read_exact(&mut payload)?; let count = reader.read_u16::()?; - let (pair, run_lengths) = match BenEncodeFrame::from_parts( + let frame = BenEncodeFrame::try_from_parts( (pair_a, pair_b), delta_max_len_bits, payload, count, - ) { + )?; + let (pair, run_lengths) = match frame { BenEncodeFrame::TwoDelta { pair, run_length_vector, .. } => (pair, run_length_vector), - _ => unreachable!("BenEncodeFrame::from_parts always returns TwoDelta"), + _ => unreachable!("try_from_parts always returns TwoDelta"), }; chunk_buffer.push(BufferedDeltaFrame { diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 9a35baa..70ddb33 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -764,16 +764,16 @@ fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { fn translate_twodelta_non_eof_read_error_propagates() { use std::io::{self, Read}; - // ingest_ben_stream in TwoDelta mode calls translate_ben_twodelta_to_xben. After reading the - // anchor frame it loops reading delta frames; a non-EOF error on pair_a (first u16 read in the - // loop) must propagate. + // ingest_ben_stream in TwoDelta mode calls translate_ben_twodelta_to_xben. After consuming a + // complete snapshot frame it loops reading the next frame's tag byte; a non-EOF error there + // must propagate. let mut xben = Vec::new(); let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); - // Banner (17 bytes) + minimal anchor frame: - // max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1 + // Banner (17 bytes) + a complete snapshot frame: + // snapshot tag, then max_val_bits=1, max_len_bits=1, n_bytes=0 (no payload), count=1. let mut input: Vec = b"TWODELTA BEN FILE".to_vec(); - input.extend_from_slice(&[0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01]); + input.extend_from_slice(&[0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01]); struct ErrorAfterEof; impl Read for ErrorAfterEof { diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index 625b349..ae463a2 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -1525,16 +1525,17 @@ fn encode_twodelta_frame_single_value_swap() { // ────────────────────────────────────────────────────────────────────────────── #[test] -fn twodelta_frame_from_parts_round_trip() { +fn twodelta_frame_try_from_parts_round_trip() { let pair = (10u16, 20u16); let run_lengths = vec![2u16, 5, 1]; let original = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); - let reconstructed = BenEncodeFrame::from_parts( + let reconstructed = BenEncodeFrame::try_from_parts( pair, original.max_len_bit_count(), original.payload().to_vec(), original.count(), - ); + ) + .expect("encoder-produced parts must reconstruct"); assert_eq!(original.as_slice(), reconstructed.as_slice()); assert_eq!(original.pair().unwrap(), reconstructed.pair().unwrap()); assert_eq!( diff --git a/ben/tests/test_fixture_mutations.rs b/ben/tests/test_fixture_mutations.rs index f9620cb..d9c5cca 100644 --- a/ben/tests/test_fixture_mutations.rs +++ b/ben/tests/test_fixture_mutations.rs @@ -21,7 +21,7 @@ use std::path::Path; use binary_ensemble::codec::decode::{ decode_ben_to_jsonl, decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress, }; -use binary_ensemble::codec::encode::xz_compress; +use binary_ensemble::codec::encode::{encode_ben_to_xben, xz_compress}; use binary_ensemble::io::bundle::reader::BendlReader; use binary_ensemble::io::bundle::writer::BendlAppender; use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; @@ -143,6 +143,19 @@ fn drive_ben_entry_points(fixture_name: &str, pos: usize, byte: u8, bytes: &[u8] let _ = extract_assignment_ben(bytes, 1); let _ = extract_assignment_ben(bytes, 3); }); + // Encode-side entry point that *reads* untrusted BEN: the BEN→XBEN converter re-parses every + // frame (including the TwoDelta ingest path), so it faces the same corruption surface as the + // decoders. + run("encode_ben_to_xben", &|| { + let _ = encode_ben_to_xben( + io::BufReader::new(bytes), + io::sink(), + Some(1), + Some(0), + None, + None, + ); + }); } /// Drive every public XBEN read entry point over `bytes`. From 87cf6e6e06e1de3a6d39ee8a3e132c5f25a6e3d5 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:01:49 -0600 Subject: [PATCH 153/221] fix issue with twodelta silently failing on a pathological 2 distrcit TX --- ben/src/codec/encode/errors.rs | 15 ++ ben/src/codec/encode/twodelta.rs | 4 +- ben/src/io/writer/stream_writer/ben.rs | 38 +++- ben/src/io/writer/stream_writer/xben.rs | 105 +++++++--- ben/src/io/writer/tests.rs | 186 ++++++++++++++++++ ben/src/io/writer/twodelta.rs | 9 +- ...est_boundary_proptest.proptest-regressions | 7 + ben/tests/test_boundary_proptest.rs | 182 +++++++++++++++++ ben/tests/test_extreme_plans.rs | 124 ++++++++++++ docs/twodelta-format-spec.md | 9 + 10 files changed, 635 insertions(+), 44 deletions(-) create mode 100644 ben/tests/test_boundary_proptest.proptest-regressions create mode 100644 ben/tests/test_boundary_proptest.rs create mode 100644 ben/tests/test_extreme_plans.rs diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index 7a6e512..0251251 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -19,6 +19,12 @@ pub enum EncodeError { #[error("TwoDelta transition involves more than two distinct district ids")] TwoDeltaTooManyIds, + #[error( + "TwoDelta run length exceeds u16::MAX, which the wire format cannot represent in a \ + delta frame" + )] + TwoDeltaRunTooLong, + #[error("TwoDelta received identical assignment to previous frame")] TwoDeltaIdentical, @@ -60,3 +66,12 @@ impl From for io::Error { } } } + +/// Whether `err` is the TwoDelta run-length representability error +/// ([`EncodeError::TwoDeltaRunTooLong`]). Writers recover from this one error by emitting a +/// snapshot frame, which splits long runs natively, instead of a delta or repeat frame. +pub(crate) fn is_twodelta_run_too_long(err: &io::Error) -> bool { + err.get_ref() + .and_then(|inner| inner.downcast_ref::()) + .is_some_and(|e| matches!(e, EncodeError::TwoDeltaRunTooLong)) +} diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index e1cfb3a..bbd9789 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -314,7 +314,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( if active_run_length == u16::MAX { return Err(Error::new( ErrorKind::InvalidInput, - "TwoDelta run length exceeds u16::MAX", + EncodeError::TwoDeltaRunTooLong, )); } active_run_length += 1; @@ -439,7 +439,7 @@ fn construct_twodelta_frame_from_scratch( if active_run_length == u16::MAX { return Err(Error::new( ErrorKind::InvalidInput, - "TwoDelta run length exceeds u16::MAX", + EncodeError::TwoDeltaRunTooLong, )); } active_run_length += 1; diff --git a/ben/src/io/writer/stream_writer/ben.rs b/ben/src/io/writer/stream_writer/ben.rs index 2616e3b..fb0de0f 100644 --- a/ben/src/io/writer/stream_writer/ben.rs +++ b/ben/src/io/writer/stream_writer/ben.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::io::{self, Write}; +use crate::codec::encode::errors::is_twodelta_run_too_long; use crate::codec::encode::encode_twodelta_frame_with_hint; use crate::codec::BenEncodeFrame; use crate::BenVariant; @@ -69,25 +70,44 @@ impl BenState { self.write_twodelta_snapshot(assignment, count)?; } else { match classify_transition(&self.previous_assignment, assignment)? { - TransitionKind::Repeat => { - let frame = twodelta_repeat_frame(assignment, count)?; - self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; - self.writer.write_all(frame.as_slice())?; - } + TransitionKind::Repeat => match twodelta_repeat_frame(assignment, count) { + Ok(frame) => { + self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; + self.writer.write_all(frame.as_slice())?; + } + // A pair-projected run longer than u16::MAX cannot be expressed in a + // delta-shaped frame (splitting it would require zero-length runs, + // which readers reject as corruption); a snapshot splits long runs + // natively. + Err(e) if is_twodelta_run_too_long(&e) => { + self.write_twodelta_snapshot(assignment, count)?; + } + Err(e) => return Err(e), + }, // Clean 2-swap where both districts already exist: cheap delta against the // maintained masks. TransitionKind::Delta(a, b) if pair_has_masks(&self.previous_masks, a, b) => { - let frame = encode_twodelta_frame_with_hint( + match encode_twodelta_frame_with_hint( &self.previous_assignment, assignment, Some((a, b)), Some(&mut self.previous_masks), Some(count), - )?; - self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; - self.writer.write_all(frame.as_slice())?; + ) { + Ok(frame) => { + self.writer.write_all(&[BEN_TWODELTA_DELTA_TAG])?; + self.writer.write_all(frame.as_slice())?; + } + // Same representability limit as the repeat arm. The failed + // encode leaves `previous_masks` untouched, and the snapshot + // reseeds them from `assignment`. + Err(e) if is_twodelta_run_too_long(&e) => { + self.write_twodelta_snapshot(assignment, count)?; + } + Err(e) => return Err(e), + } } // A >2-district transition, or a 2-id transition that introduces a district // absent from the previous assignment (no mask to delta against): full diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index 6bb87ca..092b99b 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -7,6 +7,7 @@ use byteorder::{BigEndian, ReadBytesExt}; use xz2::write::XzEncoder; use crate::codec::decode::decode_ben_line; +use crate::codec::encode::errors::is_twodelta_run_too_long; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; use crate::codec::translate::ben_to_ben32_lines; use crate::codec::frames::{check_payload_len, check_twodelta_run_width}; @@ -141,9 +142,20 @@ impl XBenInner { previous_assignment, previous_masks, )?; - let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; - chunk_buffer.push(repeat); - *previous_assignment = assign_vec; + match twodelta_repeat_buffered_frame(&assign_vec, 1) { + Ok(repeat) => { + chunk_buffer.push(repeat); + *previous_assignment = assign_vec; + } + // The pair-projected run exceeds u16::MAX, so the repeat cannot be a + // delta-shaped frame; re-buffer it as a fresh pending full frame, + // which splits long runs natively and keeps merging later repeats. + Err(e) if is_twodelta_run_too_long(&e) => { + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } + Err(e) => return Err(e), + } return Ok(()); } *pending_full_count += 1; @@ -155,8 +167,17 @@ impl XBenInner { { if chunk_buffer.last().unwrap().count == u16::MAX { flush_chunk_inner(&mut self.encoder, chunk_buffer)?; - let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; - chunk_buffer.push(repeat); + match twodelta_repeat_buffered_frame(&assign_vec, 1) { + Ok(repeat) => chunk_buffer.push(repeat), + // Same representability limit as the pending-full repeat path: defer + // as a pending full frame (the chunk was just flushed, so the full + // frame correctly follows the chunk's deltas in the stream). + Err(e) if is_twodelta_run_too_long(&e) => { + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } + Err(e) => return Err(e), + } } else { chunk_buffer.last_mut().unwrap().count += 1; } @@ -177,38 +198,62 @@ impl XBenInner { // `previous == assign_vec` only reaches here when the chunk was just flushed // (so the repeat-of-last-delta fast path above was skipped). Encode it as a // repeat delta against the previous frame. - TransitionKind::Repeat => { - let repeat = twodelta_repeat_buffered_frame(&assign_vec, 1)?; - chunk_buffer.push(repeat); - *previous_assignment = assign_vec; - } + TransitionKind::Repeat => match twodelta_repeat_buffered_frame(&assign_vec, 1) + { + Ok(repeat) => { + chunk_buffer.push(repeat); + *previous_assignment = assign_vec; + } + // Same representability limit as the saturation paths: defer as a pending + // full frame. `previous_assignment` already equals the repeated value. + Err(e) if is_twodelta_run_too_long(&e) => { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } + Err(e) => return Err(e), + }, // Clean 2-swap where both districts already exist: cheap delta. TransitionKind::Delta(a, b) if pair_has_masks(previous_masks, a, b) => { - let frame = encode_twodelta_frame_with_hint( + match encode_twodelta_frame_with_hint( &*previous_assignment, &assign_vec, Some((a, b)), Some(previous_masks), None, - )?; - let (pair, run_lengths) = match frame { - BenEncodeFrame::TwoDelta { - pair, - run_length_vector, - .. - } => (pair, run_length_vector), - _ => unreachable!( - "encode_twodelta_frame_with_hint always returns the TwoDelta arm" - ), - }; - chunk_buffer.push(BufferedDeltaFrame { - pair, - run_lengths, - count: 1, - }); - *previous_assignment = assign_vec; - if chunk_buffer.len() >= *twodelta_chunk_size { - flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + ) { + Ok(frame) => { + let (pair, run_lengths) = match frame { + BenEncodeFrame::TwoDelta { + pair, + run_length_vector, + .. + } => (pair, run_length_vector), + _ => unreachable!( + "encode_twodelta_frame_with_hint always returns the \ + TwoDelta arm" + ), + }; + chunk_buffer.push(BufferedDeltaFrame { + pair, + run_lengths, + count: 1, + }); + *previous_assignment = assign_vec; + if chunk_buffer.len() >= *twodelta_chunk_size { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + } + } + // The delta's pair-projected run exceeds u16::MAX: defer as a pending + // full frame, exactly like the Snapshot arm below. The failed encode + // leaves `previous_masks` untouched, and `flush_twodelta_full` + // reseeds them when the full frame is emitted. + Err(e) if is_twodelta_run_too_long(&e) => { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } + Err(e) => return Err(e), } } // A >2-district transition, or a 2-id transition introducing a district absent diff --git a/ben/src/io/writer/tests.rs b/ben/src/io/writer/tests.rs index 70ddb33..9be27fe 100644 --- a/ben/src/io/writer/tests.rs +++ b/ben/src/io/writer/tests.rs @@ -760,6 +760,192 @@ fn twodelta_repeat_frame_run_exceeds_u16_max_errors() { assert!(err.to_string().contains("u16::MAX")); } +// ── TwoDelta long-run snapshot fallback ────────────────────────────── +// +// A pair-projected run longer than u16::MAX cannot be expressed in a delta-shaped frame (splitting +// it would require zero-length runs, which readers reject as corruption). The writers fall back to +// snapshot/full frames, whose RLE splits long runs natively. One test per fallback site. + +/// Smallest assignment whose single-district body exceeds the u16::MAX run limit when projected +/// onto a repeat/delta pair. +fn long_run_assignment() -> Vec { + vec![1u16; u16::MAX as usize + 1] +} + +/// Drain a plain-BEN TwoDelta stream, asserting every sample equals `expected` and returning the +/// expanded sample total. +fn drain_ben_expecting(ben: &[u8], expected: &[u16]) -> usize { + let mut total = 0usize; + BenStreamReader::from_ben(ben) + .unwrap() + .silent(true) + .for_each_assignment(|a, count| { + assert_eq!(a, expected, "decoded assignment diverged"); + total += count as usize; + Ok(true) + }) + .unwrap(); + total +} + +#[test] +fn ben_twodelta_long_run_repeat_falls_back_to_snapshot() { + let a = long_run_assignment(); + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + writer.write_frame(a.clone(), 3).unwrap(); + // Repeat of the previous frame: unrepresentable as a repeat delta → snapshot fallback. + writer.write_frame(a.clone(), 4).unwrap(); + writer.finish().unwrap(); + } + assert_eq!(drain_ben_expecting(&ben, &a), 7); +} + +#[test] +fn ben_twodelta_long_run_delta_falls_back_to_snapshot() { + // A → B is a clean 2-swap at position 0, but the delta's pair covers every position and B's + // leading run exceeds u16::MAX → snapshot fallback. + let mut a = vec![1u16; u16::MAX as usize + 2]; + *a.last_mut().unwrap() = 2; + let mut b = a.clone(); + b[0] = 2; + + let mut ben = Vec::new(); + { + let mut writer = BenStreamWriter::for_ben(&mut ben, BenVariant::TwoDelta).unwrap(); + writer.write_frame(a.clone(), 1).unwrap(); + writer.write_frame(b.clone(), 2).unwrap(); + writer.finish().unwrap(); + } + + let mut seen = Vec::new(); + BenStreamReader::from_ben(ben.as_slice()) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + seen.push((assignment.to_vec(), count)); + Ok(true) + }) + .unwrap(); + assert_eq!(seen, vec![(a, 1), (b, 2)]); +} + +#[test] +fn xben_twodelta_long_run_delta_falls_back_to_full_frame() { + // Same construction as the plain-BEN delta test, through the XBEN columnar writer. + let mut a = vec![1u16; u16::MAX as usize + 2]; + *a.last_mut().unwrap() = 2; + let mut b = a.clone(); + b[0] = 2; + + assert_eq!( + roundtrip_xben_counts(&[a.clone(), b.clone()], BenVariant::TwoDelta), + vec![(a, 1), (b, 1)] + ); +} + +#[test] +fn xben_twodelta_long_run_repeat_after_chunk_flush_falls_back_to_full_frame() { + // chunk_size = 1 forces a chunk flush after the A→B delta, so the next repeat of B reaches + // the classify-Repeat arm with an empty chunk. B's repeat pair (1, 4) projects onto a run + // longer than u16::MAX → pending-full fallback. The A→B delta itself stays representable + // because its pair (3, 4) covers only the two tail positions. + let mut a = vec![1u16; u16::MAX as usize + 3]; + a[u16::MAX as usize + 1] = 3; + a[u16::MAX as usize + 2] = 4; + let mut b = a.clone(); + b[u16::MAX as usize + 1] = 4; + + let mut xben = Vec::new(); + { + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, Some(1)); + writer.write_assignment(a.clone()).unwrap(); + writer.write_assignment(b.clone()).unwrap(); + writer.write_assignment(b.clone()).unwrap(); + writer.finish().unwrap(); + } + + let reader = BenStreamReader::from_xben(Cursor::new(xben)).unwrap(); + let decoded: Vec<(Vec, u16)> = reader.map(|r| r.unwrap()).collect(); + let total: usize = decoded.iter().map(|&(_, c)| c as usize).sum(); + assert_eq!(total, 3); + assert_eq!(decoded[0].0, a); + for (assignment, _) in &decoded[1..] { + assert_eq!(assignment, &b); + } +} + +#[test] +fn xben_twodelta_long_run_repeat_saturation_falls_back_to_full_frame() { + // u16::MAX identical samples saturate the pending full frame's count; the next repeat cannot + // be a delta-shaped frame (single-district body → pair-projected run beyond u16::MAX), so the + // writer re-buffers it as a fresh full frame and keeps merging later repeats into it. + let a = long_run_assignment(); + let n = u16::MAX as usize + 2; + + let mut xben = Vec::new(); + { + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); + for _ in 0..n { + writer.write_assignment(a.clone()).unwrap(); + } + writer.finish().unwrap(); + } + + let mut total = 0usize; + BenStreamReader::from_xben(Cursor::new(xben)) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + assert_eq!(assignment, a.as_slice()); + total += count as usize; + Ok(true) + }) + .unwrap(); + assert_eq!(total, n); +} + +#[test] +fn xben_twodelta_long_run_chunk_repeat_saturation_falls_back_to_full_frame() { + // An A→B delta seeds the chunk, u16::MAX repeats of B saturate that delta's count, and the + // next repeat trips the chunk-saturation path: B's repeat pair projects onto a run beyond + // u16::MAX → pending-full fallback. + let mut a = vec![1u16; u16::MAX as usize + 3]; + a[u16::MAX as usize + 1] = 3; + a[u16::MAX as usize + 2] = 4; + let mut b = a.clone(); + b[u16::MAX as usize + 1] = 4; + let n_b = u16::MAX as usize + 2; + + let mut xben = Vec::new(); + { + let mut writer = build_xben_writer(&mut xben, BenVariant::TwoDelta, None); + writer.write_assignment(a.clone()).unwrap(); + for _ in 0..n_b { + writer.write_assignment(b.clone()).unwrap(); + } + writer.finish().unwrap(); + } + + let mut a_total = 0usize; + let mut b_total = 0usize; + BenStreamReader::from_xben(Cursor::new(xben)) + .unwrap() + .silent(true) + .for_each_assignment(|assignment, count| { + if assignment == a.as_slice() { + a_total += count as usize; + } else { + assert_eq!(assignment, b.as_slice()); + b_total += count as usize; + } + Ok(true) + }) + .unwrap(); + assert_eq!((a_total, b_total), (1, n_b)); +} + #[test] fn translate_twodelta_non_eof_read_error_propagates() { use std::io::{self, Read}; diff --git a/ben/src/io/writer/twodelta.rs b/ben/src/io/writer/twodelta.rs index 172ada1..7e07a8d 100644 --- a/ben/src/io/writer/twodelta.rs +++ b/ben/src/io/writer/twodelta.rs @@ -1,3 +1,4 @@ +use crate::codec::encode::errors::EncodeError; use std::collections::HashMap; use std::io; @@ -17,8 +18,10 @@ pub const DEFAULT_TWODELTA_CHUNK_SIZE: usize = 10_000; /// Walk a TwoDelta repeat-eligible assignment and emit the `(pair, run_lengths)` describing it. /// /// Used by both the BEN and XBEN writers to construct the body of a TwoDelta "repeat" frame: each -/// writer wraps the result in its own frame type. Returns an `InvalidInput` error if any run -/// exceeds `u16::MAX` in length. +/// writer wraps the result in its own frame type. Returns an `InvalidInput` error carrying +/// [`EncodeError::TwoDeltaRunTooLong`] if any run exceeds `u16::MAX` in length — the wire format +/// cannot represent such a run in a delta-shaped frame, and the writers recover by emitting a +/// snapshot frame instead. pub(crate) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16), Vec)> { let first = assignment.first().copied().unwrap_or(0); let second = assignment @@ -39,7 +42,7 @@ pub(crate) fn twodelta_repeat_runs(assignment: &[u16]) -> io::Result<((u16, u16) if run_len == u16::MAX { return Err(io::Error::new( io::ErrorKind::InvalidInput, - "TwoDelta repeat frame contains a run longer than u16::MAX", + EncodeError::TwoDeltaRunTooLong, )); } run_len += 1; diff --git a/ben/tests/test_boundary_proptest.proptest-regressions b/ben/tests/test_boundary_proptest.proptest-regressions new file mode 100644 index 0000000..1f1bd02 --- /dev/null +++ b/ben/tests/test_boundary_proptest.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc c88174804786761220d343183e6248f73efd26f62420b6901fc069f0b5f7cef6 # shrinks to seq = [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]] diff --git a/ben/tests/test_boundary_proptest.rs b/ben/tests/test_boundary_proptest.rs new file mode 100644 index 0000000..956f34e --- /dev/null +++ b/ben/tests/test_boundary_proptest.rs @@ -0,0 +1,182 @@ +//! Boundary-biased round-trip property tests. +//! +//! The strategies in `test_impls_pipeline.rs` deliberately stay small (ids ≤ 2000, runs ≤ 300, +//! length ≤ 1500) to keep runtime bounded — which means they structurally cannot reach any `u16` +//! wire-format boundary. This suite is the complement: its generators are *biased toward* the +//! representability edges where data-dependent encode bugs live: +//! +//! - run lengths straddling `u16::MAX` (64-bit-clean splitting in layer-1 RLE and the BEN32 body, +//! and the TwoDelta long-run snapshot fallback); +//! - district ids `0` and `u16::MAX` (sentinel-adjacent values, TwoDelta pair synthesis); +//! - transition shapes mixing repeats, 2-id repaints (delta-eligible), and many-id changes +//! (snapshot transitions). +//! +//! Every generated sequence must round-trip *exactly* through every `(variant × wire format)` +//! cell — there is no input a writer may reject, because every BEN-stack writer is total over +//! arbitrary `Vec` sequences (delta-shaped frames fall back to snapshots when a pair +//! projection exceeds the `u16` run limit). + +use binary_ensemble::io::reader::BenStreamReader; +use binary_ensemble::io::writer::{BenStreamWriter, XzEncodeOptions}; +use binary_ensemble::BenVariant; + +use proptest::prelude::*; +use std::io::Cursor; + +// ===================================================================== +// Boundary-biased strategies +// ===================================================================== + +/// Run lengths weighted toward the `u16::MAX` boundary: mostly tiny (so values interleave), with +/// a real chance of runs just below, at, and above the 65,535 split/representability limit. +fn boundary_run_len() -> impl Strategy { + prop_oneof![ + 4 => 1usize..=3, + 1 => 65_534usize..=65_535, + 2 => 65_536usize..=65_600, + ] +} + +/// District ids weighted toward the sentinel-adjacent edges: small ids, id `0`, and `u16::MAX`. +fn boundary_value() -> impl Strategy { + prop_oneof![ + 4 => 1u16..=4, + 1 => Just(0u16), + 1 => Just(u16::MAX), + ] +} + +/// One assignment built from 1–4 boundary-biased runs (worst case ≈ 262k nodes). +fn boundary_assignment() -> impl Strategy> { + prop::collection::vec((boundary_value(), boundary_run_len()), 1..=4).prop_map(|runs| { + let mut out = Vec::new(); + for (value, len) in runs { + out.extend(std::iter::repeat_n(value, len)); + } + out + }) +} + +/// Repaint the positions occupied by two distinct values of `prev` with seed-derived alternating +/// stretches of those same two values: a valid 2-id transition (delta-eligible when both ids have +/// masks), whose pair projection inherits `prev`'s long runs — exactly the shape that forces the +/// TwoDelta long-run fallback. +fn repaint_pair(prev: &[u16], seed: u64) -> Vec { + let mut distinct: Vec = prev.to_vec(); + distinct.sort_unstable(); + distinct.dedup(); + if distinct.len() < 2 { + return prev.to_vec(); // single-district plan: degrade to a repeat + } + let a = distinct[(seed as usize) % distinct.len()]; + let mut b = distinct[((seed >> 8) as usize) % distinct.len()]; + if a == b { + b = distinct[(distinct.iter().position(|&x| x == a).unwrap() + 1) % distinct.len()]; + } + + let mut next = prev.to_vec(); + let mut value = if seed & 1 == 0 { a } else { b }; + let mut stretch = 1 + (seed >> 16) as usize % 80_000; + let mut placed = 0usize; + for idx in 0..next.len() { + if next[idx] != a && next[idx] != b { + continue; + } + next[idx] = value; + placed += 1; + if placed == stretch { + value = if value == a { b } else { a }; + stretch = 1 + (stretch.rotate_left(9) ^ 0x5bd1) % 80_000; + placed = 0; + } + } + next +} + +/// Shift every district id by one (wrapping): a many-id transition that forces a snapshot frame +/// in TwoDelta while keeping the assignment's run structure (and its boundary runs) intact. +fn rotate_values(prev: &[u16]) -> Vec { + prev.iter().map(|v| v.wrapping_add(1)).collect() +} + +/// A short sample sequence over boundary-shaped assignments. Each step is a repeat (count +/// merging, repeat frames), a 2-id repaint (delta paths + long-run fallback), or a value +/// rotation (snapshot transitions). +fn boundary_sequence() -> impl Strategy>> { + ( + boundary_assignment(), + prop::collection::vec(any::(), 0..=3), + ) + .prop_map(|(base, ops)| { + let mut seq = vec![base]; + for op in ops { + let prev = seq.last().expect("sequence starts non-empty"); + let next = match op % 3 { + 0 => prev.clone(), + 1 => repaint_pair(prev, op), + _ => rotate_values(prev), + }; + seq.push(next); + } + seq + }) +} + +// ===================================================================== +// Round-trip cells +// ===================================================================== + +fn encode_ben(samples: &[Vec], variant: BenVariant) -> Vec { + let mut ben = Vec::new(); + let mut writer = BenStreamWriter::for_ben(&mut ben, variant).unwrap(); + for s in samples { + writer.write_assignment(s.clone()).unwrap(); + } + writer.finish().unwrap(); + drop(writer); + ben +} + +fn encode_xben(samples: &[Vec], variant: BenVariant) -> Vec { + let mut xben = Vec::new(); + let mut writer = + BenStreamWriter::for_xben(&mut xben, variant, XzEncodeOptions::default()).unwrap(); + for s in samples { + writer.write_assignment(s.clone()).unwrap(); + } + writer.finish().unwrap(); + drop(writer); + xben +} + +fn expand(reader: BenStreamReader) -> Vec> { + reader + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat_n(a, c as usize) + }) + .collect() +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 32, + ..ProptestConfig::default() + })] + + /// Every boundary-shaped sequence round-trips exactly through every variant on both wire + /// formats. Writers are total: no generated input may be rejected. + #[test] + fn boundary_sequences_round_trip_every_variant_and_wire_format(seq in boundary_sequence()) { + for variant in [BenVariant::Standard, BenVariant::MkvChain, BenVariant::TwoDelta] { + let ben = encode_ben(&seq, variant); + let decoded = expand(BenStreamReader::from_ben(ben.as_slice()).unwrap()); + prop_assert_eq!(&decoded, &seq, "{:?} plain BEN diverged", variant); + + let xben = encode_xben(&seq, variant); + let decoded = expand(BenStreamReader::from_xben(Cursor::new(xben)).unwrap()); + prop_assert_eq!(&decoded, &seq, "{:?} XBEN diverged", variant); + } + } +} diff --git a/ben/tests/test_extreme_plans.rs b/ben/tests/test_extreme_plans.rs new file mode 100644 index 0000000..729ce1f --- /dev/null +++ b/ben/tests/test_extreme_plans.rs @@ -0,0 +1,124 @@ +//! Round-trip coverage for an extreme-but-legal plan shape at realistic scale: a 770,000-node +//! dual graph split into two districts, one of which holds a single node. +//! +//! This geometry stresses every run-length limit at once — the lone node leaves monochrome +//! stretches of ~385k+ nodes, far beyond the `u16` run limit, so layer-1 RLE must split runs in +//! every BEN frame, the BEN32 body must split its 4-byte runs, and every TwoDelta transition's +//! pair projection spans all positions, forcing the long-run snapshot fallback on each move. +//! The sample sequence mixes accepted moves of the lone node with repeats (rejected proposals) +//! so MkvChain/TwoDelta count merging is exercised too. + +use binary_ensemble::io::bundle::format::AssignmentFormat; +use binary_ensemble::io::bundle::reader::BendlReader; +use binary_ensemble::io::bundle::writer::BendlWriter; +use binary_ensemble::io::reader::BenStreamReader; +use binary_ensemble::io::writer::{BenStreamWriter, XzEncodeOptions}; +use binary_ensemble::BenVariant; +use std::io::Cursor; + +const N: usize = 770_000; + +/// District 2 everywhere except a single district-1 node at `lone`. +fn plan(lone: usize) -> Vec { + let mut a = vec![2u16; N]; + a[lone] = 1; + a +} + +/// Accepted moves of the lone node interleaved with repeats. +fn samples() -> Vec> { + vec![ + plan(0), + plan(0), // repeat + plan(1), // 2-swap move + plan(1), + plan(2), // 2-swap move + plan(385_000), // jump into the middle + plan(385_000), + ] +} + +fn encode_ben(samples: &[Vec], variant: BenVariant) -> Vec { + let mut ben = Vec::new(); + let mut writer = BenStreamWriter::for_ben(&mut ben, variant).unwrap(); + for s in samples { + writer.write_assignment(s.clone()).unwrap(); + } + writer.finish().unwrap(); + drop(writer); + ben +} + +fn encode_xben(samples: &[Vec], variant: BenVariant) -> Vec { + let mut xben = Vec::new(); + let mut writer = + BenStreamWriter::for_xben(&mut xben, variant, XzEncodeOptions::default()).unwrap(); + for s in samples { + writer.write_assignment(s.clone()).unwrap(); + } + writer.finish().unwrap(); + drop(writer); + xben +} + +fn expand(reader: BenStreamReader) -> Vec> { + reader + .silent(true) + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat_n(a, c as usize) + }) + .collect() +} + +#[test] +fn extreme_two_district_plan_round_trips_every_variant_and_wire_format() { + let samples = samples(); + for variant in [ + BenVariant::Standard, + BenVariant::MkvChain, + BenVariant::TwoDelta, + ] { + let ben = encode_ben(&samples, variant); + assert_eq!( + expand(BenStreamReader::from_ben(ben.as_slice()).unwrap()), + samples, + "{variant:?} plain BEN diverged" + ); + + let xben = encode_xben(&samples, variant); + assert_eq!( + expand(BenStreamReader::from_xben(Cursor::new(xben)).unwrap()), + samples, + "{variant:?} XBEN diverged" + ); + } +} + +#[test] +fn extreme_two_district_plan_round_trips_through_a_bendl_bundle() { + let samples = samples(); + let xben = encode_xben(&samples, BenVariant::TwoDelta); + + let mut backing = Cursor::new(Vec::::new()); + { + let writer = BendlWriter::new(&mut backing, AssignmentFormat::Xben).unwrap(); + let mut session = writer.into_stream_session().unwrap(); + std::io::Write::write_all(&mut session, &xben).unwrap(); + let writer = session.finish_into_writer(samples.len() as i64); + writer.finish().unwrap(); + } + + let mut reader = BendlReader::open(Cursor::new(backing.into_inner())).unwrap(); + assert_eq!(reader.sample_count(), Some(samples.len() as i64)); + reader.verify_stream_checksum().unwrap(); + + let verified = reader.open_assignment_reader().unwrap().silent(true); + let decoded: Vec> = verified + .flat_map(|r| { + let (a, c) = r.unwrap(); + std::iter::repeat_n(a, c as usize) + }) + .collect(); + assert_eq!(decoded, samples); +} diff --git a/docs/twodelta-format-spec.md b/docs/twodelta-format-spec.md index 23831a2..749db70 100644 --- a/docs/twodelta-format-spec.md +++ b/docs/twodelta-format-spec.md @@ -229,6 +229,15 @@ preserve the *expanded* sample count: As with the other variants, a frame `count` of `0` is invalid and MUST be rejected by readers. +## Run-Length Representability + +Run lengths in delta-shaped frames (deltas, chunks, and no-op repeat deltas) are `u16` and MUST be +greater than zero, so a pair-projected run longer than `65535` positions cannot be expressed in a +delta-shaped frame: splitting it would require interleaving zero-length runs, which readers reject +as corruption. A writer that encounters such a run MUST fall back to a snapshot (plain `.ben`) or +full (`.xben`) frame, whose RLE layer splits long runs into consecutive maximal runs natively. +Readers need no special handling — the fallback arrives as an ordinary snapshot/full frame. + ## Reader Rules A reader MUST: From 5593e37f5f812ecd2b3ba883b89a99dca736a34b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:50:52 -0600 Subject: [PATCH 154/221] Add in cross architecture tests --- Taskfile.yml | 73 ++++++++++++++++++++++++++++++++++++++++++- ben/tests/test_cli.rs | 32 +++++++++++++++---- 2 files changed, 98 insertions(+), 7 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 755a8eb..eac043c 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -151,7 +151,78 @@ tasks: cmds: - task: test-rust - task: test-python - - task: docs-test + + ensure-be-target: + desc: Install the big-endian (s390x) std target if it is not already available + internal: true + silent: true + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + status: + - rustup target list --installed | grep -q '^s390x-unknown-linux-gnu$' + cmds: + - rustup target add s390x-unknown-linux-gnu + + ensure-cross: + desc: Install the `cross` cross-compilation runner if it is not already available + internal: true + silent: true + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + status: + - command -v cross + cmds: + - cargo install cross --locked + + check-endian: + desc: >- + Type-check the ben crate (including tests) for a big-endian target. No Docker needed; + catches compile-level endianness regressions only — `task test-endian` is the real gate. + silent: true + deps: + - ensure-toolchain + - ensure-be-target + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo check -p binary-ensemble --all-targets --target s390x-unknown-linux-gnu + + test-big-endian: + desc: >- + Run the full ben test suite on a big-endian target (s390x) under QEMU via `cross`. + Requires Docker (or Podman). The format-stability fixtures decoding here is what proves + the wire codecs are endian-correct. + silent: true + deps: + - ensure-toolchain + - ensure-cross + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cross test -p binary-ensemble --target s390x-unknown-linux-gnu + + test-little-endian: + desc: >- + Run the full ben test suite on a little-endian target (x86_64) via `cross`. Redundant on + an x86/ARM host (the native suite already covers it) but keeps `task test-endian` + host-independent. Requires Docker (or Podman). + silent: true + deps: + - ensure-toolchain + - ensure-cross + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cross test -p binary-ensemble --target x86_64-unknown-linux-gnu + + test-endian: + desc: >- + Run the ben test suite on one big-endian and one little-endian target, so wire-format + endianness is proven regardless of the development machine + silent: true + cmds: + - task: test-big-endian + - task: test-little-endian format-rust: desc: Format Rust code diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 1e6f556..8c02fbd 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -44,16 +44,36 @@ fn bin_path(name: &str) -> &'static str { } } +/// Build a `Command` for one of the workspace CLIs, honoring any cross-compilation runner cargo +/// was configured with (e.g. `CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER` inside a `cross` +/// container running the suite under QEMU). Cargo routes the *test binary itself* through that +/// runner automatically, but subprocesses spawned by tests exec directly — without this shim, a +/// foreign-architecture CLI binary is handed straight to the host kernel, which rejects it (or +/// hands it to a shell that mangles the ELF as a script). The variable is only ever set in +/// cross-compilation environments, so native runs take the plain-exec path. +fn cli_command(bin: &str) -> Command { + let runner = std::env::vars().find_map(|(key, value)| { + (key.starts_with("CARGO_TARGET_") && key.ends_with("_RUNNER") && !value.trim().is_empty()) + .then_some(value) + }); + match runner { + Some(runner) => { + let mut parts = runner.split_whitespace(); + let mut cmd = Command::new(parts.next().expect("runner value is non-empty")); + cmd.args(parts); + cmd.arg(bin_path(bin)); + cmd + } + None => Command::new(bin_path(bin)), + } +} + fn run(bin: &str, args: &[&str], cwd: &Path) -> Output { - Command::new(bin_path(bin)) - .current_dir(cwd) - .args(args) - .output() - .unwrap() + cli_command(bin).current_dir(cwd).args(args).output().unwrap() } fn run_with_stdin(bin: &str, args: &[&str], cwd: &Path, stdin: &[u8]) -> Output { - let mut child = Command::new(bin_path(bin)) + let mut child = cli_command(bin) .current_dir(cwd) .args(args) .stdin(Stdio::piped()) From 7f6b0f841ef98be6c597809fc08145b3339aefd9 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:39:26 -0600 Subject: [PATCH 155/221] remove the from_parts from twodelta --- ben/src/codec/frames/encode.rs | 50 ---------------------------------- ben/src/codec/frames/tests.rs | 10 ------- 2 files changed, 60 deletions(-) diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs index fccfa67..e006e6b 100644 --- a/ben/src/codec/frames/encode.rs +++ b/ben/src/codec/frames/encode.rs @@ -257,56 +257,6 @@ impl BenEncodeFrame { }) } - /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload. - /// - /// Unlike [`BenEncodeFrame::try_from_parts`], this performs no validation: zero run-length - /// slots anywhere in the payload are silently dropped, the bit width is trusted, and the - /// payload length is not checked against the recovered run count. On a corrupt payload that - /// can silently shift the run alternation and decode to a plausible-but-wrong delta. - #[deprecated( - note = "performs no payload validation and silently drops zero run-length slots; \ - use try_from_parts" - )] - pub fn from_parts( - pair: (u16, u16), - max_len_bit_count: u8, - payload: Vec, - count: u16, - ) -> Self { - let n_bytes = payload.len() as u32; - let raw_bytes = assemble_twodelta_raw_bytes(pair, max_len_bit_count, &payload, count); - - let mut run_length_vector = Vec::new(); - let mut buffer: u32 = 0; - let mut n_bits_in_buff: u16 = 0; - - for &byte in payload[..n_bytes as usize].iter() { - // Place the incoming byte at the top of the 32-bit shift register, below any bits - // already buffered. The explicit shift is endian-independent; extraction below always - // reads from the register's high end. - buffer |= ((byte as u32) << 24) >> n_bits_in_buff; - n_bits_in_buff += 8; - - while n_bits_in_buff >= max_len_bit_count as u16 { - let item = (buffer >> (32 - max_len_bit_count)) as u16; - buffer <<= max_len_bit_count; - n_bits_in_buff -= max_len_bit_count as u16; - if item > 0 { - run_length_vector.push(item); - } - } - } - - Self::TwoDelta { - pair, - max_len_bit_count, - n_bytes, - run_length_vector, - raw_bytes, - count, - } - } - /// Borrow the serialized frame bytes. pub fn as_slice(&self) -> &[u8] { match self { diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index c6736fe..400544c 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -484,16 +484,6 @@ fn twodelta_try_from_parts_rejects_inconsistent_payload_len() { assert!(err.to_string().contains("inconsistent")); } -#[test] -#[allow(deprecated)] -fn twodelta_from_parts_legacy_zero_drop_pinned_until_removal() { - // The deprecated constructor's documented behavior: zero slots are dropped wherever they - // appear. Pinned so the behavior cannot drift while the API survives its deprecation window; - // new code uses try_from_parts, which rejects this same payload. - let frame = BenEncodeFrame::from_parts((1, 2), 4, vec![0x0F], 1); - let (_, _, _, runs, _, _) = unwrap_encode_twodelta(frame); - assert_eq!(runs, vec![15], "interior zero slot is silently dropped"); -} #[test] fn twodelta_from_run_lengths_single_run() { From 38f9f43554ec46c3438fdd8c9eab83dde02e61b0 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:05:39 -0600 Subject: [PATCH 156/221] convert panics into errors for better handling --- ben/src/codec/decode/ben.rs | 5 +- ben/src/codec/decode/tests/mod.rs | 2 +- ben/src/codec/decode/tests/twodelta.rs | 8 +- ben/src/codec/encode/errors.rs | 6 ++ ben/src/codec/encode/jsonl.rs | 8 +- ben/src/codec/encode/tests.rs | 14 +-- ben/src/codec/encode/twodelta.rs | 12 +-- ben/src/codec/frames/encode.rs | 84 ++++++++++-------- ben/src/codec/frames/tests.rs | 101 ++++++++++++---------- ben/src/codec/translate/mod.rs | 2 +- ben/src/io/reader/stream_reader/frames.rs | 7 +- ben/src/io/writer/stream_writer/ben.rs | 15 ++-- ben/src/io/writer/stream_writer/xben.rs | 32 +++---- ben/src/ops/relabel/mod.rs | 2 +- ben/src/ops/relabel/tests.rs | 31 ++++--- ben/tests/test_boundary_proptest.rs | 6 +- ben/tests/test_cli.rs | 6 +- ben/tests/test_coverage.rs | 14 +-- ben/tests/test_fixture_mutations.rs | 6 +- ben/tests/test_impls_pipeline.rs | 5 +- ben/tests/test_stress_edges.rs | 7 +- 21 files changed, 202 insertions(+), 171 deletions(-) diff --git a/ben/src/codec/decode/ben.rs b/ben/src/codec/decode/ben.rs index 9f4cd9f..042838a 100644 --- a/ben/src/codec/decode/ben.rs +++ b/ben/src/codec/decode/ben.rs @@ -232,7 +232,7 @@ mod tests { // 100,000 pairs sit above MAX_RLE_PREALLOC_PAIRS, so the output vector must grow past its // clamped initial reservation without losing or reordering pairs. let runs = vec![(1u16, 1u16); 100_000]; - let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None).unwrap(); let decoded = decode_ben_line( Cursor::new(frame.payload()), frame.max_val_bit_count().unwrap(), @@ -250,7 +250,8 @@ mod tests { // 2049 runs of 65,535 elements expand past the 2^27 sanity bound; each run is // individually legal, so only the bound on the sum catches this. let frame = - BenEncodeFrame::from_rle(vec![(1u16, u16::MAX); 2049], BenVariant::Standard, None); + BenEncodeFrame::from_rle(vec![(1u16, u16::MAX); 2049], BenVariant::Standard, None) + .unwrap(); let err = decode_ben_line( Cursor::new(frame.payload()), frame.max_val_bit_count().unwrap(), diff --git a/ben/src/codec/decode/tests/mod.rs b/ben/src/codec/decode/tests/mod.rs index 184f603..f714cf4 100644 --- a/ben/src/codec/decode/tests/mod.rs +++ b/ben/src/codec/decode/tests/mod.rs @@ -211,7 +211,7 @@ fn decode_twodelta_frame_rejects_zero_run_length() { // The delta paint loop assumes no zero-length runs exist (a zero would underflow its // per-run countdown and mispaint positions), so a frame carrying one is rejected up front. - let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 0, 1], Some(1)); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 0, 1], Some(1)).unwrap(); let err = decode_twodelta_frame(vec![1, 2], &frame).unwrap_err(); assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("zero")); diff --git a/ben/src/codec/decode/tests/twodelta.rs b/ben/src/codec/decode/tests/twodelta.rs index 9c6921d..006f6ea 100644 --- a/ben/src/codec/decode/tests/twodelta.rs +++ b/ben/src/codec/decode/tests/twodelta.rs @@ -85,7 +85,7 @@ fn apply_runs_alternating_single_positions() { #[test] fn decode_twodelta_frame_basic() { - let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None).unwrap(); let prev = vec![1u16, 2, 1, 2]; let result = decode_twodelta_frame(prev, &frame).unwrap(); assert_eq!(result, vec![1, 1, 2, 2]); @@ -95,7 +95,7 @@ fn decode_twodelta_frame_basic() { fn decode_twodelta_frame_full_swap() { // pair=(2,1) means run starts with value 2; run_lengths=[2,2] prev [1,2,1,2]: pair positions // 0,1,2,3 → [2,2,1,1] - let frame = BenEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None); + let frame = BenEncodeFrame::from_run_lengths((2, 1), vec![2, 2], None).unwrap(); let prev = vec![1u16, 2, 1, 2]; let result = decode_twodelta_frame(prev, &frame).unwrap(); assert_eq!(result, vec![2, 2, 1, 1]); @@ -105,8 +105,8 @@ fn decode_twodelta_frame_full_swap() { fn decode_twodelta_frame_chain_returns_to_original() { // Frame 1: (1,2) run=[2,2] applied to [1,2,1,2] → [1,1,2,2] Frame 2: (1,2) run=[1,1,1,1] // applied to [1,1,2,2] → [1,2,1,2] - let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); - let f2 = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None); + let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None).unwrap(); + let f2 = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1, 1, 1], None).unwrap(); let initial = vec![1u16, 2, 1, 2]; let after_f1 = decode_twodelta_frame(initial.clone(), &f1).unwrap(); assert_eq!(after_f1, vec![1, 1, 2, 2]); diff --git a/ben/src/codec/encode/errors.rs b/ben/src/codec/encode/errors.rs index 0251251..6fca364 100644 --- a/ben/src/codec/encode/errors.rs +++ b/ben/src/codec/encode/errors.rs @@ -25,6 +25,12 @@ pub enum EncodeError { )] TwoDeltaRunTooLong, + #[error( + "frame payload of {runs} run(s) at {bits_per_run} bit(s)/run overflows the u32 \ + n_bytes field" + )] + FramePayloadOverflow { runs: usize, bits_per_run: u64 }, + #[error("TwoDelta received identical assignment to previous frame")] TwoDeltaIdentical, diff --git a/ben/src/codec/encode/jsonl.rs b/ben/src/codec/encode/jsonl.rs index 59f55c4..9a92eb4 100644 --- a/ben/src/codec/encode/jsonl.rs +++ b/ben/src/codec/encode/jsonl.rs @@ -45,12 +45,10 @@ pub fn encode_jsonl_to_xben( let mut ben_encoder = BenStreamWriter::for_xben_with_encoder(encoder, variant, chunk_size)?; - let mut line_num = 1u64; let spinner = Spinner::new("Encoding line"); - for line_result in reader.lines() { + for (line_num, line_result) in (1u64..).zip(reader.lines()) { spinner.set_count(line_num); - line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).map_err(|e| { io::Error::new( @@ -85,12 +83,10 @@ pub fn encode_jsonl_to_ben( writer: W, variant: BenVariant, ) -> Result<()> { - let mut line_num = 1u64; let spinner = Spinner::new("Encoding line"); let mut ben_encoder = BenStreamWriter::for_ben(writer, variant)?; - for line_result in reader.lines() { + for (line_num, line_result) in (1u64..).zip(reader.lines()) { spinner.set_count(line_num); - line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).map_err(|e| { io::Error::new( diff --git a/ben/src/codec/encode/tests.rs b/ben/src/codec/encode/tests.rs index 7d56d4b..4e61c73 100644 --- a/ben/src/codec/encode/tests.rs +++ b/ben/src/codec/encode/tests.rs @@ -267,12 +267,14 @@ fn test_encode_jsonl_to_ben_len_65535() { #[test] fn test_encode_ben_vec_from_assign_matches_rle_entrypoint() { let assign_vec = vec![4u16, 4, 4, 1, 1, 3, 3, 3, 2]; - let direct = BenEncodeFrame::from_assignment(assign_vec.clone(), BenVariant::Standard, None); + let direct = + BenEncodeFrame::from_assignment(assign_vec.clone(), BenVariant::Standard, None).unwrap(); let via_rle = BenEncodeFrame::from_rle( crate::util::rle::assign_to_rle(assign_vec), BenVariant::Standard, None, - ); + ) + .unwrap(); assert_eq!(direct, via_rle); } @@ -474,12 +476,9 @@ fn encode_jsonl_to_ben_multiple_simple_lines() { } fn encode_jsonl_to_ben32(reader: R, mut writer: W) -> std::io::Result<()> { - let mut line_num = 1; - writer.write_all("STANDARD BEN FILE".as_bytes())?; - for line_result in reader.lines() { + for (line_num, line_result) in (1..).zip(reader.lines()) { eprint!("Encoding line: {}\r", line_num); - line_num += 1; let line = line_result?; let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line"); @@ -1561,7 +1560,8 @@ fn bit_packing_boundary_widths_round_trip() { #[test] fn bit_packing_boundary_widths_pin_encoder_choice() { fn standard_widths(assignment: Vec) -> (u8, u8) { - let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None); + let frame = + BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None).unwrap(); if let BenEncodeFrame::Standard { max_val_bit_count, max_len_bit_count, diff --git a/ben/src/codec/encode/twodelta.rs b/ben/src/codec/encode/twodelta.rs index bbd9789..f219a0c 100644 --- a/ben/src/codec/encode/twodelta.rs +++ b/ben/src/codec/encode/twodelta.rs @@ -339,11 +339,7 @@ fn construct_twodelta_frame_from_pair_and_mask_hints( previous_masks.insert(pair.first_run_district(), new_mask_a); previous_masks.insert(pair.second_run_district(), new_mask_b); - Ok(BenEncodeFrame::from_run_lengths( - pair.as_ordered_pair(), - run_lengths, - count, - )) + BenEncodeFrame::from_run_lengths(pair.as_ordered_pair(), run_lengths, count) } /// Build a TwoDelta frame using only pre-computed position masks, inferring the pair from the first @@ -454,11 +450,7 @@ fn construct_twodelta_frame_from_scratch( } run_lengths.push(active_run_length); - Ok(BenEncodeFrame::from_run_lengths( - enc_pair, - run_lengths, - count, - )) + BenEncodeFrame::from_run_lengths(enc_pair, run_lengths, count) } /// Encode a transition between two assignment vectors as a TwoDelta frame. diff --git a/ben/src/codec/frames/encode.rs b/ben/src/codec/frames/encode.rs index e006e6b..26f93d8 100644 --- a/ben/src/codec/frames/encode.rs +++ b/ben/src/codec/frames/encode.rs @@ -1,4 +1,5 @@ use super::compress_rle_to_ben_bytes; +use crate::codec::encode::errors::EncodeError; use crate::util::rle::assign_to_rle; use crate::BenVariant; use std::io; @@ -79,13 +80,17 @@ impl BenEncodeFrame { /// /// `count` is ignored for `Standard` and defaults to `1` for `MkvChain`. /// - /// # Panics + /// # Errors /// - /// Panics if `variant` is [`BenVariant::TwoDelta`]; use [`BenEncodeFrame::from_run_lengths`] - /// for that. Also panics if the packed payload would exceed the `u32` byte length the frame - /// header can carry — that bound sits far beyond any real assignment, so reaching it means - /// the caller's input is corrupt rather than merely large. - pub fn from_rle(runs: Vec<(u16, u16)>, variant: BenVariant, count: Option) -> Self { + /// Returns `InvalidInput` if `variant` is [`BenVariant::TwoDelta`] (use + /// [`BenEncodeFrame::from_run_lengths`] for that) or if the packed payload would exceed the + /// `u32` byte length the frame header can carry — a bound far beyond any real assignment, + /// so reaching it means the input is corrupt rather than merely large. + pub fn from_rle( + runs: Vec<(u16, u16)>, + variant: BenVariant, + count: Option, + ) -> io::Result { let (max_val, max_len) = runs .iter() .fold((0u16, 0u16), |(max_val, max_len), &(val, len)| { @@ -95,54 +100,58 @@ impl BenEncodeFrame { let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); let assign_bits = (max_val_bit_count + max_len_bit_count) as u64; let payload_bits = assign_bits * runs.len() as u64; - let n_bytes = u32::try_from(payload_bits.div_ceil(8)).unwrap_or_else(|_| { - panic!( - "BEN frame payload of {} run(s) at {assign_bits} bit(s)/run overflows the u32 \ - n_bytes field", - runs.len() + let n_bytes = u32::try_from(payload_bits.div_ceil(8)).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + EncodeError::FramePayloadOverflow { + runs: runs.len(), + bits_per_run: assign_bits, + }, ) - }); + })?; let mut raw_bytes = compress_rle_to_ben_bytes(max_val_bit_count, max_len_bit_count, n_bytes, &runs); match variant { - BenVariant::Standard => Self::Standard { + BenVariant::Standard => Ok(Self::Standard { runs, max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes, - }, + }), BenVariant::MkvChain => { let count = count.unwrap_or(1); raw_bytes.extend(count.to_be_bytes()); - Self::MkvChain { + Ok(Self::MkvChain { runs, max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes, count, - } + }) } - BenVariant::TwoDelta => panic!( + BenVariant::TwoDelta => Err(io::Error::new( + io::ErrorKind::InvalidInput, "BenEncodeFrame::from_rle does not support TwoDelta; \ use BenEncodeFrame::from_run_lengths instead", - ), + )), } } /// Build a `Standard` or `MkvChain` frame from an assignment vector. /// - /// # Panics + /// # Errors /// - /// Panics if `variant` is [`BenVariant::TwoDelta`]; TwoDelta frames cannot be derived from a - /// single assignment vector. + /// Returns `InvalidInput` if `variant` is [`BenVariant::TwoDelta`] (TwoDelta frames cannot be + /// derived from a single assignment vector) or if the packed payload would overflow the + /// frame header's `u32` byte length; see [`BenEncodeFrame::from_rle`]. pub fn from_assignment( assignment: impl AsRef<[u16]>, variant: BenVariant, count: Option, - ) -> Self { + ) -> io::Result { Self::from_rle(assign_to_rle(assignment), variant, count) } @@ -150,29 +159,31 @@ impl BenEncodeFrame { /// /// `count` defaults to `1` if `None`. /// - /// # Panics + /// # Errors /// - /// Panics if the packed payload would exceed the `u32` byte length the frame header can - /// carry — that bound sits far beyond any real delta, so reaching it means the caller's - /// input is corrupt rather than merely large. + /// Returns `InvalidInput` if the packed payload would exceed the `u32` byte length the frame + /// header can carry — a bound far beyond any real delta, so reaching it means the input is + /// corrupt rather than merely large. pub fn from_run_lengths( pair: (u16, u16), run_length_vector: Vec, count: Option, - ) -> Self { + ) -> io::Result { let count = count.unwrap_or(1); let max_len = run_length_vector.iter().copied().max().unwrap_or(0); let max_len_bit_count = (16 - max_len.leading_zeros() as u8).max(1); let payload_bits = max_len_bit_count as u64 * run_length_vector.len() as u64; - let n_bytes = u32::try_from(payload_bits.div_ceil(8)).unwrap_or_else(|_| { - panic!( - "TwoDelta frame payload of {} run length(s) at {max_len_bit_count} bit(s) each \ - overflows the u32 n_bytes field", - run_length_vector.len() + let n_bytes = u32::try_from(payload_bits.div_ceil(8)).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + EncodeError::FramePayloadOverflow { + runs: run_length_vector.len(), + bits_per_run: u64::from(max_len_bit_count), + }, ) - }); + })?; // pair_bytes (4) + max_len_bit_count (1) + n_bytes (4) + payload (n_bytes) + count (2) let mut raw_bytes = Vec::with_capacity((n_bytes + 11) as usize); @@ -204,14 +215,14 @@ impl BenEncodeFrame { raw_bytes.extend(count.to_be_bytes()); - Self::TwoDelta { + Ok(Self::TwoDelta { pair, max_len_bit_count, n_bytes, run_length_vector, raw_bytes, count, - } + }) } /// Reconstruct a `TwoDelta` frame from already-parsed header fields and a raw payload, @@ -237,8 +248,7 @@ impl BenEncodeFrame { count: u16, ) -> io::Result { use super::decode::{ - check_twodelta_frame_consistency, check_twodelta_run_width, - unpack_twodelta_run_lengths, + check_twodelta_frame_consistency, check_twodelta_run_width, unpack_twodelta_run_lengths, }; check_twodelta_run_width(max_len_bit_count)?; diff --git a/ben/src/codec/frames/tests.rs b/ben/src/codec/frames/tests.rs index 400544c..b282825 100644 --- a/ben/src/codec/frames/tests.rs +++ b/ben/src/codec/frames/tests.rs @@ -292,7 +292,7 @@ fn ben_decode_mkv_non_eof_read_error_propagates() { #[test] fn ben_decode_twodelta_from_reader() { // Build a TwoDelta encode frame, then read it back as a decode frame. - let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(5)); + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(5)).unwrap(); let bytes = encoded.into_bytes(); let mut cursor = io::Cursor::new(bytes); @@ -332,7 +332,7 @@ fn ben_decode_twodelta_invalid_max_len_bits_zero_errors() { #[test] fn ben_decode_twodelta_count_max_u16() { - let encoded = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1], Some(u16::MAX)); + let encoded = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1], Some(u16::MAX)).unwrap(); let bytes = encoded.into_bytes(); let mut cursor = io::Cursor::new(bytes); let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) @@ -347,7 +347,7 @@ fn ben_decode_twodelta_count_max_u16() { #[test] fn encode_from_rle_standard_carries_runs_and_bytes() { let runs = vec![(1u16, 2u16), (2, 3), (3, 1)]; - let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None).unwrap(); let (got_runs, mvb, mlb, n, raw) = unwrap_encode_standard(frame); assert_eq!(got_runs, runs); assert_eq!(mvb, 2); // max value 3 fits in 2 bits @@ -361,7 +361,7 @@ fn encode_from_rle_standard_carries_runs_and_bytes() { #[test] fn encode_from_rle_mkv_count_none_defaults_to_one() { let runs = vec![(1u16, 2u16), (2, 3)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, None).unwrap(); let (_, _, _, _, raw, count) = unwrap_encode_mkv(frame); assert_eq!(count, 1); let trailing = &raw[raw.len() - 2..]; @@ -371,7 +371,7 @@ fn encode_from_rle_mkv_count_none_defaults_to_one() { #[test] fn encode_from_rle_mkv_with_count() { let runs = vec![(1u16, 2u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(7)); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(7)).unwrap(); let (_, _, _, _, raw, count) = unwrap_encode_mkv(frame); assert_eq!(count, 7); let trailing = &raw[raw.len() - 2..]; @@ -379,16 +379,19 @@ fn encode_from_rle_mkv_with_count() { } #[test] -#[should_panic(expected = "TwoDelta")] -fn encode_from_rle_twodelta_panics() { +fn encode_from_rle_twodelta_returns_invalid_input() { + // TwoDelta frames cannot be built from a bare RLE vector; the misuse surfaces as a typed + // error (not a panic) so bindings can present it cleanly. let runs = vec![(1u16, 2u16)]; - let _ = BenEncodeFrame::from_rle(runs, BenVariant::TwoDelta, None); + let err = BenEncodeFrame::from_rle(runs, BenVariant::TwoDelta, None).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("from_run_lengths")); } #[test] fn encode_single_run_frame() { let runs = vec![(5u16, 1u16)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None).unwrap(); let (_, mvb, mlb, _, _) = unwrap_encode_standard(frame); assert_eq!(mvb, 3); // 5 fits in 3 bits assert_eq!(mlb, 1); // 1 fits in 1 bit @@ -397,7 +400,7 @@ fn encode_single_run_frame() { #[test] fn encode_large_values_near_u16_max() { let runs = vec![(u16::MAX, u16::MAX)]; - let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(runs, BenVariant::Standard, None).unwrap(); let (_, mvb, mlb, _, _) = unwrap_encode_standard(frame); assert_eq!(mvb, 16); assert_eq!(mlb, 16); @@ -408,7 +411,7 @@ fn encode_large_values_near_u16_max() { #[test] fn encode_from_assignment_standard() { let assignment = vec![1u16, 1, 2, 2, 3]; - let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); + let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None).unwrap(); let (runs, _, _, _, _) = unwrap_encode_standard(frame); assert_eq!(runs, vec![(1, 2), (2, 2), (3, 1)]); } @@ -416,7 +419,8 @@ fn encode_from_assignment_standard() { #[test] fn encode_from_assignment_mkv_carries_count() { let assignment = vec![1u16, 1, 2, 2]; - let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::MkvChain, Some(9)); + let frame = + BenEncodeFrame::from_assignment(&assignment, BenVariant::MkvChain, Some(9)).unwrap(); let (_, _, _, _, _, count) = unwrap_encode_mkv(frame); assert_eq!(count, 9); } @@ -425,7 +429,7 @@ fn encode_from_assignment_mkv_carries_count() { #[test] fn twodelta_from_run_lengths_count_none_defaults_to_one() { - let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], None).unwrap(); let (pair, _, _, runs, _, count) = unwrap_encode_twodelta(frame); assert_eq!(pair, (1, 2)); assert_eq!(runs, vec![2, 2]); @@ -434,7 +438,7 @@ fn twodelta_from_run_lengths_count_none_defaults_to_one() { #[test] fn twodelta_from_run_lengths_then_try_from_parts_roundtrip() { - let original = BenEncodeFrame::from_run_lengths((3, 4), vec![5, 5, 5], Some(2)); + let original = BenEncodeFrame::from_run_lengths((3, 4), vec![5, 5, 5], Some(2)).unwrap(); let bytes = original.as_slice().to_vec(); let (pair, max_len_bits, n_bytes, _, _, count) = unwrap_encode_twodelta(original.clone()); let payload_slice = &bytes[9..9 + n_bytes as usize]; @@ -444,12 +448,15 @@ fn twodelta_from_run_lengths_then_try_from_parts_roundtrip() { assert_eq!(rb_pair, pair); assert_eq!(rb_runs, vec![5, 5, 5]); assert_eq!(rb_count, count); - assert_eq!(rb_raw, bytes, "rebuilt frame must serialize byte-identically"); + assert_eq!( + rb_raw, bytes, + "rebuilt frame must serialize byte-identically" + ); } #[test] fn twodelta_try_from_parts_preserves_nontrivial_count() { - let original = BenEncodeFrame::from_run_lengths((1, 9), vec![3, 3], Some(42)); + let original = BenEncodeFrame::from_run_lengths((1, 9), vec![3, 3], Some(42)).unwrap(); let bytes = original.as_slice().to_vec(); let (_, max_len_bits, n_bytes, _, _, _) = unwrap_encode_twodelta(original); let payload = bytes[9..9 + n_bytes as usize].to_vec(); @@ -484,10 +491,9 @@ fn twodelta_try_from_parts_rejects_inconsistent_payload_len() { assert!(err.to_string().contains("inconsistent")); } - #[test] fn twodelta_from_run_lengths_single_run() { - let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![5], Some(3)); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![5], Some(3)).unwrap(); let (pair, _, _, runs, _, count) = unwrap_encode_twodelta(frame); assert_eq!(pair, (1, 2)); assert_eq!(runs, vec![5]); @@ -499,7 +505,7 @@ fn twodelta_from_run_lengths_single_run() { #[test] fn standard_encode_decode_roundtrip() { let runs = vec![(1u16, 4u16), (2, 3), (3, 1)]; - let encoded = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None); + let encoded = BenEncodeFrame::from_rle(runs.clone(), BenVariant::Standard, None).unwrap(); let bytes = encoded.into_bytes(); let mut cursor = io::Cursor::new(bytes); @@ -516,7 +522,7 @@ fn standard_encode_decode_roundtrip() { #[test] fn mkv_encode_decode_roundtrip() { let runs = vec![(1u16, 4u16), (2, 3)]; - let encoded = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(11)); + let encoded = BenEncodeFrame::from_rle(runs, BenVariant::MkvChain, Some(11)).unwrap(); let bytes = encoded.into_bytes(); let mut cursor = io::Cursor::new(bytes); @@ -529,7 +535,7 @@ fn mkv_encode_decode_roundtrip() { #[test] fn twodelta_encode_decode_roundtrip() { - let encoded = BenEncodeFrame::from_run_lengths((4, 7), vec![3, 3, 3], Some(8)); + let encoded = BenEncodeFrame::from_run_lengths((4, 7), vec![3, 3, 3], Some(8)).unwrap(); let bytes = encoded.into_bytes(); let mut cursor = io::Cursor::new(bytes); @@ -546,8 +552,8 @@ fn twodelta_encode_decode_roundtrip() { #[test] fn standard_decode_two_frames_back_to_back() { - let f1 = BenEncodeFrame::from_rle(vec![(1, 2), (2, 1)], BenVariant::Standard, None); - let f2 = BenEncodeFrame::from_rle(vec![(3, 1), (4, 2)], BenVariant::Standard, None); + let f1 = BenEncodeFrame::from_rle(vec![(1, 2), (2, 1)], BenVariant::Standard, None).unwrap(); + let f2 = BenEncodeFrame::from_rle(vec![(3, 1), (4, 2)], BenVariant::Standard, None).unwrap(); let mut bytes = f1.into_bytes(); bytes.extend(f2.into_bytes()); @@ -564,8 +570,8 @@ fn standard_decode_two_frames_back_to_back() { #[test] fn mkv_decode_two_frames_back_to_back() { - let f1 = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::MkvChain, Some(3)); - let f2 = BenEncodeFrame::from_rle(vec![(2, 4)], BenVariant::MkvChain, Some(5)); + let f1 = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::MkvChain, Some(3)).unwrap(); + let f2 = BenEncodeFrame::from_rle(vec![(2, 4)], BenVariant::MkvChain, Some(5)).unwrap(); let mut bytes = f1.into_bytes(); bytes.extend(f2.into_bytes()); @@ -582,8 +588,8 @@ fn mkv_decode_two_frames_back_to_back() { #[test] fn twodelta_decode_two_frames_back_to_back() { - let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)); - let f2 = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1, 1, 1], Some(1)); + let f1 = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)).unwrap(); + let f2 = BenEncodeFrame::from_run_lengths((3, 4), vec![1, 1, 1, 1], Some(1)).unwrap(); let mut bytes = f1.into_bytes(); bytes.extend(f2.into_bytes()); @@ -602,7 +608,7 @@ fn twodelta_decode_two_frames_back_to_back() { #[test] fn decode_count_returns_one_for_standard() { - let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() @@ -612,21 +618,21 @@ fn decode_count_returns_one_for_standard() { #[test] fn decode_variant_method() { - let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() .unwrap(); assert_eq!(frame.variant(), BenVariant::Standard); - let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(2)); + let encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(2)).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); assert_eq!(frame.variant(), BenVariant::MkvChain); - let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() @@ -636,21 +642,22 @@ fn decode_variant_method() { #[test] fn decode_raw_bytes_returns_some_for_snapshot_arms_none_for_twodelta() { - let std_encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let std_encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None).unwrap(); let mut cursor = io::Cursor::new(std_encoded.into_bytes()); let std_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() .unwrap(); assert!(std_frame.raw_bytes().is_some()); - let mkv_encoded = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(1)); + let mkv_encoded = + BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(1)).unwrap(); let mut cursor = io::Cursor::new(mkv_encoded.into_bytes()); let mkv_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::MkvChain) .unwrap() .unwrap(); assert!(mkv_frame.raw_bytes().is_some()); - let td_encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let td_encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)).unwrap(); let mut cursor = io::Cursor::new(td_encoded.into_bytes()); let td_frame = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() @@ -662,7 +669,8 @@ fn decode_raw_bytes_returns_some_for_snapshot_arms_none_for_twodelta() { #[test] fn encode_as_slice_to_bytes_into_bytes_agree() { - let encoded = BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None); + let encoded = + BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None).unwrap(); let s = encoded.as_slice().to_vec(); let t = encoded.to_bytes(); let i = encoded.into_bytes(); @@ -672,31 +680,31 @@ fn encode_as_slice_to_bytes_into_bytes_agree() { #[test] fn encode_count_method() { - let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None).unwrap(); assert_eq!(std_frame.count(), 1); - let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(7)); + let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, Some(7)).unwrap(); assert_eq!(mkv_frame.count(), 7); - let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(13)); + let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(13)).unwrap(); assert_eq!(td_frame.count(), 13); } #[test] fn encode_variant_method() { - let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None); + let std_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::Standard, None).unwrap(); assert_eq!(std_frame.variant(), BenVariant::Standard); - let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, None); + let mkv_frame = BenEncodeFrame::from_rle(vec![(1, 1)], BenVariant::MkvChain, None).unwrap(); assert_eq!(mkv_frame.variant(), BenVariant::MkvChain); - let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], None); + let td_frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], None).unwrap(); assert_eq!(td_frame.variant(), BenVariant::TwoDelta); } #[test] fn encode_payload_returns_packed_payload_region() { - let frame = BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(vec![(1, 2), (3, 4)], BenVariant::Standard, None).unwrap(); let bytes = frame.as_slice().to_vec(); let payload = frame.payload().to_vec(); // For Standard, payload is bytes[6..6+n_bytes]. @@ -706,7 +714,7 @@ fn encode_payload_returns_packed_payload_region() { #[test] fn encode_as_ref_and_deref_match_as_slice() { - let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None).unwrap(); let s = frame.as_slice(); let r: &[u8] = frame.as_ref(); assert_eq!(s, r); @@ -716,7 +724,7 @@ fn encode_as_ref_and_deref_match_as_slice() { #[test] fn encode_partial_eq_vec_both_directions() { - let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(vec![(1, 2)], BenVariant::Standard, None).unwrap(); let bytes: Vec = frame.as_slice().to_vec(); assert_eq!(frame, bytes); assert_eq!(bytes, frame); @@ -727,7 +735,8 @@ fn encode_partial_eq_vec_both_directions() { #[test] fn decode_expand_standard_assignment() { // An assignment of [1, 1, 2, 2, 3] becomes RLE [(1,2),(2,2),(3,1)]. - let encoded = BenEncodeFrame::from_assignment([1u16, 1, 2, 2, 3], BenVariant::Standard, None); + let encoded = + BenEncodeFrame::from_assignment([1u16, 1, 2, 2, 3], BenVariant::Standard, None).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::Standard) .unwrap() @@ -738,7 +747,7 @@ fn decode_expand_standard_assignment() { #[test] fn decode_expand_twodelta_requires_prev() { - let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)); + let encoded = BenEncodeFrame::from_run_lengths((1, 2), vec![2, 2], Some(1)).unwrap(); let mut cursor = io::Cursor::new(encoded.into_bytes()); let decoded = BenDecodeFrame::from_reader(&mut cursor, BenVariant::TwoDelta) .unwrap() diff --git a/ben/src/codec/translate/mod.rs b/ben/src/codec/translate/mod.rs index 00aaa98..58669b5 100644 --- a/ben/src/codec/translate/mod.rs +++ b/ben/src/codec/translate/mod.rs @@ -76,7 +76,7 @@ pub(crate) fn ben32_to_ben_line( })); } - Ok(BenEncodeFrame::from_rle(ben32_rle, BenVariant::from(variant), Some(count)).into_bytes()) + Ok(BenEncodeFrame::from_rle(ben32_rle, BenVariant::from(variant), Some(count))?.into_bytes()) } /// Read one 4-byte ben32 word, distinguishing a clean end of input from a truncated word. diff --git a/ben/src/io/reader/stream_reader/frames.rs b/ben/src/io/reader/stream_reader/frames.rs index a2958dd..99c3cee 100644 --- a/ben/src/io/reader/stream_reader/frames.rs +++ b/ben/src/io/reader/stream_reader/frames.rs @@ -90,11 +90,14 @@ impl Iterator for BenStreamFrameReader { silent, ) { Some(Ok((assignment, count))) => { - let encoded = BenEncodeFrame::from_assignment( + let encoded = match BenEncodeFrame::from_assignment( &assignment, BenVariant::Standard, None, - ); + ) { + Ok(encoded) => encoded, + Err(e) => return Some(Err(e)), + }; let (max_val_bit_count, max_len_bit_count, n_bytes, raw_bytes) = match encoded { BenEncodeFrame::Standard { diff --git a/ben/src/io/writer/stream_writer/ben.rs b/ben/src/io/writer/stream_writer/ben.rs index fb0de0f..e3e2c3e 100644 --- a/ben/src/io/writer/stream_writer/ben.rs +++ b/ben/src/io/writer/stream_writer/ben.rs @@ -3,8 +3,8 @@ use std::collections::HashMap; use std::io::{self, Write}; -use crate::codec::encode::errors::is_twodelta_run_too_long; use crate::codec::encode::encode_twodelta_frame_with_hint; +use crate::codec::encode::errors::is_twodelta_run_too_long; use crate::codec::BenEncodeFrame; use crate::BenVariant; @@ -54,14 +54,15 @@ impl BenState { fn encode_and_write_frame(&mut self, assignment: &[u16], count: u16) -> io::Result<()> { match self.variant { BenVariant::Standard => { - let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None); + let frame = + BenEncodeFrame::from_assignment(assignment, BenVariant::Standard, None)?; for _ in 0..count { self.writer.write_all(frame.as_slice())?; } } BenVariant::MkvChain => { let frame = - BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count)); + BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count))?; self.writer.write_all(frame.as_slice())?; } BenVariant::TwoDelta => { @@ -129,7 +130,7 @@ impl BenState { for (idx, &val) in assignment.iter().enumerate() { self.previous_masks.entry(val).or_default().push(idx); } - let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count)); + let frame = BenEncodeFrame::from_assignment(assignment, BenVariant::MkvChain, Some(count))?; self.writer.write_all(&[BEN_TWODELTA_SNAPSHOT_TAG])?; self.writer.write_all(frame.as_slice())?; Ok(()) @@ -168,9 +169,5 @@ impl BenState { pub(crate) fn twodelta_repeat_frame(assignment: &[u16], count: u16) -> io::Result { let (pair, run_lengths) = twodelta_repeat_runs(assignment)?; - Ok(BenEncodeFrame::from_run_lengths( - pair, - run_lengths, - Some(count), - )) + BenEncodeFrame::from_run_lengths(pair, run_lengths, Some(count)) } diff --git a/ben/src/io/writer/stream_writer/xben.rs b/ben/src/io/writer/stream_writer/xben.rs index 092b99b..0f0b4d3 100644 --- a/ben/src/io/writer/stream_writer/xben.rs +++ b/ben/src/io/writer/stream_writer/xben.rs @@ -9,8 +9,8 @@ use xz2::write::XzEncoder; use crate::codec::decode::decode_ben_line; use crate::codec::encode::errors::is_twodelta_run_too_long; use crate::codec::encode::{encode_ben32_assignments, encode_twodelta_frame_with_hint}; -use crate::codec::translate::ben_to_ben32_lines; use crate::codec::frames::{check_payload_len, check_twodelta_run_width}; +use crate::codec::translate::ben_to_ben32_lines; use crate::codec::BenEncodeFrame; use crate::format::banners::{has_known_banner_prefix, BANNER_LEN}; use crate::progress::Spinner; @@ -198,21 +198,23 @@ impl XBenInner { // `previous == assign_vec` only reaches here when the chunk was just flushed // (so the repeat-of-last-delta fast path above was skipped). Encode it as a // repeat delta against the previous frame. - TransitionKind::Repeat => match twodelta_repeat_buffered_frame(&assign_vec, 1) - { - Ok(repeat) => { - chunk_buffer.push(repeat); - *previous_assignment = assign_vec; - } - // Same representability limit as the saturation paths: defer as a pending - // full frame. `previous_assignment` already equals the repeated value. - Err(e) if is_twodelta_run_too_long(&e) => { - flush_chunk_inner(&mut self.encoder, chunk_buffer)?; - *pending_full_assignment = Some(assign_vec); - *pending_full_count = 1; + TransitionKind::Repeat => { + match twodelta_repeat_buffered_frame(&assign_vec, 1) { + Ok(repeat) => { + chunk_buffer.push(repeat); + *previous_assignment = assign_vec; + } + // Same representability limit as the saturation paths: defer as a + // pending full frame. `previous_assignment` + // already equals the repeated value. + Err(e) if is_twodelta_run_too_long(&e) => { + flush_chunk_inner(&mut self.encoder, chunk_buffer)?; + *pending_full_assignment = Some(assign_vec); + *pending_full_count = 1; + } + Err(e) => return Err(e), } - Err(e) => return Err(e), - }, + } // Clean 2-swap where both districts already exist: cheap delta. TransitionKind::Delta(a, b) if pair_has_masks(previous_masks, a, b) => { match encode_twodelta_frame_with_hint( diff --git a/ben/src/ops/relabel/mod.rs b/ben/src/ops/relabel/mod.rs index 4f55802..d849b1a 100644 --- a/ben/src/ops/relabel/mod.rs +++ b/ben/src/ops/relabel/mod.rs @@ -331,7 +331,7 @@ fn relabel_first_seen_via_byte_walk( 1 }; - let frame = BenEncodeFrame::from_rle(ben_line, input_variant, Some(count_occurrences)); + let frame = BenEncodeFrame::from_rle(ben_line, input_variant, Some(count_occurrences))?; writer.write_all(frame.as_slice())?; sample_number += count_occurrences as usize; diff --git a/ben/src/ops/relabel/tests.rs b/ben/src/ops/relabel/tests.rs index 0f86457..00d7dfe 100644 --- a/ben/src/ops/relabel/tests.rs +++ b/ben/src/ops/relabel/tests.rs @@ -59,10 +59,10 @@ fn with_banner(variant: BenVariant, payload: &[u8]) -> Vec { fn test_relabel_ben_line_simple() { let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)]; - let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None).unwrap(); let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)]; - let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None).unwrap(); let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); @@ -253,11 +253,11 @@ fn test_relabel_ben_line_with_map() { let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2]; let in_rle = assign_to_rle(in_assign); - let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None).unwrap(); let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5]; let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None).unwrap(); let mut new_to_old_map = HashMap::new(); new_to_old_map.insert(0, 2); @@ -290,7 +290,8 @@ fn test_relabel_ben_line_with_map() { fn first_seen_fast_path_rejects_zero_count_frame() { // A MkvChain frame with count == 0 is corrupt; the byte-walking fast path must error rather // than re-emit a frame every downstream reader rejects. - let frame = BenEncodeFrame::from_assignment(vec![1u16, 2, 2], BenVariant::MkvChain, Some(0)); + let frame = + BenEncodeFrame::from_assignment(vec![1u16, 2, 2], BenVariant::MkvChain, Some(0)).unwrap(); let with_banner_in = with_banner(BenVariant::MkvChain, frame.as_slice()); let err = relabel_ben_file( @@ -309,11 +310,11 @@ fn test_relabel_ben_line_with_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(in_assign); - let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None).unwrap(); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None).unwrap(); let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); @@ -340,11 +341,11 @@ fn test_relabel_ben_line_with_large_shuffle() { let mut out_assign = in_assign.clone(); let in_rle = assign_to_rle(&in_assign); - let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None); + let input = BenEncodeFrame::from_rle(in_rle, BenVariant::Standard, None).unwrap(); let new_to_old_map = shuffle_with_mapping(&mut out_assign); let out_rle = assign_to_rle(out_assign); - let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None); + let expected = BenEncodeFrame::from_rle(out_rle, BenVariant::Standard, None).unwrap(); let with_banner_in = with_banner(BenVariant::Standard, input.as_slice()); let mut buf = Vec::new(); @@ -1258,8 +1259,10 @@ fn run_policy_pins_frame_preservation_and_collapse() { { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = + BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)).unwrap(); + let frame_b = + BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)).unwrap(); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } @@ -1334,8 +1337,10 @@ fn standard_target_cross_policy_byte_identity() { { let banner = crate::format::banners::MKVCHAIN_BEN_BANNER; input.extend_from_slice(banner); - let frame_a = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)); - let frame_b = BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)); + let frame_a = + BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(5)).unwrap(); + let frame_b = + BenEncodeFrame::from_assignment([1u16, 2, 3], BenVariant::MkvChain, Some(7)).unwrap(); input.extend_from_slice(frame_a.as_slice()); input.extend_from_slice(frame_b.as_slice()); } diff --git a/ben/tests/test_boundary_proptest.rs b/ben/tests/test_boundary_proptest.rs index 956f34e..9864f58 100644 --- a/ben/tests/test_boundary_proptest.rs +++ b/ben/tests/test_boundary_proptest.rs @@ -78,11 +78,11 @@ fn repaint_pair(prev: &[u16], seed: u64) -> Vec { let mut value = if seed & 1 == 0 { a } else { b }; let mut stretch = 1 + (seed >> 16) as usize % 80_000; let mut placed = 0usize; - for idx in 0..next.len() { - if next[idx] != a && next[idx] != b { + for slot in next.iter_mut() { + if *slot != a && *slot != b { continue; } - next[idx] = value; + *slot = value; placed += 1; if placed == stretch { value = if value == a { b } else { a }; diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 8c02fbd..853cf57 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -69,7 +69,11 @@ fn cli_command(bin: &str) -> Command { } fn run(bin: &str, args: &[&str], cwd: &Path) -> Output { - cli_command(bin).current_dir(cwd).args(args).output().unwrap() + cli_command(bin) + .current_dir(cwd) + .args(args) + .output() + .unwrap() } fn run_with_stdin(bin: &str, args: &[&str], cwd: &Path, stdin: &[u8]) -> Output { diff --git a/ben/tests/test_coverage.rs b/ben/tests/test_coverage.rs index ae463a2..2bd2142 100644 --- a/ben/tests/test_coverage.rs +++ b/ben/tests/test_coverage.rs @@ -621,7 +621,7 @@ fn ben_encoder_twodelta_base_frame_then_delta_round_trip() { #[test] fn encode_ben_vec_from_rle_empty_rle() { // Empty RLE produces a minimal frame with zero payload bytes. - let frame = BenEncodeFrame::from_rle(vec![], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_rle(vec![], BenVariant::Standard, None).unwrap(); // 1 byte max_val_bits + 1 byte max_len_bits + 4 bytes n_bytes = 6 bytes assert_eq!(frame.as_slice().len(), 6); } @@ -630,21 +630,21 @@ fn encode_ben_vec_from_rle_empty_rle() { fn encode_ben_vec_from_assign_and_rle_are_equivalent() { let assign = vec![3u16, 3, 3, 1, 2, 2]; let rle = assign_to_rle(&assign); - let via_assign = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None); - let via_rle = BenEncodeFrame::from_rle(rle, BenVariant::Standard, None); + let via_assign = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None).unwrap(); + let via_rle = BenEncodeFrame::from_rle(rle, BenVariant::Standard, None).unwrap(); assert_eq!(via_assign.as_slice(), via_rle.as_slice()); } #[test] fn encode_ben_vec_from_assign_single_element() { - let frame = BenEncodeFrame::from_assignment([42u16], BenVariant::Standard, None); + let frame = BenEncodeFrame::from_assignment([42u16], BenVariant::Standard, None).unwrap(); assert!(!frame.as_slice().is_empty()); } #[test] fn encode_ben_vec_from_assign_all_same() { let assign = vec![7u16; 500]; - let frame = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None); + let frame = BenEncodeFrame::from_assignment(&assign, BenVariant::Standard, None).unwrap(); // Should encode efficiently — the payload compresses a single run. assert!(!frame.as_slice().is_empty()); } @@ -1528,7 +1528,7 @@ fn encode_twodelta_frame_single_value_swap() { fn twodelta_frame_try_from_parts_round_trip() { let pair = (10u16, 20u16); let run_lengths = vec![2u16, 5, 1]; - let original = BenEncodeFrame::from_run_lengths(pair, run_lengths, None); + let original = BenEncodeFrame::from_run_lengths(pair, run_lengths, None).unwrap(); let reconstructed = BenEncodeFrame::try_from_parts( pair, original.max_len_bit_count(), @@ -1553,7 +1553,7 @@ fn twodelta_frame_try_from_parts_round_trip() { #[test] fn encode_ben_frame_from_assignment() { let assignment = vec![1u16, 1, 2, 2, 3]; - let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None); + let frame = BenEncodeFrame::from_assignment(&assignment, BenVariant::Standard, None).unwrap(); // Frame from assignment should produce runs let runs = frame.runs().unwrap(); assert_eq!(runs, &[(1u16, 2u16), (2u16, 2u16), (3u16, 1u16)]); diff --git a/ben/tests/test_fixture_mutations.rs b/ben/tests/test_fixture_mutations.rs index d9c5cca..dd3a854 100644 --- a/ben/tests/test_fixture_mutations.rs +++ b/ben/tests/test_fixture_mutations.rs @@ -113,7 +113,11 @@ fn drive_ben_entry_points(fixture_name: &str, pos: usize, byte: u8, bytes: &[u8] }); run("into_subsample_every", &|| { if let Ok(reader) = BenStreamReader::from_ben(bytes) { - for record in reader.silent(true).into_subsample_every(2, 1).take(MAX_PULLS) { + for record in reader + .silent(true) + .into_subsample_every(2, 1) + .take(MAX_PULLS) + { let _ = record; } } diff --git a/ben/tests/test_impls_pipeline.rs b/ben/tests/test_impls_pipeline.rs index 5f2eda1..1278d2d 100644 --- a/ben/tests/test_impls_pipeline.rs +++ b/ben/tests/test_impls_pipeline.rs @@ -853,7 +853,7 @@ fn xben_truncated_frame_reports_unexpected_eof() { fn encode_decode_ben32_odd_bit_packing_roundtrip() { // values up to 3 (2 bits), lengths big to make non-byte boundary let rle = vec![(1u16, 3u16), (2, 5), (3, 7)]; - let ben_frame = BenEncodeFrame::from_rle(rle.clone(), BenVariant::Standard, None); + let ben_frame = BenEncodeFrame::from_rle(rle.clone(), BenVariant::Standard, None).unwrap(); let ben = ben_frame.as_slice(); // ben layout: [max_val_bits, max_len_bits, n_bytes, payload...] let max_val_bits = ben[0]; @@ -1472,7 +1472,8 @@ fn twodelta_first_frame_carries_repeat_trailer() { // The first frame is a snapshot: a 1-byte snapshot tag (0x00) precedes the MkvChain-formatted // body, which is the Standard frame bytes plus a trailing u16 repetition count. - let expected_first = BenEncodeFrame::from_assignment(&first, BenVariant::Standard, None); + let expected_first = + BenEncodeFrame::from_assignment(&first, BenVariant::Standard, None).unwrap(); assert_eq!(&ben[..17], b"TWODELTA BEN FILE"); assert_eq!(ben[17], 0x00, "first frame should carry the snapshot tag"); let body_start = 18; diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index 8cee041..e907afb 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -308,7 +308,8 @@ fn malformed_ben_bit_widths_return_invalid_data() { #[test] fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { - let anchor = BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)); + let anchor = + BenEncodeFrame::from_assignment(vec![1u16, 2], BenVariant::MkvChain, Some(1)).unwrap(); let mut ben = TWODELTA_BEN_BANNER.to_vec(); ben.push(0x00); // snapshot tag for the anchor (MkvChain-formatted body) ben.extend_from_slice(anchor.as_slice()); @@ -322,7 +323,7 @@ fn malformed_twodelta_bit_width_and_extra_runs_return_errors() { let err = reader.next().unwrap().unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); - let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)); + let frame = BenEncodeFrame::from_run_lengths((1, 2), vec![1, 1], Some(1)).unwrap(); let err = decode_twodelta_frame(vec![1u16], &frame).unwrap_err(); assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); } @@ -502,7 +503,7 @@ fn xben_twodelta_huge_incomplete_chunk_errors_without_panicking() { #[test] fn zero_count_frames_are_rejected() { - let frame = BenEncodeFrame::from_assignment(vec![1u16], BenVariant::MkvChain, Some(0)); + let frame = BenEncodeFrame::from_assignment(vec![1u16], BenVariant::MkvChain, Some(0)).unwrap(); let mut ben = MKVCHAIN_BEN_BANNER.to_vec(); ben.extend_from_slice(frame.as_slice()); let err = BenStreamReader::from_ben(ben.as_slice()) From a33b35cde6ee0c600829f676f80e7e68116f8297 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:09:13 -0600 Subject: [PATCH 157/221] add rust linter into workflow --- Taskfile.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Taskfile.yml b/Taskfile.yml index eac043c..0e847d9 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -252,6 +252,16 @@ tasks: - task: format-rust - task: format-python + lint-rust: + desc: Lint Rust code (clippy over every workspace target, warnings denied) + silent: true + deps: + - ensure-toolchain + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + cmds: + - cargo clippy --workspace --all-targets -- -D warnings + lint-python: desc: Lint ben-py Python code silent: true @@ -267,6 +277,7 @@ tasks: desc: Run linters silent: true cmds: + - task: lint-rust - task: lint-python coverage-ben: From 5e76363c18d98d3f7dfba59e2e1151610263b3e1 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:30:46 -0600 Subject: [PATCH 158/221] make sure interop with Pcompress still lives --- Taskfile.yml | 1 + ben/tests/fixtures/v1.0.0/interop.pcompress | Bin 0 -> 41 bytes ben/tests/test_cli.rs | 38 ++++++++++++++++++++ ben/tests/test_format_stability.rs | 29 +++++++++++++++ docs/format-stability.md | 6 ++++ 5 files changed, 74 insertions(+) create mode 100644 ben/tests/fixtures/v1.0.0/interop.pcompress diff --git a/Taskfile.yml b/Taskfile.yml index 0e847d9..a6aab4f 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -126,6 +126,7 @@ tasks: cargo test -- --ignored --skip generate_format_stability_fixtures --skip regenerate_twodelta_fixtures + --skip generate_pcompress_interop_fixture test-rust: desc: Run Rust tests for the workspace (fast suite plus #[ignore]-gated stress tests) diff --git a/ben/tests/fixtures/v1.0.0/interop.pcompress b/ben/tests/fixtures/v1.0.0/interop.pcompress new file mode 100644 index 0000000000000000000000000000000000000000..7125e8f6a42801053f518efdd8882c2b41dfa7f8 GIT binary patch literal 41 kcmWlPK@I>g41%!z|Ci_JK?6o0ePP&^IgDWJJ@w$;4>nN{0RR91 literal 0 HcmV?d00001 diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index 853cf57..af41af8 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1523,6 +1523,44 @@ fn reben_cli_supports_multi_level_cluster_ordering() { assert!(!mlc_json["nodes"].as_array().unwrap().is_empty()); } +#[test] +fn pcben_decodes_committed_foreign_pcompress_fixture() { + // `interop.pcompress` was minted by the real PCompress implementation (the `pcompress` + // crates.io dependency), so this pins the foreign-format interop contract: bytes produced by + // genuine PCompress must keep converting to BEN that decodes back to the canonical JSONL. + // The expected output is the committed `source.jsonl`, whose one-based ids are the fixture's + // zero-based ids shifted by the bridge. + let fixtures = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("v1.0.0"); + let expected = fs::read_to_string(fixtures.join("source.jsonl")).unwrap(); + + let temp = TempDir::new("pcben-interop"); + let ben_path = temp.path().join("interop.ben"); + let pc_to_ben = run( + "pcben", + &[ + "--mode", + "pc-to-ben", + "--input-file", + fixtures.join("interop.pcompress").to_str().unwrap(), + "--output-file", + ben_path.to_str().unwrap(), + ], + temp.path(), + ); + assert_success(&pc_to_ben); + + let mut jsonl = Vec::new(); + decode_ben_to_jsonl(fs::File::open(&ben_path).unwrap(), &mut jsonl).unwrap(); + assert_eq!( + String::from_utf8(jsonl).unwrap(), + expected, + "foreign pcompress fixture no longer converts to the canonical ensemble" + ); +} + #[test] fn pben_cli_converts_between_formats() { let temp = TempDir::new("pcben"); diff --git a/ben/tests/test_format_stability.rs b/ben/tests/test_format_stability.rs index a33ee28..4520cfa 100644 --- a/ben/tests/test_format_stability.rs +++ b/ben/tests/test_format_stability.rs @@ -437,6 +437,35 @@ fn generate_format_stability_fixtures() { eprintln!("Wrote v1.0.0 fixtures to {:?}", fixtures_dir()); } +/// The canonical assignments rendered in PCompress's zero-based line format (one JSON array per +/// line) — the input the foreign `pcompress` encoder consumes. District ids are CANONICAL_JSONL's +/// minus one, so the pcben bridge's one-based shift converts the fixture back to CANONICAL_JSONL +/// exactly. +const CANONICAL_PCOMPRESS_INPUT: &str = "\ +[0,0,1,1] +[0,1,0,1] +[0,0,0,1] +[0,0,0,1] +[1,1,1,0] +"; + +#[test] +#[ignore = "mints only the foreign-format pcompress interop fixture; never run as part of normal CI"] +fn generate_pcompress_interop_fixture() { + // Minted by the *foreign implementation*: the `pcompress` crates.io dependency is mggg's real + // encoder, so these bytes pin interop with genuine PCompress output rather than with this + // workspace's own rendering of the format. Re-minting is legitimate only if the pinned + // `pcompress` dependency version changes its wire format, which would itself be an interop + // event worth a dedicated PR. + let mut reader = BufReader::new(CANONICAL_PCOMPRESS_INPUT.as_bytes()); + let mut writer = std::io::BufWriter::new(Vec::new()); + pcompress::encode::encode(&mut reader, &mut writer, false); + let out = writer.into_inner().expect("flush pcompress fixture bytes"); + write_fixture("interop.pcompress", &out); + + eprintln!("Wrote interop.pcompress to {:?}", fixtures_dir()); +} + #[test] #[ignore = "regenerates only the (unreleased) TwoDelta fixtures; never run as part of normal CI"] fn regenerate_twodelta_fixtures() { diff --git a/docs/format-stability.md b/docs/format-stability.md index 62b8cf7..c427cec 100644 --- a/docs/format-stability.md +++ b/docs/format-stability.md @@ -82,6 +82,12 @@ The current `v1.0.0` set covers: - `unknown_flags.bendl` — a derivative of `flags_set.bendl` with reserved bits set in the header `flags` and in a custom asset's `asset_flags`. Pins forward-compatible reader behavior: unknown bits must be ignored, all known operations still succeed. +- `interop.pcompress` — the canonical ensemble encoded by the **foreign PCompress + implementation** (the `pcompress` crates.io dependency, mggg's real encoder). Pins the pcben + interop contract: genuine PCompress bytes must keep converting to BEN that decodes back to + `source.jsonl`. Minted by the focused `generate_pcompress_interop_fixture` regenerator; + re-minting is legitimate only when the pinned `pcompress` dependency version changes its wire + format, in a dedicated PR. - `source.jsonl`, `source_twodelta.jsonl`, `source_graph.json`, `source_metadata.json` — human-readable sources committed alongside the binary fixtures so the contents can be inspected without running the codec. `source.jsonl` mints the Standard/MkvChain/BENDL fixtures; From 0fe0e4c921a51766e82de13c4445e247074f053c Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:32:48 -0600 Subject: [PATCH 159/221] better ci --- .github/workflows/ci.yml | 47 +++++++++++ .github/workflows/full-tests.yml | 133 +++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/full-tests.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..b1a1814 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +# Lightweight quality gates on every PR: formatting and lints for both languages. The heavier +# gates (full test suite, big-endian emulation) live in full-tests.yml and run on demand — either +# from the Actions tab or via a `/ci-full` / `/ci-endian` PR comment. +# +# These mirror `task format` / `task lint`; keep the two in sync. + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + format: + name: format + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - name: cargo fmt + run: cargo fmt --all -- --check + - uses: astral-sh/setup-uv@v5 + - name: ruff format + working-directory: ben-py + run: uvx ruff format --check . + + lint: + name: lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - uses: Swatinem/rust-cache@v2 + - name: cargo clippy + run: cargo clippy --workspace --all-targets -- -D warnings + - uses: astral-sh/setup-uv@v5 + - name: ruff check + working-directory: ben-py + run: uvx ruff check . diff --git a/.github/workflows/full-tests.yml b/.github/workflows/full-tests.yml new file mode 100644 index 0000000..2ecc7c8 --- /dev/null +++ b/.github/workflows/full-tests.yml @@ -0,0 +1,133 @@ +name: Full tests (on demand) + +# The heavy gates, run only when asked: +# +# - From the Actions tab via "Run workflow" (workflow_dispatch), picking any branch. +# - From a PR by commenting `/ci-full` (Rust + Python suites + big-endian) or `/ci-endian` +# (big-endian only). Comment triggers are restricted to the repo owner, members, and +# collaborators, because the jobs check out and execute the PR head. +# +# Note: GitHub only evaluates `issue_comment` triggers against the workflow file on the default +# branch, so the comment commands start working once this file lands on main. + +on: + workflow_dispatch: + issue_comment: + types: [created] + +permissions: + contents: read + issues: write # eyes-reaction acknowledgment on the triggering comment + +jobs: + decide: + name: decide what to run + runs-on: ubuntu-latest + outputs: + run_full: ${{ steps.flags.outputs.run_full }} + run_endian: ${{ steps.flags.outputs.run_endian }} + ref: ${{ steps.flags.outputs.ref }} + steps: + - name: compute trigger flags + id: flags + # The comment body is passed through env rather than interpolated into the script, so a + # hostile comment cannot inject shell. + env: + EVENT_NAME: ${{ github.event_name }} + IS_PR_COMMENT: ${{ github.event.issue.pull_request != null }} + COMMENT_BODY: ${{ github.event.comment.body }} + AUTHOR_ASSOCIATION: ${{ github.event.comment.author_association }} + PR_NUMBER: ${{ github.event.issue.number }} + DISPATCH_REF: ${{ github.ref }} + run: | + run_full=false + run_endian=false + ref="$DISPATCH_REF" + if [ "$EVENT_NAME" = "workflow_dispatch" ]; then + run_full=true + run_endian=true + elif [ "$IS_PR_COMMENT" = "true" ]; then + case "$AUTHOR_ASSOCIATION" in + OWNER|MEMBER|COLLABORATOR) + case "$COMMENT_BODY" in + *"/ci-full"*) run_full=true; run_endian=true ;; + esac + case "$COMMENT_BODY" in + *"/ci-endian"*) run_endian=true ;; + esac + ;; + esac + ref="refs/pull/$PR_NUMBER/head" + fi + { + echo "run_full=$run_full" + echo "run_endian=$run_endian" + echo "ref=$ref" + } >> "$GITHUB_OUTPUT" + - name: acknowledge the triggering comment + if: github.event_name == 'issue_comment' && (steps.flags.outputs.run_full == 'true' || steps.flags.outputs.run_endian == 'true') + env: + GH_TOKEN: ${{ github.token }} + run: >- + gh api + repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions + -f content=eyes + + rust-tests: + name: rust tests + needs: decide + if: needs.decide.outputs.run_full == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.decide.outputs.ref }} + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: fast suite + run: cargo test + - name: ignored-gated stress suite + run: >- + cargo test -- --ignored + --skip generate_format_stability_fixtures + --skip regenerate_twodelta_fixtures + --skip generate_pcompress_interop_fixture + + python-tests: + name: python tests + needs: decide + if: needs.decide.outputs.run_full == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.decide.outputs.ref }} + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - uses: astral-sh/setup-uv@v5 + - name: sync environment + working-directory: ben-py + run: uv sync --all-groups + - name: build extension + working-directory: ben-py + run: uv run maturin develop + - name: pytest + working-directory: ben-py + run: uv run pytest tests/ + + big-endian: + name: big-endian suite (s390x under QEMU) + needs: decide + if: needs.decide.outputs.run_endian == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.decide.outputs.ref }} + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - uses: taiki-e/install-action@v2 + with: + tool: cross + - name: cross test on s390x + run: cross test -p binary-ensemble --target s390x-unknown-linux-gnu From b83148ac7157d820ea580a6367147632f2b0fbc6 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:52:03 -0600 Subject: [PATCH 160/221] more fuzzing --- Cargo.toml | 1 + Taskfile.yml | 37 +++++++++++++++++++ fuzz/.gitignore | 5 +++ fuzz/Cargo.toml | 46 ++++++++++++++++++++++++ fuzz/fuzz_targets/ben_reader.rs | 50 ++++++++++++++++++++++++++ fuzz/fuzz_targets/bendl_reader.rs | 56 +++++++++++++++++++++++++++++ fuzz/fuzz_targets/xben_body.rs | 47 ++++++++++++++++++++++++ fuzz/fuzz_targets/xben_container.rs | 25 +++++++++++++ 8 files changed, 267 insertions(+) create mode 100644 fuzz/.gitignore create mode 100644 fuzz/Cargo.toml create mode 100644 fuzz/fuzz_targets/ben_reader.rs create mode 100644 fuzz/fuzz_targets/bendl_reader.rs create mode 100644 fuzz/fuzz_targets/xben_body.rs create mode 100644 fuzz/fuzz_targets/xben_container.rs diff --git a/Cargo.toml b/Cargo.toml index bf51c76..8d0c26f 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ exclude = [ "dev_files/*", "example/*", + "fuzz", "tests/*", "TODO.md", ] diff --git a/Taskfile.yml b/Taskfile.yml index a6aab4f..03bb6d8 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -153,6 +153,43 @@ tasks: - task: test-rust - task: test-python + ensure-cargo-fuzz: + desc: Install the `cargo-fuzz` runner if it is not already available + internal: true + silent: true + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + status: + - command -v cargo-fuzz + cmds: + - cargo install cargo-fuzz --locked + + fuzz: + desc: >- + Time-boxed coverage-guided fuzzing of the read surfaces (libFuzzer via cargo-fuzz; requires + the nightly toolchain). Explores compound multi-byte corruptions that the exhaustive + single-byte mutation harness cannot reach. FUZZ_SECONDS bounds each target (default 60). + silent: true + deps: + - ensure-toolchain + - ensure-cargo-fuzz + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + vars: + FUZZ_SECONDS: '{{.FUZZ_SECONDS | default "60"}}' + cmds: + # Seed every corpus from the committed wire-format fixtures so the fuzzer starts from + # valid structures instead of rediscovering the banners byte by byte. + - mkdir -p fuzz/corpus/ben_reader fuzz/corpus/xben_body fuzz/corpus/xben_container fuzz/corpus/bendl_reader + - cp ben/tests/fixtures/v1.0.0/*.ben fuzz/corpus/ben_reader/ + - cp ben/tests/fixtures/v1.0.0/*.ben fuzz/corpus/xben_body/ + - cp ben/tests/fixtures/v1.0.0/*.xben fuzz/corpus/xben_container/ + - cp ben/tests/fixtures/v1.0.0/*.bendl fuzz/corpus/bendl_reader/ + - cargo +nightly fuzz run ben_reader -- -max_total_time={{.FUZZ_SECONDS}} + - cargo +nightly fuzz run xben_body -- -max_total_time={{.FUZZ_SECONDS}} + - cargo +nightly fuzz run xben_container -- -max_total_time={{.FUZZ_SECONDS}} + - cargo +nightly fuzz run bendl_reader -- -max_total_time={{.FUZZ_SECONDS}} + ensure-be-target: desc: Install the big-endian (s390x) std target if it is not already available internal: true diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 0000000..ab0eaa1 --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..16a364e --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "binary-ensemble-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.binary-ensemble] +path = "../ben" + +[[bin]] +name = "ben_reader" +path = "fuzz_targets/ben_reader.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "xben_body" +path = "fuzz_targets/xben_body.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "xben_container" +path = "fuzz_targets/xben_container.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "bendl_reader" +path = "fuzz_targets/bendl_reader.rs" +test = false +doc = false +bench = false + +# Standalone: deliberately not a member of the parent workspace (cargo-fuzz convention; the +# parent Cargo.toml excludes `fuzz/`). +[workspace] diff --git a/fuzz/fuzz_targets/ben_reader.rs b/fuzz/fuzz_targets/ben_reader.rs new file mode 100644 index 0000000..d511bb0 --- /dev/null +++ b/fuzz/fuzz_targets/ben_reader.rs @@ -0,0 +1,50 @@ +//! Coverage-guided fuzzing of the plain-BEN read surface. +//! +//! The deterministic mutation harness (`ben/tests/test_fixture_mutations.rs`) covers every +//! single-byte corruption of the committed fixtures exhaustively; this target explores the +//! compound, multi-byte corruptions that enumeration cannot reach. The contract is the same: +//! arbitrary bytes may error anywhere, but must never panic, hang, or exhaust memory. + +#![no_main] + +use binary_ensemble::codec::decode::decode_ben_to_jsonl; +use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; +use binary_ensemble::ops::extract::extract_assignment_ben; +use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; +use binary_ensemble::BenVariant; +use libfuzzer_sys::fuzz_target; + +/// Bound on records pulled from iterator-style entry points: corrupt streams may yield errors +/// indefinitely without ending the iterator. +const MAX_PULLS: usize = 64; + +fuzz_target!(|data: &[u8]| { + let _ = decode_ben_to_jsonl(data, std::io::sink()); + + if let Ok(reader) = BenStreamReader::from_ben(data) { + for record in reader.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + if let Ok(reader) = BenStreamReader::from_ben(data) { + let _ = reader.silent(true).count_samples(); + } + if let Ok(frames) = BenStreamFrameReader::from_ben(data) { + for frame in frames.take(MAX_PULLS) { + let _ = frame; + } + } + if let Ok(reader) = BenStreamReader::from_ben(data) { + for record in reader.silent(true).into_subsample_by_range(1, 3).take(MAX_PULLS) { + let _ = record; + } + } + + let _ = relabel_ben_file(data, std::io::sink(), RelabelOptions::first_seen()); + let _ = relabel_ben_file( + data, + std::io::sink(), + RelabelOptions::convert_to(BenVariant::TwoDelta), + ); + let _ = extract_assignment_ben(data, 2); +}); diff --git a/fuzz/fuzz_targets/bendl_reader.rs b/fuzz/fuzz_targets/bendl_reader.rs new file mode 100644 index 0000000..42952e1 --- /dev/null +++ b/fuzz/fuzz_targets/bendl_reader.rs @@ -0,0 +1,56 @@ +//! Coverage-guided fuzzing of the `.bendl` bundle read surface. +//! +//! Mutants split into the same two classes as the deterministic harness: open-rejected (the +//! constructor is the whole reachable surface) and openable (every accessor must then hold the +//! no-panic contract, including the verified and unverified asset/stream readers). + +#![no_main] + +use binary_ensemble::io::bundle::reader::BendlReader; +use binary_ensemble::io::bundle::writer::BendlAppender; +use libfuzzer_sys::fuzz_target; +use std::io::{Cursor, Read}; + +const MAX_PULLS: usize = 64; + +fuzz_target!(|data: &[u8]| { + if let Ok(mut reader) = BendlReader::open(Cursor::new(data.to_vec())) { + let _ = reader.is_finalized(); + let _ = reader.sample_count(); + let _ = reader.assignment_format(); + let _ = reader.validate_directory(); + + for entry in reader.assets().to_vec() { + let _ = reader.asset_bytes(&entry); + let _ = reader.asset_bytes_unverified(&entry); + if let Ok(mut payload) = reader.asset_payload_reader_unverified(&entry) { + let _ = payload.read_to_end(&mut Vec::new()); + } + let _ = reader.verify_asset_checksum(&entry); + } + let _ = reader.verify_all_asset_checksums(); + let _ = reader.verify_stream_checksum(); + + if let Ok(mut stream) = reader.assignment_stream_reader() { + let _ = stream.read_to_end(&mut Vec::new()); + } + if let Ok(mut stream) = reader.assignment_stream_reader_unverified() { + let _ = stream.read_to_end(&mut Vec::new()); + } + if let Ok(verified) = reader.open_assignment_reader() { + for record in verified.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + if let Ok(verified) = reader.open_assignment_reader() { + let _ = verified.count_samples(); + } + if let Ok(unverified) = reader.open_assignment_reader_unverified() { + for record in unverified.silent(true).take(MAX_PULLS) { + let _ = record; + } + }; + } + + let _ = BendlAppender::open(Cursor::new(data.to_vec())); +}); diff --git a/fuzz/fuzz_targets/xben_body.rs b/fuzz/fuzz_targets/xben_body.rs new file mode 100644 index 0000000..cbf873d --- /dev/null +++ b/fuzz/fuzz_targets/xben_body.rs @@ -0,0 +1,47 @@ +//! Coverage-guided fuzzing of the decompressed XBEN body parsers. +//! +//! Fuzzing the raw `.xben` container mostly exercises the xz layer, whose integrity checks +//! reject mutants before the BEN32/TwoDelta parsers run. This target re-wraps the fuzz input in +//! a fresh, valid xz container so corruption lands directly on the inner parsers — the same +//! trick as the deterministic harness's recompressed-body sweeps, but coverage-guided. + +#![no_main] + +use binary_ensemble::codec::decode::{decode_xben_to_ben, decode_xben_to_jsonl}; +use binary_ensemble::codec::encode::xz_compress; +use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; +use binary_ensemble::ops::extract::extract_assignment_xben; +use libfuzzer_sys::fuzz_target; +use std::io::BufReader; + +const MAX_PULLS: usize = 64; + +fuzz_target!(|data: &[u8]| { + let mut container = Vec::new(); + xz_compress( + BufReader::new(data), + &mut container, + Some(1), + Some(0), + None, + ) + .expect("compressing an in-memory body cannot fail"); + + let _ = decode_xben_to_jsonl(BufReader::new(container.as_slice()), std::io::sink()); + let _ = decode_xben_to_ben(BufReader::new(container.as_slice()), std::io::sink()); + + if let Ok(reader) = BenStreamReader::from_xben(container.as_slice()) { + for record in reader.silent(true).take(MAX_PULLS) { + let _ = record; + } + } + if let Ok(reader) = BenStreamReader::from_xben(container.as_slice()) { + let _ = reader.silent(true).count_samples(); + } + if let Ok(frames) = BenStreamFrameReader::from_xben(container.as_slice()) { + for frame in frames.take(MAX_PULLS) { + let _ = frame; + } + } + let _ = extract_assignment_xben(container.as_slice(), 2); +}); diff --git a/fuzz/fuzz_targets/xben_container.rs b/fuzz/fuzz_targets/xben_container.rs new file mode 100644 index 0000000..3489ef4 --- /dev/null +++ b/fuzz/fuzz_targets/xben_container.rs @@ -0,0 +1,25 @@ +//! Coverage-guided fuzzing of the raw `.xben` container surface (xz framing + dispatch). +//! +//! Complement of `xben_body`: here the fuzz input is the container itself, so the xz layer, the +//! banner dispatch, and the error paths between them face the corruption. + +#![no_main] + +use binary_ensemble::codec::decode::{decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress}; +use binary_ensemble::io::reader::BenStreamReader; +use libfuzzer_sys::fuzz_target; +use std::io::BufReader; + +const MAX_PULLS: usize = 64; + +fuzz_target!(|data: &[u8]| { + let _ = decode_xben_to_jsonl(BufReader::new(data), std::io::sink()); + let _ = decode_xben_to_ben(BufReader::new(data), std::io::sink()); + let _ = xz_decompress(BufReader::new(data), std::io::sink()); + + if let Ok(reader) = BenStreamReader::from_xben(data) { + for record in reader.silent(true).take(MAX_PULLS) { + let _ = record; + } + } +}); From 929ecc4d4f11e4433ab831264417b60c60edde07 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:21:19 -0600 Subject: [PATCH 161/221] add soak test --- ben/tests/test_streaming_soak.rs | 145 +++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 ben/tests/test_streaming_soak.rs diff --git a/ben/tests/test_streaming_soak.rs b/ben/tests/test_streaming_soak.rs new file mode 100644 index 0000000..592b331 --- /dev/null +++ b/ben/tests/test_streaming_soak.rs @@ -0,0 +1,145 @@ +//! Streaming-invariant soak tests. +//! +//! "Streaming, not slurping" is a core invariant of this workspace: ensembles are too large to +//! hold in memory, so every encode/decode path must process samples incrementally. Nothing else +//! in the suite *measures* that — every other harness uses small inputs, so an accidental +//! buffer-everything regression would pass the entire suite and only surface as an OOM on a real +//! multi-gigabyte ensemble. +//! +//! These tests pin the invariant directly: an encoder thread streams a multi-gigabyte *logical* +//! ensemble through an OS pipe (64 KiB of kernel backpressure) into a decoder, and the process's +//! peak RSS (`VmHWM`) must stay bounded. A slurping regression on either side would buffer +//! gigabytes and blow the bound unmissably. +//! +//! Linux-only (peak RSS is read from `/proc/self/status`) and `#[ignore]`-gated into the +//! slow/stress suite: multi-gigabyte logical streams take a few seconds. + +#![cfg(target_os = "linux")] + +use binary_ensemble::io::reader::BenStreamReader; +use binary_ensemble::io::writer::{BenStreamWriter, XzEncodeOptions}; +use binary_ensemble::BenVariant; +use std::io::{BufReader, BufWriter}; + +/// Samples streamed per test. +const N_SAMPLES: usize = 200_000; +/// Nodes per assignment. 200k samples x 5k nodes x 2 bytes = 2 GB of logical assignment data. +const ASSIGNMENT_LEN: usize = 5_000; +/// Peak-RSS budget. True streaming peaks well under 100 MB; a slurping regression buffers the +/// 2 GB logical stream and exceeds this bound by an order of magnitude. +const MAX_PEAK_RSS_KB: u64 = 256 * 1024; + +/// The process's lifetime peak resident set size in kilobytes, from `/proc/self/status` `VmHWM`. +fn peak_rss_kb() -> u64 { + let status = std::fs::read_to_string("/proc/self/status").expect("read /proc/self/status"); + let line = status + .lines() + .find(|l| l.starts_with("VmHWM:")) + .expect("VmHWM present in /proc/self/status"); + line.split_whitespace() + .nth(1) + .expect("VmHWM value field") + .parse() + .expect("VmHWM parses as kB") +} + +/// A few distinct assignment templates so consecutive samples differ (no count-merging shortcut) +/// while staying cheap to produce. Runs of 50 keep each frame's RLE small, so the on-wire stream +/// is modest even though the decoded stream is gigabytes. +fn templates() -> Vec> { + (0..4u16) + .map(|k| { + (0..ASSIGNMENT_LEN) + .map(|j| ((j / 50) as u16 + k) % 40 + 1) + .collect() + }) + .collect() +} + +/// Drive `N_SAMPLES` through an encoder thread, an OS pipe, and `decode`, asserting the decoded +/// totals and the peak-RSS bound. +fn assert_streaming_round_trip( + encode: impl FnOnce(std::io::PipeWriter) + Send + 'static, + decode: impl FnOnce(std::io::PipeReader) -> (usize, u64), +) { + let (reader, writer) = std::io::pipe().expect("create pipe"); + + let encoder_thread = std::thread::spawn(move || encode(writer)); + let (total_samples, total_nodes) = decode(reader); + encoder_thread.join().expect("encoder thread"); + + assert_eq!(total_samples, N_SAMPLES); + assert_eq!(total_nodes, (N_SAMPLES * ASSIGNMENT_LEN) as u64); + + let peak = peak_rss_kb(); + assert!( + peak < MAX_PEAK_RSS_KB, + "peak RSS {peak} kB breaches the {MAX_PEAK_RSS_KB} kB streaming bound; \ + some encode/decode path is buffering the stream instead of streaming it" + ); +} + +fn decode_counting(reader: R, from_xben: bool) -> (usize, u64) { + let mut decoder = if from_xben { + BenStreamReader::from_xben(reader).expect("open xben stream") + } else { + BenStreamReader::from_ben(reader).expect("open ben stream") + } + .silent(true); + + let mut total_samples = 0usize; + let mut total_nodes = 0u64; + decoder + .for_each_assignment(|assignment, count| { + total_samples += count as usize; + total_nodes += assignment.len() as u64 * u64::from(count); + Ok(true) + }) + .expect("decode stream"); + (total_samples, total_nodes) +} + +#[test] +#[ignore = "streaming soak: multi-gigabyte logical stream; run via the slow/stress gate"] +fn plain_ben_round_trip_streams_without_slurping() { + let templates = templates(); + assert_streaming_round_trip( + move |writer| { + let mut encoder = + BenStreamWriter::for_ben(BufWriter::new(writer), BenVariant::Standard) + .expect("open ben writer"); + for i in 0..N_SAMPLES { + encoder + .write_assignment(templates[i % templates.len()].clone()) + .expect("write assignment"); + } + encoder.finish().expect("finish ben stream"); + }, + |reader| decode_counting(BufReader::new(reader), false), + ); +} + +#[test] +#[ignore = "streaming soak: multi-gigabyte logical stream; run via the slow/stress gate"] +fn xben_round_trip_streams_without_slurping() { + let templates = templates(); + assert_streaming_round_trip( + move |writer| { + // Compression level 1 keeps the xz dictionary near 1 MiB; the default level's 64 MiB + // dictionary would dominate the RSS measurement and mask a slurping regression. + let options = XzEncodeOptions::new() + .with_n_threads(1) + .with_compression_level(1); + let mut encoder = + BenStreamWriter::for_xben(BufWriter::new(writer), BenVariant::Standard, options) + .expect("open xben writer"); + for i in 0..N_SAMPLES { + encoder + .write_assignment(templates[i % templates.len()].clone()) + .expect("write assignment"); + } + encoder.finish().expect("finish xben stream"); + }, + |reader| decode_counting(BufReader::new(reader), true), + ); +} From 0611dc4630397aa3288c8aa98f15fd75e425bb4a Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:21:47 -0600 Subject: [PATCH 162/221] better ci / cd with a smoke test on wheels --- .github/workflows/ci_cd.yml | 28 +++++++++++++++++++++ .github/workflows/full-tests.yml | 42 ++++++++++++++++++++++++++++--- .github/workflows/wheel_smoke.py | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/wheel_smoke.py diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index c4b24a7..a4bc6fb 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -42,6 +42,14 @@ jobs: --release --locked --out ${{ env.OUT_DIR }} + # Smoke-test the built wheel in a clean venv before it can be published. aarch64 wheels are + # cross-built and cannot execute on this runner, so only the native target is smoked. + - name: Smoke test wheel + if: matrix.target == 'x86_64' + run: | + python3 -m venv /tmp/wheel-smoke + /tmp/wheel-smoke/bin/pip install ${{ env.PKG_DIR }}/${{ env.OUT_DIR }}/*.whl + /tmp/wheel-smoke/bin/python .github/workflows/wheel_smoke.py - uses: actions/upload-artifact@v4 with: name: wheels-linux-${{ matrix.target }} @@ -75,6 +83,12 @@ jobs: --target universal2-apple-darwin --out ${{ env.OUT_DIR }} + - name: Smoke test wheel + run: | + python3 -m venv /tmp/wheel-smoke + /tmp/wheel-smoke/bin/pip install ${{ env.PKG_DIR }}/${{ env.OUT_DIR }}/*.whl + /tmp/wheel-smoke/bin/python .github/workflows/wheel_smoke.py + - uses: actions/upload-artifact@v4 with: name: wheels-macos-universal2 @@ -103,6 +117,13 @@ jobs: --release --locked --out ${{ env.OUT_DIR }} + - name: Smoke test wheel + shell: pwsh + run: | + python -m venv $env:TEMP\wheel-smoke + $wheel = Get-ChildItem "${{ env.PKG_DIR }}\${{ env.OUT_DIR }}\*.whl" | Select-Object -First 1 + & "$env:TEMP\wheel-smoke\Scripts\pip" install $wheel.FullName + & "$env:TEMP\wheel-smoke\Scripts\python" .github\workflows\wheel_smoke.py - uses: actions/upload-artifact@v4 with: name: wheels-windows @@ -131,6 +152,13 @@ jobs: --locked --out ${{ env.OUT_DIR }} --target aarch64-pc-windows-msvc + - name: Smoke test wheel + shell: pwsh + run: | + python -m venv $env:TEMP\wheel-smoke + $wheel = Get-ChildItem "${{ env.PKG_DIR }}\${{ env.OUT_DIR }}\*.whl" | Select-Object -First 1 + & "$env:TEMP\wheel-smoke\Scripts\pip" install $wheel.FullName + & "$env:TEMP\wheel-smoke\Scripts\python" .github\workflows\wheel_smoke.py - uses: actions/upload-artifact@v4 with: name: wheels-windows-arm64 diff --git a/.github/workflows/full-tests.yml b/.github/workflows/full-tests.yml index 2ecc7c8..d72f217 100644 --- a/.github/workflows/full-tests.yml +++ b/.github/workflows/full-tests.yml @@ -3,9 +3,10 @@ name: Full tests (on demand) # The heavy gates, run only when asked: # # - From the Actions tab via "Run workflow" (workflow_dispatch), picking any branch. -# - From a PR by commenting `/ci-full` (Rust + Python suites + big-endian) or `/ci-endian` -# (big-endian only). Comment triggers are restricted to the repo owner, members, and -# collaborators, because the jobs check out and execute the PR head. +# - From a PR by commenting `/ci-full` (Rust + Python suites + big-endian), `/ci-endian` +# (big-endian only), or `/ci-fuzz` (time-boxed coverage-guided fuzzing). Comment triggers +# are restricted to the repo owner, members, and collaborators, because the jobs check out +# and execute the PR head. # # Note: GitHub only evaluates `issue_comment` triggers against the workflow file on the default # branch, so the comment commands start working once this file lands on main. @@ -26,6 +27,7 @@ jobs: outputs: run_full: ${{ steps.flags.outputs.run_full }} run_endian: ${{ steps.flags.outputs.run_endian }} + run_fuzz: ${{ steps.flags.outputs.run_fuzz }} ref: ${{ steps.flags.outputs.ref }} steps: - name: compute trigger flags @@ -42,10 +44,12 @@ jobs: run: | run_full=false run_endian=false + run_fuzz=false ref="$DISPATCH_REF" if [ "$EVENT_NAME" = "workflow_dispatch" ]; then run_full=true run_endian=true + run_fuzz=true elif [ "$IS_PR_COMMENT" = "true" ]; then case "$AUTHOR_ASSOCIATION" in OWNER|MEMBER|COLLABORATOR) @@ -55,6 +59,9 @@ jobs: case "$COMMENT_BODY" in *"/ci-endian"*) run_endian=true ;; esac + case "$COMMENT_BODY" in + *"/ci-fuzz"*) run_fuzz=true ;; + esac ;; esac ref="refs/pull/$PR_NUMBER/head" @@ -62,10 +69,11 @@ jobs: { echo "run_full=$run_full" echo "run_endian=$run_endian" + echo "run_fuzz=$run_fuzz" echo "ref=$ref" } >> "$GITHUB_OUTPUT" - name: acknowledge the triggering comment - if: github.event_name == 'issue_comment' && (steps.flags.outputs.run_full == 'true' || steps.flags.outputs.run_endian == 'true') + if: github.event_name == 'issue_comment' && (steps.flags.outputs.run_full == 'true' || steps.flags.outputs.run_endian == 'true' || steps.flags.outputs.run_fuzz == 'true') env: GH_TOKEN: ${{ github.token }} run: >- @@ -131,3 +139,29 @@ jobs: tool: cross - name: cross test on s390x run: cross test -p binary-ensemble --target s390x-unknown-linux-gnu + + fuzz: + name: coverage-guided fuzzing (time-boxed) + needs: decide + if: needs.decide.outputs.run_fuzz == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.decide.outputs.ref }} + - uses: dtolnay/rust-toolchain@nightly + - uses: Swatinem/rust-cache@v2 + - uses: taiki-e/install-action@v2 + with: + tool: cargo-fuzz + - uses: arduino/setup-task@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: fuzz every read surface (120s per target) + run: task fuzz FUZZ_SECONDS=120 + - name: upload crash artifacts on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-artifacts + path: fuzz/artifacts/ diff --git a/.github/workflows/wheel_smoke.py b/.github/workflows/wheel_smoke.py new file mode 100644 index 0000000..b822de8 --- /dev/null +++ b/.github/workflows/wheel_smoke.py @@ -0,0 +1,43 @@ +"""Smoke test for a freshly built binary_ensemble wheel. + +Run inside a clean venv that has the wheel (and nothing else from this repo) installed: imports +the extension module and round-trips a tiny ensemble through encode/decode. Catches wheels that +build but cannot load (bad abi3 configuration, missing symbols) or that load but cannot reach the +Rust core, before they are published. +""" + +import json +import pathlib +import tempfile + +import binary_ensemble as be + +LINES = [ + {"assignment": [1, 1, 2, 2], "sample": 1}, + {"assignment": [2, 2, 1, 1], "sample": 2}, + {"assignment": [2, 2, 1, 1], "sample": 3}, +] + + +def main() -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = pathlib.Path(tmp) + src = tmp_path / "src.jsonl" + ben = tmp_path / "out.ben" + back = tmp_path / "round.jsonl" + + src.write_text( + "".join(json.dumps(line, separators=(",", ":")) + "\n" for line in LINES) + ) + be.encode_jsonl_to_ben(src, ben, overwrite=True, variant="standard") + be.decode_ben_to_jsonl(ben, back, overwrite=True) + + assert src.read_bytes() == back.read_bytes(), ( + f"wheel round-trip mismatch:\n{src.read_text()!r}\n!=\n{back.read_text()!r}" + ) + + print("wheel smoke test passed") + + +if __name__ == "__main__": + main() From cedd613ec76217afa2861a4b181c417c231b1bcb Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:11:20 -0600 Subject: [PATCH 163/221] autocompress larger assets in bundle --- ben-py/binary_ensemble/__init__.py | 7 + ben-py/binary_ensemble/bundle.py | 29 +++- ben-py/binary_ensemble/bundle.pyi | 10 +- ben-py/docs/api/bundle.md | 2 +- ben-py/docs/changelog.md | 28 ++++ ben-py/docs/concepts/compatibility.md | 3 +- ben-py/docs/concepts/formats.md | 4 +- ben-py/docs/concepts/limitations.md | 29 +++- ben-py/docs/concepts/performance.md | 6 + ben-py/docs/concepts/release-versioning.md | 10 +- ben-py/docs/conf.py | 2 + ben-py/docs/getting-started/installation.md | 6 +- .../docs/how-to/analyze-with-numpy-pandas.md | 132 ++++++++++++++++++ ben-py/docs/how-to/error-reference.md | 74 +++++++++- ben-py/docs/how-to/index.md | 7 + ben-py/docs/how-to/subsample.md | 27 +++- ben-py/docs/how-to/troubleshooting.md | 46 ++++++ ben-py/docs/index.md | 2 + ben-py/docs/user/using_ben_py.ipynb | 2 +- ben-py/pyproject.toml | 3 + ben-py/src/decode/bundle_decoder.rs | 38 ++++- ben-py/src/decode/decoder.rs | 34 ++++- ben-py/src/encode/bundle_encoder.rs | 21 ++- ben-py/src/encode/encoder.rs | 11 ++ ben-py/tests/test_bundle_api.py | 31 +++- ben-py/tests/test_surface.py | 2 + ben-py/uv.lock | 4 + ben/src/io/bundle/format.rs | 22 ++- ben/src/io/bundle/tests/format.rs | 24 +++- ben/src/io/bundle/tests/writer.rs | 62 ++++++++ ben/src/io/bundle/writer.rs | 12 +- 31 files changed, 626 insertions(+), 64 deletions(-) create mode 100644 ben-py/docs/changelog.md create mode 100644 ben-py/docs/how-to/analyze-with-numpy-pandas.md diff --git a/ben-py/binary_ensemble/__init__.py b/ben-py/binary_ensemble/__init__.py index 36caafd..221a2b5 100644 --- a/ben-py/binary_ensemble/__init__.py +++ b/ben-py/binary_ensemble/__init__.py @@ -15,6 +15,8 @@ All public symbols are re-exported here for convenience. """ +from importlib import metadata as _metadata + from binary_ensemble import bundle, codec, graph, stream from binary_ensemble.bundle import ( BendlDecoder, @@ -32,6 +34,11 @@ ) from binary_ensemble.stream import BenDecoder, BenEncoder +try: + __version__ = _metadata.version("binary-ensemble") +except _metadata.PackageNotFoundError: # source tree imported without an installed dist + __version__ = "0.0.0+unknown" + __all__ = [ # Submodules "stream", diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index a59a41d..6dc2a3e 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -29,12 +29,18 @@ import tempfile from typing import Any, Optional, Union -from binary_ensemble._core import BendlDecoder +from binary_ensemble._core import BendlDecoder, BendlStreamSession from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder from binary_ensemble._core import recompress_bundle as _recompress_bundle from binary_ensemble._core import relabel_bundle as _relabel_bundle -__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream", "relabel_bundle"] +__all__ = [ + "BendlEncoder", + "BendlDecoder", + "BendlStreamSession", + "compress_stream", + "relabel_bundle", +] def _atomic_or_out(transform, path, out_file, in_place, suffix=".bendl"): @@ -92,6 +98,15 @@ class BendlEncoder: ``stream()``): either ``with BendlEncoder(...) as enc: ...`` or an explicit :meth:`close`. In append mode (:meth:`append`), an existing finalized bundle is grown with new assets and ``stream()`` is unavailable. + + Args: + file_path: Output path for the new bundle. Must not exist unless + ``overwrite=True``. + overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. + + Raises: + OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it + cannot be created. """ def __init__(self, file_path, overwrite: bool = False) -> None: @@ -147,7 +162,11 @@ def add_asset( """Embed a custom asset under ``name``. ``content_type`` is ``"json"`` (payload must be valid UTF-8 JSON; the - decoder will auto-parse it) or ``"text"`` (payload must be valid UTF-8). + decoder will auto-parse it), ``"text"`` (payload must be valid UTF-8), + or ``"binary"`` (arbitrary bytes, stored verbatim — e.g. a zipped + shapefile or a GeoPackage). Every asset carries a CRC32C integrity + checksum, and payloads of 1 KiB or more are xz-compressed on disk by + default (transparent on read). """ data = _coerce_bytes(payload) if content_type == "json": @@ -164,9 +183,9 @@ def add_asset( raise ValueError( f"content_type='text' requires valid UTF-8: {exc}" ) from exc - else: + elif content_type != "binary": raise ValueError( - f"content_type must be 'json' or 'text', got {content_type!r}" + f"content_type must be 'json', 'text', or 'binary', got {content_type!r}" ) self._enc.add_asset(name, data, content_type) diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi index 3cdaeed..8201c66 100644 --- a/ben-py/binary_ensemble/bundle.pyi +++ b/ben-py/binary_ensemble/bundle.pyi @@ -1,9 +1,15 @@ from typing import Any, Optional, Union from binary_ensemble._core import BendlDecoder as BendlDecoder -from binary_ensemble._core import BendlStreamSession +from binary_ensemble._core import BendlStreamSession as BendlStreamSession -__all__ = ["BendlEncoder", "BendlDecoder", "compress_stream", "relabel_bundle"] +__all__ = [ + "BendlEncoder", + "BendlDecoder", + "BendlStreamSession", + "compress_stream", + "relabel_bundle", +] class BendlEncoder: def __init__(self, file_path, overwrite: bool = False) -> None: ... diff --git a/ben-py/docs/api/bundle.md b/ben-py/docs/api/bundle.md index 250d964..0b3ea47 100644 --- a/ben-py/docs/api/bundle.md +++ b/ben-py/docs/api/bundle.md @@ -106,7 +106,7 @@ with encoder.stream("ben", variant="twodelta") as stream: ``` ```{eval-rst} -.. autoclass:: binary_ensemble._core.BendlStreamSession +.. autoclass:: binary_ensemble.bundle.BendlStreamSession :members: ``` diff --git a/ben-py/docs/changelog.md b/ben-py/docs/changelog.md new file mode 100644 index 0000000..eef3331 --- /dev/null +++ b/ben-py/docs/changelog.md @@ -0,0 +1,28 @@ +# Changelog + +Notable changes to the `binary-ensemble` Python package, newest first. The byte-level +stability promises for the BEN/XBEN/BENDL formats themselves are covered separately in +[Compatibility and stability](concepts/compatibility.md). + +## 1.0.0 + +First stable release of the rewritten Python API. + +- **`.bendl` bundles** — `BendlEncoder` / `BendlDecoder` read and write the single-file + bundle format: an assignment stream plus the dual graph, node permutation map, metadata, + and custom assets. `compress_stream` recompresses a bundle's stream to XBEN and + `relabel_bundle` reorders a bundle's graph and rewrites its stream to match, both + preserving every asset. +- **Plain streams** — `BenEncoder` / `BenDecoder` write and iterate plain `.ben`/`.xben` + streams, with frame-skipping subsampling (`subsample_indices`, `subsample_range`, + `subsample_every`) shared with the bundle decoder. +- **Whole-file codecs** — `encode_jsonl_to_ben`, `encode_jsonl_to_xben`, + `encode_ben_to_xben`, and the matching `decode_*` helpers convert complete files + between JSONL, BEN, and XBEN. +- **Graph reordering** — `binary_ensemble.graph` exposes the MLC, RCM, and key-based + orderings used by `add_graph` and `relabel_bundle`. +- **Encoding variants** — `standard`, `mkv_chain`, and `twodelta` (the default), with + automatic variant detection on read. +- `binary_ensemble.__version__` reports the installed package version. + +Requires Python 3.11+; NetworkX is the only runtime dependency. diff --git a/ben-py/docs/concepts/compatibility.md b/ben-py/docs/concepts/compatibility.md index fdb7284..c8b1943 100644 --- a/ben-py/docs/concepts/compatibility.md +++ b/ben-py/docs/concepts/compatibility.md @@ -85,6 +85,7 @@ context around the stream. For serious runs, store at least: - creation date and operator notes. ```python +import binary_ensemble from binary_ensemble import BendlEncoder encoder = BendlEncoder("compatibility.bendl", overwrite=True) @@ -93,7 +94,7 @@ encoder.add_metadata( "sampler": "ReCom", "seed": 1234, "node_order": "GEOID20", - "binary_ensemble": "record the package version here", + "binary_ensemble": binary_ensemble.__version__, } ) with encoder.stream("ben") as stream: diff --git a/ben-py/docs/concepts/formats.md b/ben-py/docs/concepts/formats.md index 7ca7e48..f322bb1 100644 --- a/ben-py/docs/concepts/formats.md +++ b/ben-py/docs/concepts/formats.md @@ -129,7 +129,9 @@ the embedded assignment stream, then a directory table at the end: The writer lays the file down in order — a provisional header marked *unfinalized*, then assets, then the stream, then the directory — and **patches the header last** to flip it to finalized and fill in the final lengths, checksum, and sample count. So if the process dies mid-write, the -partial file is still recoverable (assignments read to end-of-file) and clearly flagged incomplete; +partial file is clearly flagged incomplete and the stream bytes that reached disk are still +salvageable — see +[Recovering samples from a crashed run](../how-to/troubleshooting.md#recovering-samples-from-a-crashed-run); that final header patch is the single commit point. ## Going deeper diff --git a/ben-py/docs/concepts/limitations.md b/ben-py/docs/concepts/limitations.md index 81210cb..0deda06 100644 --- a/ben-py/docs/concepts/limitations.md +++ b/ben-py/docs/concepts/limitations.md @@ -73,8 +73,29 @@ Assignments store integer district ids. The practical limit is 16-bit positive d which is far above normal statewide redistricting use. Non-integer labels should be mapped to integers before encoding. -## No geospatial geometry +## Geospatial data travels as opaque blobs -Bundles can store graph JSON and custom text or JSON assets, but they do not embed arbitrary -geospatial file trees by default. Store geometry paths, hashes, and provenance in metadata, or -ship the geometry separately when readers need it. +Bundles can carry geospatial data — a zipped shapefile, a GeoPackage, a GeoJSON file — as +custom binary assets. The payload is stored verbatim with a CRC32C integrity checksum +(xz-compressed on disk when it is 1 KiB or larger, transparently decompressed on read): + +```python +from binary_ensemble import BendlDecoder, BendlEncoder + +# Stand-in for real geometry bytes, e.g. open("tracts.gpkg", "rb").read(). +gpkg_bytes = b"GPKG\x00\x01" + bytes(range(256)) + +encoder = BendlEncoder("with_geometry.bendl", overwrite=True) +encoder.add_asset("tracts.gpkg", gpkg_bytes, content_type="binary") +encoder.close() + +decoder = BendlDecoder("with_geometry.bendl") +assert decoder.read_asset_bytes("tracts.gpkg") == gpkg_bytes +``` + +What the bundle does **not** do is interpret the geometry: there is no spatial indexing, no +geometry validation, and — most importantly — no enforcement that the geometry's feature order +matches the dual graph's node order. That correspondence is the caller's responsibility, exactly +as it is for the graph itself. For large geometry collections that several bundles share, storing +paths, hashes, and provenance in metadata and shipping the geometry separately is still often the +better layout — embedding is a convenience for self-contained archives, not a requirement. diff --git a/ben-py/docs/concepts/performance.md b/ben-py/docs/concepts/performance.md index d83e554..f979eb4 100644 --- a/ben-py/docs/concepts/performance.md +++ b/ben-py/docs/concepts/performance.md @@ -99,6 +99,12 @@ for assignment in BendlDecoder("ensemble.bendl").subsample_every(25): BEN streams are cheapest to subsample. XBEN streams pay a decompression startup cost, then can still skip through the decoded stream efficiently. +How much work a skipped sample costs depends on the [encoding variant](variants.md): +`standard` and `mkv_chain` frames are skipped wholesale without unpacking, while `twodelta` +(the default) replays the deltas between snapshots, so skipped samples are cheaper but not +free. Choose `variant="standard"` or `variant="mkv_chain"` at encode time if repeated random +access is the dominant workload. + ## Practical workflow For serious runs: diff --git a/ben-py/docs/concepts/release-versioning.md b/ben-py/docs/concepts/release-versioning.md index ecee41e..2fd68cb 100644 --- a/ben-py/docs/concepts/release-versioning.md +++ b/ben-py/docs/concepts/release-versioning.md @@ -4,14 +4,14 @@ This page describes the promises users should rely on at release boundaries. ## Python package versions -The Python package version is the normal package version installed by `pip`. Use it to record -which Python bindings wrote or read a bundle. +The Python package version is the normal package version installed by `pip`, exposed as +`binary_ensemble.__version__`. Use it to record which Python bindings wrote or read a +bundle. ```python -from importlib import metadata +import binary_ensemble -version = metadata.version("binary-ensemble") -print(version) +print(binary_ensemble.__version__) ``` For reproducible runs, store that value in bundle metadata alongside sampler settings, graph diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index 1418345..ee72230 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -92,6 +92,8 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "networkx": ("https://networkx.org/documentation/stable/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), } # -- linkcheck --------------------------------------------------------------- diff --git a/ben-py/docs/getting-started/installation.md b/ben-py/docs/getting-started/installation.md index 69f18c2..44417b4 100644 --- a/ben-py/docs/getting-started/installation.md +++ b/ben-py/docs/getting-started/installation.md @@ -29,6 +29,7 @@ pre-existing JSONL files. ```python import binary_ensemble +print(binary_ensemble.__version__) print(binary_ensemble.__all__) ``` @@ -51,10 +52,11 @@ maturin develop --release # builds the extension and installs it editable ## Command-line tools This Python package wraps the same engine as the project's CLI tools (`ben`, `reben`, -`bendl`, `pcben`). From a checkout, build or install those tools with Cargo: +`bendl`, `pcben`). Install them with Cargo, either from crates.io or from a checkout: ```bash -cargo install --path ben +cargo install binary-ensemble # from crates.io +cargo install --path ben # from a repository checkout ``` The Python API mirrors the CLI's structure — see [The API map](../concepts/api-map.md). diff --git a/ben-py/docs/how-to/analyze-with-numpy-pandas.md b/ben-py/docs/how-to/analyze-with-numpy-pandas.md new file mode 100644 index 0000000..22fcce3 --- /dev/null +++ b/ben-py/docs/how-to/analyze-with-numpy-pandas.md @@ -0,0 +1,132 @@ +# Analyze an ensemble with NumPy and pandas + +The decoders yield each plan as a plain `list[int]`, which makes them sampler-agnostic — +but for real analysis you usually want the whole ensemble (or a slice of it) as one +array, so scores can be computed vectorized instead of plan by plan. This guide shows the +patterns; it assumes the sample files from [How-to guides](index.md). + +## Load an ensemble into a NumPy array + +Stack the decoded plans into an `(n_samples, n_nodes)` array. District ids fit in 16 bits +(see [Limitations](../concepts/limitations.md)), so `dtype=np.uint16` keeps the array +compact: + +```python +import numpy as np + +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +ensemble = np.array(list(decoder), dtype=np.uint16) + +print(ensemble.shape) # (120, 64): one row per sample, one column per node +``` + +```{admonition} Will it fit? +:class: warning +The array costs `n_samples × n_nodes × 2` bytes at `uint16`. A 10,000-plan ensemble on a +10,000-node graph is a comfortable 200 MB — but 100k plans on 140k census blocks is 28 GB. +For ensembles that big, [subsample](subsample.md) before stacking, or use the streaming +pattern at the end of this page. +``` + +To stack only a slice, chain a subsample call into the same expression: + +```python +thinned = np.array(list(BendlDecoder("ensemble.bendl").subsample_every(10)), dtype=np.uint16) +print(thinned.shape) # (12, 64) +``` + +## Score plans vectorized + +With the ensemble as an array, per-plan statistics become one-liners over `axis=1`: + +```python +district_one_share = (ensemble == 1).mean(axis=1) # fraction of nodes in district 1 +districts_used = ensemble.max(axis=1) # highest district id per plan + +print(district_one_share[:4]) +print(districts_used[:4]) +``` + +Per-district node counts for every plan come from {func}`numpy.bincount` row by row: + +```python +n_districts = int(ensemble.max()) +sizes = np.stack([np.bincount(plan, minlength=n_districts + 1)[1:] for plan in ensemble]) + +print(sizes.shape) # (120, 4): district sizes, one row per plan +``` + +If node populations live on the graph, weight the counts to get district populations: + +```python +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +populations = np.array([graph.nodes[node]["TOTPOP"] for node in graph.nodes]) + +district_pops = np.stack( + [np.bincount(plan, weights=populations, minlength=n_districts + 1)[1:] for plan in ensemble] +) +print(district_pops[0]) # population of districts 1..4 under the first plan +``` + +The order of `populations` matches the order of the assignment columns because both +follow the embedded graph's node order — that positional contract is the whole point of +bundling the graph (see [The data contract](../concepts/data-model.md)). + +## Label columns with a DataFrame + +A {class}`pandas.DataFrame` makes the node order explicit by naming each column after its +graph node. Samples are 1-indexed throughout `binary-ensemble`, so index the rows to +match: + +```python +import pandas as pd + +decoder = BendlDecoder("ensemble.bendl") +graph = decoder.read_graph() +node_labels = [graph.nodes[node]["GEOID20"] for node in graph.nodes] + +frame = pd.DataFrame(ensemble, columns=node_labels) +frame.index = pd.RangeIndex(1, len(frame) + 1, name="sample") + +print(frame.iloc[:3, :4]) +``` + +(Use `list(graph.nodes)` as the labels when the graph has no geographic key.) + +From here, everything in the pandas toolbox applies — `frame.eq(1).mean(axis=1)`, +groupbys over melted long form, joins against node-level covariates keyed by `GEOID20`, +and so on. + +## Stream when the ensemble doesn't fit in memory + +For ensembles too large to stack, iterate once and accumulate. The decoder never holds +more than one plan in memory, so this scales to millions of samples: + +```python +import numpy as np + +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") + +totals = np.zeros(decoder.read_graph().number_of_nodes()) +for assignment in decoder: + plan = np.asarray(assignment, dtype=np.uint16) + totals += plan == 1 + +frequency = totals / len(decoder) +print(frequency[:8]) # how often each node lands in district 1 +``` + +`len(decoder)` is cheap (on a finalized bundle it is read from the header), so it is safe +to use for the denominator or a progress bar. + +## Any sampler, any source + +Nothing on this page is GerryChain-specific. The decoders return plain integer lists no +matter what produced the stream — a ReCom chain, ForestReCom, Sequential Monte Carlo, or +a JSONL file converted with the [codec helpers](convert-formats.md) — so the same NumPy +and pandas patterns apply to all of them. diff --git a/ben-py/docs/how-to/error-reference.md b/ben-py/docs/how-to/error-reference.md index 936a7e3..cf86877 100644 --- a/ben-py/docs/how-to/error-reference.md +++ b/ben-py/docs/how-to/error-reference.md @@ -1,6 +1,12 @@ # Error reference -This page maps common symptoms to likely causes and fixes. +This page maps common symptoms to likely causes and fixes, naming the exception class each +one raises so you can catch it deliberately. As a rule of thumb: file-system problems +(missing files, refusing to overwrite, closed encoders) raise `OSError`; invalid argument +values (bad `sort`, bad `content_type`, conflicting output modes, mismatched assignment +lengths) raise `ValueError`; and container/format mismatches detected inside the Rust +engine (wrong reader for the file, invalid subsample positions, truncated streams) raise a +plain `Exception` whose message describes the problem. ## Output file already exists @@ -18,9 +24,12 @@ encode_jsonl_to_ben("plans.jsonl", "error-reference.ben", overwrite=True) ## Wrong reader for the file type -**Symptom:** opening a file raises an error that points you at another decoder. +**Symptom:** opening a file raises an `Exception` whose message names the decoder to use — +for example *"…is a .bendl bundle, not a plain BEN/XBEN stream. Open it with +binary_ensemble.bundle.BendlDecoder instead."* -**Cause:** `.bendl`, `.ben`, and `.xben` are different containers. +**Cause:** `.bendl`, `.ben`, and `.xben` are different containers. (A missing or unreadable +file raises `OSError` instead.) **Fix:** use the matching reader. @@ -51,6 +60,8 @@ print(decoder.asset_names()) ## Relabeling fails because the bundle has no graph +**Symptom:** `ValueError: bundle has no graph.json to reorder`. + **Cause:** `relabel_bundle()` must know the graph order to rewrite assignment positions. **Fix:** create bundles with `add_graph()`, or relabel before discarding the graph context. @@ -70,6 +81,9 @@ with encoder.stream("ben") as stream: ## Relabeling fails after XBEN recompression +**Symptom:** `ValueError: relabel_bundle only supports BEN bundles; relabel before +compressing to XBEN`. + **Cause:** `relabel_bundle()` works on `.bendl` bundles with embedded BEN streams. XBEN is the final archive step. @@ -84,6 +98,9 @@ compress_stream("error-sorted.bendl", out_file="error-archive.bendl") ## `content_type` is rejected +**Symptom:** `ValueError: content_type must be 'json' or 'text'`, or a `ValueError` about +invalid UTF-8 / invalid JSON. + **Cause:** `add_asset()` accepts only `content_type="json"` or `content_type="text"` from the Python wrapper. JSON payloads must be valid UTF-8 JSON; text payloads must be valid UTF-8. @@ -102,6 +119,9 @@ with encoder.stream("ben") as stream: ## `sort="key"` fails +**Symptom:** a `ValueError` such as `key=... is only valid with sort='key'` or +`unknown sort`. + **Cause:** key ordering requires a `key=` argument, and every node must have the relevant attribute unless you use `key="id"`. @@ -125,6 +145,54 @@ ordered_graph, _ = graph.reorder( assert ordered_graph.number_of_nodes() == 4 ``` +## A subsample call rejects its arguments + +**Symptom:** an `Exception` such as `indices must be 1-based`, `indices must be <= number +of samples in base data`, `range must be 1-based and end >= start`, or `step and offset +must be >= 1`. + +**Cause:** sample positions are 1-based everywhere, and out-of-range positions raise +rather than being silently dropped. (An unsorted or duplicated index list does not raise — +it is sorted and deduplicated with a `UserWarning`.) + +**Fix:** clamp the request to `len(decoder)` first. + +```python +from binary_ensemble import BendlDecoder + +decoder = BendlDecoder("ensemble.bendl") +window = list(decoder.subsample_range(1, min(50, len(decoder)))) +assert len(window) == 50 +``` + +## `stream.write()` rejects an assignment + +**Symptom:** `ValueError: assignment length N does not match graph node count M`. + +**Cause:** after `add_graph()`, every assignment written to the bundle's stream is checked +against the stored graph's node count. This catches node-order bugs at write time instead +of at analysis time. + +**Fix:** write assignments in the returned graph's node order — one entry per node. See +[The data contract](../concepts/data-model.md). + +## A bundle opens, but `len()` or iteration raises + +**Symptom:** `BendlDecoder(path)` succeeds, but `len()` or the first `for` loop raises an +`Exception` such as `truncated TwoDelta frame` or `failed to fill whole buffer`. + +**Cause:** the bundle was never finalized — typically a crashed run. The header is intact +but the stream's final frame was cut off mid-write. + +**Fix:** confirm with `is_complete()`, then follow +[Recovering samples from a crashed run](troubleshooting.md#recovering-samples-from-a-crashed-run). + +```python +from binary_ensemble import BendlDecoder + +print(BendlDecoder("ensemble.bendl").is_complete()) +``` + ## Assignments decode but downstream maps are wrong **Cause:** graph order and assignment order do not match. diff --git a/ben-py/docs/how-to/index.md b/ben-py/docs/how-to/index.md index 2ba6092..7d57818 100644 --- a/ben-py/docs/how-to/index.md +++ b/ben-py/docs/how-to/index.md @@ -99,6 +99,13 @@ Stream a ReCom chain straight into a self-describing `.bendl` bundle. Open a bundle, recover its graph and metadata, and walk its assignments. ::: +:::{grid-item-card} Analyze with NumPy and pandas +:link: analyze-with-numpy-pandas +:link-type: doc + +Stack an ensemble into an array or DataFrame and score every plan vectorized. +::: + :::{grid-item-card} Subsample a large ensemble :link: subsample :link-type: doc diff --git a/ben-py/docs/how-to/subsample.md b/ben-py/docs/how-to/subsample.md index 81205fc..00b1b1c 100644 --- a/ben-py/docs/how-to/subsample.md +++ b/ben-py/docs/how-to/subsample.md @@ -1,11 +1,24 @@ # Subsample a large ensemble When an ensemble has millions of plans, you often want only a slice — every 1000th plan, a -contiguous range, or a handful of specific indices. The decoders support this directly, and -they do it by **skipping** frames rather than decoding everything, so it stays fast. +contiguous range, or a handful of specific indices. The decoders support this directly: they +never materialize the samples you skip, and where the stream allows it they skip whole +frames without unpacking them. All three methods are available on both `BendlDecoder` (for bundles) and `BenDecoder` (for -plain streams). Each returns a decoder you iterate. +plain streams). Each returns a decoder you iterate. Indices are **1-based**; out-of-range +indices raise rather than being silently dropped, and an unsorted or duplicated index list +is sorted and deduplicated (with a warning). + +```{note} +How cheap skipping is depends on the stream's [encoding variant](../concepts/variants.md). +`standard` and `mkv_chain` frames state their byte length up front, so the reader hops +straight over unwanted samples. `twodelta` — the default — stores most samples as deltas, so +the reader still has to replay the deltas between snapshots to reconstruct the samples you +keep; skipped samples are cheaper (they're never built into Python lists) but not free. If +heavy random access or repeated subsampling is your primary workload, encode with +`variant="standard"` or `variant="mkv_chain"`. +``` ## By specific indices @@ -52,8 +65,8 @@ for assignment in BenDecoder("chain.xben", mode="xben").subsample_range(10, 15): ``` ```{tip} -Subsampling a BEN stream is fastest because frames can be skipped without decompressing. An -XBEN stream pays a one-time startup cost to begin reading, after which skipping is cheap -again. If you'll subsample an XBEN file repeatedly, extract it to BEN first with -[`decode_xben_to_ben`](convert-formats.md). +A BEN stream is the cheapest container to subsample. An XBEN stream pays a one-time +decompression startup cost to begin reading, after which skipping costs the same as in the +equivalent BEN stream. If you'll subsample an XBEN file repeatedly, extract it to BEN first +with [`decode_xben_to_ben`](convert-formats.md). ``` diff --git a/ben-py/docs/how-to/troubleshooting.md b/ben-py/docs/how-to/troubleshooting.md index 1ea4061..641aa02 100644 --- a/ben-py/docs/how-to/troubleshooting.md +++ b/ben-py/docs/how-to/troubleshooting.md @@ -65,6 +65,52 @@ with BendlEncoder("assets-only.bendl", overwrite=True) as encoder: encoder.add_metadata({"kind": "asset index"}) ``` +### Recovering samples from a crashed run + +An unfinalized bundle is not a write-off. The stream bytes that reached disk before the +crash are still there; only the last frame may be cut off mid-write, and the asset +directory (written at finalization) is lost. `extract_stream(allow_unfinalized=True)` +copies the partial stream out, and a salvage loop keeps every sample up to the truncated +tail: + +```python +from binary_ensemble import BendlDecoder, BenDecoder, BendlEncoder + +# allow_unfinalized=True permits extraction even though the stream checksum +# was never written. (On a finalized bundle the flag is harmless.) +BendlDecoder("ensemble.bendl").extract_stream( + "recovered.ben", overwrite=True, allow_unfinalized=True +) + +# Keep every intact sample; stop at the truncated tail frame, if any. +recovered = [] +stream = iter(BenDecoder("recovered.ben")) +while True: + try: + recovered.append(next(stream)) + except StopIteration: + break # clean end of stream + except Exception: + break # truncated tail frame from the crash + +# Re-encode the salvaged samples into a fresh, finalized bundle. +encoder = BendlEncoder("recovered.bendl", overwrite=True) +with encoder.stream("ben") as out: + for assignment in recovered: + out.write(assignment) +``` + +Two things to know about what survives a crash: + +- **Assets do not.** The bundle's directory is committed at finalization, so + `asset_names()` on a crashed bundle is empty even if you called `add_graph()` or + `add_metadata()` before streaming. Re-attach the graph and metadata to the recovered + bundle from their original sources. +- **`len()` and iteration on the crashed bundle itself raise** (the truncated tail frame + breaks the sample count), which is why the recipe extracts first and salvages from the + plain stream. For ensembles too large to buffer in a list, open the output stream first + and write each salvaged sample as it is decoded. + ## The assignments decode, but the maps look wrong This is almost always a node-order problem. Decoding can only recover the integer vectors diff --git a/ben-py/docs/index.md b/ben-py/docs/index.md index 6652a10..64febb9 100644 --- a/ben-py/docs/index.md +++ b/ben-py/docs/index.md @@ -146,6 +146,7 @@ how-to/examples-gallery how-to/anti-patterns how-to/compress-gerrychain-run how-to/read-and-iterate +how-to/analyze-with-numpy-pandas how-to/subsample how-to/convert-formats how-to/shrink-for-sharing @@ -173,6 +174,7 @@ api/index :hidden: :caption: Project +changelog format stability Rust crate source GitHub diff --git a/ben-py/docs/user/using_ben_py.ipynb b/ben-py/docs/user/using_ben_py.ipynb index 853e96e..91aab0d 100644 --- a/ben-py/docs/user/using_ben_py.ipynb +++ b/ben-py/docs/user/using_ben_py.ipynb @@ -466,7 +466,7 @@ "id": "e6bc1ecd", "metadata": {}, "source": [ - "## Subsampling\\n\\nFor winnowing a large ensemble you rarely want every plan. `BenDecoder` can yield just a\\nsubset by **skipping** frames, so it stays fast. Indices are 1-based.\\n\\nA decoder is reusable: call the `subsample_*` methods on the **same** `BenDecoder` as many\\ntimes as you like. Each call rewinds to the start of the stream and applies the new\\nselection, so there's no need to open a fresh decoder per subsample." + "## Subsampling\\n\\nFor winnowing a large ensemble you rarely want every plan. `BenDecoder` can yield just a\\nsubset without materializing the rest. Indices are 1-based. (How cheap a skipped sample is\\ndepends on the variant: `standard` and `mkv_chain` frames are skipped wholesale, while\\n`twodelta` — the default — replays the deltas between snapshots.)\\n\\nA decoder is reusable: call the `subsample_*` methods on the **same** `BenDecoder` as many\\ntimes as you like. Each call rewinds to the start of the stream and applies the new\\nselection, so there's no need to open a fresh decoder per subsample." ] }, { diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index 818a54b..f89ea29 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -54,6 +54,9 @@ dev = [ "ipykernel>=7.0.1", "ipywidgets>=8.1.7", "maturin>=1.9.6", + # Used directly by the docs how-to snippets, which run under pytest. + "numpy>=1.26", + "pandas>=2.0", "pytest>=8.4.2", "ruff>=0.11.0", "tqdm>=4.67.1", diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index 013862c..c41d22a 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -27,6 +27,17 @@ use std::path::PathBuf; /// points the caller at :class:`~binary_ensemble.stream.BenDecoder`. A finalized assets-only /// bundle (one written with no assignment stream) iterates to nothing with ``len() == 0``. /// +/// Args: +/// file_path: Path to the input ``.bendl`` file. Whether the embedded stream is BEN or +/// XBEN is read from the bundle header; an XBEN stream warns about a one-time +/// decompression startup cost. +/// +/// Raises: +/// Exception: If ``file_path`` is not a bundle (use +/// :class:`~binary_ensemble.stream.BenDecoder` for plain streams), or its header +/// cannot be parsed. +/// OSError: If the file cannot be opened. +/// /// Example: /// >>> from binary_ensemble import BendlDecoder /// >>> dec = BendlDecoder("ensemble.bendl") @@ -133,8 +144,9 @@ impl PyBendlDecoder { /// Count the samples in the embedded stream. /// /// The result is the *expanded* sample count (a frame repeating five identical samples - /// contributes five). It is computed lazily and cached, so repeated calls and ``len()`` - /// are cheap. + /// contributes five). On a finalized bundle the count is read from the bundle header, + /// so it never requires scanning the stream; it is cached either way, so repeated + /// calls and ``len()`` are cheap. /// /// Returns: /// int: The number of samples in the bundle's stream. @@ -145,13 +157,20 @@ impl PyBendlDecoder { /// Restrict iteration to the samples at the given 1-indexed positions. /// - /// Selected samples are reached by skipping frames rather than decoding the whole stream. + /// Skipped samples are never materialized as Python lists, and where the encoding + /// variant allows it (``standard``, ``mkv_chain``) whole frames are skipped without + /// being unpacked. /// /// Args: - /// indices: The 1-indexed sample numbers to keep. + /// indices: The 1-indexed sample numbers to keep. An unsorted or duplicated list + /// is sorted and deduplicated, with a ``UserWarning``. /// /// Returns: /// BendlDecoder: ``self``, so the call can be chained into a ``for`` loop. + /// + /// Raises: + /// Exception: If any index is ``0`` (indices are 1-based) or greater than the + /// number of samples in the stream. #[pyo3(text_signature = "(self, indices, /)")] fn subsample_indices<'py>( mut slf: PyRefMut<'py, Self>, @@ -171,6 +190,10 @@ impl PyBendlDecoder { /// Returns: /// BendlDecoder: ``self``, for chaining into a ``for`` loop. /// + /// Raises: + /// Exception: If ``start`` is ``0``, ``end`` is less than ``start``, or ``end`` + /// is greater than the number of samples in the stream. + /// /// Example: /// >>> list(BendlDecoder("ensemble.bendl").subsample_range(10, 15)) /// # samples 10, 11, 12, 13, 14, and 15 @@ -193,6 +216,9 @@ impl PyBendlDecoder { /// /// Returns: /// BendlDecoder: ``self``, for chaining into a ``for`` loop. + /// + /// Raises: + /// Exception: If ``step`` or ``offset`` is ``0`` (both are 1-based). #[pyo3(signature = (step, offset=1))] #[pyo3(text_signature = "(self, step, offset=1)")] fn subsample_every<'py>( @@ -326,7 +352,9 @@ impl PyBendlDecoder { /// The stored adjacency-format JSON is rebuilt into a live graph via /// `networkx.readwrite.json_graph.adjacency_graph`, so its node order matches the order /// assignments were written in and it can be handed straight to consumers like GerryChain's - /// `Partition`. The raw JSON is still available through `read_json_asset("graph.json")`. + /// `Partition`. The result is a :class:`networkx.Graph` — or a + /// :class:`networkx.MultiGraph` if the stored adjacency declares itself a multigraph. + /// The raw JSON is still available through `read_json_asset("graph.json")`. #[pyo3(text_signature = "(self)")] fn read_graph<'py>(&mut self, py: Python<'py>) -> PyResult>> { let Some(data) = self.read_known_json(py, ASSET_TYPE_GRAPH, "graph.json")? else { diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 8351967..327835b 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -17,6 +17,17 @@ use std::path::PathBuf; /// inspection surface (assets, embedded graph, metadata). This mirrors the ``ben`` vs /// ``bendl`` split of the command-line tools. /// +/// Args: +/// file_path: Path to the input ``.ben`` or ``.xben`` file. +/// mode: Which reader to use — ``"ben"`` or ``"xben"``. Defaults to ``"ben"``. +/// Opening an XBEN stream warns about a one-time decompression startup cost. +/// +/// Raises: +/// Exception: If ``file_path`` is a ``.bendl`` bundle (use +/// :class:`~binary_ensemble.bundle.BendlDecoder` instead), or ``mode`` does not +/// match the file's actual format. +/// OSError: If the file cannot be opened or its banner is malformed. +/// /// Example: /// >>> from binary_ensemble import BenDecoder /// >>> for assignment in BenDecoder("plans.ben"): @@ -95,8 +106,8 @@ impl PyBenDecoder { /// Count the samples in the stream. /// /// The result is the *expanded* sample count: a frame that repeats five identical - /// samples contributes five. The count is computed lazily and cached, so repeated calls - /// (and ``len()``) are cheap. + /// samples contributes five. The first call walks the stream to count; the result is + /// cached, so repeated calls (and ``len()``) are cheap afterwards. /// /// Returns: /// int: The number of samples in the stream. @@ -107,15 +118,21 @@ impl PyBenDecoder { /// Restrict iteration to the samples at the given 1-indexed positions. /// - /// Selected samples are reached by skipping frames rather than decoding the whole - /// stream, so this stays fast on large ensembles. + /// Skipped samples are never materialized as Python lists, and where the encoding + /// variant allows it (``standard``, ``mkv_chain``) whole frames are skipped without + /// being unpacked, so this stays fast on large ensembles. /// /// Args: - /// indices: The 1-indexed sample numbers to keep. + /// indices: The 1-indexed sample numbers to keep. An unsorted or duplicated list + /// is sorted and deduplicated, with a ``UserWarning``. /// /// Returns: /// BenDecoder: ``self``, so the call can be chained directly into a ``for`` loop. /// + /// Raises: + /// Exception: If any index is ``0`` (indices are 1-based) or greater than the + /// number of samples in the stream. + /// /// Example: /// >>> for plan in BenDecoder("plans.ben").subsample_indices([1, 500, 9999]): /// ... ... @@ -138,6 +155,10 @@ impl PyBenDecoder { /// Returns: /// BenDecoder: ``self``, for chaining into a ``for`` loop. /// + /// Raises: + /// Exception: If ``start`` is ``0``, ``end`` is less than ``start``, or ``end`` + /// is greater than the number of samples in the stream. + /// /// Example: /// >>> list(BenDecoder("plans.ben").subsample_range(10, 15)) /// # samples 10, 11, 12, 13, 14, and 15 @@ -161,6 +182,9 @@ impl PyBenDecoder { /// Returns: /// BenDecoder: ``self``, for chaining into a ``for`` loop. /// + /// Raises: + /// Exception: If ``step`` or ``offset`` is ``0`` (both are 1-based). + /// /// Example: /// >>> for plan in BenDecoder("plans.ben").subsample_every(1000): /// ... ... diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index 9a63dc8..dc91ac2 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -38,9 +38,11 @@ fn map_io_err(err: io::Error) -> PyErr { fn opts_for(content_type: &str) -> PyResult { match content_type { "json" => Ok(AddAssetOptions::defaults().json()), - "text" => Ok(AddAssetOptions::defaults()), + // "text" and "binary" carry the same wire options (raw bytes, default compression + // policy); the two names exist so call sites document their payloads honestly. + "text" | "binary" => Ok(AddAssetOptions::defaults()), other => Err(PyValueError::new_err(format!( - "content_type must be 'json' or 'text', got {other:?}" + "content_type must be 'json', 'text', or 'binary', got {other:?}" ))), } } @@ -157,18 +159,25 @@ impl PyBendlEncoder { /// Add a custom asset (asset type ``CUSTOM``). /// + /// Payloads are stored verbatim with a CRC32C integrity checksum, so any bytes round-trip — + /// including binary blobs such as zipped shapefiles or GeoPackages. Payloads at or above 1 + /// KiB are xz-compressed on disk by default (transparent on read); already-compressed blobs + /// gain little from this but are not harmed by it. + /// /// Args: /// name: Asset name stored in the bundle directory. - /// payload: UTF-8 text or JSON bytes to store. - /// content_type: Either ``"json"`` or ``"text"``. JSON assets are marked so - /// :meth:`binary_ensemble.bundle.BendlDecoder.read_json_asset` can parse them. + /// payload: The bytes to store. + /// content_type: ``"json"``, ``"text"``, or ``"binary"``. JSON assets are marked so + /// :meth:`binary_ensemble.bundle.BendlDecoder.read_json_asset` can parse them; + /// ``"text"`` and ``"binary"`` store the bytes unmarked. /// /// Raises: - /// ValueError: If ``content_type`` is not ``"json"`` or ``"text"``. + /// ValueError: If ``content_type`` is not ``"json"``, ``"text"``, or ``"binary"``. /// Exception: If the encoder is closed, failed, or currently streaming. /// /// Example: /// >>> encoder.add_asset("scores.json", '{"cut_edges": [10]}', content_type="json") + /// >>> encoder.add_asset("tracts.gpkg", gpkg_bytes, content_type="binary") #[pyo3(signature = (name, payload, content_type))] #[pyo3(text_signature = "(self, name, payload, content_type)")] fn add_asset(&mut self, name: &str, payload: Vec, content_type: &str) -> PyResult<()> { diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index 83acbbf..c4de457 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -18,6 +18,17 @@ use std::path::PathBuf; /// This produces a plain BEN stream with no bundle framing. To produce a self-describing /// ``.bendl`` bundle (with an embedded graph, metadata, or other assets) use /// :class:`~binary_ensemble.bundle.BendlEncoder` instead. +/// +/// Args: +/// file_path: Output path. Must not exist unless ``overwrite=True``. +/// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. +/// variant: BEN encoding variant for the stream — ``"standard"``, ``"mkv_chain"``, +/// or ``"twodelta"``. ``None`` (the default) means ``"twodelta"``. +/// +/// Raises: +/// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be +/// created. +/// ValueError: If ``variant`` is not a recognized variant name. #[pyclass(module = "binary_ensemble", name = "BenEncoder", unsendable)] pub struct PyBenEncoder { writer: Option>>, diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index 5d83c77..3a32ca8 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -155,8 +155,8 @@ def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: def test_add_asset_content_type_validation(tmp_path: Path) -> None: enc = BendlEncoder(tmp_path / "v.bendl", overwrite=True) - with pytest.raises(ValueError, match="must be 'json' or 'text'"): - enc.add_asset("x", b"data", content_type="binary") + with pytest.raises(ValueError, match="must be 'json', 'text', or 'binary'"): + enc.add_asset("x", b"data", content_type="parquet") with pytest.raises(ValueError, match="valid UTF-8 JSON"): enc.add_asset("bad.json", "not json", content_type="json") with pytest.raises(ValueError, match="valid UTF-8"): @@ -164,12 +164,39 @@ def test_add_asset_content_type_validation(tmp_path: Path) -> None: # Valid forms succeed. enc.add_asset("ok.json", '{"a":1}', content_type="json") enc.add_asset("ok.txt", "fine", content_type="text") + enc.add_asset("ok.bin", b"\xff\xfe\x00\x01", content_type="binary") enc.close() dec = BendlDecoder(tmp_path / "v.bendl") assert dec.read_json_asset("ok.json") == {"a": 1} flags = {a["name"]: a["flags"] for a in dec.list_assets()} assert "json" in flags["ok.json"] assert "json" not in flags["ok.txt"] + assert "json" not in flags["ok.bin"] + + +def test_binary_asset_round_trips_arbitrary_bytes(tmp_path: Path) -> None: + # A blob that is deliberately not valid UTF-8 and not JSON — the shape of a zipped + # shapefile or GeoPackage — must round-trip byte-exactly under CRC protection. + blob = bytes(range(256)) * 5 + enc = BendlEncoder(tmp_path / "blob.bendl", overwrite=True) + enc.add_asset("tracts.gpkg", blob, content_type="binary") + enc.close() + + dec = BendlDecoder(tmp_path / "blob.bendl") + assert dec.read_asset_bytes("tracts.gpkg") == blob + + +def test_large_assets_compress_transparently(tmp_path: Path) -> None: + # Payloads at or above the writer's 1 KiB threshold are xz-compressed on disk by default; + # the read side decompresses transparently, so round-trips are unaffected. + big_json = json.dumps({"scores": list(range(2000))}) + assert len(big_json) >= 1024 + enc = BendlEncoder(tmp_path / "big.bendl", overwrite=True) + enc.add_asset("scores.json", big_json, content_type="json") + enc.close() + + dec = BendlDecoder(tmp_path / "big.bendl") + assert dec.read_json_asset("scores.json") == {"scores": list(range(2000))} # --------------------------------------------------------------------------- diff --git a/ben-py/tests/test_surface.py b/ben-py/tests/test_surface.py index e0a0dc5..9f158f6 100644 --- a/ben-py/tests/test_surface.py +++ b/ben-py/tests/test_surface.py @@ -60,10 +60,12 @@ def test_bundle_module_exports() -> None: assert set(bundle.__all__) == { "BendlEncoder", "BendlDecoder", + "BendlStreamSession", "compress_stream", "relabel_bundle", } assert bundle.BendlDecoder is _core.BendlDecoder + assert bundle.BendlStreamSession is _core.BendlStreamSession def test_codec_module_exports() -> None: diff --git a/ben-py/uv.lock b/ben-py/uv.lock index a380723..6d8db84 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -119,6 +119,8 @@ dev = [ { name = "ipykernel" }, { name = "ipywidgets" }, { name = "maturin" }, + { name = "numpy" }, + { name = "pandas" }, { name = "pytest" }, { name = "ruff" }, { name = "tqdm" }, @@ -147,6 +149,8 @@ dev = [ { name = "ipykernel", specifier = ">=7.0.1" }, { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "maturin", specifier = ">=1.9.6" }, + { name = "numpy", specifier = ">=1.26" }, + { name = "pandas", specifier = ">=2.0" }, { name = "pytest", specifier = ">=8.4.2" }, { name = "ruff", specifier = ">=0.11.0" }, { name = "tqdm", specifier = ">=4.67.1" }, diff --git a/ben/src/io/bundle/format.rs b/ben/src/io/bundle/format.rs index 749240a..d534181 100644 --- a/ben/src/io/bundle/format.rs +++ b/ben/src/io/bundle/format.rs @@ -142,10 +142,24 @@ impl KnownAssetKind { } } -/// Return whether a given asset type should default to xz compression when the writer is not given -/// an explicit compression option. -pub fn default_compresses_by_type(asset_type: u16) -> bool { - matches!(asset_type, ASSET_TYPE_GRAPH) +/// Payload size at and above which the writer compresses an asset by default. +/// +/// Below this, the xz container overhead (~60–90 bytes) can exceed the savings — a ~100-byte +/// `metadata.json` would *grow* under compression — so small payloads stay raw. At or above it, +/// the JSON/text payloads bundles typically carry (per-plan scores, node maps, provenance) +/// compress well for negligible CPU. An explicit [`AddAssetOptions::raw`] or +/// [`AddAssetOptions::compress`] always overrides the default. +/// +/// [`AddAssetOptions::raw`]: super::writer::AddAssetOptions::raw +/// [`AddAssetOptions::compress`]: super::writer::AddAssetOptions::compress +pub const DEFAULT_ASSET_COMPRESSION_THRESHOLD: usize = 1024; + +/// Return whether an asset should default to xz compression when the writer is not given an +/// explicit compression option: graphs always compress (they are the bundle's bulkiest JSON and +/// compress extremely well), and any other asset compresses once its payload reaches +/// [`DEFAULT_ASSET_COMPRESSION_THRESHOLD`]. +pub fn default_compresses(asset_type: u16, payload_len: usize) -> bool { + asset_type == ASSET_TYPE_GRAPH || payload_len >= DEFAULT_ASSET_COMPRESSION_THRESHOLD } /// Asset flag bit: the decoded payload is UTF-8 JSON. diff --git a/ben/src/io/bundle/tests/format.rs b/ben/src/io/bundle/tests/format.rs index ea1cafd..a05d7c1 100644 --- a/ben/src/io/bundle/tests/format.rs +++ b/ben/src/io/bundle/tests/format.rs @@ -25,10 +25,26 @@ fn standardized_name_lookup() { #[test] fn default_compression_policy() { - assert!(default_compresses_by_type(ASSET_TYPE_GRAPH)); - assert!(!default_compresses_by_type(ASSET_TYPE_METADATA)); - assert!(!default_compresses_by_type(ASSET_TYPE_NODE_PERMUTATION_MAP)); - assert!(!default_compresses_by_type(ASSET_TYPE_CUSTOM)); + // Graphs always compress, regardless of size. + assert!(default_compresses(ASSET_TYPE_GRAPH, 0)); + assert!(default_compresses(ASSET_TYPE_GRAPH, 10)); + + // Everything else is size-gated: below the threshold the xz container overhead can exceed + // the savings, so small payloads stay raw. + for asset_type in [ + ASSET_TYPE_METADATA, + ASSET_TYPE_NODE_PERMUTATION_MAP, + ASSET_TYPE_CUSTOM, + ] { + assert!(!default_compresses( + asset_type, + DEFAULT_ASSET_COMPRESSION_THRESHOLD - 1 + )); + assert!(default_compresses( + asset_type, + DEFAULT_ASSET_COMPRESSION_THRESHOLD + )); + } } #[test] diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 3488808..839fcce 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -723,6 +723,68 @@ fn stress_many_custom_assets_round_trip() { } } +#[test] +fn default_compression_is_size_gated_end_to_end() { + use crate::io::bundle::format::DEFAULT_ASSET_COMPRESSION_THRESHOLD; + + let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); + + // Small custom asset: stays raw by default (compressing it would grow it). + let small = vec![7u8; 16]; + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "small.bin", + &small, + AddAssetOptions::defaults(), + ) + .unwrap(); + + // Threshold-sized custom asset: compresses by default. Highly repetitive payload so the + // on-disk size visibly shrinks. + let large = vec![7u8; DEFAULT_ASSET_COMPRESSION_THRESHOLD]; + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "large.bin", + &large, + AddAssetOptions::defaults(), + ) + .unwrap(); + + // Explicit raw() on a large payload overrides the default. + writer + .add_asset( + ASSET_TYPE_CUSTOM, + "large_raw.bin", + &large, + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + + let writer = write_stream_bytes_via_session(writer, b"STANDARD BEN FILE\x00fake", 1); + let buf = writer.finish().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(buf)).unwrap(); + + let small_entry = reader.find_asset_by_name("small.bin").cloned().unwrap(); + assert_eq!(small_entry.asset_flags & ASSET_FLAG_XZ, 0); + assert_eq!(reader.asset_bytes(&small_entry).unwrap(), small); + + let large_entry = reader.find_asset_by_name("large.bin").cloned().unwrap(); + assert_ne!(large_entry.asset_flags & ASSET_FLAG_XZ, 0); + assert!( + (large_entry.payload_len as usize) < DEFAULT_ASSET_COMPRESSION_THRESHOLD, + "compressed on-disk payload should be smaller than the raw payload" + ); + assert_eq!(reader.asset_bytes(&large_entry).unwrap(), large); + + let raw_entry = reader.find_asset_by_name("large_raw.bin").cloned().unwrap(); + assert_eq!(raw_entry.asset_flags & ASSET_FLAG_XZ, 0); + assert_eq!(raw_entry.payload_len as usize, large.len()); + assert_eq!(reader.asset_bytes(&raw_entry).unwrap(), large); +} + #[test] fn append_empty_commit_is_noop() { let (bundle, _) = build_base_bundle(); diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index a714242..98fa2b7 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -30,10 +30,10 @@ use thiserror::Error; use xz2::write::XzEncoder; use super::format::{ - default_compresses_by_type, encode_directory, read_directory, standardized_name_for, - AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, - ASSET_FLAG_CHECKSUM, ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, DEFAULT_XZ_PRESET, - FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, + default_compresses, encode_directory, read_directory, standardized_name_for, AssignmentFormat, + BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, ASSET_FLAG_CHECKSUM, + ASSET_FLAG_JSON, ASSET_FLAG_XZ, ASSET_TYPE_CUSTOM, DEFAULT_XZ_PRESET, FINALIZED_YES, + HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, }; /// Options passed alongside each [`BendlWriter::add_asset`] call. @@ -253,7 +253,7 @@ impl BendlWriter { let compress = options .compress - .unwrap_or_else(|| default_compresses_by_type(asset_type)); + .unwrap_or_else(|| default_compresses(asset_type, payload.len())); let encoded = encode_asset_payload(payload.to_vec(), compress, options.is_json)?; // Write at current file position. @@ -655,7 +655,7 @@ impl BendlAppender { let compress = options .compress - .unwrap_or_else(|| default_compresses_by_type(asset_type)); + .unwrap_or_else(|| default_compresses(asset_type, payload.len())); self.pending.push(PendingAsset { asset_type, From 72d0018e6840185ef2f94cd167d666cb4c27dda3 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:12:26 -0600 Subject: [PATCH 164/221] Note compression bomb risk (consequence of format and nothing to do here) --- docs/ben-format-spec.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/ben-format-spec.md b/docs/ben-format-spec.md index ca34d88..bf5e4ab 100644 --- a/docs/ben-format-spec.md +++ b/docs/ben-format-spec.md @@ -238,6 +238,13 @@ places no limit on assignment length, but each run can demand up to 65535 elemen bound a small malicious frame could request an arbitrarily large allocation. The bound MUST sit well above any real dual graph (this implementation uses 2^27 ≈ 134 million elements). +Consumers decoding untrusted streams should also be aware of the format's decompression-bomb +characteristic: a frame's expanded output is `assignment length × count`, so a few dozen wire +bytes can legally represent terabytes of expanded samples. Memory stays bounded in a streaming +reader (one assignment is materialized at a time), but total output volume and CPU are +proportional to the expanded size — callers writing expanded output (e.g. JSONL) from untrusted +input should impose their own output or time budgets. + Frame-level subsampling does not require unpacking payload bits: a reader can skip a frame by reading its 6-byte header, seeking past `n_bytes` (and, for MkvChain, the 2-byte count), and only unpacking the payloads of frames it keeps. From a3b3400200d1d507af2ee654f7b51468378859a8 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:12:48 -0600 Subject: [PATCH 165/221] tuning for fuzz tests --- fuzz/fuzz_targets/ben_reader.rs | 23 +++++++++++------------ fuzz/fuzz_targets/xben_body.rs | 10 ++++++---- fuzz/fuzz_targets/xben_container.rs | 14 ++++++++++---- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fuzz/fuzz_targets/ben_reader.rs b/fuzz/fuzz_targets/ben_reader.rs index d511bb0..f9c504e 100644 --- a/fuzz/fuzz_targets/ben_reader.rs +++ b/fuzz/fuzz_targets/ben_reader.rs @@ -5,13 +5,16 @@ //! compound, multi-byte corruptions that enumeration cannot reach. The contract is the same: //! arbitrary bytes may error anywhere, but must never panic, hang, or exhaust memory. +//! Full-drain entry points (`decode_ben_to_jsonl`, `relabel_ben_file`) are deliberately absent: +//! their parse coverage is identical to the bounded iterators below, but a frame's expanded +//! output is `assignment length × count`, so the fuzzer inevitably discovers tiny inputs that +//! legally demand minutes of serialization work (the format's documented decompression-bomb +//! characteristic), drowning exploration in slow units. + #![no_main] -use binary_ensemble::codec::decode::decode_ben_to_jsonl; use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; use binary_ensemble::ops::extract::extract_assignment_ben; -use binary_ensemble::ops::relabel::{relabel_ben_file, RelabelOptions}; -use binary_ensemble::BenVariant; use libfuzzer_sys::fuzz_target; /// Bound on records pulled from iterator-style entry points: corrupt streams may yield errors @@ -19,8 +22,6 @@ use libfuzzer_sys::fuzz_target; const MAX_PULLS: usize = 64; fuzz_target!(|data: &[u8]| { - let _ = decode_ben_to_jsonl(data, std::io::sink()); - if let Ok(reader) = BenStreamReader::from_ben(data) { for record in reader.silent(true).take(MAX_PULLS) { let _ = record; @@ -35,16 +36,14 @@ fuzz_target!(|data: &[u8]| { } } if let Ok(reader) = BenStreamReader::from_ben(data) { - for record in reader.silent(true).into_subsample_by_range(1, 3).take(MAX_PULLS) { + for record in reader + .silent(true) + .into_subsample_by_range(1, 3) + .take(MAX_PULLS) + { let _ = record; } } - let _ = relabel_ben_file(data, std::io::sink(), RelabelOptions::first_seen()); - let _ = relabel_ben_file( - data, - std::io::sink(), - RelabelOptions::convert_to(BenVariant::TwoDelta), - ); let _ = extract_assignment_ben(data, 2); }); diff --git a/fuzz/fuzz_targets/xben_body.rs b/fuzz/fuzz_targets/xben_body.rs index cbf873d..c11bf0e 100644 --- a/fuzz/fuzz_targets/xben_body.rs +++ b/fuzz/fuzz_targets/xben_body.rs @@ -5,9 +5,14 @@ //! a fresh, valid xz container so corruption lands directly on the inner parsers — the same //! trick as the deterministic harness's recompressed-body sweeps, but coverage-guided. +//! Full-drain entry points (`decode_xben_to_jsonl`/`decode_xben_to_ben`) are deliberately +//! absent: a frame's expanded output is `assignment length × count`, so the fuzzer inevitably +//! discovers tiny bodies that legally demand minutes of serialization work (the format's +//! documented decompression-bomb characteristic). The bounded iterators below cover the same +//! parsers without the unbounded output cost. + #![no_main] -use binary_ensemble::codec::decode::{decode_xben_to_ben, decode_xben_to_jsonl}; use binary_ensemble::codec::encode::xz_compress; use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; use binary_ensemble::ops::extract::extract_assignment_xben; @@ -27,9 +32,6 @@ fuzz_target!(|data: &[u8]| { ) .expect("compressing an in-memory body cannot fail"); - let _ = decode_xben_to_jsonl(BufReader::new(container.as_slice()), std::io::sink()); - let _ = decode_xben_to_ben(BufReader::new(container.as_slice()), std::io::sink()); - if let Ok(reader) = BenStreamReader::from_xben(container.as_slice()) { for record in reader.silent(true).take(MAX_PULLS) { let _ = record; diff --git a/fuzz/fuzz_targets/xben_container.rs b/fuzz/fuzz_targets/xben_container.rs index 3489ef4..dcb3e8b 100644 --- a/fuzz/fuzz_targets/xben_container.rs +++ b/fuzz/fuzz_targets/xben_container.rs @@ -3,18 +3,19 @@ //! Complement of `xben_body`: here the fuzz input is the container itself, so the xz layer, the //! banner dispatch, and the error paths between them face the corruption. +//! Full-drain entry points are deliberately absent here too (see `xben_body.rs`): bounded +//! iteration covers the same dispatch and xz plumbing without the `length × count` output cost. + #![no_main] -use binary_ensemble::codec::decode::{decode_xben_to_ben, decode_xben_to_jsonl, xz_decompress}; -use binary_ensemble::io::reader::BenStreamReader; +use binary_ensemble::codec::decode::xz_decompress; +use binary_ensemble::io::reader::{BenStreamFrameReader, BenStreamReader}; use libfuzzer_sys::fuzz_target; use std::io::BufReader; const MAX_PULLS: usize = 64; fuzz_target!(|data: &[u8]| { - let _ = decode_xben_to_jsonl(BufReader::new(data), std::io::sink()); - let _ = decode_xben_to_ben(BufReader::new(data), std::io::sink()); let _ = xz_decompress(BufReader::new(data), std::io::sink()); if let Ok(reader) = BenStreamReader::from_xben(data) { @@ -22,4 +23,9 @@ fuzz_target!(|data: &[u8]| { let _ = record; } } + if let Ok(frames) = BenStreamFrameReader::from_xben(data) { + for frame in frames.take(MAX_PULLS) { + let _ = frame; + } + } }); From 1361ace067463592659740e1df46b6156ef30784 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:25:13 -0600 Subject: [PATCH 166/221] better api for asset payloads --- ben-py/binary_ensemble/bundle.py | 81 +++++++++++++++++++++++++---- ben-py/binary_ensemble/bundle.pyi | 4 +- ben-py/docs/concepts/limitations.md | 7 ++- ben-py/tests/test_bundle_api.py | 59 ++++++++++++++++++++- 4 files changed, 135 insertions(+), 16 deletions(-) diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 6dc2a3e..4b53b92 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -27,7 +27,7 @@ import json import os import tempfile -from typing import Any, Optional, Union +from typing import Any, Optional from binary_ensemble._core import BendlDecoder, BendlStreamSession from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder @@ -69,13 +69,48 @@ def _atomic_or_out(transform, path, out_file, in_place, suffix=".bendl"): transform(path, out_file, False) -def _coerce_bytes(payload: Union[bytes, bytearray, memoryview, str]) -> bytes: - """Coerce an ``add_asset`` payload to bytes (``str`` is UTF-8 encoded).""" +def _coerce_asset_payload(payload: Any, content_type: str) -> bytes: + """Coerce an ``add_asset`` payload to bytes. + + Accepted forms: + + - ``dict`` / ``list`` — serialized via ``json.dumps`` (requires + ``content_type="json"``). + - ``str`` — UTF-8 encoded **content** (not a path; pass a ``pathlib.Path`` + to read a file — this deliberately differs from :meth:`BendlEncoder.add_metadata`, + whose payloads are never plain text, so there a ``str`` is a path). + - ``bytes`` / ``bytearray`` / ``memoryview`` — used verbatim. + - any object with a ``.read()`` method (open files, ``io.BytesIO``) — read, + with ``str`` results UTF-8 encoded. + - ``os.PathLike`` (e.g. ``pathlib.Path``) — the file at that path is read. + """ + if isinstance(payload, (dict, list)): + if content_type != "json": + raise ValueError( + "dict/list payloads are serialized as JSON and require " + f"content_type='json', got {content_type!r}" + ) + return json.dumps(payload).encode("utf-8") if isinstance(payload, str): return payload.encode("utf-8") if isinstance(payload, (bytes, bytearray, memoryview)): return bytes(payload) - raise TypeError(f"asset payload must be bytes or str, got {type(payload).__name__}") + if hasattr(payload, "read"): + data = payload.read() + if isinstance(data, str): + return data.encode("utf-8") + if isinstance(data, (bytes, bytearray, memoryview)): + return bytes(data) + raise TypeError( + f"asset payload .read() must return bytes or str, got {type(data).__name__}" + ) + if isinstance(payload, os.PathLike): + with open(os.fspath(payload), "rb") as f: + return f.read() + raise TypeError( + "asset payload must be bytes, str, dict/list (JSON), a file-like with " + f".read(), or a path, got {type(payload).__name__}" + ) class BendlEncoder: @@ -156,19 +191,42 @@ def add_metadata(self, metadata: Any) -> None: def add_asset( self, name: str, - payload: Union[bytes, bytearray, memoryview, str], + payload: Any, content_type: str, ) -> None: """Embed a custom asset under ``name``. + ``payload`` may be bytes-like or a ``str`` (stored as UTF-8 content), a + ``dict``/``list`` (serialized as JSON; requires ``content_type="json"``), + an open file or other object with ``.read()``, or a ``pathlib.Path`` + whose file contents are read. A plain ``str`` is always *content*, never + a path — pass a ``Path`` to read from disk. + ``content_type`` is ``"json"`` (payload must be valid UTF-8 JSON; the decoder will auto-parse it), ``"text"`` (payload must be valid UTF-8), - or ``"binary"`` (arbitrary bytes, stored verbatim — e.g. a zipped - shapefile or a GeoPackage). Every asset carries a CRC32C integrity - checksum, and payloads of 1 KiB or more are xz-compressed on disk by - default (transparent on read). + ``"binary"`` (arbitrary bytes, stored verbatim — e.g. a zipped + shapefile or a GeoPackage), or ``"file"`` (the payload is a ``str`` or + ``pathlib.Path`` naming a file whose contents are read and stored as + binary). Every asset carries a CRC32C integrity checksum, and payloads + of 1 KiB or more are xz-compressed on disk by default (transparent on + read). + + ``"file"`` is the one content type under which a plain ``str`` payload + is a *path*; to store a typed file (e.g. JSON the decoder should + auto-parse), pass a ``pathlib.Path`` with ``content_type="json"`` + instead. """ - data = _coerce_bytes(payload) + if content_type == "file": + if not isinstance(payload, (str, os.PathLike)): + raise TypeError( + "content_type='file' requires a str or os.PathLike payload, " + f"got {type(payload).__name__}" + ) + with open(os.fspath(payload), "rb") as f: + data = f.read() + self._enc.add_asset(name, data, "binary") + return + data = _coerce_asset_payload(payload, content_type) if content_type == "json": try: json.loads(data.decode("utf-8")) @@ -185,7 +243,8 @@ def add_asset( ) from exc elif content_type != "binary": raise ValueError( - f"content_type must be 'json', 'text', or 'binary', got {content_type!r}" + f"content_type must be 'json', 'text', 'binary', or 'file', " + f"got {content_type!r}" ) self._enc.add_asset(name, data, content_type) diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi index 8201c66..a37e280 100644 --- a/ben-py/binary_ensemble/bundle.pyi +++ b/ben-py/binary_ensemble/bundle.pyi @@ -1,4 +1,4 @@ -from typing import Any, Optional, Union +from typing import Any, Optional from binary_ensemble._core import BendlDecoder as BendlDecoder from binary_ensemble._core import BendlStreamSession as BendlStreamSession @@ -22,7 +22,7 @@ class BendlEncoder: def add_asset( self, name: str, - payload: Union[bytes, bytearray, memoryview, str], + payload: Any, content_type: str, ) -> None: ... def stream( diff --git a/ben-py/docs/concepts/limitations.md b/ben-py/docs/concepts/limitations.md index 0deda06..1d9807e 100644 --- a/ben-py/docs/concepts/limitations.md +++ b/ben-py/docs/concepts/limitations.md @@ -80,13 +80,16 @@ custom binary assets. The payload is stored verbatim with a CRC32C integrity che (xz-compressed on disk when it is 1 KiB or larger, transparently decompressed on read): ```python +from pathlib import Path + from binary_ensemble import BendlDecoder, BendlEncoder -# Stand-in for real geometry bytes, e.g. open("tracts.gpkg", "rb").read(). +# Stand-in for a real geometry file, e.g. one produced by geopandas. gpkg_bytes = b"GPKG\x00\x01" + bytes(range(256)) +Path("tracts.gpkg").write_bytes(gpkg_bytes) encoder = BendlEncoder("with_geometry.bendl", overwrite=True) -encoder.add_asset("tracts.gpkg", gpkg_bytes, content_type="binary") +encoder.add_asset("tracts.gpkg", Path("tracts.gpkg"), content_type="file") encoder.close() decoder = BendlDecoder("with_geometry.bendl") diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index 3a32ca8..98899bf 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -155,7 +155,7 @@ def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: def test_add_asset_content_type_validation(tmp_path: Path) -> None: enc = BendlEncoder(tmp_path / "v.bendl", overwrite=True) - with pytest.raises(ValueError, match="must be 'json', 'text', or 'binary'"): + with pytest.raises(ValueError, match="must be 'json', 'text', 'binary', or 'file'"): enc.add_asset("x", b"data", content_type="parquet") with pytest.raises(ValueError, match="valid UTF-8 JSON"): enc.add_asset("bad.json", "not json", content_type="json") @@ -174,6 +174,63 @@ def test_add_asset_content_type_validation(tmp_path: Path) -> None: assert "json" not in flags["ok.bin"] +def test_add_asset_accepts_dict_and_list_payloads(tmp_path: Path) -> None: + enc = BendlEncoder(tmp_path / "d.bendl", overwrite=True) + enc.add_asset("scores.json", {"cut_edges": [10, 12]}, content_type="json") + enc.add_asset("steps.json", [1, 2, 3], content_type="json") + # dict/list payloads are JSON by definition; other content types are a caller mistake. + with pytest.raises(ValueError, match="require content_type='json'"): + enc.add_asset("bad.bin", {"a": 1}, content_type="binary") + enc.close() + + dec = BendlDecoder(tmp_path / "d.bendl") + assert dec.read_json_asset("scores.json") == {"cut_edges": [10, 12]} + assert dec.read_json_asset("steps.json") == [1, 2, 3] + + +def test_add_asset_accepts_paths_and_file_likes(tmp_path: Path) -> None: + import io + + blob = bytes(range(256)) + src = tmp_path / "geometry.gpkg" + src.write_bytes(blob) + + enc = BendlEncoder(tmp_path / "f.bendl", overwrite=True) + # pathlib.Path payload: the file at that path is read. (A plain str would be stored as + # UTF-8 *content*, never treated as a path.) + enc.add_asset("from_path.gpkg", src, content_type="binary") + # File-like payloads are read; binary and text handles both work. + enc.add_asset("from_filelike.gpkg", io.BytesIO(blob), content_type="binary") + enc.add_asset("from_text_handle.txt", io.StringIO("hello"), content_type="text") + enc.close() + + dec = BendlDecoder(tmp_path / "f.bendl") + assert dec.read_asset_bytes("from_path.gpkg") == blob + assert dec.read_asset_bytes("from_filelike.gpkg") == blob + assert dec.read_asset_bytes("from_text_handle.txt") == b"hello" + + +def test_add_asset_file_content_type_reads_paths(tmp_path: Path) -> None: + blob = bytes(range(256)) + src = tmp_path / "geometry.gpkg" + src.write_bytes(blob) + + enc = BendlEncoder(tmp_path / "p.bendl", overwrite=True) + # Under content_type="file", a plain str payload *is* a path — the explicit opt-in that + # resolves the str-is-content default of every other content type. + enc.add_asset("from_str_path.gpkg", str(src), content_type="file") + enc.add_asset("from_pathlib.gpkg", src, content_type="file") + with pytest.raises(TypeError, match="requires a str or os.PathLike"): + enc.add_asset("bad", b"raw bytes are not a path", content_type="file") + with pytest.raises(FileNotFoundError): + enc.add_asset("missing", tmp_path / "nope.gpkg", content_type="file") + enc.close() + + dec = BendlDecoder(tmp_path / "p.bendl") + assert dec.read_asset_bytes("from_str_path.gpkg") == blob + assert dec.read_asset_bytes("from_pathlib.gpkg") == blob + + def test_binary_asset_round_trips_arbitrary_bytes(tmp_path: Path) -> None: # A blob that is deliberately not valid UTF-8 and not JSON — the shape of a zipped # shapefile or GeoPackage — must round-trip byte-exactly under CRC protection. From bf4179c4453916a2e42cd833bee34c5b892ccd7b Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Thu, 11 Jun 2026 21:55:06 -0600 Subject: [PATCH 167/221] make sure python assets verify --- ben-py/binary_ensemble/_core.pyi | 4 ++ ben-py/src/decode/bundle_decoder.rs | 27 +++++++++++ ben-py/tests/test_bundle.py | 68 ++++++++++++++++++++++++++- ben-py/tests/test_python_pipelines.py | 22 +++++++++ ben-py/tests/test_relabel.py | 31 ++++++++++++ 5 files changed, 151 insertions(+), 1 deletion(-) diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index 7e57e30..212d3c7 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -72,6 +72,10 @@ class BendlDecoder: def is_complete(self) -> bool: ... def asset_names(self) -> list[str]: ... def list_assets(self) -> list[dict[str, Any]]: ... + # Verifies every asset checksum and the stream checksum against the raw on-disk bytes + # (no decoding). Iteration/subsampling do not check checksums; call this when integrity + # matters. Raises on any mismatch or on an unfinalized bundle. + def verify(self) -> None: ... def read_asset_bytes(self, name: str) -> bytes: ... def read_json_asset(self, name: str) -> Any: ... # Returns a NetworkX graph (``networkx.Graph``/``MultiGraph``) rebuilt from the diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index c41d22a..ac924d1 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -308,6 +308,33 @@ impl PyBendlDecoder { Ok(out) } + /// Verify the bundle's integrity checksums without decoding anything. + /// + /// Scans the raw on-disk bytes of every asset and of the assignment stream and compares + /// them against the CRC32C checksums recorded when the bundle was written. Iterating or + /// subsampling a decoder reads the stream *without* checking these checksums (partial + /// reads cannot prove a whole-stream checksum), so call this when integrity matters — + /// e.g. after downloading a bundle or before an important run. + /// + /// Raises: + /// Exception: If any asset checksum or the stream checksum does not match the on-disk + /// bytes, or if the bundle is unfinalized (an unfinalized bundle's stream checksum + /// is not authoritative). + /// + /// Example: + /// >>> dec = BendlDecoder("ensemble.bendl") + /// >>> dec.verify() # raises on any corruption + #[pyo3(text_signature = "(self)")] + fn verify(&mut self) -> PyResult<()> { + self.reader + .verify_all_asset_checksums() + .map_err(|e| PyException::new_err(format!("Bundle asset verification failed: {e}")))?; + self.reader + .verify_stream_checksum() + .map_err(|e| PyException::new_err(format!("Bundle stream verification failed: {e}")))?; + Ok(()) + } + /// Read the (decoded) bytes of a named asset as a Python ``bytes`` object. /// /// Args: diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index 42d1286..900c603 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -22,7 +22,7 @@ import pytest from binary_ensemble import BenDecoder, BenEncoder, encode_jsonl_to_xben -from binary_ensemble.bundle import BendlDecoder +from binary_ensemble.bundle import BendlDecoder, BendlEncoder # --------------------------------------------------------------------------- # Format constants (mirror ben/src/io/bundle/format.rs) @@ -1005,3 +1005,69 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: extracted = tmp_path / f"fuzz-{trial}.ben" dec.extract_stream(extracted) assert list(BenDecoder(extracted, mode="ben")) == samples + + +# --------------------------------------------------------------------------- +# verify(): explicit integrity checking +# --------------------------------------------------------------------------- + + +def _checksummed_bundle(path: Path) -> None: + """A small finalized bundle written by the real encoder (checksums populated).""" + with BendlEncoder(path, overwrite=True) as enc: + enc.add_asset("notes.txt", "integrity matters", content_type="text") + with enc.stream("ben", variant="standard") as s: + for a in ([1, 1, 2, 2], [2, 2, 1, 1]): + s.write(a) + + +def _flip_byte_at_marker(path: Path, marker: bytes) -> None: + """XOR one byte at the first occurrence of ``marker`` in the file.""" + data = bytearray(path.read_bytes()) + idx = data.index(marker) + data[idx] ^= 0xFF + path.write_bytes(bytes(data)) + + +def test_verify_passes_on_pristine_bundle(tmp_path: Path) -> None: + path = tmp_path / "ok.bendl" + _checksummed_bundle(path) + BendlDecoder(path).verify() # must not raise + + +def test_verify_catches_stream_corruption(tmp_path: Path) -> None: + # Iteration and subsampling read the stream without checksum verification (partial reads + # cannot prove a whole-stream CRC); verify() is the explicit integrity gate and must catch + # any byte flip in the stream region. + path = tmp_path / "stream-corrupt.bendl" + _checksummed_bundle(path) + _flip_byte_at_marker(path, b"STANDARD BEN FILE") + + dec = BendlDecoder(path) # directory is intact, so the bundle still opens + with pytest.raises(Exception, match="stream verification failed"): + dec.verify() + + +def test_verify_catches_asset_corruption(tmp_path: Path) -> None: + path = tmp_path / "asset-corrupt.bendl" + _checksummed_bundle(path) + _flip_byte_at_marker(path, b"integrity matters") + + dec = BendlDecoder(path) + with pytest.raises(Exception, match="asset verification failed"): + dec.verify() + + +def test_verify_rejects_unfinalized_bundle(tmp_path: Path) -> None: + # An unfinalized bundle's stream checksum is not authoritative, so verify() must refuse + # rather than report a meaningless pass/fail. + path = tmp_path / "unfinalized.bendl" + with pytest.raises(RuntimeError, match="boom"): + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream("ben") as s: + s.write([1, 2, 3]) + raise RuntimeError("boom") + + dec = BendlDecoder(path) + with pytest.raises(Exception, match="stream verification failed"): + dec.verify() diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index f81216f..7338459 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -458,3 +458,25 @@ def test_decode_helpers_error_paths(tmp_path: Path) -> None: out.write_text("exists\n", encoding="utf-8") with pytest.raises(OSError, match="already exists"): decode_ben_to_jsonl(ben, out, overwrite=False) + + +def test_decoder_surfaces_truncated_streams_as_clean_exceptions(tmp_path: Path) -> None: + # The Rust core guarantees corrupt input errors rather than panics; this pins the Python + # half of that contract — a truncated stream raises an ordinary exception from iteration. + samples = [[1, 1, 2, 2], [2, 2, 1, 1], [1, 2, 1, 2]] + + ben = tmp_path / "trunc.ben" + with BenEncoder(ben, overwrite=True, variant="standard") as enc: + for a in samples: + enc.write(a) + ben.write_bytes(ben.read_bytes()[:-3]) + with pytest.raises(Exception, match="."): + list(BenDecoder(ben, mode="ben")) + + xben = tmp_path / "trunc.xben" + src = tmp_path / "src.jsonl" + write_jsonl(samples, src) + encode_jsonl_to_xben(src, xben, overwrite=True, variant="standard") + xben.write_bytes(xben.read_bytes()[:-3]) + with pytest.raises(Exception, match="."): + list(BenDecoder(xben, mode="xben")) diff --git a/ben-py/tests/test_relabel.py b/ben-py/tests/test_relabel.py index 36e4b58..91cd6bc 100644 --- a/ben-py/tests/test_relabel.py +++ b/ben-py/tests/test_relabel.py @@ -136,3 +136,34 @@ def test_relabel_rejects_xben_bundle(tmp_path: Path) -> None: compress_stream(src, out_file=xben) with pytest.raises(ValueError, match="only supports BEN"): relabel_bundle(xben, out_file=tmp_path / "o.bendl") + + +def test_relabel_rejects_unfinalized_bundle(tmp_path: Path) -> None: + src = tmp_path / "unfinalized.bendl" + with pytest.raises(RuntimeError, match="boom"): + with BendlEncoder(src, overwrite=True) as enc: + enc.add_graph(_graph(), sort=None) + with enc.stream("ben") as s: + s.write([1] * _n()) + raise RuntimeError("boom") + + with pytest.raises(Exception, match="finalized"): + relabel_bundle(src, out_file=tmp_path / "out.bendl") + + +def test_relabel_rejects_empty_stream_bundle(tmp_path: Path) -> None: + src = tmp_path / "assets-only.bendl" + with BendlEncoder(src, overwrite=True) as enc: + enc.add_graph(_graph(), sort=None) + + with pytest.raises(Exception, match="non-empty assignment stream"): + relabel_bundle(src, out_file=tmp_path / "out.bendl") + + +def test_relabel_out_file_refuses_existing(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + _build_ben_bundle(src) + out = tmp_path / "exists.bendl" + out.write_bytes(b"existing") + with pytest.raises(OSError, match="already exists"): + relabel_bundle(src, out_file=out) From 14304b808f0acb2e32a99de28f155f7916d6ca81 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:04:08 -0600 Subject: [PATCH 168/221] enabling remove asset --- Taskfile.yml | 28 ++ ben-py/README.md | 69 +++- ben-py/binary_ensemble/__init__.py | 14 +- ben-py/binary_ensemble/_core.pyi | 191 ++++++----- ben-py/binary_ensemble/bundle.py | 439 ++++++++++++++++-------- ben-py/binary_ensemble/bundle.pyi | 76 ++-- ben-py/binary_ensemble/codec.py | 7 +- ben-py/binary_ensemble/graph.py | 117 +++++-- ben-py/binary_ensemble/graph.pyi | 22 +- ben-py/binary_ensemble/stream.py | 9 +- ben-py/binary_ensemble/types.py | 111 ++++++ ben-py/pyproject.toml | 13 + ben-py/src/common.rs | 88 ++++- ben-py/src/compact.rs | 100 ++++++ ben-py/src/decode/bundle_decoder.rs | 85 ++++- ben-py/src/decode/cursor.rs | 2 +- ben-py/src/decode/decoder.rs | 30 +- ben-py/src/decode/mod.rs | 2 +- ben-py/src/decode/py_funcs.rs | 18 +- ben-py/src/decode/types.rs | 2 +- ben-py/src/encode/bundle_encoder.rs | 121 ++++--- ben-py/src/encode/encoder.rs | 32 +- ben-py/src/encode/mod.rs | 2 +- ben-py/src/encode/py_funcs.rs | 49 +-- ben-py/src/graph/py_funcs.rs | 26 +- ben-py/src/lib.rs | 6 + ben-py/src/recompress.rs | 16 +- ben-py/src/relabel.rs | 22 +- ben-py/tests/test_bundle.py | 74 +--- ben-py/tests/test_bundle_api.py | 237 +++++++++++-- ben-py/tests/test_compact.py | 250 ++++++++++++++ ben-py/tests/test_docs_snippets.py | 4 +- ben-py/tests/test_graph.py | 9 + ben-py/tests/test_python_pipelines.py | 20 +- ben-py/tests/test_recompress.py | 19 +- ben-py/tests/test_relabel.py | 23 +- ben-py/tests/test_surface.py | 15 +- ben-py/tests/typing_assertions.py | 114 ++++++ ben-py/uv.lock | 51 +++ ben/src/cli/ben/args.rs | 2 +- ben/src/cli/ben/bundle.rs | 4 +- ben/src/cli/ben/modes/encode.rs | 2 +- ben/src/cli/ben/modes/xencode.rs | 2 +- ben/src/cli/ben/paths.rs | 2 +- ben/src/cli/bendl/args.rs | 25 +- ben/src/cli/bendl/mod.rs | 13 +- ben/src/cli/bendl/remove.rs | 55 +++ ben/src/io/bundle/compact.rs | 320 +++++++++++++++++ ben/src/io/bundle/error.rs | 2 +- ben/src/io/bundle/mod.rs | 2 + ben/src/io/bundle/writer.rs | 42 ++- ben/tests/test_bendl_append_proptest.rs | 278 ++++++++++++++- ben/tests/test_cli.rs | 121 +++++++ fuzz/fuzz_targets/bendl_reader.rs | 2 +- 54 files changed, 2790 insertions(+), 595 deletions(-) create mode 100644 ben-py/binary_ensemble/types.py create mode 100644 ben-py/src/compact.rs create mode 100644 ben-py/tests/test_compact.py create mode 100644 ben-py/tests/typing_assertions.py create mode 100644 ben/src/cli/bendl/remove.rs create mode 100644 ben/src/io/bundle/compact.rs diff --git a/Taskfile.yml b/Taskfile.yml index 03bb6d8..82e5079 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -311,12 +311,30 @@ tasks: cmds: - uv run ruff check . + typecheck-python: + desc: >- + Two-stage type check of the ben-py package, stubs, and type aliases: ty for the + fast first pass, then pyright for the thorough one. tests/typing_assertions.py + pins the public signatures from the consumer side — assert_type for positives, + type-ignore comments for calls that must NOT type-check (kept honest by + pyright's reportUnnecessaryTypeIgnoreComment). + silent: true + deps: + - ben-py-sync + env: + PATH: '{{.CARGO_BIN}}:{{.LOCAL_BIN}}:{{env "PATH"}}' + dir: ben-py + cmds: + - uv run ty check binary_ensemble tests/typing_assertions.py + - uv run pyright binary_ensemble tests/typing_assertions.py + lint: desc: Run linters silent: true cmds: - task: lint-rust - task: lint-python + - task: typecheck-python coverage-ben: desc: Run Rust coverage for the ben crate @@ -462,6 +480,16 @@ tasks: - uv run --extra docs sphinx-build -E -a -W -b dirhtml docs docs/_build - 'echo "Docs built -> ben-py/docs/_build/index.html"' + docs-refresh-notebooks: + desc: >- + Re-execute the tutorial notebooks in place, refreshing the committed outputs the + docs site renders. Run after editing any notebook code cell. + dir: ben-py + cmds: + - >- + uv run --extra docs --extra docs-exec python docs/_refresh_notebooks.py + docs/user/using_bendl.ipynb docs/user/using_ben_py.ipynb + docs-exec: desc: Build the docs and execute every tutorial notebook (CI-equivalent) dir: ben-py diff --git a/ben-py/README.md b/ben-py/README.md index 6848796..03f4195 100755 --- a/ben-py/README.md +++ b/ben-py/README.md @@ -3,7 +3,7 @@ [![PyPI](https://img.shields.io/pypi/v/binary-ensemble.svg)](https://pypi.org/project/binary-ensemble/) [![Python versions](https://img.shields.io/pypi/pyversions/binary-ensemble.svg)](https://pypi.org/project/binary-ensemble/) [![Documentation](https://img.shields.io/readthedocs/binary-ensemble.svg)](https://binary-ensemble.readthedocs.io/) -[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE) +[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE.md) **Compress, store, and stream massive ensembles of districting plans.** @@ -27,11 +27,12 @@ the whole thing. pip install binary-ensemble ``` -Requires Python 3.11+. Pre-built wheels are available for Linux, macOS, and Windows. +Requires Python 3.11+. Pre-built wheels are available for Linux, macOS, and Windows. The +only runtime dependency is NetworkX, and the API is fully type-annotated (`py.typed`). ## Quick example -Write an ensemble into one self-describing `.bendl` bundle, then read it back: +Write an ensemble into one self-describing `.bendl` file, then read it back: ```python from binary_ensemble import BendlEncoder, BendlDecoder @@ -41,7 +42,7 @@ plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] # The stream context finalizes the bundle when it closes. encoder = BendlEncoder("ensemble.bendl", overwrite=True) encoder.add_metadata({"sampler": "demo", "seed": 1234}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for assignment in plans: stream.write(assignment) @@ -50,23 +51,71 @@ for assignment in BendlDecoder("ensemble.bendl"): print(assignment) ``` -Already have JSONL files? Convert whole files in one call: +## The graph travels with the data + +An assignment is just integers — it only means something in a dual graph's node order. A +`.bendl` file embeds the graph, so a collaborator can open one file and reconstruct plans +with no risk of pairing the wrong graph: ```python -from binary_ensemble import encode_jsonl_to_ben, encode_ben_to_xben +import networkx as nx +from binary_ensemble import BendlEncoder, BendlDecoder + +dual_graph = nx.convert_node_labels_to_integers(nx.grid_2d_graph(4, 4)) + +encoder = BendlEncoder("run.bendl", overwrite=True) +ordered = encoder.add_graph(nx.adjacency_data(dual_graph)) # reordered for compression +with encoder.stream() as stream: + for step in range(1000): + stream.write([(node + step) % 4 + 1 for node in range(16)]) -encode_jsonl_to_ben("plans.jsonl", "plans.ben") # fast working format -encode_ben_to_xben("plans.ben", "plans.xben") # smallest, for storage +decoder = BendlDecoder("run.bendl") +graph = decoder.read_graph() # back as a live networkx.Graph, in assignment order +print(len(decoder)) # 1000 — read from the header, no scan + +for assignment in decoder.subsample_every(100): + ... # every 100th plan, without decoding the rest ``` +## More than the basics + +- **Whole-file converters** for existing JSONL ensembles: + + ```python + from binary_ensemble import encode_jsonl_to_ben, encode_ben_to_xben + + encode_jsonl_to_ben("plans.jsonl", "plans.ben") # fast working format + encode_ben_to_xben("plans.ben", "plans.xben") # smallest, for storage + ``` + +- **Shrink for sharing** — reorder a finished file and recompress its stream to XBEN, + keeping every asset: + + ```python + from binary_ensemble import relabel_bundle, compress_stream + + relabel_bundle("run.bendl", out_file="run-sorted.bendl", sort="mlc") + compress_stream("run-sorted.bendl", out_file="run-archive.bendl") + ``` + +- **Subsampling** by stride, range, or explicit indices on both bundles and plain + `.ben`/`.xben` streams — skipped samples are never materialized. +- **Custom assets**: attach scores, notes, run manifests, or arbitrary binary blobs (a + zipped shapefile, a GeoPackage) alongside the stream. Every asset is checksummed + (CRC32C), large payloads are xz-compressed transparently, and + `BendlDecoder.verify()` validates a whole file in one call. +- **Sampler-agnostic**: encoders take plain `list[int]` assignments, so the same API works + for GerryChain, ForestReCom, SMC, or your own code. + ## Documentation Full docs are at **[binary-ensemble.readthedocs.io](https://binary-ensemble.readthedocs.io/)**: - [Quickstart](https://binary-ensemble.readthedocs.io/getting-started/quickstart/) — your first ensemble in a few lines. - [Concepts](https://binary-ensemble.readthedocs.io/concepts/overview/) — dual graphs, the BEN/XBEN/BENDL formats, encoding variants, and the compression levers. -- [How-to guides](https://binary-ensemble.readthedocs.io/how-to/) — compress a GerryChain run, subsample, convert formats, shrink a bundle for sharing. +- [How-to guides](https://binary-ensemble.readthedocs.io/how-to/) — compress a GerryChain run, analyze with NumPy/pandas, subsample, convert formats, shrink a file for sharing, recover a crashed run. - [API reference](https://binary-ensemble.readthedocs.io/api/) — every public class and function. +- [Tutorial notebooks](https://binary-ensemble.readthedocs.io/user/using_bendl/) — executed end to end in CI against the live API, as is every code snippet in the docs. ## Command-line tools @@ -78,4 +127,4 @@ cargo install binary-ensemble ## License -MIT — see [LICENSE](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE). +MIT — see [LICENSE](https://github.com/peterrrock2/binary-ensemble/blob/main/LICENSE.md). diff --git a/ben-py/binary_ensemble/__init__.py b/ben-py/binary_ensemble/__init__.py index 221a2b5..0a7b83c 100644 --- a/ben-py/binary_ensemble/__init__.py +++ b/ben-py/binary_ensemble/__init__.py @@ -3,21 +3,20 @@ The public surface mirrors the CLI's ``ben`` vs ``bendl`` split: - :mod:`binary_ensemble.bundle` — the recommended single-file ``.bendl`` format: - :class:`~binary_ensemble.bundle.BendlEncoder`, - :class:`~binary_ensemble.bundle.BendlDecoder`, and - :func:`~binary_ensemble.bundle.compress_stream`. + :class:`~binary_ensemble.bundle.BendlEncoder`, :class:`~binary_ensemble.bundle.BendlDecoder`, + and :func:`~binary_ensemble.bundle.compress_stream`. - :mod:`binary_ensemble.stream` — plain BEN/XBEN streams: - :class:`~binary_ensemble.stream.BenEncoder`, - :class:`~binary_ensemble.stream.BenDecoder`. + :class:`~binary_ensemble.stream.BenEncoder`, :class:`~binary_ensemble.stream.BenDecoder`. - :mod:`binary_ensemble.codec` — whole-file JSONL ↔ BEN ↔ XBEN transforms. - :mod:`binary_ensemble.graph` — graph reordering utilities. +- :mod:`binary_ensemble.types` — shared type aliases for annotating user code. All public symbols are re-exported here for convenience. """ from importlib import metadata as _metadata -from binary_ensemble import bundle, codec, graph, stream +from binary_ensemble import bundle, codec, graph, stream, types from binary_ensemble.bundle import ( BendlDecoder, BendlEncoder, @@ -36,7 +35,7 @@ try: __version__ = _metadata.version("binary-ensemble") -except _metadata.PackageNotFoundError: # source tree imported without an installed dist +except _metadata.PackageNotFoundError: # source tree imported without an installed distribution __version__ = "0.0.0+unknown" __all__ = [ @@ -45,6 +44,7 @@ "bundle", "codec", "graph", + "types", # Bundle (recommended) "BendlEncoder", "BendlDecoder", diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index 212d3c7..c22be45 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -1,12 +1,26 @@ """Type stubs for the compiled ``binary_ensemble._core`` extension. -These describe the raw PyO3 surface. End users should import the ergonomic -facades from :mod:`binary_ensemble.stream`, :mod:`binary_ensemble.bundle`, -:mod:`binary_ensemble.codec`, and :mod:`binary_ensemble.graph` instead. +These describe the raw PyO3 surface. End users should import the ergonomic facades from +:mod:`binary_ensemble.stream`, :mod:`binary_ensemble.bundle`, :mod:`binary_ensemble.codec`, and +:mod:`binary_ensemble.graph` instead. """ -from pathlib import Path -from typing import Any, Iterable, Iterator, Literal +from collections.abc import Iterator, Sequence +from types import TracebackType +from typing import Any, Literal + +import networkx as nx + +from binary_ensemble.types import ( + AssetEntry, + AssignmentFormat, + GraphInput, + MetadataInput, + NodePermutationMap, + SortMethod, + StrPath, + Variant, +) # --------------------------------------------------------------------------- # Stream decoder / encoder (plain .ben / .xben) @@ -15,35 +29,38 @@ from typing import Any, Iterable, Iterator, Literal class BenDecoder: """Iterator over assignments in a plain BEN or XBEN stream. - Stream-only: opening this on a ``.bendl`` bundle raises and points at - :class:`BendlDecoder`. Sample counting is lazy and cached. + Stream-only: opening this on a ``.bendl`` bundle raises and points at :class:`BendlDecoder`. + Sample counting is lazy and cached. """ - def __init__( - self, file_path: str | Path, mode: Literal["ben", "xben"] = "ben" - ) -> None: ... + def __init__(self, file_path: StrPath, mode: AssignmentFormat = "ben") -> None: ... def __iter__(self) -> Iterator[list[int]]: ... def __next__(self) -> list[int]: ... def __len__(self) -> int: ... def count_samples(self) -> int: ... - def subsample_indices(self, indices: Iterable[int]) -> "BenDecoder": ... + def subsample_indices(self, indices: Sequence[int]) -> "BenDecoder": ... def subsample_range(self, start: int, end: int) -> "BenDecoder": ... def subsample_every(self, step: int, offset: int = 1) -> "BenDecoder": ... - def assignment_format(self) -> Literal["ben", "xben"]: ... + def assignment_format(self) -> AssignmentFormat: ... class BenEncoder: """Encoder for plain Binary Ensemble (`.ben`) streams.""" def __init__( self, - file_path: str | Path, + file_path: StrPath, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, + variant: Variant = "twodelta", ) -> None: ... - def write(self, assignment: list[int]) -> None: ... + def write(self, assignment: Sequence[int]) -> None: ... def close(self) -> None: ... def __enter__(self) -> "BenEncoder": ... - def __exit__(self, exc_type, exc, tb) -> bool: ... + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> bool: ... # --------------------------------------------------------------------------- # Bundle decoder / encoder (.bendl) @@ -52,41 +69,46 @@ class BenEncoder: class BendlDecoder: """Reader and iterator for a ``.bendl`` bundle. - Bundle-only: opening this on a plain ``.ben``/``.xben`` stream raises and - points at :class:`BenDecoder`. Iteration walks the embedded assignment - stream; the bundle directory and asset payloads are exposed through the - inspection methods. A finalized assets-only bundle (empty stream) iterates to - nothing with ``len == 0``. + Bundle-only: opening this on a plain ``.ben``/``.xben`` stream raises and points at + :class:`BenDecoder`. Iteration walks the embedded assignment stream; the bundle directory and + asset payloads are exposed through the inspection methods. A finalized assets-only bundle + (empty stream) iterates to nothing with ``len == 0``. """ - def __init__(self, file_path: str | Path) -> None: ... + def __init__(self, file_path: StrPath) -> None: ... def __iter__(self) -> Iterator[list[int]]: ... def __next__(self) -> list[int]: ... def __len__(self) -> int: ... def count_samples(self) -> int: ... - def subsample_indices(self, indices: Iterable[int]) -> "BendlDecoder": ... + def subsample_indices(self, indices: Sequence[int]) -> "BendlDecoder": ... def subsample_range(self, start: int, end: int) -> "BendlDecoder": ... def subsample_every(self, step: int, offset: int = 1) -> "BendlDecoder": ... - def assignment_format(self) -> Literal["ben", "xben"]: ... + def assignment_format(self) -> AssignmentFormat: ... def version(self) -> tuple[int, int]: ... + # On-disk byte length of the embedded stream region, straight from the header (no decoding; + # the same bytes extract_stream copies out). 0 for an assets-only bundle. + def stream_size(self) -> int: ... + # On-disk byte length of a named asset's stored payload, straight from the directory. For + # xz-flagged assets this is the compressed size; len(read_asset_bytes(name)) is the decoded + # size. Raises KeyError for an unknown name. + def asset_size(self, name: str) -> int: ... def is_complete(self) -> bool: ... def asset_names(self) -> list[str]: ... - def list_assets(self) -> list[dict[str, Any]]: ... - # Verifies every asset checksum and the stream checksum against the raw on-disk bytes - # (no decoding). Iteration/subsampling do not check checksums; call this when integrity - # matters. Raises on any mismatch or on an unfinalized bundle. + def list_assets(self) -> list[AssetEntry]: ... + # Verifies every asset checksum and the stream checksum against the raw on-disk bytes (no + # decoding). Iteration/subsampling do not check checksums; call this when integrity matters. + # Raises on any mismatch or on an unfinalized bundle. def verify(self) -> None: ... def read_asset_bytes(self, name: str) -> bytes: ... def read_json_asset(self, name: str) -> Any: ... - # Returns a NetworkX graph (``networkx.Graph``/``MultiGraph``) rebuilt from the - # stored adjacency JSON, or ``None`` if absent. Use ``read_json_asset("graph.json")`` - # for the raw parsed dict. - def read_graph(self) -> Any | None: ... + # Returns a NetworkX graph rebuilt from the stored adjacency JSON, or ``None`` if absent. + # Use ``read_json_asset("graph.json")`` for the raw parsed dict. + def read_graph(self) -> nx.Graph | None: ... def read_metadata(self) -> Any | None: ... - def read_node_permutation_map(self) -> Any | None: ... + def read_node_permutation_map(self) -> NodePermutationMap | None: ... def extract_stream( self, - out_path: str | Path, + out_path: StrPath, overwrite: bool = False, allow_unfinalized: bool = False, ) -> None: ... @@ -94,91 +116,106 @@ class BendlDecoder: class BendlStreamSession: """Single-use context manager over a bundle's assignment stream. - Obtained from :meth:`BendlEncoder.stream`; finalizes the bundle on a clean - close and leaves it unfinalized if the context exits via an exception. + Obtained from :meth:`BendlEncoder.stream`; finalizes the bundle on a clean close and leaves + it unfinalized if the context exits via an exception. """ - def write(self, assignment: list[int]) -> None: ... + def write(self, assignment: Sequence[int]) -> None: ... def close(self) -> None: ... def __enter__(self) -> "BendlStreamSession": ... - def __exit__(self, exc_type, exc, tb) -> bool: ... + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> bool: ... class BendlEncoder: """Writer for a ``.bendl`` bundle (create mode) or appender (append mode).""" - def __init__(self, file_path: str | Path, overwrite: bool = False) -> None: ... + def __init__(self, file_path: StrPath, overwrite: bool = False) -> None: ... @staticmethod - def append(file_path: str | Path) -> "BendlEncoder": ... + def append(file_path: StrPath) -> "BendlEncoder": ... + # The raw core surface takes the payload as already-coerced bytes; the bundle facade accepts + # richer payload shapes (and content_type="file"). def add_asset( - self, name: str, payload: bytes, content_type: Literal["json", "text"] + self, name: str, payload: bytes, content_type: Literal["json", "text", "binary"] ) -> None: ... - def add_metadata(self, metadata: Any) -> None: ... + # Drops the directory entry only (payload bytes become dead space until the next + # whole-bundle rewrite compacts them); frees the name for re-add. KeyError if absent. + def remove_asset(self, name: str) -> None: ... + def add_metadata(self, metadata: MetadataInput) -> None: ... # Returns the (possibly reordered) graph as a NetworkX graph, matching - # BendlDecoder.read_graph. sort defaults to "mlc"; sort="key" sorts by `key`; - # sort=None stores raw. + # BendlDecoder.read_graph. sort defaults to "mlc"; sort="key" sorts by `key`; sort=None + # stores raw. def add_graph( - self, graph: Any, sort: str | None = "mlc", key: str | None = None - ) -> Any: ... - def stream( - self, - format: Literal["ben"] = "ben", - variant: Literal["standard", "mkv_chain", "twodelta"] | None = None, - ) -> BendlStreamSession: ... + self, graph: GraphInput, sort: SortMethod | None = "mlc", key: str | None = None + ) -> nx.Graph: ... + # The embedded stream is always BEN at write time; XBEN bundles are produced by recompressing + # a finished bundle (see binary_ensemble.bundle.compress_stream). + def stream(self, *, variant: Variant = "twodelta") -> BendlStreamSession: ... def close(self) -> None: ... def __enter__(self) -> "BendlEncoder": ... - def __exit__(self, exc_type, exc, tb) -> bool: ... + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> bool: ... # --------------------------------------------------------------------------- # Whole-file stream / JSONL transforms # --------------------------------------------------------------------------- def encode_jsonl_to_ben( - in_file: str | Path, - out_file: str | Path, + in_file: StrPath, + out_file: StrPath, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] = "twodelta", + variant: Variant = "twodelta", ) -> None: ... def encode_jsonl_to_xben( - in_file: str | Path, - out_file: str | Path, + in_file: StrPath, + out_file: StrPath, overwrite: bool = False, - variant: Literal["standard", "mkv_chain", "twodelta"] = "twodelta", + variant: Variant = "twodelta", n_threads: int | None = None, compression_level: int | None = None, xz_block_size: int | None = None, ) -> None: ... def encode_ben_to_xben( - in_file: str | Path, - out_file: str | Path, + in_file: StrPath, + out_file: StrPath, overwrite: bool = False, n_threads: int | None = None, compression_level: int | None = None, xz_block_size: int | None = None, ) -> None: ... -def decode_ben_to_jsonl( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: ... -def decode_xben_to_jsonl( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: ... -def decode_xben_to_ben( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: ... +def decode_ben_to_jsonl(in_file: StrPath, out_file: StrPath, overwrite: bool = False) -> None: ... +def decode_xben_to_jsonl(in_file: StrPath, out_file: StrPath, overwrite: bool = False) -> None: ... +def decode_xben_to_ben(in_file: StrPath, out_file: StrPath, overwrite: bool = False) -> None: ... # --------------------------------------------------------------------------- # Graph reordering and bundle recompression # --------------------------------------------------------------------------- def graph_reorder( - graph: Any, sort: str | None = "mlc", key: str | None = None -) -> tuple[Any, Any]: ... -def recompress_bundle( - in_file: str | Path, out_file: str | Path, overwrite: bool = False -) -> None: ... + graph: GraphInput, sort: SortMethod = "mlc", key: str | None = None +) -> tuple[nx.Graph, NodePermutationMap]: ... + +# Rewrites the bundle without unreferenced byte ranges (dead space from remove_asset and +# superseded directories). Assets carried by decoded payload; stream bytes copied verbatim +# (checksum-verified); wire format preserved. +def compact_bundle(in_file: StrPath, out_file: StrPath, overwrite: bool = False) -> None: ... + +# In-place compaction choosing the cheapest strategy. Returns "none" (already compact), +# "tail" (post-stream tail rebuilt; stream untouched and not verified), or "full" +# (whole-bundle verified rewrite via temp file + atomic swap). +def compact_bundle_in_place(path: StrPath) -> Literal["none", "tail", "full"]: ... +def recompress_bundle(in_file: StrPath, out_file: StrPath, overwrite: bool = False) -> None: ... def relabel_bundle( - in_file: str | Path, - out_file: str | Path, - sort: str | None = "mlc", + in_file: StrPath, + out_file: StrPath, + sort: SortMethod = "mlc", key: str | None = None, overwrite: bool = False, ) -> None: ... diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 4b53b92..1293f45 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -1,16 +1,15 @@ """The ``.bendl`` bundle format — the recommended single-file container. -A bundle wraps a BEN/XBEN assignment stream together with front-loaded assets: a -dual ``graph.json``, a ``node_permutation_map.json``, a ``metadata.json``, and -arbitrary custom blobs. :class:`BendlEncoder` writes one; :class:`BendlDecoder` -reads and iterates one. +A bundle wraps a BEN/XBEN assignment stream together with front-loaded assets: a dual +``graph.json``, a ``node_permutation_map.json``, a ``metadata.json``, and arbitrary custom blobs. +:class:`BendlEncoder` writes one; :class:`BendlDecoder` reads and iterates one. Typical write:: with BendlEncoder(path, overwrite=True) as enc: enc.add_graph(graph, sort="rcm") # sort=None => store raw enc.add_metadata({"seed": 1234}) - with enc.stream("ben") as stream: + with enc.stream() as stream: for assignment in chain: stream.write(assignment) @@ -27,12 +26,29 @@ import json import os import tempfile -from typing import Any, Optional +from collections.abc import Callable +from typing import TYPE_CHECKING, Literal, cast, overload from binary_ensemble._core import BendlDecoder, BendlStreamSession from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder +from binary_ensemble._core import compact_bundle_in_place as _compact_bundle_in_place from binary_ensemble._core import recompress_bundle as _recompress_bundle from binary_ensemble._core import relabel_bundle as _relabel_bundle +from binary_ensemble.types import ( + BinaryAssetPayload, + GraphInput, + JsonAssetPayload, + MetadataInput, + SortMethod, + StrPath, + TextAssetPayload, + Variant, +) + +if TYPE_CHECKING: + from types import TracebackType + + import networkx as nx __all__ = [ "BendlEncoder", @@ -43,45 +59,47 @@ ] -def _atomic_or_out(transform, path, out_file, in_place, suffix=".bendl"): - """Shared in_place-swap / out_file dispatch for whole-bundle transforms. +def _atomic_or_out( + transform: Callable[[StrPath, StrPath, bool], None], + path: StrPath, + out_file: StrPath | None, + overwrite: bool, + suffix: str = ".bendl", +) -> None: + """Shared in-place-swap / out_file dispatch for whole-bundle transforms. - ``transform(src, dst, overwrite)`` writes the result. Exactly one of - ``in_place`` / ``out_file`` must be given. + ``transform(src, dst, overwrite)`` writes the result. ``out_file=None`` means in place: the + result is written to a temp file and atomically swapped over ``path``. ``overwrite`` governs + an existing ``out_file`` (the in-place swap always replaces ``path``). """ - if in_place and out_file is not None: - raise ValueError("pass either in_place=True or out_file, not both") - if not in_place and out_file is None: - raise ValueError("pass either in_place=True or out_file") - - if in_place: - directory = os.path.dirname(os.path.abspath(os.fspath(path))) - fd, tmp = tempfile.mkstemp(suffix=suffix, dir=directory) - os.close(fd) - try: - transform(path, tmp, True) - os.replace(tmp, path) - except BaseException: - if os.path.exists(tmp): - os.remove(tmp) - raise - else: - transform(path, out_file, False) - - -def _coerce_asset_payload(payload: Any, content_type: str) -> bytes: + if out_file is not None: + transform(path, out_file, overwrite) + return + + directory = os.path.dirname(os.path.abspath(os.fspath(path))) + fd, tmp = tempfile.mkstemp(suffix=suffix, dir=directory) + os.close(fd) + try: + transform(path, tmp, True) + os.replace(tmp, path) + except BaseException: + if os.path.exists(tmp): + os.remove(tmp) + raise + + +def _coerce_asset_payload(payload: object, content_type: str) -> bytes: """Coerce an ``add_asset`` payload to bytes. Accepted forms: - - ``dict`` / ``list`` — serialized via ``json.dumps`` (requires - ``content_type="json"``). - - ``str`` — UTF-8 encoded **content** (not a path; pass a ``pathlib.Path`` - to read a file — this deliberately differs from :meth:`BendlEncoder.add_metadata`, - whose payloads are never plain text, so there a ``str`` is a path). + - ``dict`` / ``list`` — serialized via ``json.dumps`` (requires ``content_type="json"``). + - ``str`` — UTF-8 encoded **content** (not a path; pass a ``pathlib.Path`` to read a file — + this deliberately differs from :meth:`BendlEncoder.add_metadata`, whose payloads are never + plain text, so there a ``str`` is a path). - ``bytes`` / ``bytearray`` / ``memoryview`` — used verbatim. - - any object with a ``.read()`` method (open files, ``io.BytesIO``) — read, - with ``str`` results UTF-8 encoded. + - any object with a ``.read()`` method (open files, ``io.BytesIO``) — read, with ``str`` + results UTF-8 encoded. - ``os.PathLike`` (e.g. ``pathlib.Path``) — the file at that path is read. """ if isinstance(payload, (dict, list)): @@ -95,8 +113,9 @@ def _coerce_asset_payload(payload: Any, content_type: str) -> bytes: return payload.encode("utf-8") if isinstance(payload, (bytes, bytearray, memoryview)): return bytes(payload) - if hasattr(payload, "read"): - data = payload.read() + reader = getattr(payload, "read", None) + if callable(reader): + data = reader() if isinstance(data, str): return data.encode("utf-8") if isinstance(data, (bytes, bytearray, memoryview)): @@ -116,105 +135,165 @@ def _coerce_asset_payload(payload: Any, content_type: str) -> bytes: class BendlEncoder: """Writer for a ``.bendl`` bundle (create mode) or an asset appender (append mode). - In create mode (the constructor), assets may be added before or after a - single-use ``stream()``. You do **not** need to use ``BendlEncoder`` itself as - a context manager: closing the ``stream()`` context finalizes the bundle, so - the common pattern is:: + In create mode (the constructor), assets may be added before or after a single-use + ``stream()``. You do **not** need to use ``BendlEncoder`` itself as a context manager: closing + the ``stream()`` context finalizes the bundle, so the common pattern is:: enc = BendlEncoder(path, overwrite=True) graph = enc.add_graph(my_graph) # MLC-reordered by default - with enc.stream("ben") as stream: # only the stream needs ``with`` + with enc.stream() as stream: # only the stream needs ``with`` for assignment in chain: stream.write(assignment) # bundle is finalized here - The encoder is still usable as a context manager if you prefer, and that is - the easy way to finalize an *assets-only* bundle (one written with no - ``stream()``): either ``with BendlEncoder(...) as enc: ...`` or an explicit - :meth:`close`. In append mode (:meth:`append`), an existing finalized bundle - is grown with new assets and ``stream()`` is unavailable. + The encoder is still usable as a context manager if you prefer, and that is the easy way to + finalize an *assets-only* bundle (one written with no ``stream()``): either + ``with BendlEncoder(...) as enc: ...`` or an explicit :meth:`close`. In append mode + (:meth:`append`), an existing finalized bundle is grown with new assets and ``stream()`` is + unavailable. Args: - file_path: Output path for the new bundle. Must not exist unless - ``overwrite=True``. - overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. + file_path (StrPath): Output path for the new bundle (``str`` or ``os.PathLike``, e.g. + ``pathlib.Path``). Must not exist unless ``overwrite=True``. + overwrite (bool, optional): Replace an existing file at ``file_path``. Default is + ``False``. Raises: - OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it - cannot be created. + OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be created. """ - def __init__(self, file_path, overwrite: bool = False) -> None: + def __init__(self, file_path: StrPath, overwrite: bool = False) -> None: + self._path = file_path self._enc = _CoreBendlEncoder(file_path, overwrite=overwrite) @classmethod - def append(cls, file_path) -> "BendlEncoder": + def append(cls, file_path: StrPath) -> "BendlEncoder": """Open an existing *finalized* bundle to append new assets. - ``stream()`` is unavailable in append mode; each ``add_*`` commits - immediately. + ``stream()`` is unavailable in append mode; each ``add_*`` commits immediately. + + Args: + file_path (StrPath): Path to an existing, finalized ``.bendl`` bundle (``str`` or + ``os.PathLike``). + + Returns: + BendlEncoder: An encoder in append mode. + + Raises: + Exception: If the file is missing, is not a bundle, or is not finalized. """ self = cls.__new__(cls) + self._path = file_path self._enc = _CoreBendlEncoder.append(file_path) return self def add_graph( - self, graph: Any, sort: Optional[str] = "mlc", key: Optional[str] = None - ) -> Any: + self, + graph: GraphInput, + sort: SortMethod | None = "mlc", + key: str | None = None, + ) -> "nx.Graph": """Embed the dual ``graph.json`` and return the (possibly reordered) graph. - ``sort`` selects how nodes are ordered and defaults to ``"mlc"`` (so the - graph is reordered for better compression): + When reordering, both ``graph.json`` and ``node_permutation_map.json`` are stored and the + reordered graph is returned so the chain runs on that ordering. Reordering is pre-stream + only; a raw graph (``sort=None``) may also be attached post-stream / in append mode. + + Args: + graph (GraphInput): The dual graph (:data:`~binary_ensemble.types.GraphInput`): a + live ``networkx.Graph`` (subclasses such as ``gerrychain.Graph`` count; its node + iteration order is preserved), or adjacency-format JSON as a parsed ``dict`` or + ``list``, raw ``bytes``, a file-like object with ``.read()``, or a ``str`` / + ``os.PathLike`` path to a JSON file. A plain ``str`` is a *path* here. + sort (SortMethod | None, optional): How to order the nodes + (:data:`~binary_ensemble.types.SortMethod` or ``None``): ``"mlc"`` (multi-level + clustering — reorders the graph for better compression), ``"rcm"`` (reverse + Cuthill-McKee), ``"key"`` (sort by the node attribute named in ``key``), or + ``None`` to store the graph as-is with no permutation map. Default is ``"mlc"``. + key (str | None, optional): Node attribute to sort by, e.g. ``key="GEOID"``; + ``key="id"`` sorts by the NetworkX node id. Required with — and only valid with — + ``sort="key"``. Default is ``None``. + + Returns: + networkx.Graph: The stored graph after any reordering (matching + :meth:`BendlDecoder.read_graph`). Its node iteration order is the order the chain + must write assignments in. + + Raises: + ValueError: If ``sort`` / ``key`` is invalid. + Exception: If a reordering graph is added after the stream has started. + """ + return self._enc.add_graph(graph, sort, key) - - ``"mlc"`` — multi-level clustering, - - ``"rcm"`` — reverse Cuthill-McKee, - - ``"key"`` — sort by the node attribute named in ``key`` (e.g. - ``sort="key", key="GEOID"``; ``key="id"`` sorts by the NetworkX node id), - - ``None`` — store the graph as-is, with no permutation map. + def add_metadata(self, metadata: MetadataInput) -> None: + """Embed the canonical ``metadata.json`` asset (run provenance). - When reordering, both ``graph.json`` and ``node_permutation_map.json`` are - stored and the reordered graph is returned so the chain runs on that - ordering. Reordering is pre-stream only; a raw graph (``sort=None``) may - also be attached post-stream / in append mode. ``key`` is only valid with - ``sort="key"``. + Args: + metadata (MetadataInput): The JSON payload + (:data:`~binary_ensemble.types.MetadataInput`): a ``dict`` or ``list`` + (serialized for you), raw JSON ``bytes``, a file-like object with ``.read()``, or + a ``str`` / ``os.PathLike`` path to a JSON file. A plain ``str`` is a *path* + here, never inline JSON. - The graph is returned as a NetworkX graph (matching - :meth:`BendlDecoder.read_graph`), so its node order is the order the - chain should write assignments in. + Raises: + Exception: If the payload cannot be converted to JSON bytes, or the encoder is in an + invalid state. """ - return self._enc.add_graph(graph, sort, key) - - def add_metadata(self, metadata: Any) -> None: - """Embed the canonical ``metadata.json`` asset (a dict/list, bytes, or path).""" self._enc.add_metadata(metadata) + @overload + def add_asset( + self, name: str, payload: JsonAssetPayload, content_type: Literal["json"] + ) -> None: ... + @overload + def add_asset( + self, name: str, payload: TextAssetPayload, content_type: Literal["text"] + ) -> None: ... + @overload + def add_asset( + self, name: str, payload: BinaryAssetPayload, content_type: Literal["binary"] + ) -> None: ... + @overload + def add_asset(self, name: str, payload: StrPath, content_type: Literal["file"]) -> None: ... def add_asset( self, name: str, - payload: Any, + payload: object, content_type: str, ) -> None: """Embed a custom asset under ``name``. - ``payload`` may be bytes-like or a ``str`` (stored as UTF-8 content), a - ``dict``/``list`` (serialized as JSON; requires ``content_type="json"``), - an open file or other object with ``.read()``, or a ``pathlib.Path`` - whose file contents are read. A plain ``str`` is always *content*, never - a path — pass a ``Path`` to read from disk. - - ``content_type`` is ``"json"`` (payload must be valid UTF-8 JSON; the - decoder will auto-parse it), ``"text"`` (payload must be valid UTF-8), - ``"binary"`` (arbitrary bytes, stored verbatim — e.g. a zipped - shapefile or a GeoPackage), or ``"file"`` (the payload is a ``str`` or - ``pathlib.Path`` naming a file whose contents are read and stored as - binary). Every asset carries a CRC32C integrity checksum, and payloads - of 1 KiB or more are xz-compressed on disk by default (transparent on - read). - - ``"file"`` is the one content type under which a plain ``str`` payload - is a *path*; to store a typed file (e.g. JSON the decoder should - auto-parse), pass a ``pathlib.Path`` with ``content_type="json"`` - instead. + Every asset carries a CRC32C integrity checksum, and payloads of 1 KiB or more are + xz-compressed on disk by default (both transparent on read). + + Args: + name (str): Asset name, the key used to read it back (e.g. ``"params.json"``). + payload (JsonAssetPayload | TextAssetPayload | BinaryAssetPayload | StrPath): + The asset content; the accepted shapes depend on ``content_type``: + + - for ``"json"`` (:data:`~binary_ensemble.types.JsonAssetPayload`): a ``dict`` / + ``list`` (serialized via ``json.dumps``), a JSON ``str``, bytes-like JSON, a + file-like object with ``.read()``, or an ``os.PathLike`` whose file is read. + Must yield valid UTF-8 JSON; the decoder will auto-parse it. + - for ``"text"`` (:data:`~binary_ensemble.types.TextAssetPayload`): the same + shapes, minus ``dict`` / ``list``; must yield valid UTF-8. + - for ``"binary"`` (:data:`~binary_ensemble.types.BinaryAssetPayload`): the same + shapes as ``"text"``; stored verbatim (e.g. a zipped shapefile or a + GeoPackage). + - for ``"file"`` (:data:`~binary_ensemble.types.StrPath`): a ``str`` or + ``os.PathLike`` naming a file whose contents are read and stored as binary. + + Outside ``content_type="file"``, a plain ``str`` is always *content*, never a + path — pass a ``pathlib.Path`` to read from disk (e.g. a ``Path`` with + ``content_type="json"`` stores a JSON file the decoder will auto-parse). + content_type (AssetContentType): One of ``"json"``, ``"text"``, ``"binary"``, or + ``"file"`` (:data:`~binary_ensemble.types.AssetContentType`). + + Raises: + ValueError: If the payload does not satisfy ``content_type`` (e.g. malformed JSON, + non-UTF-8 text, an unknown content type). + TypeError: If the payload shape is not accepted (e.g. a ``dict`` with + ``content_type="text"``, or a non-path with ``content_type="file"``). """ if content_type == "file": if not isinstance(payload, (str, os.PathLike)): @@ -231,31 +310,73 @@ def add_asset( try: json.loads(data.decode("utf-8")) except (UnicodeDecodeError, json.JSONDecodeError) as exc: - raise ValueError( - f"content_type='json' requires valid UTF-8 JSON: {exc}" - ) from exc + raise ValueError(f"content_type='json' requires valid UTF-8 JSON: {exc}") from exc elif content_type == "text": try: data.decode("utf-8") except UnicodeDecodeError as exc: - raise ValueError( - f"content_type='text' requires valid UTF-8: {exc}" - ) from exc + raise ValueError(f"content_type='text' requires valid UTF-8: {exc}") from exc elif content_type != "binary": raise ValueError( - f"content_type must be 'json', 'text', 'binary', or 'file', " - f"got {content_type!r}" + f"content_type must be 'json', 'text', 'binary', or 'file', got {content_type!r}" ) - self._enc.add_asset(name, data, content_type) + # The branches above leave only the core-supported literals. + core_type = cast('Literal["json", "text", "binary"]', content_type) + self._enc.add_asset(name, data, core_type) + + def remove_asset(self, name: str) -> None: + """Remove a named asset from a finalized bundle, reclaiming its bytes. + + Available wherever :meth:`add_asset` commits immediately: append mode, or create mode + after the stream has closed. The directory entry is dropped and the bundle is then + compacted in place, so the asset's payload bytes are actually gone from the file — not + just unreferenced. The name (and any singleton-type claim, e.g. ``metadata.json``) + becomes free again, so remove-then-add is the way to replace an asset's payload. + + Removing appended (post-stream) assets is cheap at any scale: the compaction rebuilds + only the small post-stream tail and never touches the assignment stream, even when the + stream is tens of gigabytes. Removing a *pre-stream* asset (the graph, or metadata + added before streaming) costs one whole-file rewrite instead. For the rare bundle that + arrives with dead space from elsewhere (every public write path here leaves bundles + compact), the raw ``_core.compact_bundle_in_place`` reclaims it directly, and the raw + ``_core.BendlEncoder.remove_asset`` drops only the directory entry if you specifically + need that form. + + Args: + name (str): The asset's name, as listed by + :meth:`~binary_ensemble._core.BendlDecoder.asset_names`. + + Raises: + KeyError: If no asset with that name exists in the bundle. + Exception: If the encoder is in create mode before the stream (just don't add the + asset), is currently streaming, or is closed. + """ + self._enc.remove_asset(name) + _compact_bundle_in_place(self._path) - def stream(self, format: str = "ben", variant: Optional[str] = None): + def stream(self, *, variant: Variant = "twodelta") -> BendlStreamSession: """Open the single-use assignment stream context manager. - Only ``"ben"`` is accepted; produce XBEN bundles via - :func:`compress_stream`. ``variant`` selects the BEN variant - (default ``"twodelta"``). + The embedded stream is always written in the BEN wire format; produce an XBEN bundle with + :func:`compress_stream` after writing (XBEN is a whole-stream LZMA2 wrap, so it cannot be + written live sample-by-sample). + + Args: + variant (Variant, optional): BEN encoding variant + (:data:`~binary_ensemble.types.Variant`): ``"standard"``, ``"mkv_chain"``, or + ``"twodelta"``. Default is ``"twodelta"``. + + Returns: + BendlStreamSession: A single-use context manager. ``write`` each assignment inside + the ``with`` block; a clean close finalizes the bundle, an exception leaves it + unfinalized. + + Raises: + ValueError: If ``variant`` is invalid. + Exception: If a stream was already written, append mode is active, or the encoder is + closed. """ - return self._enc.stream(format, variant) + return self._enc.stream(variant=variant) def close(self) -> None: """Finalize (create mode) or finish (append mode) the bundle. Idempotent.""" @@ -264,59 +385,87 @@ def close(self) -> None: def __enter__(self) -> "BendlEncoder": return self - def __exit__(self, exc_type, exc, tb) -> bool: + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: "TracebackType | None", + ) -> bool: self.close() return False def compress_stream( - path, - out_file=None, - in_place: bool = False, + path: StrPath, + out_file: StrPath | None = None, + overwrite: bool = False, ) -> None: """Recompress a bundle's embedded BEN stream to XBEN, preserving every asset. - Provide exactly one of ``in_place=True`` (recompress to a temp file and - atomically swap it over ``path``) or ``out_file`` (write a new bundle). - Passing both, or neither, raises. + All assets (graph, metadata, node_permutation_map, custom blobs) are preserved by decoded + payload, name, type, and JSON flag; storage compression is normalized to the writer's default + policy. An assets-only bundle (empty stream) recompresses to an empty XBEN bundle. + + Args: + path (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). + out_file (StrPath | None, optional): Destination path for the recompressed bundle + (``str`` or ``os.PathLike``), leaving ``path`` untouched. Default is ``None`` which + recompresses in place: the result is written to a temp file and atomically swapped + over ``path``. + overwrite (bool, optional): Replace ``out_file`` if it already exists. Irrelevant in + place, which always replaces ``path``. Default is ``False``. - All assets (graph, metadata, node_permutation_map, custom blobs) are - preserved by decoded payload, name, type, and JSON flag; storage compression - is normalized to the writer's default policy. An assets-only bundle (empty - stream) recompresses to an empty XBEN bundle. + Raises: + OSError: If ``out_file`` exists and ``overwrite`` is ``False``. """ _atomic_or_out( - lambda src, dst, overwrite: _recompress_bundle(src, dst, overwrite=overwrite), + lambda src, dst, ow: _recompress_bundle(src, dst, overwrite=ow), path, out_file, - in_place, + overwrite, ) def relabel_bundle( - path, - out_file=None, - sort: str = "mlc", - key: Optional[str] = None, - in_place: bool = False, + path: StrPath, + out_file: StrPath | None = None, + sort: SortMethod = "mlc", + key: str | None = None, + overwrite: bool = False, ) -> None: """Reorder a BEN bundle's graph and relabel its stream to match. - ``sort`` selects the ordering — ``"mlc"`` (default), ``"rcm"``, or ``"key"`` - to sort by the node attribute named in ``key`` (e.g. ``sort="key", - key="GEOID"``). It reorders the embedded ``graph.json``, rewrites every - assignment into the new node order, and writes a fresh bundle storing the - reordered graph and a ``node_permutation_map.json`` (so the reordering is - reversible). Metadata and custom assets are preserved. This is the - bundle-level form of the CLI's ``reben`` ordering flow — typically run to - shrink a bundle before an XBEN recompress. - - Provide exactly one of ``in_place=True`` or ``out_file``. Only BEN bundles are - supported (relabel before compressing to XBEN); the source must carry a graph. + Reorders the embedded ``graph.json``, rewrites every assignment into the new node order, and + writes a fresh bundle storing the reordered graph and a ``node_permutation_map.json`` (so the + reordering is reversible). Metadata and custom assets are preserved. This is the bundle-level + form of the CLI's ``reben`` ordering flow — typically run to shrink a bundle before an XBEN + recompress. + + Only BEN bundles are supported (relabel before compressing to XBEN); the source must carry a + graph. + + Args: + path (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). Must + hold a BEN (not XBEN) stream and a ``graph.json``. + out_file (StrPath | None, optional): Destination path for the relabeled bundle (``str`` + or ``os.PathLike``), leaving ``path`` untouched. Default is ``None`` which relabels + in place: the result is written to a temp file and atomically swapped over ``path``. + sort (SortMethod, optional): The ordering (:data:`~binary_ensemble.types.SortMethod`): + ``"mlc"`` (multi-level clustering), ``"rcm"`` (reverse Cuthill-McKee), or ``"key"`` + (sort by the node attribute named in ``key``). Default is ``"mlc"``. + key (str | None, optional): Node attribute to sort by, e.g. ``key="GEOID"``. Required + with — and only valid with — ``sort="key"``. Default is ``None``. + overwrite (bool, optional): Replace ``out_file`` if it already exists. Irrelevant in + place, which always replaces ``path``. Default is ``False``. + + Raises: + ValueError: If ``sort`` / ``key`` is invalid, or if the bundle has no graph or a non-BEN + stream. + OSError: If ``out_file`` exists and ``overwrite`` is ``False``. """ _atomic_or_out( - lambda src, dst, overwrite: _relabel_bundle(src, dst, sort, key, overwrite), + lambda src, dst, ow: _relabel_bundle(src, dst, sort, key, ow), path, out_file, - in_place, + overwrite, ) diff --git a/ben-py/binary_ensemble/bundle.pyi b/ben-py/binary_ensemble/bundle.pyi index a37e280..400fb9a 100644 --- a/ben-py/binary_ensemble/bundle.pyi +++ b/ben-py/binary_ensemble/bundle.pyi @@ -1,7 +1,20 @@ -from typing import Any, Optional +from types import TracebackType +from typing import Literal, overload + +import networkx as nx from binary_ensemble._core import BendlDecoder as BendlDecoder from binary_ensemble._core import BendlStreamSession as BendlStreamSession +from binary_ensemble.types import ( + BinaryAssetPayload, + GraphInput, + JsonAssetPayload, + MetadataInput, + SortMethod, + StrPath, + TextAssetPayload, + Variant, +) __all__ = [ "BendlEncoder", @@ -12,35 +25,54 @@ __all__ = [ ] class BendlEncoder: - def __init__(self, file_path, overwrite: bool = False) -> None: ... + def __init__(self, file_path: StrPath, overwrite: bool = False) -> None: ... @classmethod - def append(cls, file_path) -> "BendlEncoder": ... + def append(cls, file_path: StrPath) -> "BendlEncoder": ... def add_graph( - self, graph: Any, sort: Optional[str] = "mlc", key: Optional[str] = None - ) -> Any: ... - def add_metadata(self, metadata: Any) -> None: ... - def add_asset( self, - name: str, - payload: Any, - content_type: str, + graph: GraphInput, + sort: SortMethod | None = "mlc", + key: str | None = None, + ) -> nx.Graph: ... + def add_metadata(self, metadata: MetadataInput) -> None: ... + @overload + def add_asset( + self, name: str, payload: JsonAssetPayload, content_type: Literal["json"] + ) -> None: ... + @overload + def add_asset( + self, name: str, payload: TextAssetPayload, content_type: Literal["text"] + ) -> None: ... + @overload + def add_asset( + self, name: str, payload: BinaryAssetPayload, content_type: Literal["binary"] ) -> None: ... - def stream( - self, format: str = "ben", variant: Optional[str] = None - ) -> BendlStreamSession: ... + @overload + def add_asset(self, name: str, payload: StrPath, content_type: Literal["file"]) -> None: ... + # Drops the directory entry and compacts the bundle in place, so the payload bytes are + # actually reclaimed; frees the name for re-add. KeyError if absent. (The raw + # _core.BendlEncoder.remove_asset is the cheap, directory-only form.) + def remove_asset(self, name: str) -> None: ... + def stream(self, *, variant: Variant = "twodelta") -> BendlStreamSession: ... def close(self) -> None: ... def __enter__(self) -> "BendlEncoder": ... - def __exit__(self, exc_type, exc, tb) -> bool: ... + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> bool: ... +# out_file=None means in place: the result is atomically swapped over `path`. def compress_stream( - path, - out_file=None, - in_place: bool = False, + path: StrPath, + out_file: StrPath | None = None, + overwrite: bool = False, ) -> None: ... def relabel_bundle( - path, - out_file=None, - sort: str = "mlc", - key: Optional[str] = None, - in_place: bool = False, + path: StrPath, + out_file: StrPath | None = None, + sort: SortMethod = "mlc", + key: str | None = None, + overwrite: bool = False, ) -> None: ... diff --git a/ben-py/binary_ensemble/codec.py b/ben-py/binary_ensemble/codec.py index 9049816..726008d 100644 --- a/ben-py/binary_ensemble/codec.py +++ b/ben-py/binary_ensemble/codec.py @@ -1,9 +1,8 @@ """Whole-file stream/JSONL transforms. -These helpers convert entire files in one call, without an iterator: JSONL ↔ -BEN ↔ XBEN. For streaming sample-by-sample access use -:class:`binary_ensemble.stream.BenDecoder`; for the single-file bundle format -use :mod:`binary_ensemble.bundle`. +These helpers convert entire files in one call, without an iterator: JSONL ↔ BEN ↔ XBEN. For +streaming sample-by-sample access use :class:`binary_ensemble.stream.BenDecoder`; for the +single-file bundle format use :mod:`binary_ensemble.bundle`. """ from __future__ import annotations diff --git a/ben-py/binary_ensemble/graph.py b/ben-py/binary_ensemble/graph.py index cf62009..579c1ca 100644 --- a/ben-py/binary_ensemble/graph.py +++ b/ben-py/binary_ensemble/graph.py @@ -1,26 +1,30 @@ """Graph reordering utilities (the reben orderings). -Reordering a dual graph before building a chain (or a bundle) can dramatically -improve BEN/XBEN compression. Each function takes a NetworkX adjacency-format -graph (a ``dict``/``list``, raw JSON ``bytes``, a file-like with ``.read()``, or -a path) and returns ``(reordered_graph, node_permutation_map)``: +Reordering a dual graph before building a chain (or a bundle) can dramatically improve BEN/XBEN +compression. Each function takes a graph — a live ``networkx.Graph``, or adjacency-format JSON +as a ``dict``/``list``, raw ``bytes``, a file-like with ``.read()``, or a path — and returns +``(reordered_graph, node_permutation_map)``: -- ``reordered_graph`` is a live NetworkX graph in its new node ordering (the same - shape :meth:`binary_ensemble.bundle.BendlEncoder.add_graph` and +- ``reordered_graph`` is a live NetworkX graph in its new node ordering (the same shape + :meth:`binary_ensemble.bundle.BendlEncoder.add_graph` and :meth:`binary_ensemble.bundle.BendlDecoder.read_graph` return). -- ``node_permutation_map`` is the parsed ``node_permutation_map.json`` payload — - an object with a ``node_permutation_old_to_new`` field mapping original - zero-based node positions to their new positions. +- ``node_permutation_map`` is the parsed ``node_permutation_map.json`` payload — an object with a + ``node_permutation_old_to_new`` field mapping original zero-based node positions to their new + positions. -To reorder *and* embed the result in a bundle in one step, pass ``sort`` / ``key`` -to :meth:`binary_ensemble.bundle.BendlEncoder.add_graph`. +To reorder *and* embed the result in a bundle in one step, pass ``sort`` / ``key`` to +:meth:`binary_ensemble.bundle.BendlEncoder.add_graph`. """ from __future__ import annotations -from typing import Any, Optional, Tuple +from typing import TYPE_CHECKING from binary_ensemble._core import graph_reorder +from binary_ensemble.types import GraphInput, NodePermutationMap, SortMethod + +if TYPE_CHECKING: + import networkx as nx __all__ = [ "reorder", @@ -31,28 +35,93 @@ def reorder( - graph: Any, sort: str = "mlc", key: Optional[str] = None -) -> Tuple[Any, Any]: + graph: GraphInput, sort: SortMethod = "mlc", key: str | None = None +) -> "tuple[nx.Graph, NodePermutationMap]": """Reorder ``graph`` and return ``(reordered_graph, node_permutation_map)``. - ``sort`` is ``"mlc"`` (multi-level clustering, the default), ``"rcm"`` - (reverse Cuthill-McKee), or ``"key"`` to sort by the node attribute named in - ``key`` (e.g. ``sort="key", key="GEOID"``; ``key="id"`` sorts by the NetworkX - node id). ``key`` is only valid with ``sort="key"``. + Args: + graph (GraphInput): The dual graph (:data:`~binary_ensemble.types.GraphInput`): a live + ``networkx.Graph`` (subclasses such as ``gerrychain.Graph`` count), or + adjacency-format JSON as a parsed ``dict`` or ``list``, raw ``bytes``, a file-like + object with ``.read()``, or a ``str`` / ``os.PathLike`` path to a JSON file. A plain + ``str`` is a *path* here. + sort (SortMethod, optional): The ordering (:data:`~binary_ensemble.types.SortMethod`): + ``"mlc"`` (multi-level clustering), ``"rcm"`` (reverse Cuthill-McKee), or ``"key"`` + (sort by the node attribute named in ``key``). Default is ``"mlc"``. + key (str | None, optional): Node attribute to sort by, e.g. ``key="GEOID"``; + ``key="id"`` sorts by the NetworkX node id. Required with — and only valid with — + ``sort="key"``. Default is ``None``. + + Returns: + tuple[networkx.Graph, NodePermutationMap]: The reordered graph (a live NetworkX graph, + the shape :meth:`BendlEncoder.add_graph ` + and :meth:`BendlDecoder.read_graph ` + return) and the parsed permutation map + (:class:`~binary_ensemble.types.NodePermutationMap`), whose + ``node_permutation_old_to_new`` field maps original zero-based node positions to their + new positions. + + Raises: + ValueError: If ``sort`` / ``key`` is invalid. """ return graph_reorder(graph, sort, key) -def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: - """Reorder ``graph`` using recursive multi-level clustering.""" +def reorder_multi_level_cluster( + graph: GraphInput, +) -> "tuple[nx.Graph, NodePermutationMap]": + """Reorder ``graph`` using recursive multi-level clustering. + + Equivalent to :func:`reorder` with ``sort="mlc"``. + + Args: + graph (GraphInput): The dual graph (:data:`~binary_ensemble.types.GraphInput`): a live + ``networkx.Graph``, or adjacency-format JSON as a parsed ``dict`` or ``list``, raw + ``bytes``, a file-like object with ``.read()``, or a ``str`` / ``os.PathLike`` path + to a JSON file. + + Returns: + tuple[networkx.Graph, NodePermutationMap]: The reordered graph and the parsed permutation + map — see :func:`reorder`. + """ return graph_reorder(graph, "mlc") -def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: - """Reorder ``graph`` using Reverse Cuthill-McKee.""" +def reorder_reverse_cuthill_mckee( + graph: GraphInput, +) -> "tuple[nx.Graph, NodePermutationMap]": + """Reorder ``graph`` using Reverse Cuthill-McKee. + + Equivalent to :func:`reorder` with ``sort="rcm"``. + + Args: + graph (GraphInput): The dual graph (:data:`~binary_ensemble.types.GraphInput`): a live + ``networkx.Graph``, or adjacency-format JSON as a parsed ``dict`` or ``list``, raw + ``bytes``, a file-like object with ``.read()``, or a ``str`` / ``os.PathLike`` path + to a JSON file. + + Returns: + tuple[networkx.Graph, NodePermutationMap]: The reordered graph and the parsed permutation + map — see :func:`reorder`. + """ return graph_reorder(graph, "rcm") -def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: - """Reorder ``graph`` by sorting on a node-attribute ``key`` (use ``"id"`` for node id).""" +def reorder_by_key(graph: GraphInput, key: str) -> "tuple[nx.Graph, NodePermutationMap]": + """Reorder ``graph`` by sorting on a node attribute. + + Equivalent to :func:`reorder` with ``sort="key"``. + + Args: + graph (GraphInput): The dual graph (:data:`~binary_ensemble.types.GraphInput`): a live + ``networkx.Graph``, or adjacency-format JSON as a parsed ``dict`` or ``list``, raw + ``bytes``, a file-like object with ``.read()``, or a ``str`` / ``os.PathLike`` path + to a JSON file. + key (str): Node attribute to sort by, e.g. ``key="GEOID"``; the special ``key="id"`` + sorts by the NetworkX node id. + + Returns: + tuple[networkx.Graph, NodePermutationMap]: The reordered graph and the parsed permutation + map — see :func:`reorder`. + """ return graph_reorder(graph, "key", key) diff --git a/ben-py/binary_ensemble/graph.pyi b/ben-py/binary_ensemble/graph.pyi index 0c0e70b..1aa76a5 100644 --- a/ben-py/binary_ensemble/graph.pyi +++ b/ben-py/binary_ensemble/graph.pyi @@ -1,4 +1,6 @@ -from typing import Any, Optional, Tuple +import networkx as nx + +from binary_ensemble.types import GraphInput, NodePermutationMap, SortMethod __all__ = [ "reorder", @@ -7,11 +9,15 @@ __all__ = [ "reorder_by_key", ] -# Each helper returns (reordered_graph, node_permutation_map): the graph is a live -# NetworkX graph, the map is the parsed node_permutation_map.json dict. +# Each helper returns (reordered_graph, node_permutation_map): the graph is a live NetworkX +# graph, the map is the parsed node_permutation_map.json dict. def reorder( - graph: Any, sort: str = "mlc", key: Optional[str] = None -) -> Tuple[Any, Any]: ... -def reorder_multi_level_cluster(graph: Any) -> Tuple[Any, Any]: ... -def reorder_reverse_cuthill_mckee(graph: Any) -> Tuple[Any, Any]: ... -def reorder_by_key(graph: Any, key: str) -> Tuple[Any, Any]: ... + graph: GraphInput, sort: SortMethod = "mlc", key: str | None = None +) -> tuple[nx.Graph, NodePermutationMap]: ... +def reorder_multi_level_cluster( + graph: GraphInput, +) -> tuple[nx.Graph, NodePermutationMap]: ... +def reorder_reverse_cuthill_mckee( + graph: GraphInput, +) -> tuple[nx.Graph, NodePermutationMap]: ... +def reorder_by_key(graph: GraphInput, key: str) -> tuple[nx.Graph, NodePermutationMap]: ... diff --git a/ben-py/binary_ensemble/stream.py b/ben-py/binary_ensemble/stream.py index d89852a..e18ffb1 100644 --- a/ben-py/binary_ensemble/stream.py +++ b/ben-py/binary_ensemble/stream.py @@ -1,10 +1,9 @@ """Plain BEN/XBEN stream encoding and decoding. -``BenEncoder`` writes a plain ``.ben`` stream; ``BenDecoder`` iterates a plain -``.ben`` / ``.xben`` stream. Both are stream-only: opening a decoder on a -``.bendl`` bundle, or trying to read bundle assets, raises and points you at -:mod:`binary_ensemble.bundle`. For the recommended single-file bundle format, -use :class:`binary_ensemble.bundle.BendlEncoder` / +``BenEncoder`` writes a plain ``.ben`` stream; ``BenDecoder`` iterates a plain ``.ben`` / +``.xben`` stream. Both are stream-only: opening a decoder on a ``.bendl`` bundle, or trying to +read bundle assets, raises and points you at :mod:`binary_ensemble.bundle`. For the recommended +single-file bundle format, use :class:`binary_ensemble.bundle.BendlEncoder` / :class:`binary_ensemble.bundle.BendlDecoder`. """ diff --git a/ben-py/binary_ensemble/types.py b/ben-py/binary_ensemble/types.py new file mode 100644 index 0000000..14a1e69 --- /dev/null +++ b/ben-py/binary_ensemble/types.py @@ -0,0 +1,111 @@ +"""Shared type aliases for the public API. + +These names describe the payload shapes the API accepts and returns, so user code can annotate +against them:: + + from binary_ensemble.types import GraphInput, NodePermutationMap + + def load(graph: GraphInput) -> NodePermutationMap: ... + +Nothing here changes runtime behavior; the aliases exist so the signatures in +:mod:`binary_ensemble.bundle`, :mod:`binary_ensemble.graph`, and the ``_core`` stubs say what +they actually mean. +""" + +from __future__ import annotations + +import os +from typing import Any, Literal, Protocol, TypedDict + +# Imported eagerly so GraphInput is one honest runtime definition. networkx is a hard +# dependency and its core import is light (no compiled deps). +import networkx as nx + +__all__ = [ + "AssetContentType", + "AssetEntry", + "AssignmentFormat", + "BinaryAssetPayload", + "GraphInput", + "JsonAssetPayload", + "MetadataInput", + "NodePermutationMap", + "SortMethod", + "StrPath", + "SupportsRead", + "TextAssetPayload", + "Variant", +] + +Variant = Literal["standard", "mkv_chain", "twodelta"] +"""BEN encoding variant (see the variants concept page for how to choose).""" + +AssignmentFormat = Literal["ben", "xben"] +"""Wire format of an assignment stream.""" + +SortMethod = Literal["mlc", "rcm", "key"] +"""Graph reordering method: multi-level clustering, reverse Cuthill-McKee, or +sort-by-node-attribute (which also requires ``key=``).""" + +AssetContentType = Literal["json", "text", "binary", "file"] +"""How :meth:`~binary_ensemble.bundle.BendlEncoder.add_asset` treats its payload.""" + +StrPath = str | os.PathLike[str] +"""A filesystem path.""" + + +class SupportsRead(Protocol): + """A file-like object whose ``.read()`` yields ``bytes`` or ``str``.""" + + def read(self) -> bytes | str: ... + + +GraphInput = nx.Graph | dict[str, Any] | list[Any] | bytes | bytearray | SupportsRead | StrPath +"""Accepted forms for a dual graph: a live ``networkx.Graph`` (subclasses such as +``gerrychain.Graph`` count; its node iteration order is preserved), or adjacency-format JSON as a +parsed ``dict`` / ``list``, raw ``bytes``, a file-like with ``.read()``, or a path to a JSON +file. A plain ``str`` is a *path* here.""" + +MetadataInput = dict[str, Any] | list[Any] | bytes | bytearray | SupportsRead | StrPath +"""Accepted forms for ``metadata.json`` payloads: a parsed ``dict`` / ``list``, raw JSON +``bytes``, a file-like with ``.read()``, or a path to a JSON file (a plain ``str`` is a *path*, +never inline JSON).""" + +BinaryAssetPayload = bytes | bytearray | memoryview | str | SupportsRead | os.PathLike[str] +"""``add_asset`` payloads for ``content_type="binary"``: bytes-like (stored verbatim), ``str`` +(stored as its UTF-8 encoding — *content*, not a path), a file-like with ``.read()``, or an +``os.PathLike`` whose file is read. Note that a plain ``str`` is content; only ``os.PathLike`` +objects are treated as paths.""" + +TextAssetPayload = BinaryAssetPayload +"""``add_asset`` payloads for ``content_type="text"`` — the same shapes as +:data:`BinaryAssetPayload`, but the resulting bytes must be valid UTF-8.""" + +JsonAssetPayload = dict[str, Any] | list[Any] | BinaryAssetPayload +"""``add_asset`` payloads for ``content_type="json"``: additionally accepts a ``dict`` / +``list``, which is serialized via ``json.dumps``. The resulting bytes must be valid UTF-8 +JSON.""" + + +class NodePermutationMap(TypedDict): + """The parsed ``node_permutation_map.json`` payload. + + ``node_permutation_old_to_new`` maps original zero-based node positions (as JSON string keys) + to their new positions. Exactly one of ``ordering_method`` / ``key`` records how the ordering + was produced. + """ + + node_permutation_old_to_new: dict[str, int] + ordering_method: str | None + key: str | None + + +class AssetEntry(TypedDict): + """One bundle-directory entry, as returned by + :meth:`~binary_ensemble.bundle.BendlDecoder.list_assets`.""" + + name: str + type: int + offset: int + len: int + flags: list[str] diff --git a/ben-py/pyproject.toml b/ben-py/pyproject.toml index f89ea29..5503f4b 100755 --- a/ben-py/pyproject.toml +++ b/ben-py/pyproject.toml @@ -24,6 +24,17 @@ filterwarnings = [ "ignore:.*XBEN may take a second*:UserWarning", ] +[tool.ruff] +line-length = 100 + +[tool.pyright] +include = ["binary_ensemble", "tests/typing_assertions.py"] +venvPath = "." +venv = ".venv" +# The negative assertions in tests/typing_assertions.py rely on this: a +# `type: ignore` on a call that becomes legal must fail the check. +reportUnnecessaryTypeIgnoreComment = "error" + [project.optional-dependencies] # Render the site (used by ReadTheDocs). Notebook outputs are rendered from the # committed .ipynb files, so no execution dependencies are needed here. @@ -57,7 +68,9 @@ dev = [ # Used directly by the docs how-to snippets, which run under pytest. "numpy>=1.26", "pandas>=2.0", + "pyright>=1.1", "pytest>=8.4.2", "ruff>=0.11.0", "tqdm>=4.67.1", + "ty", ] diff --git a/ben-py/src/common.rs b/ben-py/src/common.rs index 94eb2b1..841f597 100644 --- a/ben-py/src/common.rs +++ b/ben-py/src/common.rs @@ -1,7 +1,7 @@ use binary_ensemble::BenVariant; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyDict, PyList}; +use pyo3::types::{PyByteArray, PyBytes, PyDict, PyList}; use std::fs::File; use std::io::{BufReader, BufWriter}; use std::path::PathBuf; @@ -58,16 +58,22 @@ pub fn open_output(out_file: &PathBuf, overwrite: bool) -> PyResult, obj: &Bound<'_, PyAny>) -> PyResult> { +pub fn parse_json_input( + py: Python<'_>, + obj: &Bound<'_, PyAny>, + what: &str, + accepted: &str, +) -> PyResult> { // Dict / list → json.dumps. if obj.is_instance_of::() || obj.is_instance_of::() { let json_mod = py.import("json")?; @@ -76,12 +82,14 @@ pub fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult extraction would + // also accept any sequence of small ints (e.g. iterating a NetworkX graph's node ids) and + // silently store garbage. if let Ok(b) = obj.downcast::() { return Ok(b.as_bytes().to_vec()); } - if let Ok(b) = obj.extract::>() { - return Ok(b); + if let Ok(b) = obj.downcast::() { + return Ok(b.to_vec()); } // File-like: must have .read(). Check before str/path, since a plain `str` / `Path` has no @@ -97,22 +105,70 @@ pub fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult() { return Ok(s.into_bytes()); } - return Err(PyException::new_err( - "graph .read() must return bytes or str", - )); + return Err(PyException::new_err(format!( + "{what} .read() must return bytes or str" + ))); } // Path / str → read the file at that path. - let path: PathBuf = obj.extract().map_err(|_| { - PyValueError::new_err( - "graph must be a dict/list, bytes, a file-like with .read(), or a path", - ) - })?; + let path: PathBuf = obj + .extract() + .map_err(|_| PyValueError::new_err(format!("{what} must be {accepted}")))?; std::fs::read(&path).map_err(|e| { - PyIOError::new_err(format!("Failed to read graph file {}: {e}", path.display())) + PyIOError::new_err(format!( + "Failed to read {what} file {}: {e}", + path.display() + )) }) } +/// Normalize a user-supplied metadata argument into raw UTF-8 JSON bytes. +pub fn parse_metadata_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { + parse_json_input( + py, + obj, + "metadata", + "a dict/list, bytes, a file-like with .read(), or a path", + ) +} + +/// Convert a live NetworkX graph into adjacency-format JSON bytes, or return `None` if `obj` is +/// not a NetworkX graph (subclasses such as `gerrychain.Graph` count). +fn networkx_graph_to_json_bytes( + py: Python<'_>, + obj: &Bound<'_, PyAny>, +) -> PyResult>> { + let networkx = py.import("networkx")?; + let graph_cls = networkx.getattr("Graph")?; + if !obj.is_instance(&graph_cls)? { + return Ok(None); + } + // adjacency_data preserves the graph's node iteration order, so a raw (sort=None) embed + // stores exactly the order the caller's graph already has. + let json_graph = py.import("networkx.readwrite.json_graph")?; + let data = json_graph.call_method1("adjacency_data", (obj,))?; + let json_mod = py.import("json")?; + let dumped = json_mod.call_method1("dumps", (&data,))?; + let s: String = dumped.extract()?; + Ok(Some(s.into_bytes())) +} + +/// Normalize a user-supplied graph argument into raw adjacency-format UTF-8 JSON bytes. +/// +/// Accepts everything [`parse_json_input`] does, plus a live NetworkX graph (serialized via +/// `networkx.readwrite.json_graph.adjacency_data`, preserving its node order). +pub fn parse_graph_input(py: Python<'_>, obj: &Bound<'_, PyAny>) -> PyResult> { + if let Some(bytes) = networkx_graph_to_json_bytes(py, obj)? { + return Ok(bytes); + } + parse_json_input( + py, + obj, + "graph", + "a networkx.Graph, dict/list, bytes, a file-like with .read(), or a path", + ) +} + /// Build a live NetworkX graph from an already-parsed adjacency-format JSON object. /// /// The shared tail behind every API that hands a graph back to the caller — diff --git a/ben-py/src/compact.rs b/ben-py/src/compact.rs new file mode 100644 index 0000000..c00fa64 --- /dev/null +++ b/ben-py/src/compact.rs @@ -0,0 +1,100 @@ +//! Binding for compacting a `.bendl` file: rewriting it without unreferenced byte ranges. +//! +//! Thin wrapper over [`binary_ensemble::io::bundle::compact`], which owns the semantics: assets +//! are carried over by decoded payload, the assignment stream is copied verbatim through a +//! verified reader, and the wire format (BEN or XBEN) is preserved. The `bendl` CLI's `remove` +//! and `compact` subcommands share the same core implementation. + +use crate::common::open_output; +use binary_ensemble::io::bundle::compact::{ + compact_bundle as core_compact_bundle, compact_bundle_in_place as core_compact_in_place, + Compaction, +}; +use binary_ensemble::io::bundle::{BendlReader, BendlWriteError}; +use pyo3::exceptions::{PyException, PyIOError}; +use pyo3::prelude::*; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +fn map_bundle_err(err: BendlWriteError) -> PyErr { + match err { + BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + other => PyException::new_err(format!("{other}")), + } +} + +/// Rewrite the bundle at `in_file` without unreferenced byte ranges, writing the result to +/// `out_file`. +/// +/// Raw surface for the rare bundle that arrives with dead space from other tooling — every +/// facade write path (``remove_asset``, ``compress_stream``, ``relabel_bundle``) keeps bundles +/// compact automatically. See also :func:`compact_bundle_in_place`. +/// +/// Args: +/// in_file (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). +/// out_file (StrPath): Destination path for the compacted bundle (``str`` or +/// ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``. +/// Exception: If the bundle is unfinalized, or an asset or the stream fails its checksum. +#[pyfunction] +#[pyo3(signature = (in_file, out_file, overwrite = false))] +#[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] +pub fn compact_bundle(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> PyResult<()> { + let file = File::open(&in_file) + .map_err(|e| PyIOError::new_err(format!("Failed to open {}: {e}", in_file.display())))?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(|e| { + PyException::new_err(format!( + "Failed to parse bundle header in {}: {e}", + in_file.display() + )) + })?; + // Check before open_output so a doomed call cannot create or truncate the destination. + if !reader.is_finalized() { + return Err(PyException::new_err( + "compact_bundle requires a finalized bundle", + )); + } + + let buf = open_output(&out_file, overwrite)?; + core_compact_bundle(&mut reader, buf).map_err(map_bundle_err)?; + Ok(()) +} + +/// Compact the bundle at `path` in place, choosing the cheapest applicable strategy. +/// +/// When every unreferenced byte lies after the assignment stream (the layout that asset +/// removals and appends produce), only the small post-stream tail is rebuilt — O(tail), +/// independent of stream size, no scratch space, stream never read. Otherwise the bundle is +/// rewritten wholesale through a temp file (stream checksum-verified during the copy) and +/// atomically swapped over `path`. +/// +/// Raw surface, also used by :meth:`binary_ensemble.bundle.BendlEncoder.remove_asset` — every +/// facade write path keeps bundles compact automatically, so this is only needed for bundles +/// that arrive with dead space from other tooling. +/// +/// Args: +/// path (StrPath): Path to the ``.bendl`` bundle to compact (``str`` or ``os.PathLike``). +/// +/// Returns: +/// str: Which strategy ran — ``"none"`` (already compact), ``"tail"`` (post-stream tail +/// rebuilt; stream untouched and not verified), or ``"full"`` (whole-bundle rewrite). +/// +/// Raises: +/// Exception: If the bundle is unfinalized, or — on the full-rewrite path — an asset or +/// the stream fails its checksum. +#[pyfunction] +#[pyo3(signature = (path))] +#[pyo3(text_signature = "(path)")] +pub fn compact_bundle_in_place(path: PathBuf) -> PyResult<&'static str> { + let kind = core_compact_in_place(&path).map_err(map_bundle_err)?; + Ok(match kind { + Compaction::None => "none", + Compaction::TailRewrite => "tail", + Compaction::FullRewrite => "full", + }) +} diff --git a/ben-py/src/decode/bundle_decoder.rs b/ben-py/src/decode/bundle_decoder.rs index ac924d1..a7a4d61 100644 --- a/ben-py/src/decode/bundle_decoder.rs +++ b/ben-py/src/decode/bundle_decoder.rs @@ -28,9 +28,9 @@ use std::path::PathBuf; /// bundle (one written with no assignment stream) iterates to nothing with ``len() == 0``. /// /// Args: -/// file_path: Path to the input ``.bendl`` file. Whether the embedded stream is BEN or -/// XBEN is read from the bundle header; an XBEN stream warns about a one-time -/// decompression startup cost. +/// file_path (StrPath): Path to the input ``.bendl`` file (``str`` or ``os.PathLike``). Whether +/// the embedded stream is BEN or XBEN is read from the bundle header; an XBEN stream warns +/// about a one-time decompression startup cost. /// /// Raises: /// Exception: If ``file_path`` is not a bundle (use @@ -60,7 +60,7 @@ impl PyBendlDecoder { /// pays a one-time decompression startup cost. /// /// Args: - /// file_path: Path to the input ``.bendl`` file. + /// file_path (StrPath): Path to the input ``.bendl`` file (``str`` or ``os.PathLike``). /// /// Raises: /// Exception: If ``file_path`` is not a bundle (use @@ -76,7 +76,7 @@ impl PyBendlDecoder { })?; if !is_bundle { return Err(PyException::new_err(format!( - "{} is not a .bendl bundle (missing BENDL magic). Open plain BEN/XBEN \ + "{} is not a .bendl file (missing BENDL magic). Open plain BEN/XBEN \ streams with binary_ensemble.stream.BenDecoder instead.", file_path.display() ))); @@ -162,15 +162,15 @@ impl PyBendlDecoder { /// being unpacked. /// /// Args: - /// indices: The 1-indexed sample numbers to keep. An unsorted or duplicated list - /// is sorted and deduplicated, with a ``UserWarning``. + /// indices (Sequence[int]): The 1-indexed sample numbers to keep. Duplicates are dropped; + /// an unsorted list is sorted, with a ``UserWarning``. /// /// Returns: /// BendlDecoder: ``self``, so the call can be chained into a ``for`` loop. /// /// Raises: - /// Exception: If any index is ``0`` (indices are 1-based) or greater than the - /// number of samples in the stream. + /// Exception: If ``indices`` is empty, contains ``0`` (indices are 1-based), or + /// contains an index greater than the number of samples in the stream. #[pyo3(text_signature = "(self, indices, /)")] fn subsample_indices<'py>( mut slf: PyRefMut<'py, Self>, @@ -184,8 +184,8 @@ impl PyBendlDecoder { /// Restrict iteration to a contiguous, 1-indexed inclusive range of samples. /// /// Args: - /// start: First sample number to keep (1-indexed, inclusive). - /// end: Last sample number to keep (1-indexed, inclusive). + /// start (int): First sample number to keep (1-indexed, inclusive). + /// end (int): Last sample number to keep (1-indexed, inclusive). /// /// Returns: /// BendlDecoder: ``self``, for chaining into a ``for`` loop. @@ -211,8 +211,8 @@ impl PyBendlDecoder { /// Restrict iteration to every ``step``-th sample. /// /// Args: - /// step: Stride between kept samples (e.g. ``10`` keeps every tenth sample). - /// offset: 1-indexed position of the first kept sample. Defaults to ``1``. + /// step (int): Stride between kept samples (e.g. ``10`` keeps every tenth sample). + /// offset (int, optional): 1-indexed position of the first kept sample. Default is ``1``. /// /// Returns: /// BendlDecoder: ``self``, for chaining into a ``for`` loop. @@ -254,6 +254,51 @@ impl PyBendlDecoder { (h.major_version, h.minor_version) } + /// Return the on-disk byte length of the embedded assignment stream. + /// + /// Read straight from the bundle header's ``stream_len`` field — no decoding or copying. + /// This is the size of the stream region as stored (BEN bytes, or compressed XBEN bytes), + /// the same bytes ``extract_stream`` would copy out. For an unfinalized bundle the stream + /// is taken to extend to the directory (or EOF), matching recovery extraction. + /// + /// Returns: + /// int: Byte length of the embedded stream region; ``0`` for an assets-only bundle. + /// + /// Example: + /// >>> BendlDecoder("ensemble.bendl").stream_size() + /// 40110 + #[pyo3(text_signature = "(self)")] + fn stream_size(&mut self) -> PyResult { + let (_offset, len) = self + .reader + .assignment_stream_range() + .map_err(|e| PyIOError::new_err(format!("Failed to read stream range: {e}")))?; + Ok(len) + } + + /// Return the on-disk byte length of a named asset's stored payload. + /// + /// Read straight from the bundle directory — no decoding or copying. For assets stored + /// xz-compressed (the ``"xz"`` flag in :meth:`list_assets`), this is the compressed size; + /// the decoded payload can be larger — use ``len(read_asset_bytes(name))`` for that. + /// + /// Args: + /// name (str): The asset's name, as listed by :meth:`asset_names`. + /// + /// Returns: + /// int: Stored byte length of the asset's payload region. + /// + /// Raises: + /// KeyError: If no asset with that name exists in the bundle. + #[pyo3(text_signature = "(self, name, /)")] + fn asset_size(&self, name: &str) -> PyResult { + let entry = self + .reader + .find_asset_by_name(name) + .ok_or_else(|| PyKeyError::new_err(format!("no asset named {name:?} in bundle")))?; + Ok(entry.payload_len) + } + /// Whether the bundle was successfully finalized. /// /// Returns: @@ -338,7 +383,7 @@ impl PyBendlDecoder { /// Read the (decoded) bytes of a named asset as a Python ``bytes`` object. /// /// Args: - /// name: The asset's name, as listed by :meth:`asset_names`. + /// name (str): The asset's name, as listed by :meth:`asset_names`. /// /// Returns: /// bytes: The asset's decoded payload. @@ -360,7 +405,7 @@ impl PyBendlDecoder { /// Parse a JSON asset into a Python object (``dict``, ``list``, …). /// /// Args: - /// name: The asset's name, as listed by :meth:`asset_names`. + /// name (str): The asset's name, as listed by :meth:`asset_names`. /// /// Returns: /// The parsed JSON value. @@ -415,10 +460,12 @@ impl PyBendlDecoder { /// ``BenDecoder(out_path, mode=dec.assignment_format())``. /// /// Args: - /// out_path: Path to write the extracted stream to. - /// overwrite: Replace ``out_path`` if it already exists. Defaults to ``False``. - /// allow_unfinalized: Permit extraction from a bundle that was never finalized - /// (recovering a partial stream). Defaults to ``False``. + /// out_path (StrPath): Path to write the extracted stream to (``str`` or + /// ``os.PathLike``). + /// overwrite (bool, optional): Replace ``out_path`` if it already exists. Default is + /// ``False``. + /// allow_unfinalized (bool, optional): Permit extraction from a bundle that was never + /// finalized (recovering a partial stream). Default is ``False``. /// /// Raises: /// OSError: If ``out_path`` exists and ``overwrite`` is ``False``, or the copy fails. diff --git a/ben-py/src/decode/cursor.rs b/ben-py/src/decode/cursor.rs index 9357d6f..ae78198 100644 --- a/ben-py/src/decode/cursor.rs +++ b/ben-py/src/decode/cursor.rs @@ -2,7 +2,7 @@ //! //! [`SampleCursor`] owns everything needed to walk an assignment stream and to apply a subsample //! selection, independent of whether the bytes come from a plain `.ben`/`.xben` file or from a -//! `.bendl` bundle's embedded stream region. Both `PyBenDecoder` and `PyBendlDecoder` embed one and +//! `.bendl` file's embedded stream region. Both `PyBenDecoder` and `PyBendlDecoder` embed one and //! forward their iteration / `len` / `subsample_*` methods to it, so the single-pass restart logic, //! the `MkvRecord` run expansion, and the subsample bounds checks cannot drift between the two. diff --git a/ben-py/src/decode/decoder.rs b/ben-py/src/decode/decoder.rs index 327835b..e51ae5c 100644 --- a/ben-py/src/decode/decoder.rs +++ b/ben-py/src/decode/decoder.rs @@ -18,9 +18,11 @@ use std::path::PathBuf; /// ``bendl`` split of the command-line tools. /// /// Args: -/// file_path: Path to the input ``.ben`` or ``.xben`` file. -/// mode: Which reader to use — ``"ben"`` or ``"xben"``. Defaults to ``"ben"``. +/// file_path (StrPath): Path to the input ``.ben`` or ``.xben`` file (``str`` or +/// ``os.PathLike``). +/// mode (AssignmentFormat, optional): Which reader to use — ``"ben"`` or ``"xben"``. /// Opening an XBEN stream warns about a one-time decompression startup cost. +/// Default is ``"ben"``. /// /// Raises: /// Exception: If ``file_path`` is a ``.bendl`` bundle (use @@ -46,8 +48,10 @@ impl PyBenDecoder { /// decompression startup cost. /// /// Args: - /// file_path: Path to the input ``.ben`` or ``.xben`` file. - /// mode: Either ``"ben"`` or ``"xben"``. Defaults to ``"ben"``. + /// file_path (StrPath): Path to the input ``.ben`` or ``.xben`` file (``str`` or + /// ``os.PathLike``). + /// mode (AssignmentFormat, optional): Either ``"ben"`` or ``"xben"``. Default is + /// ``"ben"``. /// /// Raises: /// Exception: If ``file_path`` is a ``.bendl`` bundle (use @@ -66,7 +70,7 @@ impl PyBenDecoder { if is_bundle { return Err(PyException::new_err(format!( - "{} is a .bendl bundle, not a plain BEN/XBEN stream. Open it with \ + "{} is a .bendl file, not a plain BEN/XBEN stream. Open it with \ binary_ensemble.bundle.BendlDecoder instead.", file_path.display() ))); @@ -123,15 +127,15 @@ impl PyBenDecoder { /// being unpacked, so this stays fast on large ensembles. /// /// Args: - /// indices: The 1-indexed sample numbers to keep. An unsorted or duplicated list - /// is sorted and deduplicated, with a ``UserWarning``. + /// indices (Sequence[int]): The 1-indexed sample numbers to keep. Duplicates are dropped; + /// an unsorted list is sorted, with a ``UserWarning``. /// /// Returns: /// BenDecoder: ``self``, so the call can be chained directly into a ``for`` loop. /// /// Raises: - /// Exception: If any index is ``0`` (indices are 1-based) or greater than the - /// number of samples in the stream. + /// Exception: If ``indices`` is empty, contains ``0`` (indices are 1-based), or + /// contains an index greater than the number of samples in the stream. /// /// Example: /// >>> for plan in BenDecoder("plans.ben").subsample_indices([1, 500, 9999]): @@ -149,8 +153,8 @@ impl PyBenDecoder { /// Restrict iteration to a contiguous, 1-indexed inclusive range of samples. /// /// Args: - /// start: First sample number to keep (1-indexed, inclusive). - /// end: Last sample number to keep (1-indexed, inclusive). + /// start (int): First sample number to keep (1-indexed, inclusive). + /// end (int): Last sample number to keep (1-indexed, inclusive). /// /// Returns: /// BenDecoder: ``self``, for chaining into a ``for`` loop. @@ -176,8 +180,8 @@ impl PyBenDecoder { /// Restrict iteration to every ``step``-th sample. /// /// Args: - /// step: Stride between kept samples (e.g. ``10`` keeps every tenth sample). - /// offset: 1-indexed position of the first kept sample. Defaults to ``1``. + /// step (int): Stride between kept samples (e.g. ``10`` keeps every tenth sample). + /// offset (int, optional): 1-indexed position of the first kept sample. Default is ``1``. /// /// Returns: /// BenDecoder: ``self``, for chaining into a ``for`` loop. diff --git a/ben-py/src/decode/mod.rs b/ben-py/src/decode/mod.rs index 33e4c6e..f20204d 100644 --- a/ben-py/src/decode/mod.rs +++ b/ben-py/src/decode/mod.rs @@ -1,4 +1,4 @@ -//! Python bindings for BEN/XBEN decoding and `.bendl` bundle inspection. +//! Python bindings for BEN/XBEN decoding and `.bendl` file inspection. mod bundle_decoder; mod cursor; diff --git a/ben-py/src/decode/py_funcs.rs b/ben-py/src/decode/py_funcs.rs index 1284ee3..0fb715d 100644 --- a/ben-py/src/decode/py_funcs.rs +++ b/ben-py/src/decode/py_funcs.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; /// subsample. The encoding variant is preserved and detected automatically on the next read. /// /// Args: -/// in_file: Path to the input ``.xben`` file. -/// out_file: Path to write the ``.ben`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// in_file (StrPath): Path to the input ``.xben`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.ben`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is ``False``. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. @@ -44,9 +44,9 @@ pub fn decode_xben_to_ben(in_file: PathBuf, out_file: PathBuf, overwrite: bool) /// starting at 1. /// /// Args: -/// in_file: Path to the input ``.xben`` file. -/// out_file: Path to write the ``.jsonl`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// in_file (StrPath): Path to the input ``.xben`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.jsonl`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is ``False``. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. @@ -75,9 +75,9 @@ pub fn decode_xben_to_jsonl(in_file: PathBuf, out_file: PathBuf, overwrite: bool /// starting at 1. This is the inverse of :func:`encode_jsonl_to_ben`. /// /// Args: -/// in_file: Path to the input ``.ben`` file. -/// out_file: Path to write the ``.jsonl`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. +/// in_file (StrPath): Path to the input ``.ben`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.jsonl`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is ``False``. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. diff --git a/ben-py/src/decode/types.rs b/ben-py/src/decode/types.rs index a7ec859..b691dfe 100644 --- a/ben-py/src/decode/types.rs +++ b/ben-py/src/decode/types.rs @@ -48,7 +48,7 @@ impl DecoderMode { /// Where the iterable assignment stream lives. /// -/// A plain `.ben`/`.xben` file is read from the start; a `.bendl` bundle is read through a second +/// A plain `.ben`/`.xben` file is read from the start; a `.bendl` file is read through a second /// file handle bounded to the embedded stream region. Carrying the region offsets (rather than a /// live [`binary_ensemble::io::bundle::BendlReader`]) keeps the iteration core free of the bundle /// inspection surface, so [`super::cursor::SampleCursor`] is shared verbatim between the stream and diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index dc91ac2..425d1c3 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -1,4 +1,4 @@ -//! `.bendl` bundle authoring bindings: [`PyBendlEncoder`] and its [`PyBendlStreamSession`]. +//! `.bendl` file authoring bindings: [`PyBendlEncoder`] and its [`PyBendlStreamSession`]. //! //! The encoder threads the bundle through the library's typestate machinery — `BendlWriter` //! (assets) → `BendlStreamSession` (stream) → `BendlWriter::finish` (finalize) — for the create @@ -7,7 +7,8 @@ //! routes through the writer pre-stream and the appender afterwards. use crate::common::{ - graph_node_count, networkx_graph_from_bytes, open_output, parse_graph_input, parse_variant, + graph_node_count, networkx_graph_from_bytes, open_output, parse_graph_input, + parse_metadata_input, parse_variant, }; use crate::graph::helpers::{reorder_graph_to_bytes, resolve_reorder}; use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; @@ -16,7 +17,7 @@ use binary_ensemble::io::bundle::{ AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, }; use binary_ensemble::io::writer::BenStreamWriter; -use pyo3::exceptions::{PyException, PyIOError, PyValueError}; +use pyo3::exceptions::{PyException, PyIOError, PyKeyError, PyValueError}; use pyo3::prelude::*; use std::fs::{File, OpenOptions}; use std::io::{self, BufWriter}; @@ -25,6 +26,10 @@ use std::path::PathBuf; fn map_bundle_err(err: BendlWriteError) -> PyErr { match err { BendlWriteError::Io(e) => PyIOError::new_err(format!("{e}")), + // Matches the decoder's lookup errors (read_asset_bytes, asset_size). + BendlWriteError::UnknownAssetName(name) => { + PyKeyError::new_err(format!("no asset named {name:?} in bundle")) + } other => PyException::new_err(format!("{other}")), } } @@ -68,7 +73,7 @@ enum BundleState { Closed, } -/// Writer for a single `.bendl` bundle. +/// Writer for a single `.bendl` file. #[pyclass(module = "binary_ensemble", name = "BendlEncoder", unsendable)] pub struct PyBendlEncoder { path: PathBuf, @@ -80,13 +85,15 @@ pub struct PyBendlEncoder { impl PyBendlEncoder { /// Open a new bundle writer in create mode. /// - /// A create-mode encoder writes one `.bendl` bundle. Add graph and metadata assets, then + /// A create-mode encoder writes one `.bendl` file. Add graph and metadata assets, then /// open exactly one assignment stream with :meth:`stream`. The stream context finalizes the /// bundle on a clean close. /// /// Args: - /// file_path: Output path. Must not exist unless ``overwrite=True``. - /// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. + /// file_path (StrPath): Output path (``str`` or ``os.PathLike``). Must not exist + /// unless ``overwrite=True``. + /// overwrite (bool, optional): Replace an existing file at ``file_path``. Default is + /// ``False``. /// /// Raises: /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be @@ -95,7 +102,7 @@ impl PyBendlEncoder { /// Example: /// >>> from binary_ensemble import BendlEncoder /// >>> encoder = BendlEncoder("ensemble.bendl", overwrite=True) - /// >>> with encoder.stream("ben") as stream: + /// >>> with encoder.stream() as stream: /// ... stream.write([1, 1, 2, 2]) #[new] #[pyo3(signature = (file_path, overwrite = false))] @@ -119,7 +126,8 @@ impl PyBendlEncoder { /// one assignment stream. Each ``add_*`` operation commits immediately. /// /// Args: - /// file_path: Existing finalized ``.bendl`` bundle. + /// file_path (StrPath): Existing finalized ``.bendl`` bundle (``str`` or + /// ``os.PathLike``). /// /// Returns: /// BendlEncoder: An encoder in append mode. @@ -165,9 +173,10 @@ impl PyBendlEncoder { /// gain little from this but are not harmed by it. /// /// Args: - /// name: Asset name stored in the bundle directory. - /// payload: The bytes to store. - /// content_type: ``"json"``, ``"text"``, or ``"binary"``. JSON assets are marked so + /// name (str): Asset name stored in the bundle directory. + /// payload (bytes): The bytes to store. (The :class:`binary_ensemble.bundle.BendlEncoder` + /// facade accepts richer payload shapes and coerces them to bytes.) + /// content_type (str): ``"json"``, ``"text"``, or ``"binary"``. JSON assets are marked so /// :meth:`binary_ensemble.bundle.BendlDecoder.read_json_asset` can parse them; /// ``"text"`` and ``"binary"`` store the bytes unmarked. /// @@ -193,13 +202,46 @@ impl PyBendlEncoder { Err(state_error(&self.state, "add_asset")) } + /// Remove a named asset from a finalized bundle's directory. + /// + /// Available wherever ``add_asset`` commits immediately: append mode, or create mode after + /// the stream has closed. Only the directory entry is dropped — the payload bytes remain in + /// the file as unreferenced dead space until the next whole-bundle rewrite (e.g. + /// :func:`binary_ensemble.bundle.compress_stream` or + /// :func:`binary_ensemble.bundle.relabel_bundle`) compacts them. The name (and any + /// singleton-type claim, e.g. ``metadata.json``) becomes free again, so remove-then-add is + /// the way to replace an asset's payload. + /// + /// Args: + /// name (str): The asset's name, as listed by + /// :meth:`binary_ensemble.bundle.BendlDecoder.asset_names`. + /// + /// Raises: + /// KeyError: If no asset with that name exists in the bundle. + /// Exception: If the encoder is in create mode before the stream (just don't add the + /// asset), is currently streaming, or is closed. + /// + /// Example: + /// >>> appender = BendlEncoder.append("ensemble.bendl") + /// >>> appender.remove_asset("notes.txt") + #[pyo3(signature = (name))] + #[pyo3(text_signature = "(self, name)")] + fn remove_asset(&mut self, name: &str) -> PyResult<()> { + if matches!(self.state, BundleState::Appendable) { + return self.append_commit(|a| a.remove_asset(name)); + } + Err(state_error(&self.state, "remove_asset")) + } + /// Add the canonical ``metadata.json`` known asset. /// /// ``metadata`` accepts a Python ``dict``/``list``, UTF-8 JSON bytes, a file-like object with /// ``.read()``, or a path to JSON. The decoder returns it with :meth:`read_metadata`. /// /// Args: - /// metadata: JSON-compatible metadata payload. + /// metadata (MetadataInput): The JSON payload: a ``dict``/``list``, UTF-8 JSON + /// ``bytes``, a file-like object with ``.read()``, or a ``str``/``os.PathLike`` + /// path to a JSON file (a plain ``str`` is a *path* here). /// /// Raises: /// Exception: If the metadata cannot be converted to JSON bytes, or if the encoder is in @@ -210,7 +252,7 @@ impl PyBendlEncoder { #[pyo3(signature = (metadata))] #[pyo3(text_signature = "(self, metadata)")] fn add_metadata(&mut self, py: Python<'_>, metadata: Bound<'_, PyAny>) -> PyResult<()> { - let bytes = parse_graph_input(py, &metadata)?; + let bytes = parse_metadata_input(py, &metadata)?; let opts = AddAssetOptions::defaults().json(); if let BundleState::PreStream { writer, .. } = &mut self.state { return writer @@ -236,9 +278,14 @@ impl PyBendlEncoder { /// returned graph's node count is recorded for per-write validation. /// /// Args: - /// graph: NetworkX adjacency JSON as a dict/list, bytes, file-like object, or path. - /// sort: ``"mlc"``, ``"rcm"``, ``"key"``, or ``None``. - /// key: Node attribute used when ``sort="key"``. Use ``"id"`` for node id ordering. + /// graph (GraphInput): The dual graph: a live ``networkx.Graph`` (subclasses such as + /// ``gerrychain.Graph`` count), or NetworkX adjacency JSON as a ``dict``/``list``, + /// raw JSON ``bytes``, a file-like object with ``.read()``, or a + /// ``str``/``os.PathLike`` path to a JSON file (a plain ``str`` is a *path* here). + /// sort (SortMethod | None, optional): ``"mlc"``, ``"rcm"``, ``"key"``, or ``None`` + /// to store the graph as-is. Default is ``"mlc"``. + /// key (str | None, optional): Node attribute used when ``sort="key"``. Use ``"id"`` + /// for node id ordering. Default is ``None``. /// /// Returns: /// networkx.Graph: The stored graph, after any reordering. @@ -312,39 +359,30 @@ impl PyBendlEncoder { /// Open the single-use assignment stream. /// - /// Only ``"ben"`` is accepted today; XBEN bundles are produced by - /// :func:`binary_ensemble.bundle.compress_stream` after writing. ``variant`` selects the BEN - /// variant and defaults to ``"twodelta"``. + /// The embedded stream is always written in the BEN wire format; XBEN bundles are produced + /// by :func:`binary_ensemble.bundle.compress_stream` after writing (XBEN is a whole-stream + /// LZMA2 wrap, so it cannot be written live sample-by-sample without forfeiting its + /// compression). /// /// Args: - /// format: Stream format, currently only ``"ben"``. - /// variant: BEN variant: ``"standard"``, ``"mkv_chain"``, ``"twodelta"``, or ``None``. + /// variant (Variant, optional): BEN encoding variant — ``"standard"``, + /// ``"mkv_chain"``, or ``"twodelta"``. Default is ``"twodelta"``. /// /// Returns: /// BendlStreamSession: Context manager whose :meth:`write` method accepts assignments. /// /// Raises: - /// ValueError: If ``format`` or ``variant`` is invalid. + /// ValueError: If ``variant`` is invalid. /// Exception: If a stream has already been written, append mode is active, or the encoder /// is closed/failed. /// /// Example: - /// >>> with encoder.stream("ben", variant="twodelta") as stream: + /// >>> with encoder.stream(variant="standard") as stream: /// ... stream.write([1, 1, 2, 2]) - #[pyo3(signature = (format = "ben", variant = None))] - #[pyo3(text_signature = "(self, format='ben', variant=None)")] - fn stream( - slf: Bound<'_, Self>, - format: &str, - variant: Option, - ) -> PyResult { - if format != "ben" { - return Err(PyValueError::new_err(format!( - "stream format must be 'ben' (got {format:?}); produce XBEN via \ - binary_ensemble.bundle.compress_stream" - ))); - } - let ben_var = parse_variant(variant.as_deref())?; + #[pyo3(signature = (*, variant = "twodelta"))] + #[pyo3(text_signature = "(self, *, variant='twodelta')")] + fn stream(slf: Bound<'_, Self>, variant: &str) -> PyResult { + let ben_var = parse_variant(Some(variant))?; let encoder_handle: Py = slf.clone().unbind(); let mut me = slf.borrow_mut(); @@ -479,7 +517,8 @@ fn state_error(state: &BundleState, op: &str) -> PyErr { BundleState::Streaming => "the assignment stream is open; close it before adding assets", BundleState::Failed => "the previous stream failed; this bundle is unfinalized", BundleState::Closed => "the encoder is closed", - BundleState::PreStream { .. } | BundleState::Appendable => "invalid state", + BundleState::PreStream { .. } => "the bundle is not finalized yet", + BundleState::Appendable => "invalid state", }; PyException::new_err(format!("cannot {op}: {reason}")) } @@ -504,8 +543,8 @@ impl PyBendlStreamSession { /// Encode a single assignment into the bundle's stream. /// /// Args: - /// assignment: The plan as a ``list[int]`` of district ids, one per node in - /// dual-graph node order. + /// assignment (Sequence[int]): The plan as a sequence of district ids (e.g. a + /// ``list[int]``), one per node in dual-graph node order. /// /// Returns: /// None. diff --git a/ben-py/src/encode/encoder.rs b/ben-py/src/encode/encoder.rs index c4de457..9489585 100644 --- a/ben-py/src/encode/encoder.rs +++ b/ben-py/src/encode/encoder.rs @@ -20,10 +20,12 @@ use std::path::PathBuf; /// :class:`~binary_ensemble.bundle.BendlEncoder` instead. /// /// Args: -/// file_path: Output path. Must not exist unless ``overwrite=True``. -/// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. -/// variant: BEN encoding variant for the stream — ``"standard"``, ``"mkv_chain"``, -/// or ``"twodelta"``. ``None`` (the default) means ``"twodelta"``. +/// file_path (StrPath): Output path (``str`` or ``os.PathLike``). Must not exist unless +/// ``overwrite=True``. +/// overwrite (bool, optional): Replace an existing file at ``file_path``. Default is +/// ``False``. +/// variant (Variant, optional): BEN encoding variant for the stream — ``"standard"``, +/// ``"mkv_chain"``, or ``"twodelta"``. Default is ``"twodelta"``. /// /// Raises: /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be @@ -45,20 +47,22 @@ impl PyBenEncoder { /// Open a new encoder that writes a plain ``.ben`` stream. /// /// Args: - /// file_path: Output path. Must not exist unless ``overwrite=True``. - /// overwrite: Replace an existing file at ``file_path``. Defaults to ``False``. - /// variant: BEN encoding variant for the stream — ``"standard"``, ``"mkv_chain"``, - /// or ``"twodelta"``. Defaults to ``"twodelta"`` when ``None``. + /// file_path (StrPath): Output path (``str`` or ``os.PathLike``). Must not exist + /// unless ``overwrite=True``. + /// overwrite (bool, optional): Replace an existing file at ``file_path``. Default is + /// ``False``. + /// variant (Variant, optional): BEN encoding variant for the stream — ``"standard"``, + /// ``"mkv_chain"``, or ``"twodelta"``. Default is ``"twodelta"``. /// /// Raises: /// OSError: If ``file_path`` exists and ``overwrite`` is ``False``, or it cannot be /// created. /// ValueError: If ``variant`` is not a recognized variant name. #[new] - #[pyo3(signature = (file_path, overwrite = false, variant = None))] - #[pyo3(text_signature = "(file_path, overwrite=False, variant=None)")] - fn new(file_path: PathBuf, overwrite: bool, variant: Option) -> PyResult { - let ben_var = parse_variant(variant.as_deref())?; + #[pyo3(signature = (file_path, overwrite = false, variant = "twodelta"))] + #[pyo3(text_signature = "(file_path, overwrite=False, variant='twodelta')")] + fn new(file_path: PathBuf, overwrite: bool, variant: &str) -> PyResult { + let ben_var = parse_variant(Some(variant))?; let buf = open_output(&file_path, overwrite)?; let writer = BenStreamWriter::for_ben(buf, ben_var).map_err(Self::map_io_err)?; Ok(Self { @@ -69,8 +73,8 @@ impl PyBenEncoder { /// Encode a single assignment and append it to the output stream. /// /// Args: - /// assignment: The plan as a ``list[int]`` of district ids, one per node in - /// dual-graph node order. + /// assignment (Sequence[int]): The plan as a sequence of district ids (e.g. ``list[int]``), + /// one per node in dual-graph node order. /// /// Raises: /// OSError: If the encoder has already been closed, or the write fails. diff --git a/ben-py/src/encode/mod.rs b/ben-py/src/encode/mod.rs index 74bef58..7468fa7 100644 --- a/ben-py/src/encode/mod.rs +++ b/ben-py/src/encode/mod.rs @@ -1,4 +1,4 @@ -//! Python bindings for BEN/XBEN encoding and `.bendl` bundle authoring. +//! Python bindings for BEN/XBEN encoding and `.bendl` file authoring. mod bundle_encoder; mod encoder; diff --git a/ben-py/src/encode/py_funcs.rs b/ben-py/src/encode/py_funcs.rs index 91200dc..2f60dd6 100644 --- a/ben-py/src/encode/py_funcs.rs +++ b/ben-py/src/encode/py_funcs.rs @@ -15,13 +15,16 @@ use std::path::PathBuf; /// :func:`~binary_ensemble.bundle.relabel_bundle`) for the best ratios. /// /// Args: -/// in_file: Path to the input ``.ben`` file. -/// out_file: Path to write the ``.xben`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. -/// n_threads: Number of worker threads. ``None`` uses all available cores. -/// compression_level: LZMA2 level from 0 (fastest) to 9 (smallest). ``None`` uses the -/// default (9). -/// xz_block_size: Override the xz block size in bytes. ``None`` uses the default. +/// in_file (StrPath): Path to the input ``.ben`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.xben`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// n_threads (int | None, optional): Number of worker threads. Default is ``None`` +/// which uses all available cores. +/// compression_level (int | None, optional): LZMA2 level from 0 (fastest) to 9 +/// (smallest). Default is ``None`` which uses level 9. +/// xz_block_size (int | None, optional): Override the xz block size in bytes. Default +/// is ``None`` which uses the xz default. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. @@ -67,11 +70,12 @@ pub fn encode_ben_to_xben( /// format; encode further to XBEN with :func:`encode_ben_to_xben` for storage. /// /// Args: -/// in_file: Path to the input ``.jsonl`` file. -/// out_file: Path to write the ``.ben`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. -/// variant: BEN encoding variant — ``"standard"``, ``"mkv_chain"``, or ``"twodelta"``. -/// Defaults to ``"twodelta"``. +/// in_file (StrPath): Path to the input ``.jsonl`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.ben`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// variant (Variant, optional): BEN encoding variant — ``"standard"``, ``"mkv_chain"``, +/// or ``"twodelta"``. Default is ``"twodelta"``. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. @@ -107,15 +111,18 @@ pub fn encode_jsonl_to_ben( /// line. Compression can be slow for large block-level ensembles. /// /// Args: -/// in_file: Path to the input ``.jsonl`` file. -/// out_file: Path to write the ``.xben`` output. -/// overwrite: Replace ``out_file`` if it already exists. Defaults to ``False``. -/// variant: BEN encoding variant — ``"standard"``, ``"mkv_chain"``, or ``"twodelta"``. -/// Defaults to ``"twodelta"``. -/// n_threads: Number of worker threads. ``None`` uses all available cores. -/// compression_level: LZMA2 level from 0 (fastest) to 9 (smallest). ``None`` uses the -/// default (9). -/// xz_block_size: Override the xz block size in bytes. ``None`` uses the default. +/// in_file (StrPath): Path to the input ``.jsonl`` file (``str`` or ``os.PathLike``). +/// out_file (StrPath): Path to write the ``.xben`` output (``str`` or ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// variant (Variant, optional): BEN encoding variant — ``"standard"``, ``"mkv_chain"``, +/// or ``"twodelta"``. Default is ``"twodelta"``. +/// n_threads (int | None, optional): Number of worker threads. Default is ``None`` +/// which uses all available cores. +/// compression_level (int | None, optional): LZMA2 level from 0 (fastest) to 9 +/// (smallest). Default is ``None`` which uses level 9. +/// xz_block_size (int | None, optional): Override the xz block size in bytes. Default +/// is ``None`` which uses the xz default. /// /// Raises: /// OSError: If ``out_file`` exists and ``overwrite`` is ``False``, or the conversion fails. diff --git a/ben-py/src/graph/py_funcs.rs b/ben-py/src/graph/py_funcs.rs index 636c911..a756cb2 100644 --- a/ben-py/src/graph/py_funcs.rs +++ b/ben-py/src/graph/py_funcs.rs @@ -13,13 +13,27 @@ fn json_loads(py: Python<'_>, bytes: &[u8]) -> PyResult> { /// Reorder a NetworkX adjacency-format graph and return `(reordered_graph, node_permutation_map)`. /// -/// `reordered_graph` is a live NetworkX graph (matching `BendlEncoder.add_graph` / -/// `BendlDecoder.read_graph`); `node_permutation_map` is the parsed map JSON. +/// Args: +/// graph (GraphInput): The graph: a live ``networkx.Graph`` (subclasses such as +/// ``gerrychain.Graph`` count), or NetworkX adjacency JSON as a ``dict``/``list``, raw +/// JSON ``bytes``, a file-like object with ``.read()``, or a ``str``/``os.PathLike`` +/// path to a JSON file (a plain ``str`` is a *path* here). +/// sort (SortMethod, optional): The ordering — ``"mlc"`` (multi-level clustering), +/// ``"rcm"`` (reverse Cuthill-McKee), or ``"key"`` (sort by the node attribute named +/// in ``key``). Default is ``"mlc"``. +/// key (str | None, optional): Node attribute to sort by (e.g. ``key="GEOID"``, or the +/// special ``key="id"`` for the NetworkX node id); required with — and only valid +/// with — ``sort="key"``. Default is ``None``. /// -/// `sort` selects the ordering: `"mlc"` (multi-level clustering), `"rcm"` (reverse Cuthill-McKee), -/// or `"key"` to sort by a node attribute named via `key` (e.g. `key="GEOID"`, or the special -/// `key="id"` for the NetworkX node id). The permutation map matches the on-disk -/// `node_permutation_map.json` convention (a `node_permutation_old_to_new` object). +/// Returns: +/// tuple[networkx.Graph, NodePermutationMap]: The reordered graph (a live NetworkX graph, +/// matching ``BendlEncoder.add_graph`` / ``BendlDecoder.read_graph``) and the parsed +/// permutation map, whose required ``node_permutation_old_to_new`` field maps original +/// zero-based node positions to their new positions (the on-disk +/// ``node_permutation_map.json`` convention). +/// +/// Raises: +/// ValueError: If ``sort`` / ``key`` is invalid. #[pyfunction] #[pyo3(signature = (graph, sort = Some("mlc".to_string()), key = None))] #[pyo3(text_signature = "(graph, sort='mlc', key=None)")] diff --git a/ben-py/src/lib.rs b/ben-py/src/lib.rs index 89c31ee..4c84a78 100755 --- a/ben-py/src/lib.rs +++ b/ben-py/src/lib.rs @@ -2,6 +2,7 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; pub mod common; +pub mod compact; pub mod decode; pub mod encode; pub mod graph; @@ -22,6 +23,11 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(crate::encode::encode_jsonl_to_xben, m)?)?; m.add_function(wrap_pyfunction!(crate::encode::encode_ben_to_xben, m)?)?; m.add_function(wrap_pyfunction!(crate::graph::graph_reorder, m)?)?; + m.add_function(wrap_pyfunction!(crate::compact::compact_bundle, m)?)?; + m.add_function(wrap_pyfunction!( + crate::compact::compact_bundle_in_place, + m + )?)?; m.add_function(wrap_pyfunction!(crate::recompress::recompress_bundle, m)?)?; m.add_function(wrap_pyfunction!(crate::relabel::relabel_bundle, m)?)?; diff --git a/ben-py/src/recompress.rs b/ben-py/src/recompress.rs index a948eae..6439f4a 100644 --- a/ben-py/src/recompress.rs +++ b/ben-py/src/recompress.rs @@ -1,4 +1,4 @@ -//! Binding for recompressing a `.bendl` bundle's embedded BEN stream to XBEN. +//! Binding for recompressing a `.bendl` file's embedded BEN stream to XBEN. //! //! This repackages a bundle: it reads back every asset's decoded payload and the BEN assignment //! stream, re-encodes the stream as XBEN, and writes a fresh `Xben`-format bundle with the same @@ -59,6 +59,20 @@ fn add_preserved( /// Recompress the BEN stream of the bundle at `in_file` to XBEN, writing a new bundle at /// `out_file`. +/// +/// This is the raw core binding; prefer the :func:`binary_ensemble.bundle.compress_stream` +/// facade, which adds the ``in_place`` atomic-swap mode. +/// +/// Args: +/// in_file (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). +/// out_file (StrPath): Destination path for the recompressed bundle (``str`` or +/// ``os.PathLike``). +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// +/// Raises: +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``. +/// Exception: If the bundle is unfinalized or already holds an XBEN stream. #[pyfunction] #[pyo3(signature = (in_file, out_file, overwrite = false))] #[pyo3(text_signature = "(in_file, out_file, overwrite=False)")] diff --git a/ben-py/src/relabel.rs b/ben-py/src/relabel.rs index 29bef5a..fcb1b86 100644 --- a/ben-py/src/relabel.rs +++ b/ben-py/src/relabel.rs @@ -1,4 +1,4 @@ -//! Binding for relabeling a `.bendl` bundle: reorder its dual graph and rewrite the embedded BEN +//! Binding for relabeling a `.bendl` file: reorder its dual graph and rewrite the embedded BEN //! assignment stream into the new node order, producing a fresh bundle. //! //! This is the bundle-level form of the CLI's `reben` ordering flow. The reordered `graph.json` and @@ -81,6 +81,26 @@ fn new_to_old_from_map_bytes(map_bytes: &[u8]) -> PyResult /// Relabel the bundle at `in_file` by reordering its graph (via `sort` / `key`), writing a fresh /// BEN bundle at `out_file`. +/// +/// This is the raw core binding; prefer the :func:`binary_ensemble.bundle.relabel_bundle` +/// facade, which adds the ``in_place`` atomic-swap mode. +/// +/// Args: +/// in_file (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). +/// Must hold a BEN (not XBEN) stream and a ``graph.json``. +/// out_file (StrPath): Destination path for the relabeled bundle (``str`` or +/// ``os.PathLike``). +/// sort (SortMethod, optional): The ordering — ``"mlc"``, ``"rcm"``, or ``"key"``. +/// Default is ``"mlc"``. +/// key (str | None, optional): Node attribute to sort by; required with — and only valid +/// with — ``sort="key"``. Default is ``None``. +/// overwrite (bool, optional): Replace ``out_file`` if it already exists. Default is +/// ``False``. +/// +/// Raises: +/// ValueError: If ``sort`` / ``key`` is invalid. +/// OSError: If ``out_file`` exists and ``overwrite`` is ``False``. +/// Exception: If the bundle is unfinalized, has no graph, or holds a non-BEN stream. #[pyfunction] #[pyo3(signature = (in_file, out_file, sort = Some("mlc".to_string()), key = None, overwrite = false))] #[pyo3(text_signature = "(in_file, out_file, sort='mlc', key=None, overwrite=False)")] diff --git a/ben-py/tests/test_bundle.py b/ben-py/tests/test_bundle.py index 900c603..72c87e5 100644 --- a/ben-py/tests/test_bundle.py +++ b/ben-py/tests/test_bundle.py @@ -247,9 +247,7 @@ def _write_jsonl(samples: List[List[int]], path: Path) -> None: f.write("\n") -def _ben_bytes_for( - samples: List[List[int]], tmp: Path, variant: str = "standard" -) -> bytes: +def _ben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: ben_path = tmp / "inner.ben" with BenEncoder(ben_path, overwrite=True, variant=variant) as enc: for a in samples: @@ -257,9 +255,7 @@ def _ben_bytes_for( return ben_path.read_bytes() -def _xben_bytes_for( - samples: List[List[int]], tmp: Path, variant: str = "standard" -) -> bytes: +def _xben_bytes_for(samples: List[List[int]], tmp: Path, variant: str = "standard") -> bytes: src = tmp / "src.jsonl" _write_jsonl(samples, src) out = tmp / "inner.xben" @@ -281,9 +277,7 @@ def _write_bundle(path: Path, bundle_bytes: bytes) -> Path: def test_bundle_round_trip_ben_with_assets(tmp_path: Path) -> None: rng = random.Random(4242) - samples = [ - [rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40) - ] + samples = [[rng.randint(1, 10) for _ in range(rng.randint(1, 50))] for _ in range(40)] # NetworkX adjacency format (what read_graph rebuilds into a live graph). graph_json = ( b'{"directed":false,"multigraph":false,"graph":{},' @@ -379,9 +373,7 @@ def test_canonical_helpers_return_none_when_absent(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), sample_count=1, - assets=[ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name="only_custom.bin", payload=b"x") - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="only_custom.bin", payload=b"x")], ) path = _write_bundle(tmp_path / "sparse.bendl", bundle) dec = BendlDecoder(path) @@ -436,11 +428,7 @@ def test_read_json_asset_rejects_non_utf8(tmp_path: Path) -> None: bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1, - assets=[ - _Asset( - asset_type=ASSET_TYPE_CUSTOM, name="binary.bin", payload=b"\xff\xfe\xfd" - ) - ], + assets=[_Asset(asset_type=ASSET_TYPE_CUSTOM, name="binary.bin", payload=b"\xff\xfe\xfd")], ) path = _write_bundle(tmp_path / "bin.bendl", bundle) dec = BendlDecoder(path) @@ -484,10 +472,7 @@ def test_unicode_asset_name_round_trips(tmp_path: Path) -> None: def test_many_assets_preserve_directory_order(tmp_path: Path) -> None: payloads = {f"asset_{i:04d}.bin": bytes([i & 0xFF] * (i + 1)) for i in range(200)} - assets = [ - _Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) - for n, p in payloads.items() - ] + assets = [_Asset(asset_type=ASSET_TYPE_CUSTOM, name=n, payload=p) for n, p in payloads.items()] bundle = build_bundle( stream_bytes=_ben_bytes_for([[1, 2, 3]], tmp_path), sample_count=1, @@ -566,9 +551,7 @@ def test_zero_length_custom_payload(tmp_path: Path) -> None: def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1 - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) path = _write_bundle(tmp_path / "a.bendl", bundle) dec = BendlDecoder(path) target = tmp_path / "already.ben" @@ -579,9 +562,7 @@ def test_extract_stream_refuses_existing_file_without_overwrite(tmp_path: Path) def test_extract_stream_into_missing_parent_dir_raises(tmp_path: Path) -> None: - bundle = build_bundle( - stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1 - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for([[1, 2]], tmp_path), sample_count=1) path = _write_bundle(tmp_path / "mini.bendl", bundle) dec = BendlDecoder(path) with pytest.raises(OSError): @@ -602,7 +583,7 @@ def test_open_rejects_plain_stream(tmp_path: Path) -> None: plain = tmp_path / "plain.ben" with BenEncoder(plain, overwrite=True, variant="standard") as enc: enc.write([1, 2, 3]) - with pytest.raises(Exception, match="not a .bendl bundle"): + with pytest.raises(Exception, match="not a .bendl file"): BendlDecoder(plain) @@ -790,9 +771,7 @@ def test_finalized_bundle_with_inflated_stream_len_survives_open( ) -> None: samples = [[1, 2, 3], [4, 5, 6]] bundle = bytearray( - build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) ) old_stream_len = struct.unpack_from(" None: def test_iteration_can_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "twice.bendl", bundle) dec = BendlDecoder(path) assert list(dec) == samples @@ -900,9 +877,7 @@ def test_iteration_can_restart(tmp_path: Path) -> None: def test_partial_iteration_then_restart(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6], [7, 8]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "partial.bendl", bundle) dec = BendlDecoder(path) it = iter(dec) @@ -913,9 +888,7 @@ def test_partial_iteration_then_restart(tmp_path: Path) -> None: def test_subsample_modes(tmp_path: Path) -> None: samples = [[i] for i in range(1, 11)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "sub.bendl", bundle) dec = BendlDecoder(path).subsample_range(3, 6) @@ -931,9 +904,7 @@ def test_subsample_modes(tmp_path: Path) -> None: def test_subsample_count_preserves_filtered_len(tmp_path: Path) -> None: samples = [[i] for i in range(1, 9)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "cnt.bendl", bundle) dec = BendlDecoder(path).subsample_range(2, 5) assert len(dec) == 4 @@ -944,9 +915,7 @@ def test_subsample_count_preserves_filtered_len(tmp_path: Path) -> None: def test_subsample_out_of_bounds(tmp_path: Path) -> None: samples = [[1, 2], [3, 4], [5, 6]] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "oob.bendl", bundle) with pytest.raises(Exception, match="end must be <= number of samples"): BendlDecoder(path).subsample_range(1, 99) @@ -959,9 +928,7 @@ def test_subsample_out_of_bounds(tmp_path: Path) -> None: def test_len_uses_header_fast_path(tmp_path: Path) -> None: samples = [[i] for i in range(1, 6)] - bundle = build_bundle( - stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples) - ) + bundle = build_bundle(stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=len(samples)) path = _write_bundle(tmp_path / "fast.bendl", bundle) dec = BendlDecoder(path) assert len(dec) == len(samples) @@ -987,10 +954,7 @@ def test_seeded_fuzz_random_bundles_round_trip(tmp_path: Path) -> None: ) truth.append((f"t{trial}-a{i}.bin", payload)) n_samples = rng.randint(1, 25) - samples = [ - [rng.randint(1, 8) for _ in range(rng.randint(1, 40))] - for _ in range(n_samples) - ] + samples = [[rng.randint(1, 8) for _ in range(rng.randint(1, 40))] for _ in range(n_samples)] bundle = build_bundle( stream_bytes=_ben_bytes_for(samples, tmp_path), sample_count=n_samples, @@ -1016,7 +980,7 @@ def _checksummed_bundle(path: Path) -> None: """A small finalized bundle written by the real encoder (checksums populated).""" with BendlEncoder(path, overwrite=True) as enc: enc.add_asset("notes.txt", "integrity matters", content_type="text") - with enc.stream("ben", variant="standard") as s: + with enc.stream(variant="standard") as s: for a in ([1, 1, 2, 2], [2, 2, 1, 1]): s.write(a) @@ -1064,7 +1028,7 @@ def test_verify_rejects_unfinalized_bundle(tmp_path: Path) -> None: path = tmp_path / "unfinalized.bendl" with pytest.raises(RuntimeError, match="boom"): with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2, 3]) raise RuntimeError("boom") diff --git a/ben-py/tests/test_bundle_api.py b/ben-py/tests/test_bundle_api.py index 98899bf..1372674 100644 --- a/ben-py/tests/test_bundle_api.py +++ b/ben-py/tests/test_bundle_api.py @@ -38,7 +38,7 @@ def test_create_round_trip_all_asset_kinds(tmp_path: Path) -> None: with BendlEncoder(path, overwrite=True) as enc: returned = enc.add_graph(_graph(), sort=None) enc.add_metadata({"seed": 1234}) - with enc.stream("ben") as stream: + with enc.stream() as stream: for a in samples: stream.write(a) enc.add_asset("notes.txt", "hello world", content_type="text") @@ -66,7 +66,7 @@ def test_create_round_trip_all_asset_kinds(tmp_path: Path) -> None: def test_post_stream_add_commits_immediately(tmp_path: Path) -> None: path = tmp_path / "commit.bendl" enc = BendlEncoder(path, overwrite=True) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2]) enc.add_asset("a.txt", "one", content_type="text") # A successful post-stream add is durable on disk before close(). @@ -79,7 +79,7 @@ def test_post_stream_add_commits_immediately(tmp_path: Path) -> None: def test_context_manager_and_idempotent_close(tmp_path: Path) -> None: path = tmp_path / "ctx.bendl" enc = BendlEncoder(path, overwrite=True) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2, 3]) enc.close() enc.close() # idempotent @@ -133,7 +133,7 @@ def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: path = tmp_path / "fail.bendl" with pytest.raises(RuntimeError, match="boom"): with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2, 3]) raise RuntimeError("boom") dec = BendlDecoder(path) @@ -142,9 +142,7 @@ def test_exception_in_stream_leaves_bundle_unfinalized(tmp_path: Path) -> None: with pytest.raises(Exception, match="unfinalized"): dec.extract_stream(tmp_path / "recovered.ben") # ...but the partial write is recoverable. - dec.extract_stream( - tmp_path / "recovered.ben", overwrite=True, allow_unfinalized=True - ) + dec.extract_stream(tmp_path / "recovered.ben", overwrite=True, allow_unfinalized=True) assert (tmp_path / "recovered.ben").stat().st_size > 0 @@ -266,7 +264,7 @@ def test_add_graph_reorder_emits_graph_and_permutation_map(tmp_path: Path) -> No path = tmp_path / "reord.bendl" enc = BendlEncoder(path, overwrite=True) reordered = enc.add_graph(_graph(), sort="rcm") - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1] * n) enc.close() @@ -310,7 +308,7 @@ def test_add_graph_node_count_mismatch_raises(tmp_path: Path) -> None: n = _n() enc = BendlEncoder(tmp_path / "nc.bendl", overwrite=True) enc.add_graph(_graph(), sort=None) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1] * n) # correct with pytest.raises(ValueError, match="does not match graph node count"): s.write([1] * (n - 1)) @@ -320,7 +318,7 @@ def test_reorder_add_graph_after_stream_raises_but_raw_succeeds(tmp_path: Path) n = _n() path = tmp_path / "after.bendl" enc = BendlEncoder(path, overwrite=True) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1] * n) with pytest.raises(Exception, match="only allowed before"): enc.add_graph(_graph(), sort="rcm") @@ -338,23 +336,32 @@ def test_duplicate_graph_raises(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -# Stream-format and second-stream guards +# Stream-signature and second-stream guards # --------------------------------------------------------------------------- -def test_stream_rejects_non_ben_format(tmp_path: Path) -> None: +def test_stream_takes_no_positional_arguments(tmp_path: Path) -> None: + # The embedded stream is always BEN at write time (XBEN comes from compress_stream), so + # stream() has no format parameter and variant is keyword-only — a stale positional call + # must fail loudly, not bind to variant. enc = BendlEncoder(tmp_path / "fmt.bendl", overwrite=True) - with pytest.raises(ValueError, match="must be 'ben'"): - enc.stream("xben") + with pytest.raises(TypeError): + enc.stream("ben") # type: ignore + + +def test_stream_rejects_unknown_variant(tmp_path: Path) -> None: + enc = BendlEncoder(tmp_path / "var.bendl", overwrite=True) + with pytest.raises(ValueError, match="Unknown variant"): + enc.stream(variant="xben") def test_second_stream_refused(tmp_path: Path) -> None: path = tmp_path / "two.bendl" enc = BendlEncoder(path, overwrite=True) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2]) with pytest.raises(Exception, match="already been written"): - enc.stream("ben") + enc.stream() # --------------------------------------------------------------------------- @@ -365,7 +372,7 @@ def test_second_stream_refused(tmp_path: Path) -> None: def test_append_mode_adds_assets(tmp_path: Path) -> None: path = tmp_path / "app.bendl" with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2, 3]) ap = BendlEncoder.append(path) @@ -382,17 +389,17 @@ def test_append_mode_adds_assets(tmp_path: Path) -> None: def test_append_mode_disallows_stream(tmp_path: Path) -> None: path = tmp_path / "app2.bendl" with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1]) ap = BendlEncoder.append(path) with pytest.raises(Exception, match="append mode"): - ap.stream("ben") + ap.stream() def test_append_mode_reorder_graph_raises(tmp_path: Path) -> None: path = tmp_path / "app3.bendl" with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1] * _n()) ap = BendlEncoder.append(path) with pytest.raises(Exception, match="only allowed before"): @@ -407,8 +414,196 @@ def test_append_on_unfinalized_bundle_raises(tmp_path: Path) -> None: path = tmp_path / "unfin.bendl" with pytest.raises(RuntimeError): with BendlEncoder(path, overwrite=True) as enc: - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1, 2]) raise RuntimeError("stop") with pytest.raises(Exception): BendlEncoder.append(path) + + +# --------------------------------------------------------------------------- +# Live NetworkX graph inputs +# --------------------------------------------------------------------------- + + +def test_add_graph_accepts_live_networkx_graph(tmp_path: Path) -> None: + import networkx as nx + + live = nx.readwrite.json_graph.adjacency_graph(_graph()) + n = live.number_of_nodes() + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(4)] + + path = tmp_path / "live.bendl" + enc = BendlEncoder(path, overwrite=True) + stored = enc.add_graph(live, sort=None) + # A raw (sort=None) embed of a live graph preserves its node iteration order. + assert list(stored.nodes) == list(live.nodes) + with enc.stream() as stream: + for a in samples: + stream.write(a) + + dec = BendlDecoder(path) + assert list(dec) == samples + assert list(dec.read_graph().nodes) == list(live.nodes) + + +def test_add_graph_accepts_networkx_graph_subclass(tmp_path: Path) -> None: + # gerrychain.Graph is an nx.Graph subclass; pin that subclasses are accepted. + import networkx as nx + + class SubGraph(nx.Graph): + pass + + live = SubGraph(nx.readwrite.json_graph.adjacency_graph(_graph())) + enc = BendlEncoder(tmp_path / "sub.bendl", overwrite=True) + stored = enc.add_graph(live, sort=None) + assert stored.number_of_nodes() == live.number_of_nodes() + enc.close() + + +def test_add_metadata_rejects_networkx_graph(tmp_path: Path) -> None: + # Graphs are graph.json material, not metadata; the metadata path must not + # silently serialize one. + import networkx as nx + + enc = BendlEncoder(tmp_path / "meta.bendl", overwrite=True) + with pytest.raises(ValueError, match="metadata must be"): + enc.add_metadata(nx.Graph()) + + +# --------------------------------------------------------------------------- +# Stream size (header-recorded, no decoding) +# --------------------------------------------------------------------------- + + +def test_stream_size_matches_extracted_bytes(tmp_path: Path) -> None: + n = _n() + path = tmp_path / "s.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_graph(_graph(), sort=None) + with enc.stream() as s: + for i in range(5): + s.write([(i + j) % 3 + 1 for j in range(n)]) + + # stream_size comes straight from the header and must equal the byte count + # extract_stream copies out. + dec = BendlDecoder(path) + out = tmp_path / "out.ben" + dec.extract_stream(out) + assert dec.stream_size() > 0 + assert dec.stream_size() == out.stat().st_size + + +def test_stream_size_zero_for_assets_only(tmp_path: Path) -> None: + path = tmp_path / "a.bendl" + with BendlEncoder(path, overwrite=True) as enc: + enc.add_metadata({"x": 1}) + assert BendlDecoder(path).stream_size() == 0 + + +def test_asset_size_matches_directory_and_distinguishes_stored_from_decoded( + tmp_path: Path, +) -> None: + path = tmp_path / "sizes.bendl" + with BendlEncoder(path, overwrite=True) as enc: + enc.add_asset("small.txt", "tiny", content_type="text") + # 5000 highly compressible bytes: above the 1 KiB threshold, so stored xz-compressed. + enc.add_asset("big.txt", "x" * 5000, content_type="text") + + dec = BendlDecoder(path) + # asset_size is the directory's stored length, for every entry. + for entry in dec.list_assets(): + assert dec.asset_size(entry["name"]) == entry["len"] + + # Sub-threshold assets are stored raw: stored size == decoded size. + assert dec.asset_size("small.txt") == len(dec.read_asset_bytes("small.txt")) == 4 + # Compressed assets: stored size is the xz size, smaller than the decoded payload. + flags = {a["name"]: a["flags"] for a in dec.list_assets()} + assert "xz" in flags["big.txt"] + assert dec.asset_size("big.txt") < len(dec.read_asset_bytes("big.txt")) == 5000 + + with pytest.raises(KeyError, match="no asset named"): + dec.asset_size("missing.bin") + + +# --------------------------------------------------------------------------- +# Asset removal +# --------------------------------------------------------------------------- + + +def test_remove_asset_drops_entry_and_preserves_everything_else(tmp_path: Path) -> None: + n = _n() + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(4)] + path = tmp_path / "rm.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_graph(_graph(), sort=None) + with enc.stream() as s: + for a in samples: + s.write(a) + enc.add_asset("notes.txt", "scratch notes", content_type="text") + enc.add_asset("keep.json", {"keep": True}, content_type="json") + + appender = BendlEncoder.append(path) + appender.remove_asset("notes.txt") + + dec = BendlDecoder(path) + assert "notes.txt" not in dec.asset_names() + with pytest.raises(KeyError, match="no asset named"): + dec.read_asset_bytes("notes.txt") + # Everything else is untouched: assets, stream, and every remaining checksum. + assert dec.read_json_asset("keep.json") == {"keep": True} + assert list(dec) == samples + dec.verify() + + +def test_remove_then_add_replaces_an_asset(tmp_path: Path) -> None: + path = tmp_path / "update.bendl" + with BendlEncoder(path, overwrite=True) as enc: + enc.add_metadata({"seed": 1}) + + appender = BendlEncoder.append(path) + # metadata.json is a singleton, so a bare re-add is refused... + with pytest.raises(Exception, match="duplicate"): + appender.add_metadata({"seed": 2}) + # ...but remove-then-add is the update idiom. + appender.remove_asset("metadata.json") + appender.add_metadata({"seed": 2}) + assert BendlDecoder(path).read_metadata() == {"seed": 2} + + +def test_remove_asset_reclaims_bytes_automatically(tmp_path: Path) -> None: + path = tmp_path / "reclaim.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream() as s: + s.write([1, 2, 3]) + import random + + blob = random.Random(0).randbytes(64 * 1024) # incompressible, so it really occupies bytes + enc.add_asset("bloat.bin", blob, content_type="binary") + bloated = path.stat().st_size + assert bloated > 64 * 1024 + + # The facade's removal compacts in place: the payload bytes are really gone, + # not just unreferenced. + enc.remove_asset("bloat.bin") + assert path.stat().st_size < bloated - 60_000 + dec = BendlDecoder(path) + assert dec.asset_names() == [] + assert list(dec) == [[1, 2, 3]] + dec.verify() + + +def test_remove_asset_guards(tmp_path: Path) -> None: + path = tmp_path / "guards.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_asset("a.txt", "a", content_type="text") + # Pre-stream create mode: nothing is committed yet, so there is nothing to remove. + with pytest.raises(Exception, match="not finalized"): + enc.remove_asset("a.txt") + with enc.stream() as s: + s.write([1, 2]) + # Post-stream (finalized) the same encoder can remove, and unknown names are KeyErrors. + with pytest.raises(KeyError, match="no asset named"): + enc.remove_asset("missing.txt") + enc.remove_asset("a.txt") + assert BendlDecoder(path).asset_names() == [] diff --git a/ben-py/tests/test_compact.py b/ben-py/tests/test_compact.py new file mode 100644 index 0000000..89eb5b2 --- /dev/null +++ b/ben-py/tests/test_compact.py @@ -0,0 +1,250 @@ +"""Tests for the ``_core`` bundle-compaction machinery (dead-space reclamation). + +Compaction must be *semantically invisible*: same stream bytes, same decoded asset payloads, +same metadata, same wire format — just no unreferenced byte ranges (left behind by +directory-only removals and superseded directories). These tests pin both halves: the space +actually comes back, and nothing else changes. The public facade has no standalone compact — +every public write path (``remove_asset``, ``compress_stream``, ``relabel_bundle``) keeps +bundles compact automatically — so the machinery is exercised through ``_core``, which also +reports which strategy ran (``"none"`` / ``"tail"`` / ``"full"``). +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path + +import pytest + +from binary_ensemble import _core +from binary_ensemble.bundle import BendlDecoder, BendlEncoder, compress_stream + +EXAMPLE_GRAPH = Path(__file__).resolve().parent / "data" / "gerrymandria.json" + + +def _graph(): + return json.loads(EXAMPLE_GRAPH.read_text()) + + +def _n(): + return len(_graph()["nodes"]) + + +def _build_bundle_with_dead_space(path: Path) -> tuple[list[list[int]], int]: + """A finalized bundle that has been appended to and had a large asset removed. + + Returns ``(samples, live_size)`` where ``live_size`` is the file size before the bloating + asset was added — an upper bound on what a compacted file may occupy (compaction also drops + the superseded directories the appends left behind). + """ + n = _n() + samples = [[(i + j) % 4 + 1 for j in range(n)] for i in range(8)] + enc = BendlEncoder(path, overwrite=True) + enc.add_graph(_graph(), sort=None) + enc.add_metadata({"seed": 99}) + with enc.stream() as s: + for a in samples: + s.write(a) + enc.add_asset("notes.txt", "keep me", content_type="text") + live_size = path.stat().st_size + + # Bloat: a genuinely incompressible 64 KiB blob (seeded random bytes — a periodic pattern + # would be crushed by the xz storage compression and leave no dead space), removed through + # the *core* binding, whose removal is directory-only (the facade's remove_asset compacts + # automatically, which would destroy the dead space these tests exist to exercise). + blob = random.Random(0).randbytes(64 * 1024) + core_appender = _core.BendlEncoder.append(path) + core_appender.add_asset("bloat.bin", blob, "binary") + core_appender.remove_asset("bloat.bin") + return samples, live_size + + +def test_compact_reclaims_dead_space_and_preserves_everything(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + samples, live_size = _build_bundle_with_dead_space(path) + bloated_size = path.stat().st_size + assert bloated_size > live_size + 60_000 # the dead bytes really are in the file + + before = BendlDecoder(path) + stream_size_before = before.stream_size() + names_before = before.asset_names() + + _core.compact_bundle_in_place(path) # in place + + assert path.stat().st_size <= live_size + after = BendlDecoder(path) + # Semantically identical: same plans, same assets, same metadata, same wire format. + assert list(after) == samples + assert after.asset_names() == names_before + assert after.read_metadata() == {"seed": 99} + assert after.read_asset_bytes("notes.txt") == b"keep me" + assert after.assignment_format() == "ben" + assert len(after) == len(samples) + # The stream is copied verbatim, so its recorded size is unchanged. + assert after.stream_size() == stream_size_before + # And every checksum in the compacted bundle holds. + after.verify() + + +def test_compact_copies_stream_bytes_verbatim(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) + + before_stream = tmp_path / "before.ben" + BendlDecoder(path).extract_stream(before_stream) + _core.compact_bundle_in_place(path) + after_stream = tmp_path / "after.ben" + BendlDecoder(path).extract_stream(after_stream) + + assert before_stream.read_bytes() == after_stream.read_bytes() + + +def test_in_place_compaction_picks_tail_rewrite_for_post_stream_dead_space( + tmp_path: Path, +) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) # dead space is post-stream by construction + # The fast path rebuilds only the tail (the stream is never read), and a second pass + # finds nothing left to reclaim. + assert _core.compact_bundle_in_place(path) == "tail" + assert _core.compact_bundle_in_place(path) == "none" + BendlDecoder(path).verify() + + +def test_in_place_compaction_full_rewrite_for_pre_stream_dead_space(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) + # graph.json is a pre-stream asset: removing it (directory-only, via the core binding) + # leaves dead bytes before the stream, which only the full rewrite can reclaim. + core = _core.BendlEncoder.append(path) + core.remove_asset("graph.json") + assert _core.compact_bundle_in_place(path) == "full" + dec = BendlDecoder(path) + assert "graph.json" not in dec.asset_names() + dec.verify() + + +def test_compact_is_idempotent(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) + _core.compact_bundle_in_place(path) + once = path.read_bytes() + _core.compact_bundle_in_place(path) + assert path.read_bytes() == once + + +def test_compact_preserves_xben_bundles(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + samples, _live = _build_bundle_with_dead_space(src) + xben = tmp_path / "in.xben.bendl" + compress_stream(src, out_file=xben) + + # Manufacture dead space in the XBEN bundle via the directory-only core removal, + # then compact it. + core_appender = _core.BendlEncoder.append(xben) + core_appender.add_asset("temp.bin", b"\x00" * 4096, "binary") + core_appender.remove_asset("temp.bin") + bloated = xben.stat().st_size + _core.compact_bundle_in_place(xben) + + assert xben.stat().st_size < bloated + after = BendlDecoder(xben) + assert after.assignment_format() == "xben" # wire format preserved, not re-encoded + assert list(after) == samples + after.verify() + + +def test_compact_out_file_mode_and_overwrite(tmp_path: Path) -> None: + src = tmp_path / "in.bendl" + samples, _live = _build_bundle_with_dead_space(src) + src_bytes = src.read_bytes() + + out = tmp_path / "out.bendl" + out.write_bytes(b"existing") + with pytest.raises(OSError, match="already exists"): + _core.compact_bundle(src, out) + _core.compact_bundle(src, out, overwrite=True) + + # Original untouched; the copy is the compacted one. + assert src.read_bytes() == src_bytes + assert out.stat().st_size < src.stat().st_size + assert list(BendlDecoder(out)) == samples + + +def test_compact_assets_only_bundle(tmp_path: Path) -> None: + path = tmp_path / "assets.bendl" + with BendlEncoder(path, overwrite=True) as enc: + enc.add_metadata({"only": "assets"}) + enc.add_asset("a.txt", "alpha", content_type="text") + enc = BendlEncoder.append(path) + enc.add_asset("b.txt", "beta", content_type="text") + enc.remove_asset("a.txt") + + _core.compact_bundle_in_place(path) + + dec = BendlDecoder(path) + assert dec.is_complete() + assert dec.stream_size() == 0 + assert list(dec) == [] + assert dec.asset_names() == ["metadata.json", "b.txt"] + assert dec.read_asset_bytes("b.txt") == b"beta" + dec.verify() + + +def test_compact_rejects_unfinalized_bundle(tmp_path: Path) -> None: + path = tmp_path / "partial.bendl" + with pytest.raises(RuntimeError, match="boom"): + with BendlEncoder(path, overwrite=True) as enc: + with enc.stream() as s: + s.write([1] * _n()) + raise RuntimeError("boom") + with pytest.raises(Exception, match="finalized"): + _core.compact_bundle_in_place(path) + + +def _flip_byte_at(path: Path, marker: bytes) -> None: + """XOR the first byte of ``marker`` wherever it occurs in the file.""" + data = bytearray(path.read_bytes()) + pos = data.find(marker) + assert pos != -1, f"marker {marker!r} not found" + data[pos] ^= 0xFF + path.write_bytes(bytes(data)) + + +def test_full_compact_refuses_corrupt_stream(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) + # Flip a byte inside the stream region (the stream's banner — the default variant is + # twodelta). The full rewrite copies the stream through the verified reader, so it must + # refuse and must not leave a destination file behind. + _flip_byte_at(path, b"TWODELTA BEN FILE") + corrupted = path.read_bytes() + + out = tmp_path / "out.bendl" + with pytest.raises(Exception): + _core.compact_bundle(path, out, overwrite=True) + assert path.read_bytes() == corrupted # source untouched + + # The in-place form takes the tail-rewrite fast path here (all dead space is post-stream), + # which by design never reads the stream — so it succeeds, the corruption travels along + # unread, and verify() is what catches it. This is the documented trade-off that makes + # removal O(tail) instead of O(stream) on huge bundles. + _core.compact_bundle_in_place(path) + with pytest.raises(Exception): + BendlDecoder(path).verify() + + +def test_compact_refuses_corrupt_asset(tmp_path: Path) -> None: + path = tmp_path / "in.bendl" + _build_bundle_with_dead_space(path) + _flip_byte_at(path, b"keep me") # corrupt the notes.txt payload bytes (post-stream) + # The full rewrite decodes every asset (verify-on-touch) and must refuse. + with pytest.raises(Exception): + _core.compact_bundle(path, tmp_path / "out.bendl", overwrite=True) + # The in-place tail path relocates post-stream assets as raw bytes without decoding; the + # corruption travels along with its (now mismatching) stored checksum, and verify() + # catches it. + _core.compact_bundle_in_place(path) + with pytest.raises(Exception): + BendlDecoder(path).verify() diff --git a/ben-py/tests/test_docs_snippets.py b/ben-py/tests/test_docs_snippets.py index 79dc430..256d5c9 100644 --- a/ben-py/tests/test_docs_snippets.py +++ b/ben-py/tests/test_docs_snippets.py @@ -56,9 +56,7 @@ def _markdown_files() -> list[Path]: ) -@pytest.mark.parametrize( - "doc", _markdown_files(), ids=lambda p: str(p.relative_to(DOCS_DIR)) -) +@pytest.mark.parametrize("doc", _markdown_files(), ids=lambda p: str(p.relative_to(DOCS_DIR))) def test_markdown_snippets_execute(doc: Path, tmp_path, monkeypatch) -> None: runnable = [ (i, code) diff --git a/ben-py/tests/test_graph.py b/ben-py/tests/test_graph.py index a0a45ed..9fed52b 100644 --- a/ben-py/tests/test_graph.py +++ b/ben-py/tests/test_graph.py @@ -103,3 +103,12 @@ def test_reorder_accepts_bytes_and_path() -> None: def test_reorder_rejects_unparseable_graph() -> None: with pytest.raises(Exception, match="Failed to reorder graph"): g.reorder(b"not valid json at all", "rcm") + + +def test_reorder_accepts_live_networkx_graph() -> None: + import networkx as nx + + live = nx.readwrite.json_graph.adjacency_graph(_graph()) + reordered, pmap = g.reorder(live, "rcm") + _check_consistent(reordered, pmap, live.number_of_nodes()) + assert pmap["ordering_method"] == "reverse-cuthill-mckee" diff --git a/ben-py/tests/test_python_pipelines.py b/ben-py/tests/test_python_pipelines.py index 7338459..8f5087a 100644 --- a/ben-py/tests/test_python_pipelines.py +++ b/ben-py/tests/test_python_pipelines.py @@ -36,9 +36,7 @@ def expand_rle(rle: Iterable[tuple[int, int]], cap: int) -> list[int]: return out -def gen_assignment( - rng: random.Random, max_val: int, max_run: int, max_len: int -) -> list[int]: +def gen_assignment(rng: random.Random, max_val: int, max_run: int, max_len: int) -> list[int]: rle = [] n_runs = rng.randint(10, 50) for _ in range(n_runs): @@ -297,9 +295,7 @@ def test_benencoder_rejects_overwrite_and_unknown_variant(tmp_path: Path) -> Non with pytest.raises(OSError, match="already exists"): BenEncoder(out, overwrite=False, variant="standard") with pytest.raises(OSError, match="Failed to create"): - BenEncoder( - tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard" - ) + BenEncoder(tmp_path / "missing-dir" / "out.ben", overwrite=False, variant="standard") # ---------- Decoder error / laziness paths ---------- @@ -413,9 +409,7 @@ def test_codec_helpers_reject_same_path_missing_input_and_bad_json( with pytest.raises(OSError, match="must differ"): encode_jsonl_to_ben(src, src, overwrite=True, variant="standard") with pytest.raises(OSError, match="does not exist"): - encode_jsonl_to_ben( - tmp_path / "missing.jsonl", tmp_path / "o.ben", overwrite=True - ) + encode_jsonl_to_ben(tmp_path / "missing.jsonl", tmp_path / "o.ben", overwrite=True) bad_json = tmp_path / "bad.jsonl" bad_json.write_text("not json\n", encoding="utf-8") with pytest.raises(OSError, match="Failed to convert JSONL to BEN"): @@ -424,9 +418,7 @@ def test_codec_helpers_reject_same_path_missing_input_and_bad_json( def test_encode_ben_to_xben_error_paths(tmp_path: Path) -> None: with pytest.raises(OSError, match="does not exist"): - encode_ben_to_xben( - tmp_path / "missing.ben", tmp_path / "o.xben", overwrite=True - ) + encode_ben_to_xben(tmp_path / "missing.ben", tmp_path / "o.xben", overwrite=True) bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="must differ"): @@ -437,9 +429,7 @@ def test_encode_ben_to_xben_error_paths(tmp_path: Path) -> None: def test_decode_helpers_error_paths(tmp_path: Path) -> None: with pytest.raises(OSError, match="does not exist"): - decode_ben_to_jsonl( - tmp_path / "missing.ben", tmp_path / "o.jsonl", overwrite=True - ) + decode_ben_to_jsonl(tmp_path / "missing.ben", tmp_path / "o.jsonl", overwrite=True) bad_ben = tmp_path / "bad.ben" bad_ben.write_bytes(b"garbage") with pytest.raises(OSError, match="Failed to convert BEN to JSONL"): diff --git a/ben-py/tests/test_recompress.py b/ben-py/tests/test_recompress.py index 21fc139..e32296f 100644 --- a/ben-py/tests/test_recompress.py +++ b/ben-py/tests/test_recompress.py @@ -22,7 +22,7 @@ def _build_ben_bundle(path: Path): with BendlEncoder(path, overwrite=True) as enc: enc.add_graph(_graph(), sort="rcm") enc.add_metadata({"seed": 99}) - with enc.stream("ben") as s: + with enc.stream() as s: for a in samples: s.write(a) enc.add_asset("notes.txt", "hi", content_type="text") @@ -57,13 +57,14 @@ def test_compress_stream_explicit_out_path(tmp_path: Path) -> None: assert BendlDecoder(src).assignment_format() == "ben" -def test_compress_stream_in_place(tmp_path: Path) -> None: +def test_compress_stream_in_place_by_default(tmp_path: Path) -> None: src = tmp_path / "in.bendl" samples = _build_ben_bundle(src) before = BendlDecoder(src) before_assets = {n: before.read_asset_bytes(n) for n in before.asset_names()} - compress_stream(src, in_place=True) + # out_file=None means in place: src is atomically replaced. + compress_stream(src) after = BendlDecoder(src) assert after.assignment_format() == "xben" @@ -72,15 +73,6 @@ def test_compress_stream_in_place(tmp_path: Path) -> None: assert after.read_asset_bytes(name) == payload -def test_compress_stream_arg_validation(tmp_path: Path) -> None: - src = tmp_path / "in.bendl" - _build_ben_bundle(src) - with pytest.raises(ValueError, match="either in_place=True or out_file"): - compress_stream(src) - with pytest.raises(ValueError, match="not both"): - compress_stream(src, out_file=tmp_path / "o.bendl", in_place=True) - - def test_compress_stream_assets_only_bundle(tmp_path: Path) -> None: src = tmp_path / "assets.bendl" enc = BendlEncoder(src, overwrite=True) @@ -105,3 +97,6 @@ def test_compress_stream_out_file_refuses_existing(tmp_path: Path) -> None: out.write_bytes(b"existing") with pytest.raises(OSError, match="already exists"): compress_stream(src, out_file=out) + # overwrite=True is the explicit opt-in to replace it. + compress_stream(src, out_file=out, overwrite=True) + assert BendlDecoder(out).assignment_format() == "xben" diff --git a/ben-py/tests/test_relabel.py b/ben-py/tests/test_relabel.py index 91cd6bc..fd48edd 100644 --- a/ben-py/tests/test_relabel.py +++ b/ben-py/tests/test_relabel.py @@ -32,7 +32,7 @@ def _build_ben_bundle(path: Path, with_graph: bool = True): if with_graph: enc.add_graph(_graph(), sort=None) # store in raw order enc.add_metadata({"seed": 99}) - with enc.stream("ben") as s: + with enc.stream() as s: for a in samples: s.write(a) enc.add_asset("notes.txt", "hi", content_type="text") @@ -77,21 +77,19 @@ def test_relabel_out_file_is_lossless_and_preserves_assets(tmp_path: Path) -> No assert list(BendlDecoder(src)) == samples -def test_relabel_in_place(tmp_path: Path) -> None: +def test_relabel_in_place_by_default(tmp_path: Path) -> None: src = tmp_path / "in.bendl" samples = _build_ben_bundle(src) - relabel_bundle(src, in_place=True, sort="rcm") + # out_file=None means in place: src is atomically replaced. + relabel_bundle(src, sort="rcm") dec = BendlDecoder(src) assert dec.assignment_format() == "ben" assert len(dec) == len(samples) assert dec.read_node_permutation_map()["ordering_method"] == "reverse-cuthill-mckee" old_to_new = { - int(k): v - for k, v in dec.read_node_permutation_map()[ - "node_permutation_old_to_new" - ].items() + int(k): v for k, v in dec.read_node_permutation_map()["node_permutation_old_to_new"].items() } assert [_depermute(p, old_to_new) for p in dec] == samples @@ -114,10 +112,6 @@ def test_relabel_by_key(tmp_path: Path) -> None: def test_relabel_arg_validation(tmp_path: Path) -> None: src = tmp_path / "in.bendl" _build_ben_bundle(src) - with pytest.raises(ValueError, match="either in_place=True or out_file"): - relabel_bundle(src) - with pytest.raises(ValueError, match="not both"): - relabel_bundle(src, out_file=tmp_path / "o.bendl", in_place=True) with pytest.raises(ValueError, match="sort='key' requires key"): relabel_bundle(src, out_file=tmp_path / "o.bendl", sort="key") @@ -143,7 +137,7 @@ def test_relabel_rejects_unfinalized_bundle(tmp_path: Path) -> None: with pytest.raises(RuntimeError, match="boom"): with BendlEncoder(src, overwrite=True) as enc: enc.add_graph(_graph(), sort=None) - with enc.stream("ben") as s: + with enc.stream() as s: s.write([1] * _n()) raise RuntimeError("boom") @@ -162,8 +156,11 @@ def test_relabel_rejects_empty_stream_bundle(tmp_path: Path) -> None: def test_relabel_out_file_refuses_existing(tmp_path: Path) -> None: src = tmp_path / "in.bendl" - _build_ben_bundle(src) + samples = _build_ben_bundle(src) out = tmp_path / "exists.bendl" out.write_bytes(b"existing") with pytest.raises(OSError, match="already exists"): relabel_bundle(src, out_file=out) + # overwrite=True is the explicit opt-in to replace it. + relabel_bundle(src, out_file=out, overwrite=True) + assert len(BendlDecoder(out)) == len(samples) diff --git a/ben-py/tests/test_surface.py b/ben-py/tests/test_surface.py index 9f158f6..cc767df 100644 --- a/ben-py/tests/test_surface.py +++ b/ben-py/tests/test_surface.py @@ -215,16 +215,13 @@ def test_core_stub_covers_runtime_and_matches_signatures() -> None: stub_methods = {m for m in payload if not m.startswith("__")} runtime_methods = _runtime_public_names(obj) assert stub_methods == runtime_methods, ( - f"method set drift on _core.{name}: " - f"stub={stub_methods} runtime={runtime_methods}" + f"method set drift on _core.{name}: stub={stub_methods} runtime={runtime_methods}" ) for method in stub_methods: runtime = _params_from_text_sig(getattr(obj, method).__text_signature__) if runtime is None: continue - assert runtime == payload[method], ( - f"signature drift on _core.{name}.{method}" - ) + assert runtime == payload[method], f"signature drift on _core.{name}.{method}" # --------------------------------------------------------------------------- @@ -249,13 +246,9 @@ def test_bundle_facade_matches_stub() -> None: # Module-level functions. assert ( - _params_from_inspect(bundle.compress_stream, drop_self=False) - == stub["compress_stream"][1] - ) - assert ( - _params_from_inspect(bundle.relabel_bundle, drop_self=False) - == stub["relabel_bundle"][1] + _params_from_inspect(bundle.compress_stream, drop_self=False) == stub["compress_stream"][1] ) + assert _params_from_inspect(bundle.relabel_bundle, drop_self=False) == stub["relabel_bundle"][1] # BendlEncoder methods. enc_methods = stub["BendlEncoder"][1] diff --git a/ben-py/tests/typing_assertions.py b/ben-py/tests/typing_assertions.py new file mode 100644 index 0000000..cba1408 --- /dev/null +++ b/ben-py/tests/typing_assertions.py @@ -0,0 +1,114 @@ +"""Static typing assertions for the public ben-py surface. + +This is not a pytest module — the type checkers check it (via ``task typecheck-python``) and +fail if the public signatures regress. Positive assertions use :func:`typing.assert_type`; +negative assertions are calls that *must not* type-check, suppressed with bare +``# type: ignore`` comments (both ty and pyright honor those) and kept honest by pyright's +``reportUnnecessaryTypeIgnoreComment`` — if the call ever becomes legal, the now-unused ignore +fails the check. + +Nothing here executes; the module exists purely for static analysis. +""" + +from __future__ import annotations + +import io +from pathlib import Path +from typing import assert_type + +import networkx as nx + +from binary_ensemble import ( + BenDecoder, + BenEncoder, + BendlDecoder, + BendlEncoder, + compress_stream, + relabel_bundle, +) +from binary_ensemble import graph as bgraph +from binary_ensemble.types import ( + AssetEntry, + AssignmentFormat, + NodePermutationMap, +) + + +def bundle_encoder_surface(tmp: Path) -> None: + enc = BendlEncoder(tmp / "out.bendl", overwrite=True) + + # Graph inputs: live NetworkX graphs, dicts, bytes, file-likes, and paths all type-check. + enc.add_graph(nx.Graph(), sort=None) + enc.add_graph({"nodes": [], "adjacency": []}, sort=None) + enc.add_graph(b"{}", sort="rcm") + enc.add_graph(io.BytesIO(b"{}")) + enc.add_graph(tmp / "graph.json", sort="key", key="GEOID") + enc.add_graph("graph.json") # a plain str is a path for graphs + enc.add_graph(tmp, sort="bogus") # type: ignore + + enc.add_metadata({"seed": 1234}) + enc.add_metadata(tmp / "metadata.json") + + # add_asset overloads: payload shape is tied to content_type. + enc.add_asset("params.json", {"node_repeats": 2}, "json") + enc.add_asset("notes.txt", "plain text content", "text") + enc.add_asset("blob.bin", b"\x00\x01", "binary") + enc.add_asset("tracts.gpkg", tmp / "tracts.gpkg", "file") + enc.add_asset("bad.txt", {"not": "text"}, "text") # type: ignore + enc.add_asset("bad.any", b"x", "blob") # type: ignore + + with enc.stream(variant="twodelta") as stream: + stream.write([1, 2, 3]) + stream.write((1, 2, 3)) + BendlEncoder.append(tmp / "out.bendl").remove_asset("notes.txt") + + # stream() has no format parameter, and variant is keyword-only with a literal "twodelta" + # default — None is not a legal stand-in for it. + enc.stream("ben") # type: ignore + enc.stream(variant="xben") # type: ignore + enc.stream(variant=None) # type: ignore + + +def bundle_decoder_surface(dec: BendlDecoder) -> None: + for assignment in dec: + assert_type(assignment, list[int]) + + assert_type(dec.assignment_format(), AssignmentFormat) + assert_type(dec.version(), tuple[int, int]) + assert_type(dec.stream_size(), int) + assert_type(dec.asset_size("blob.bin"), int) + assert_type(dec.read_asset_bytes("blob.bin"), bytes) + + entries = dec.list_assets() + assert_type(entries[0], AssetEntry) + assert_type(entries[0]["flags"], list[str]) + + pmap = dec.read_node_permutation_map() + if pmap is not None: + assert_type(pmap["node_permutation_old_to_new"], dict[str, int]) + + dec.subsample_indices([1, 500, 1000]) + dec.subsample_every(250, offset=2) + dec.verify() + + +def graph_surface(tmp: Path) -> None: + _graph, pmap = bgraph.reorder(tmp / "graph.json", sort="mlc") + assert_type(pmap, NodePermutationMap) + bgraph.reorder_by_key({"nodes": []}, key="GEOID") + bgraph.reorder(tmp, sort="fancy") # type: ignore + + +def stream_and_transforms_surface(tmp: Path) -> None: + with BenEncoder(tmp / "t.ben", overwrite=True, variant="mkv_chain") as enc: + enc.write([0, 1]) + dec = BenDecoder(tmp / "t.xben", mode="xben") + dec.subsample_range(1, 3) + BenDecoder(tmp, mode="jsonl") # type: ignore + + compress_stream(tmp / "a.bendl") # out_file=None means in place + compress_stream(tmp / "a.bendl", out_file=tmp / "b.bendl", overwrite=True) + relabel_bundle(tmp / "a.bendl", out_file=tmp / "c.bendl", overwrite=True) + relabel_bundle(tmp / "a.bendl", sort="rcm") + relabel_bundle(tmp / "a.bendl", in_place=True) # type: ignore + relabel_bundle(tmp / "a.bendl", sort="random") # type: ignore diff --git a/ben-py/uv.lock b/ben-py/uv.lock index 6d8db84..e57da9a 100755 --- a/ben-py/uv.lock +++ b/ben-py/uv.lock @@ -121,9 +121,11 @@ dev = [ { name = "maturin" }, { name = "numpy" }, { name = "pandas" }, + { name = "pyright" }, { name = "pytest" }, { name = "ruff" }, { name = "tqdm" }, + { name = "ty" }, ] [package.metadata] @@ -151,9 +153,11 @@ dev = [ { name = "maturin", specifier = ">=1.9.6" }, { name = "numpy", specifier = ">=1.26" }, { name = "pandas", specifier = ">=2.0" }, + { name = "pyright", specifier = ">=1.1" }, { name = "pytest", specifier = ">=8.4.2" }, { name = "ruff", specifier = ">=0.11.0" }, { name = "tqdm", specifier = ">=4.67.1" }, + { name = "ty" }, ] [[package]] @@ -1266,6 +1270,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, ] +[[package]] +name = "nodeenv" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, +] + [[package]] name = "numpy" version = "2.3.4" @@ -1705,6 +1718,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" }, ] +[[package]] +name = "pyright" +version = "1.1.410" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nodeenv" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/53/e4d8ea1391bd4355231be6f91bf239479aa0014260ed3fb5526eeb12a1f2/pyright-1.1.410.tar.gz", hash = "sha256:07a073b8ba6749826773c1269773efa11b93440d9a6aa60419d9a3172d6dc488", size = 4062013, upload-time = "2026-06-01T17:35:48.894Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/33/288b5868fa00846dacf249633719d747893e54aebd196b9968ac1878a5d3/pyright-1.1.410-py3-none-any.whl", hash = "sha256:5e961bed37cacf96b3f7cd7b1da39b350a9239aa2e69138d0e88f728cfaf296c", size = 6082448, upload-time = "2026-06-01T17:35:46.387Z" }, +] + [[package]] name = "pytest" version = "8.4.2" @@ -2462,6 +2488,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "ty" +version = "0.0.49" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/8d/37cb91808069509d43a2a11743e12f1e854fd808dbef2203309d256718cd/ty-0.0.49.tar.gz", hash = "sha256:0a027bd0c9c75d035641a365d087ad883446057f9be0b9826251c2aecafbf145", size = 5884753, upload-time = "2026-06-12T03:08:20.221Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/de/9237c6a96356612dd0393db1e94cf21f903616adf3a3701bf3da6e4adc92/ty-0.0.49-py3-none-linux_armv6l.whl", hash = "sha256:12c0c4310b936d762a8586c210b53d4fa4bb361a04429afa89bf84b922e5e065", size = 11834671, upload-time = "2026-06-12T03:07:53.062Z" }, + { url = "https://files.pythonhosted.org/packages/8f/15/daf5a14a5e07012277d450c75325c94614e2acfec4c620c881486118c410/ty-0.0.49-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:737bfdc2caf9712a8580944dcdc80a450a37a4f2bc83c8fa9b7433b374f9e471", size = 11589570, upload-time = "2026-06-12T03:08:25.779Z" }, + { url = "https://files.pythonhosted.org/packages/7d/58/30bdf98436488aca25f0763bf7f92a061528d42461b686453029e845e4c5/ty-0.0.49-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ab90c1baf3b1701d282fce4b02fa552a962d109f8972c46ef6b22429503bfea4", size = 10985236, upload-time = "2026-06-12T03:08:36.664Z" }, + { url = "https://files.pythonhosted.org/packages/22/45/ece503e4a1396e13a1a9a0cde51afe476a6506a1d557eeadf8ad45c83bc0/ty-0.0.49-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4ce8ecf6ba6fc79bd137cc0557a754f7e5f2dfe9436412551d480d680e248ad", size = 11504302, upload-time = "2026-06-12T03:08:01.664Z" }, + { url = "https://files.pythonhosted.org/packages/17/dc/5d09333d289dfbca1804eaade125c9e8a1a992a2a592a8b80c5e9b589ca9/ty-0.0.49-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:10d85c6865c984e78661e0bd20b180514b4a289739224e84816e342bdf381e04", size = 11626629, upload-time = "2026-06-12T03:08:06.844Z" }, + { url = "https://files.pythonhosted.org/packages/f2/36/155f41c9dd7237c4b609211f29f77755a139ee6218605dadc7fe21d5e3c8/ty-0.0.49-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d96a67a206619e01fa92f35a22267ec634bba62be24b1d0e947020cc179995b", size = 12074481, upload-time = "2026-06-12T03:08:09.643Z" }, + { url = "https://files.pythonhosted.org/packages/96/4c/998ee13cd5045f1f8b36982de7343163832ac53f27debe01b0de0e8bd968/ty-0.0.49-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3de9f648564e0a66344ef397770387cb0d093735f8679d2c5a08a4741e79814d", size = 12678042, upload-time = "2026-06-12T03:08:39.319Z" }, + { url = "https://files.pythonhosted.org/packages/85/c9/9a505aba85c41ce54cbcaa14f8d79aa084b86151d2d70df11c4655b92898/ty-0.0.49-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5779179ab397d15f8c9dbb8f506ec1b1745f54eac639982f76ef3ce538943b50", size = 12316194, upload-time = "2026-06-12T03:08:18.023Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b8/ded37fb93503294abbc83c36470bb1413bea05048b745881d4470b518a06/ty-0.0.49-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:792d4974e93cc09bd32f934586080bbbe21b8e777099cb521cb2de18b68a49f0", size = 12145507, upload-time = "2026-06-12T03:07:56.505Z" }, + { url = "https://files.pythonhosted.org/packages/2f/07/392e80d78f02445f695b815bb9eb0fffacda68b03faee38c900f7b990815/ty-0.0.49-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:727bda86deb136073e525c2e78d60e38aedcce5d80579170844a52bbf7c1440d", size = 12365967, upload-time = "2026-06-12T03:08:12.553Z" }, + { url = "https://files.pythonhosted.org/packages/50/d3/31b0c2a7fbedd3373e389cb1d81b8d2128f6f868fafb46557736a6f9aca8/ty-0.0.49-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4f2fc2bc4a8d2ff1cca59fd94772cabdfec4062d47a0b3a0784be46d94d0540b", size = 11475283, upload-time = "2026-06-12T03:08:28.334Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5b/329e101638920b468a3bb63059c9f66ef99b44aac501222c44832a507321/ty-0.0.49-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:3724bd9badef333321578b6a941fbc571ebf49141ec2356a8590fbe4c9aa588d", size = 11645343, upload-time = "2026-06-12T03:08:15.246Z" }, + { url = "https://files.pythonhosted.org/packages/a9/76/c897e615e32f80ca81c8c1bc49b9a1f72ff9e3cfea0f8345ba505fe28472/ty-0.0.49-py3-none-musllinux_1_2_i686.whl", hash = "sha256:166c6eb52ee4af3c5a9bb267d165d93000daa55c6758cd8ff3199741fb75917d", size = 11725585, upload-time = "2026-06-12T03:08:33.915Z" }, + { url = "https://files.pythonhosted.org/packages/59/e1/fdb42ee239f618800842681af5bb8598117e74512c10974a8b7b9086a898/ty-0.0.49-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:91e81d832c287b05782ee32eb1b801f62c1fa08df37d589d2b88c3f1d51c9731", size = 12237261, upload-time = "2026-06-12T03:08:31.105Z" }, + { url = "https://files.pythonhosted.org/packages/98/0f/a2d6a5fc9d0786cbeb3c200786da4e18c203589be3984bb5def83ca92320/ty-0.0.49-py3-none-win32.whl", hash = "sha256:7186af5ca9829d1f5d8916bcf767b8e819bfbf61b1b8ec843bb3fc699cb502e1", size = 11100789, upload-time = "2026-06-12T03:07:59.092Z" }, + { url = "https://files.pythonhosted.org/packages/d0/9d/473ac8bc57b5a2d121da893bf9dd74a118efb19a01d711df1a6e397f05cc/ty-0.0.49-py3-none-win_amd64.whl", hash = "sha256:ae2142fc126a01effcca0c222908b0e6654b5ba1266d4e4d406e4866aef8e1d1", size = 12204644, upload-time = "2026-06-12T03:08:04.327Z" }, + { url = "https://files.pythonhosted.org/packages/ef/a2/8959249da951ba3977fee20e688d28678b8a1d30a9ed4464228a85d45853/ty-0.0.49-py3-none-win_arm64.whl", hash = "sha256:75d5e2e7649765f31f4bed6c8adb149a75b18edd3fa6336dac4d0efc1a66466f", size = 11558965, upload-time = "2026-06-12T03:08:23.012Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" diff --git a/ben/src/cli/ben/args.rs b/ben/src/cli/ben/args.rs index 9f07451..bdd9908 100644 --- a/ben/src/cli/ben/args.rs +++ b/ben/src/cli/ben/args.rs @@ -54,7 +54,7 @@ pub(super) enum Mode { )] /// Defines the command line arguments accepted by the program. pub(super) struct Args { - /// Mode to run the program in (encode, decode, or read). + /// Mode to run the program in (e.g. encode, decode, or lookup). #[arg(short, long, value_enum)] pub mode: Mode, /// Input file to read from. diff --git a/ben/src/cli/ben/bundle.rs b/ben/src/cli/ben/bundle.rs index 4ccaa4b..a54560f 100644 --- a/ben/src/cli/ben/bundle.rs +++ b/ben/src/cli/ben/bundle.rs @@ -34,7 +34,7 @@ pub(super) fn append_graph_asset(out_path: &str, graph_path: &Path) -> Result<() Ok(()) } -/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` bundle at `out_path` and then append +/// Encode `input_path` (JSONL) to BEN inside a fresh `.bendl` file at `out_path` and then append /// the graph as a post-stream asset. pub(super) fn run_encode_bundle_with_graph( input_path: &Path, @@ -67,7 +67,7 @@ pub(super) fn run_encode_bundle_with_graph( append_graph_asset(out_path, graph_path) } -/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` bundle at `out_path` and +/// Encode `input_path` (JSONL or `.ben`) to XBEN inside a fresh `.bendl` file at `out_path` and /// then append the graph as a post-stream asset. #[allow(clippy::too_many_arguments)] pub(super) fn run_xencode_bundle_with_graph( diff --git a/ben/src/cli/ben/modes/encode.rs b/ben/src/cli/ben/modes/encode.rs index b6b48df..590f70f 100644 --- a/ben/src/cli/ben/modes/encode.rs +++ b/ben/src/cli/ben/modes/encode.rs @@ -12,7 +12,7 @@ use std::path::Path; pub(in crate::cli::ben) fn run(args: Args) -> CliResult { tracing::trace!("Running in encode mode"); - // --graph path: produce a .bendl bundle with the BEN stream plus a post-stream graph asset. + // --graph path: produce a .bendl file with the BEN stream plus a post-stream graph asset. if let Some(graph_path) = args.graph.as_ref() { let in_file = args.input_file.as_ref().ok_or_else(|| { CliError::other("--graph requires an input file (stdin not supported).") diff --git a/ben/src/cli/ben/modes/xencode.rs b/ben/src/cli/ben/modes/xencode.rs index 9113647..ce33d69 100644 --- a/ben/src/cli/ben/modes/xencode.rs +++ b/ben/src/cli/ben/modes/xencode.rs @@ -23,7 +23,7 @@ pub(in crate::cli::ben) fn run(args: Args) -> CliResult { } } - // --graph path: produce a .bendl bundle with the XBEN stream plus a post-stream graph asset. + // --graph path: produce a .bendl file with the XBEN stream plus a post-stream graph asset. if let Some(graph_path) = args.graph.as_ref() { let in_file = args.input_file.as_ref().ok_or_else(|| { CliError::other("--graph requires an input file (stdin not supported).") diff --git a/ben/src/cli/ben/paths.rs b/ben/src/cli/ben/paths.rs index 5b404ce..f1be1f9 100644 --- a/ben/src/cli/ben/paths.rs +++ b/ben/src/cli/ben/paths.rs @@ -15,7 +15,7 @@ pub(super) type DynWriter = Box; /// * `input_file_name` - The input file path supplied by the user. /// * `output_file_name` - An optional explicit output path. /// * `overwrite` - Whether to skip overwrite prompting. -/// * `with_graph` - When true, the output is a `.bendl` bundle instead of a bare `.ben`/`.xben` +/// * `with_graph` - When true, the output is a `.bendl` file instead of a bare `.ben`/`.xben` /// stream, so the derived extension is `.bendl` regardless of `mode`. /// /// # Returns diff --git a/ben/src/cli/bendl/args.rs b/ben/src/cli/bendl/args.rs index f073ff3..a2f8ea4 100644 --- a/ben/src/cli/bendl/args.rs +++ b/ben/src/cli/bendl/args.rs @@ -28,7 +28,7 @@ impl std::str::FromStr for NamedAsset { #[derive(Parser, Debug)] #[command( name = "bendl", - about = "Create, inspect, extract from, and append to .bendl bundle files.", + about = "Create, inspect, extract from, and append to .bendl file files.", version )] pub(super) struct Args { @@ -52,8 +52,14 @@ pub(super) enum Command { Inspect(InspectArgs), /// Extract the embedded stream or a named asset to a file. Extract(ExtractArgs), - /// Append new assets to an already-finalized `.bendl` bundle. + /// Append new assets to an already-finalized `.bendl` file. Append(AppendArgs), + /// Remove named assets from a finalized `.bendl` file, compacting it afterwards so the + /// payload bytes are actually reclaimed. + Remove(RemoveArgs), + /// Rewrite a `.bendl` file in place without unreferenced byte ranges (dead space left by + /// asset removals and superseded directories). + Compact(CompactArgs), } #[derive(Parser, Debug)] @@ -115,6 +121,21 @@ pub(super) struct ExtractArgs { pub overwrite: bool, } +#[derive(Parser, Debug)] +pub(super) struct RemoveArgs { + /// `.bendl` file to remove assets from. Must be finalized. + pub input: PathBuf, + /// Name of an asset to remove (e.g. `notes.txt`). May be repeated. + #[arg(long = "asset", required = true)] + pub assets: Vec, +} + +#[derive(Parser, Debug)] +pub(super) struct CompactArgs { + /// `.bendl` file to compact in place. + pub input: PathBuf, +} + #[derive(Parser, Debug)] pub(super) struct AppendArgs { /// `.bendl` file to append to. Must be finalized (`complete == 1`). diff --git a/ben/src/cli/bendl/mod.rs b/ben/src/cli/bendl/mod.rs index 62bbdb5..d8200e2 100644 --- a/ben/src/cli/bendl/mod.rs +++ b/ben/src/cli/bendl/mod.rs @@ -1,12 +1,15 @@ -//! CLI front-end for the `.bendl` bundle container. +//! CLI front-end for the `.bendl` file container. //! -//! Exposes four subcommands: +//! Exposes six subcommands: //! //! - `create` — wrap a `.ben` / `.xben` assignment stream plus optional asset files into a -//! finalized `.bendl` bundle. +//! finalized `.bendl` file. //! - `inspect` — print the header and directory of a `.bendl` file. //! - `extract` — copy the embedded stream region or a named asset out of a bundle to disk. //! - `append` — add new asset files to an already-finalized bundle without rewriting the stream. +//! - `remove` — drop named assets from a finalized bundle and compact it, so the payload bytes are +//! actually reclaimed. +//! - `compact` — rewrite a bundle in place without unreferenced byte ranges. mod append; mod args; @@ -14,6 +17,7 @@ mod create; mod extract; mod helpers; mod inspect; +mod remove; #[cfg(test)] mod tests; @@ -23,6 +27,7 @@ use args::{Args, Command}; use create::run_create; use extract::run_extract; use inspect::run_inspect; +use remove::{run_compact, run_remove}; use crate::cli::common::{set_quiet, set_verbose, CliError, CliResult}; use clap::Parser; @@ -38,6 +43,8 @@ pub fn run() -> CliResult { Command::Inspect(a) => run_inspect(a), Command::Extract(a) => run_extract(a), Command::Append(a) => run_append(a), + Command::Remove(a) => run_remove(a), + Command::Compact(a) => run_compact(a), } .map_err(CliError::from) } diff --git a/ben/src/cli/bendl/remove.rs b/ben/src/cli/bendl/remove.rs new file mode 100644 index 0000000..b08121c --- /dev/null +++ b/ben/src/cli/bendl/remove.rs @@ -0,0 +1,55 @@ +//! `bendl remove` and `bendl compact`: drop assets from a bundle and reclaim dead space. +//! +//! Removal at the appender level only rewrites the directory; the payload bytes stay behind as +//! unreferenced dead space. The `remove` subcommand therefore compacts the bundle afterwards, so +//! "removed" means the bytes are actually gone from the file. `compact` is the standalone form, +//! useful after many appends (each of which leaves a superseded directory behind). + +use super::args::{CompactArgs, RemoveArgs}; +use crate::io::bundle::compact::{compact_bundle_in_place, Compaction}; +use crate::io::bundle::writer::BendlAppender; +use std::fs::OpenOptions; + +fn describe(kind: Compaction) -> &'static str { + match kind { + Compaction::None => "already compact", + Compaction::TailRewrite => "tail rewrite; stream untouched", + Compaction::FullRewrite => "full rewrite", + } +} + +pub(super) fn run_remove(args: RemoveArgs) -> Result<(), String> { + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&args.input) + .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; + let mut appender = + BendlAppender::open(file).map_err(|e| format!("failed to open appender: {e}"))?; + for name in &args.assets { + appender + .remove_asset(name) + .map_err(|e| format!("failed to remove asset: {e}"))?; + } + appender + .commit() + .map_err(|e| format!("failed to commit removal: {e}"))?; + + // Removal only rewrites the directory; compact so the payload bytes are actually gone. + let kind = compact_bundle_in_place(&args.input) + .map_err(|e| format!("failed to compact bundle after removal: {e}"))?; + eprintln!( + "Removed {} asset(s) from {:?} and compacted it ({})", + args.assets.len(), + args.input, + describe(kind) + ); + Ok(()) +} + +pub(super) fn run_compact(args: CompactArgs) -> Result<(), String> { + let kind = compact_bundle_in_place(&args.input) + .map_err(|e| format!("failed to compact bundle: {e}"))?; + eprintln!("Compacted {:?} ({})", args.input, describe(kind)); + Ok(()) +} diff --git a/ben/src/io/bundle/compact.rs b/ben/src/io/bundle/compact.rs new file mode 100644 index 0000000..aa84548 --- /dev/null +++ b/ben/src/io/bundle/compact.rs @@ -0,0 +1,320 @@ +//! Compaction: rewriting a bundle without its unreferenced byte ranges. +//! +//! Removing an asset through [`super::writer::BendlAppender::remove_asset`] only drops the +//! directory entry, and every append leaves a superseded directory behind — both leave dead bytes +//! in the file that no reader ever touches. Compaction reclaims them. The user-facing removal +//! paths (the `bendl remove` CLI command and the Python facade) compact automatically, so for +//! them "removed" means the bytes are actually gone. +//! +//! Two strategies, chosen automatically by [`compact_bundle_in_place`]: +//! +//! - **Tail rewrite.** Asset removals and appends only ever create dead space *after* the +//! assignment stream (pre-stream assets are written back-to-back, and appends land past the +//! stream). When the prefix through the stream is fully live, only the small post-stream tail +//! (surviving appended assets + directory) is rebuilt, in place, and the file is truncated. Cost +//! is O(tail), independent of stream size — removing an appended asset from a 50 GB bundle costs +//! milliseconds and needs no scratch space. The stream is never read, so this path performs no +//! stream checksum verification. +//! - **Full rewrite.** When dead space exists before the stream (a removed pre-stream asset), the +//! bundle is rewritten wholesale: assets carried by decoded payload (verify-on-touch), stream +//! copied verbatim through the verified reader, temp file atomically swapped in. +//! +//! Both strategies preserve the stream's wire format (BEN or XBEN) as-is. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; +use std::path::Path; + +use super::format::{ + encode_directory, AssignmentFormat, BendlDirectoryEntry, BendlHeader, KnownAssetKind, + ASSET_FLAG_JSON, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, + HEADER_SIZE, +}; +use super::reader::BendlReader; +use super::writer::{AddAssetOptions, BendlWriteError, BendlWriter}; + +/// Which compaction strategy [`compact_bundle_in_place`] ended up using. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Compaction { + /// The bundle had no unreferenced bytes; the file was left untouched. + None, + /// Only the post-stream tail was rebuilt; the stream region was never read or moved. + TailRewrite, + /// The whole bundle was rewritten through a temp file and atomically swapped in. + FullRewrite, +} + +/// A single asset read back from the source bundle, ready to be re-added to the new one. +struct PreservedAsset { + asset_type: u16, + name: String, + is_json: bool, + payload: Vec, +} + +fn known_kind(asset_type: u16) -> Option { + match asset_type { + ASSET_TYPE_METADATA => Some(KnownAssetKind::Metadata), + ASSET_TYPE_GRAPH => Some(KnownAssetKind::Graph), + ASSET_TYPE_NODE_PERMUTATION_MAP => Some(KnownAssetKind::NodePermutationMap), + _ => None, + } +} + +fn add_preserved( + writer: &mut BendlWriter, + asset: &PreservedAsset, +) -> Result<(), BendlWriteError> { + let opts = if asset.is_json { + AddAssetOptions::defaults().json() + } else { + AddAssetOptions::defaults() + }; + match known_kind(asset.asset_type) { + Some(kind) => writer.add_known_asset(kind, &asset.payload, opts), + None => writer.add_custom_asset(&asset.name, &asset.payload, opts), + } +} + +/// Rewrite the finalized bundle behind `reader` into `out`, dropping unreferenced byte ranges. +/// +/// This is the full-rewrite strategy: assets are carried over by decoded payload (verify-on-touch +/// applies), and the assignment stream is copied verbatim through the verified stream reader, so +/// a checksum mismatch anywhere in the source surfaces as an error here instead of propagating. +/// Asset storage compression is normalized to the writer's default policy. Returns the +/// destination writer on success. +pub fn compact_bundle(reader: &mut BendlReader, out: W) -> Result +where + R: Read + Seek, + W: Write + Seek, +{ + if !reader.is_finalized() { + return Err(BendlWriteError::BundleIncomplete); + } + let sample_count = reader.header().sample_count; + let stream_len = reader.header().stream_len; + let assignment_format = reader + .header() + .assignment_format_typed() + .unwrap_or(AssignmentFormat::Ben); + + // Read every asset's decoded payload up front (each read borrows the reader exclusively). + let entries: Vec<_> = reader.assets().to_vec(); + let mut assets = Vec::with_capacity(entries.len()); + for entry in &entries { + let payload = reader.asset_bytes(entry).map_err(io::Error::other)?; + assets.push(PreservedAsset { + asset_type: entry.asset_type, + name: entry.name.clone(), + is_json: entry.asset_flags & ASSET_FLAG_JSON != 0, + payload, + }); + } + + let mut writer = BendlWriter::new(out, assignment_format)?; + for asset in &assets { + add_preserved(&mut writer, asset)?; + } + + if stream_len == 0 { + writer.finish() + } else { + let mut stream = reader + .assignment_stream_reader() + .map_err(io::Error::other)?; + let mut session = writer.into_stream_session()?; + io::copy(&mut stream, &mut session)?; + let writer = session.finish_into_writer(sample_count); + writer.finish() + } +} + +/// The post-stream tail to rebuild: surviving appended assets (raw on-disk bytes, so their +/// storage form and checksums carry over unchanged) followed by the new directory. +struct PlannedTail { + /// Concatenated raw bytes to write at the stream end: survivor payloads then directory. + block: Vec, + /// Final directory offset (stream end + survivor payload bytes). + directory_offset: u64, + /// Final directory length. + directory_len: u64, + /// Final file length. + file_len: u64, +} + +/// Decide whether the tail-rewrite strategy applies and, if so, plan it. +/// +/// Applicable iff the prefix `[0, stream_end)` is fully live: the pre-stream assets tile +/// `[HEADER_SIZE, stream_offset)` exactly and every other live payload sits at or beyond the +/// stream end. Returns `None` when dead bytes exist before the stream end (full rewrite needed). +fn plan_tail( + file: &mut File, + header: &BendlHeader, + entries: &[BendlDirectoryEntry], +) -> Result, BendlWriteError> { + let stream_end = header + .stream_offset + .checked_add(header.stream_len) + .ok_or_else(|| io::Error::other("stream_offset + stream_len overflowed"))?; + + let mut pre: Vec<&BendlDirectoryEntry> = Vec::new(); + let mut post: Vec<&BendlDirectoryEntry> = Vec::new(); + for entry in entries { + if entry.payload_offset < header.stream_offset { + pre.push(entry); + } else if entry.payload_offset >= stream_end { + post.push(entry); + } else { + // A payload inside the stream region is malformed; let the full path report it. + return Ok(None); + } + } + + // The prefix must be exactly tiled: header, then pre-stream payloads back-to-back, then the + // stream. Any gap means pre-stream dead space, which only a full rewrite can reclaim. + pre.sort_by_key(|e| e.payload_offset); + let mut cursor = HEADER_SIZE as u64; + for entry in &pre { + if entry.payload_offset != cursor { + return Ok(None); + } + cursor = cursor + .checked_add(entry.payload_len) + .ok_or_else(|| io::Error::other("payload range overflowed"))?; + } + if cursor != header.stream_offset { + return Ok(None); + } + + // Read survivors' raw on-disk bytes and lay them out from the stream end. + post.sort_by_key(|e| e.payload_offset); + let mut block = Vec::new(); + let mut new_entries: Vec = pre.iter().map(|e| (*e).clone()).collect(); + let mut offset = stream_end; + for entry in &post { + let mut payload = vec![0u8; entry.payload_len as usize]; + file.seek(SeekFrom::Start(entry.payload_offset))?; + file.read_exact(&mut payload)?; + block.extend_from_slice(&payload); + let mut moved = (*entry).clone(); + moved.payload_offset = offset; + new_entries.push(moved); + offset += entry.payload_len; + } + let directory_offset = offset; + let directory_bytes = encode_directory(&new_entries)?; + let directory_len = directory_bytes.len() as u64; + block.extend_from_slice(&directory_bytes); + + Ok(Some(PlannedTail { + block, + directory_offset, + directory_len, + file_len: directory_offset + directory_len, + })) +} + +/// Write `header` (patched with the given directory location) at offset 0 and sync. +fn patch_header( + file: &mut File, + header: &mut BendlHeader, + directory_offset: u64, + directory_len: u64, +) -> io::Result<()> { + header.directory_offset = directory_offset; + header.directory_len = directory_len; + file.seek(SeekFrom::Start(0))?; + header.write_to(file)?; + file.sync_data() +} + +/// Execute a planned tail rewrite crash-safely. +/// +/// Phase 1 appends the new tail block (survivor payloads + directory) at the current EOF and +/// patches the header to the appended directory — pure append, so a crash anywhere leaves either +/// the old or the appended directory authoritative over intact bytes. Phase 2 writes the same +/// block at the stream end and patches the header again; every byte it touches is dead under the +/// phase-1 state (the block is never larger than the dead region, which contains the survivors' +/// old payloads plus at least one superseded directory of equal entry count). The trailing +/// truncate runs last. +fn execute_tail( + file: &mut File, + header: &mut BendlHeader, + plan: &PlannedTail, +) -> Result<(), BendlWriteError> { + let block_start = plan.directory_offset - (plan.block.len() as u64 - plan.directory_len); + + // Phase 1: relocate the tail to the end of the file (append-only), then adopt it. + let eof = file.seek(SeekFrom::End(0))?; + debug_assert!( + plan.file_len <= eof, + "tail block must fit in the dead region" + ); + file.write_all(&plan.block)?; + file.sync_data()?; + let staged_dir_offset = eof + (plan.directory_offset - block_start); + patch_header(file, header, staged_dir_offset, plan.directory_len)?; + + // Phase 2: write the block at its final home (every touched byte is dead), adopt, truncate. + file.seek(SeekFrom::Start(block_start))?; + file.write_all(&plan.block)?; + file.sync_data()?; + patch_header(file, header, plan.directory_offset, plan.directory_len)?; + file.set_len(plan.file_len)?; + file.sync_data()?; + Ok(()) +} + +/// Compact the bundle at `path` in place, choosing the cheapest applicable strategy. +/// +/// Returns which strategy ran. [`Compaction::TailRewrite`] never reads or moves the assignment +/// stream (and therefore performs no stream checksum verification); [`Compaction::FullRewrite`] +/// streams the whole bundle through verified readers into a temp file and atomically swaps it +/// over `path`. On any error the original file is left untouched. +pub fn compact_bundle_in_place(path: &Path) -> Result { + // Parse and validate through the reader so malformed bundles are rejected up front. + let file = File::open(path)?; + let reader = BendlReader::open(BufReader::new(file)).map_err(BendlWriteError::Format)?; + if !reader.is_finalized() { + return Err(BendlWriteError::BundleIncomplete); + } + let mut header = *reader.header(); + let entries: Vec = reader.assets().to_vec(); + drop(reader); + + let mut file = OpenOptions::new().read(true).write(true).open(path)?; + if let Some(plan) = plan_tail(&mut file, &header, &entries)? { + // Already compact? Then the directory sits right at its planned offset and the file ends + // right after it — nothing to do. + let eof = file.seek(SeekFrom::End(0))?; + if header.directory_offset == plan.directory_offset && eof == plan.file_len { + return Ok(Compaction::None); + } + execute_tail(&mut file, &mut header, &plan)?; + return Ok(Compaction::TailRewrite); + } + drop(file); + + // Dead space before the stream: full rewrite through a temp file. + let file = File::open(path)?; + let mut reader = BendlReader::open(BufReader::new(file)).map_err(BendlWriteError::Format)?; + + let mut tmp = path.as_os_str().to_owned(); + tmp.push(".compact-tmp"); + let tmp = Path::new(&tmp).to_path_buf(); + + let result: Result<(), BendlWriteError> = (|| { + let out = BufWriter::new(File::create(&tmp)?); + let out = compact_bundle(&mut reader, out)?; + out.into_inner() + .map_err(|e| io::Error::other(e.to_string()))? + .sync_all()?; + fs::rename(&tmp, path)?; + Ok(()) + })(); + + if result.is_err() && tmp.exists() { + let _ = fs::remove_file(&tmp); + } + result.map(|()| Compaction::FullRewrite) +} diff --git a/ben/src/io/bundle/error.rs b/ben/src/io/bundle/error.rs index becdec0..92bf23e 100644 --- a/ben/src/io/bundle/error.rs +++ b/ben/src/io/bundle/error.rs @@ -1,4 +1,4 @@ -//! Read-side error types for `.bendl` bundles. +//! Read-side error types for `.bendl` files. //! //! [`BendlReadError`] is the canonical error type for high-level BENDL convenience APIs (anything //! that returns an owned value: `asset_bytes`, reader constructors that consume internally, etc.). diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index b075107..475c7a7 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -12,6 +12,7 @@ //! Pure functions over byte buffers; no I/O. //! - [`manifest`] — serde structs for the optional `metadata.json` asset. +pub mod compact; pub mod error; pub mod format; pub mod manifest; @@ -22,6 +23,7 @@ pub mod writer; #[cfg(test)] mod tests; +pub use compact::{compact_bundle, compact_bundle_in_place, Compaction}; pub use error::{BendlReadError, ChecksumError, ChecksumTarget}; pub use reader::{BendlReader, BendlVerifiedStreamReader, BundleValidationError}; pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 98fa2b7..d6d2782 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -182,6 +182,13 @@ impl AssetNameRegistry { } Ok(()) } + + /// Release a name (and any singleton-type claim) after its entry is removed, so the name can + /// be reused by a subsequent claim in the same session. + fn release(&mut self, asset_type: u16, name: &str) { + self.names.remove(name); + self.singleton_types.remove(&asset_type); + } } /// Writer for a single `.bendl` file. @@ -505,6 +512,10 @@ pub enum BendlWriteError { #[error("duplicate asset name: {0:?}")] DuplicateName(String), + /// A removal named an asset that does not exist in the bundle's directory. + #[error("no asset named {0:?} in bundle")] + UnknownAssetName(String), + /// A second singleton asset of this type was requested. #[error("duplicate singleton asset type: {0}")] DuplicateSingletonType(u16), @@ -572,8 +583,12 @@ pub struct BendlAppender { existing_entries: Vec, pending: Vec, /// Names and singleton types claimed by the existing directory plus any pending adds. Seeded - /// from the existing entries at open time, then extended as each pending asset is enqueued. + /// from the existing entries at open time, then extended as each pending asset is enqueued + /// (and shrunk by removals, so a removed name can be re-added in the same session). registry: AssetNameRegistry, + /// Whether any existing entry was removed; forces a directory rewrite on commit even when + /// nothing was added. + removed_any: bool, } /// An asset queued for append but not yet written to disk. @@ -634,9 +649,30 @@ impl BendlAppender { existing_entries, pending: Vec::new(), registry, + removed_any: false, }) } + /// Remove the named asset from the bundle's directory. + /// + /// Only the directory entry is dropped: the payload bytes remain in the file as unreferenced + /// dead space until the next whole-bundle rewrite (e.g. a recompression) compacts them. + /// Readers navigate solely via directory offsets, so the gap is invisible to them. The name + /// (and any singleton-type claim) becomes reusable by a subsequent add in the same session, + /// which makes remove-then-add the way to replace an asset's payload. + /// + /// Removal targets *committed* entries only; it does not touch assets enqueued with + /// [`Self::add_asset`] but not yet committed. + pub fn remove_asset(&mut self, name: &str) -> Result<(), BendlWriteError> { + let Some(pos) = self.existing_entries.iter().position(|e| e.name == name) else { + return Err(BendlWriteError::UnknownAssetName(name.to_string())); + }; + let entry = self.existing_entries.remove(pos); + self.registry.release(entry.asset_type, &entry.name); + self.removed_any = true; + Ok(()) + } + /// Enqueue a new asset for append. /// /// This validates the new asset against both the loaded directory and any previously-enqueued @@ -737,8 +773,8 @@ impl BendlAppender { /// /// If compression fails, the file is left unchanged. pub fn commit(mut self) -> Result { - // If nothing was enqueued, commit is a no-op — return the file untouched. - if self.pending.is_empty() { + // If nothing was enqueued or removed, commit is a no-op — return the file untouched. + if self.pending.is_empty() && !self.removed_any { return Ok(self.inner); } diff --git a/ben/tests/test_bendl_append_proptest.rs b/ben/tests/test_bendl_append_proptest.rs index d070339..5b4dbfc 100644 --- a/ben/tests/test_bendl_append_proptest.rs +++ b/ben/tests/test_bendl_append_proptest.rs @@ -17,9 +17,11 @@ //! was opened. use binary_ensemble::io::bundle::format::{ - AssignmentFormat, BendlDirectoryEntry, ASSET_TYPE_CUSTOM, + AssignmentFormat, BendlDirectoryEntry, KnownAssetKind, ASSET_TYPE_CUSTOM, +}; +use binary_ensemble::io::bundle::writer::{ + AddAssetOptions, BendlAppender, BendlWriteError, BendlWriter, }; -use binary_ensemble::io::bundle::writer::{AddAssetOptions, BendlAppender, BendlWriter}; use binary_ensemble::io::bundle::BendlReader; use proptest::prelude::*; use std::io::{Cursor, Read, Seek, SeekFrom}; @@ -392,3 +394,275 @@ proptest! { run_sequence(&ops); } } + +// --------------------------------------------------------------------------- +// Deterministic asset-removal tests +// --------------------------------------------------------------------------- + +#[test] +fn remove_asset_drops_entry_and_preserves_everything_else() { + let bytes = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "extra.bin", + b"keep me", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender.remove_asset("seed.bin").unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let names: Vec<_> = reader.assets().iter().map(|e| e.name.clone()).collect(); + assert_eq!(names, vec!["extra.bin".to_string()]); + // The survivor still reads back, and the stream + every remaining checksum still verify + // (the removed payload's bytes are dead space readers never touch). + let entry = reader.assets()[0].clone(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), b"keep me"); + reader.verify_all_asset_checksums().unwrap(); + reader.verify_stream_checksum().unwrap(); +} + +#[test] +fn remove_then_add_same_name_replaces_payload_in_one_session() { + let bytes = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender.remove_asset("seed.bin").unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "seed.bin", + b"new payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = find_entry(reader.assets(), "seed.bin").clone(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), b"new payload"); + reader.verify_all_asset_checksums().unwrap(); +} + +#[test] +fn remove_singleton_frees_its_type_for_re_add() { + // metadata.json is a singleton: a second add is refused until the first is removed. + let bytes = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender + .add_known_asset( + KnownAssetKind::Metadata, + b"{\"v\":1}", + AddAssetOptions::defaults().json(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + assert!(matches!( + appender.add_known_asset( + KnownAssetKind::Metadata, + b"{\"v\":2}", + AddAssetOptions::defaults().json(), + ), + Err(BendlWriteError::DuplicateSingletonType(_)) + )); + appender.remove_asset("metadata.json").unwrap(); + appender + .add_known_asset( + KnownAssetKind::Metadata, + b"{\"v\":2}", + AddAssetOptions::defaults().json(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + let entry = find_entry(reader.assets(), "metadata.json").clone(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), b"{\"v\":2}"); +} + +#[test] +fn compact_drops_dead_space_and_preserves_semantics() { + use binary_ensemble::io::bundle::compact::compact_bundle; + + // Manufacture dead space: append a 64 KiB incompressible blob (stored raw), then remove it + // with the directory-only appender removal. + let bytes = build_seed_bundle(); + let blob: Vec = (0u32..65536) + .map(|i| (i.wrapping_mul(2654435761) >> 13) as u8) + .collect(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "bloat.bin", + &blob, + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender.remove_asset("bloat.bin").unwrap(); + let bloated = appender.commit().unwrap().into_inner(); + + let mut reader = BendlReader::open(Cursor::new(bloated.clone())).unwrap(); + let compacted = compact_bundle(&mut reader, Cursor::new(Vec::new())) + .unwrap() + .into_inner(); + assert!(compacted.len() + 60_000 < bloated.len()); + + // Semantically identical: same assets, verbatim stream bytes, and every checksum holds. + let mut reader = BendlReader::open(Cursor::new(compacted)).unwrap(); + let names: Vec<_> = reader.assets().iter().map(|e| e.name.clone()).collect(); + assert_eq!(names, vec!["seed.bin".to_string()]); + let entry = reader.assets()[0].clone(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), b"seed payload bytes"); + let mut stream = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream) + .unwrap(); + assert_eq!(stream, b"STANDARD BEN FILE\x00\x01\x02"); + reader.verify_all_asset_checksums().unwrap(); + reader.verify_stream_checksum().unwrap(); +} + +#[test] +fn compact_rejects_unfinalized_bundle() { + use binary_ensemble::io::bundle::compact::compact_bundle; + + // Clear the header's `finalized` flag (byte 12 per the spec's fixed-header layout) so the + // bundle reads as incomplete. + let mut bytes = build_seed_bundle(); + assert_eq!(bytes[12], 1); + bytes[12] = 0; + + let mut reader = BendlReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!( + compact_bundle(&mut reader, Cursor::new(Vec::new())), + Err(BendlWriteError::BundleIncomplete) + )); +} + +/// Write `bytes` to a unique file under the cargo test tmpdir and return its path. +fn write_tmp_bundle(name: &str, bytes: &[u8]) -> std::path::PathBuf { + let path = std::path::Path::new(env!("CARGO_TARGET_TMPDIR")).join(name); + std::fs::write(&path, bytes).unwrap(); + path +} + +#[test] +fn in_place_compaction_picks_tail_rewrite_and_never_touches_the_stream() { + use binary_ensemble::io::bundle::compact::{compact_bundle_in_place, Compaction}; + + // Append two assets, then remove the first via the directory-only appender removal: the + // dead space (its payload + superseded directories) is entirely post-stream. + let bytes = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "dead.bin", + &[0xAAu8; 4096], + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + appender + .add_asset( + ASSET_TYPE_CUSTOM, + "survivor.bin", + b"survivor payload", + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + let bytes = appender.commit().unwrap().into_inner(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender.remove_asset("dead.bin").unwrap(); + let bloated = appender.commit().unwrap().into_inner(); + + // Stream end = the prefix that the tail rewrite must leave byte-identical. + let stream_end = { + let reader = BendlReader::open(Cursor::new(bloated.clone())).unwrap(); + reader.header().stream_offset + reader.header().stream_len + }; + let path = write_tmp_bundle("tail-compact.bendl", &bloated); + + assert_eq!( + compact_bundle_in_place(&path).unwrap(), + Compaction::TailRewrite + ); + let compacted = std::fs::read(&path).unwrap(); + assert!(compacted.len() + 4096 <= bloated.len()); + // Everything between the (re-patched) header and the stream end is byte-identical: the + // pre-stream assets and the stream itself were never read or moved. + let header_len = 64; + assert_eq!( + &compacted[header_len..stream_end as usize], + &bloated[header_len..stream_end as usize] + ); + + // Survivor and seed assets intact (raw storage form preserved), every checksum holds. + let mut reader = BendlReader::open(Cursor::new(compacted)).unwrap(); + let names: Vec<_> = reader.assets().iter().map(|e| e.name.clone()).collect(); + assert_eq!( + names, + vec!["seed.bin".to_string(), "survivor.bin".to_string()] + ); + let survivor = find_entry(reader.assets(), "survivor.bin").clone(); + assert_eq!(reader.asset_bytes(&survivor).unwrap(), b"survivor payload"); + reader.verify_all_asset_checksums().unwrap(); + reader.verify_stream_checksum().unwrap(); + + // A second compaction finds nothing to do and leaves the file byte-identical. + let before = std::fs::read(&path).unwrap(); + assert_eq!(compact_bundle_in_place(&path).unwrap(), Compaction::None); + assert_eq!(std::fs::read(&path).unwrap(), before); +} + +#[test] +fn in_place_compaction_falls_back_to_full_rewrite_for_pre_stream_dead_space() { + use binary_ensemble::io::bundle::compact::{compact_bundle_in_place, Compaction}; + + // seed.bin is a pre-stream asset: removing it leaves dead bytes before the stream, which + // only the full rewrite can reclaim. + let bytes = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(bytes)).unwrap(); + appender.remove_asset("seed.bin").unwrap(); + let bloated = appender.commit().unwrap().into_inner(); + let path = write_tmp_bundle("full-compact.bendl", &bloated); + + assert_eq!( + compact_bundle_in_place(&path).unwrap(), + Compaction::FullRewrite + ); + let mut reader = BendlReader::open(Cursor::new(std::fs::read(&path).unwrap())).unwrap(); + assert!(reader.assets().is_empty()); + let mut stream = Vec::new(); + reader + .assignment_stream_reader() + .unwrap() + .read_to_end(&mut stream) + .unwrap(); + assert_eq!(stream, b"STANDARD BEN FILE\x00\x01\x02"); + reader.verify_stream_checksum().unwrap(); +} + +#[test] +fn remove_unknown_asset_errors_and_commit_stays_a_no_op() { + let original = build_seed_bundle(); + let mut appender = BendlAppender::open(Cursor::new(original.clone())).unwrap(); + assert!(matches!( + appender.remove_asset("missing.bin"), + Err(BendlWriteError::UnknownAssetName(_)) + )); + // The failed removal queued nothing, so commit must leave the file byte-identical. + let bytes = appender.commit().unwrap().into_inner(); + assert_eq!(bytes, original); +} diff --git a/ben/tests/test_cli.rs b/ben/tests/test_cli.rs index af41af8..e2c7d42 100644 --- a/ben/tests/test_cli.rs +++ b/ben/tests/test_cli.rs @@ -1809,6 +1809,127 @@ fn bendl_cli_create_inspect_extract_append_roundtrip() { assert_failure(&append_duplicate); } +#[test] +fn bendl_cli_remove_reclaims_bytes_and_compact_is_stable() { + let temp = TempDir::new("bendl-remove"); + + // Seed: a .ben assignment file plus a large incompressible custom asset. + let jsonl_path = temp.path().join("samples.jsonl"); + let ben_path = temp.path().join("samples.ben"); + fs::write(&jsonl_path, sample_jsonl()).unwrap(); + assert_success(&run( + "ben", + &[ + "--mode", + "encode", + jsonl_path.to_str().unwrap(), + "--output-file", + ben_path.to_str().unwrap(), + "--save-all", + "--overwrite", + ], + temp.path(), + )); + // xorshift32 output is effectively incompressible, so the blob genuinely occupies bytes + // even though `bendl create` stores large assets xz-compressed by default. + let mut state = 0x1234_5678u32; + let blob: Vec = (0..65536u32) + .map(|_| { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + (state >> 24) as u8 + }) + .collect(); + let blob_path = temp.path().join("bloat.bin"); + fs::write(&blob_path, &blob).unwrap(); + + let bundle_path = temp.path().join("out.bendl"); + assert_success(&run( + "bendl", + &[ + "create", + "--input", + ben_path.to_str().unwrap(), + "--output", + bundle_path.to_str().unwrap(), + "--asset", + &format!("bloat.bin={}", blob_path.display()), + "--overwrite", + ], + temp.path(), + )); + let bloated = fs::metadata(&bundle_path).unwrap().len(); + assert!(bloated > 60_000, "blob should dominate the file size"); + + // `bendl remove` drops the asset AND reclaims its bytes (auto-compaction). + assert_success(&run( + "bendl", + &[ + "remove", + bundle_path.to_str().unwrap(), + "--asset", + "bloat.bin", + ], + temp.path(), + )); + let after = fs::metadata(&bundle_path).unwrap().len(); + assert!( + after + 60_000 < bloated, + "removal must reclaim the blob's bytes ({bloated} -> {after})" + ); + + // The bundle stays finalized, the asset is gone, and the stream is byte-identical. + let inspect = run( + "bendl", + &["inspect", bundle_path.to_str().unwrap()], + temp.path(), + ); + assert_success(&inspect); + let inspect_out = String::from_utf8_lossy(&inspect.stdout); + assert!(!inspect_out.contains("bloat.bin")); + assert!(inspect_out.contains("finalized: true")); + + let recovered = temp.path().join("recovered.ben"); + assert_success(&run( + "bendl", + &[ + "extract", + bundle_path.to_str().unwrap(), + "--stream", + "--output", + recovered.to_str().unwrap(), + "--overwrite", + ], + temp.path(), + )); + assert_eq!(fs::read(&recovered).unwrap(), fs::read(&ben_path).unwrap()); + + // Removing a missing asset fails and leaves the file byte-identical (the command is + // atomic: nothing commits unless every removal succeeds). + let before = fs::read(&bundle_path).unwrap(); + let missing = run( + "bendl", + &[ + "remove", + bundle_path.to_str().unwrap(), + "--asset", + "nope.bin", + ], + temp.path(), + ); + assert_failure(&missing); + assert_eq!(fs::read(&bundle_path).unwrap(), before); + + // Standalone `bendl compact` on an already-compact bundle is byte-stable. + assert_success(&run( + "bendl", + &["compact", bundle_path.to_str().unwrap()], + temp.path(), + )); + assert_eq!(fs::read(&bundle_path).unwrap(), before); +} + // ===================================================================== // `ben encode --graph` and `ben x-encode --graph` // ===================================================================== diff --git a/fuzz/fuzz_targets/bendl_reader.rs b/fuzz/fuzz_targets/bendl_reader.rs index 42952e1..727cb98 100644 --- a/fuzz/fuzz_targets/bendl_reader.rs +++ b/fuzz/fuzz_targets/bendl_reader.rs @@ -1,4 +1,4 @@ -//! Coverage-guided fuzzing of the `.bendl` bundle read surface. +//! Coverage-guided fuzzing of the `.bendl` file read surface. //! //! Mutants split into the same two classes as the deterministic harness: open-rejected (the //! constructor is the whole reachable surface) and openable (every accessor must then hold the From 99c568e9d2a701c269974b1393a09195e04a7f44 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:16:07 -0600 Subject: [PATCH 169/221] Docs..... --- README.md | 439 ++++---- ben-py/docs/_refresh_notebooks.py | 34 + ben-py/docs/api/bundle.md | 13 +- ben-py/docs/api/codec.md | 2 +- ben-py/docs/api/index.md | 2 +- ben-py/docs/changelog.md | 6 +- ben-py/docs/concepts/api-map.md | 2 +- ben-py/docs/concepts/cli-parity.md | 2 + ben-py/docs/concepts/compatibility.md | 2 +- ben-py/docs/concepts/compression.md | 2 +- ben-py/docs/concepts/data-model.md | 10 +- ben-py/docs/concepts/formats.md | 8 +- ben-py/docs/concepts/jsonl-schema.md | 8 +- ben-py/docs/concepts/limitations.md | 10 +- ben-py/docs/concepts/ordering-deep-dive.md | 4 +- ben-py/docs/concepts/overview.md | 2 +- ben-py/docs/concepts/performance.md | 2 +- ben-py/docs/concepts/vocabulary.md | 6 +- ben-py/docs/conf.py | 20 +- ben-py/docs/getting-started/quickstart.md | 6 +- ben-py/docs/how-to/anti-patterns.md | 4 +- ben-py/docs/how-to/api-cookbook.md | 8 +- ben-py/docs/how-to/compress-gerrychain-run.md | 6 +- ben-py/docs/how-to/convert-formats.md | 2 +- .../docs/how-to/custom-assets-and-append.md | 39 +- ben-py/docs/how-to/end-to-end-workflow.md | 6 +- ben-py/docs/how-to/error-reference.md | 28 +- ben-py/docs/how-to/examples-gallery.md | 4 +- ben-py/docs/how-to/index.md | 6 +- ben-py/docs/how-to/read-and-iterate.md | 2 +- ben-py/docs/how-to/shrink-for-sharing.md | 15 +- ben-py/docs/how-to/subsample.md | 4 +- ben-py/docs/how-to/troubleshooting.md | 19 +- ben-py/docs/index.md | 6 +- ben-py/docs/user/using_ben_py.ipynb | 268 ++--- ben-py/docs/user/using_bendl.ipynb | 951 +++++++++--------- docs/ben-format-spec.md | 2 +- docs/bendl-format-spec.md | 8 + docs/coding-standards.md | 14 + docs/twodelta-format-spec.md | 4 +- 40 files changed, 1018 insertions(+), 958 deletions(-) create mode 100644 ben-py/docs/_refresh_notebooks.py diff --git a/README.md b/README.md index dfc9cb8..ca8f929 100755 --- a/README.md +++ b/README.md @@ -1,302 +1,229 @@ -# Binary Ensemble Compression (BEN) +# Binary Ensemble (BEN) -This library is built as an analogue to [PCompress](https://github.com/mggg/pcompress) and is -designed to help improve storage of general ensembles of redistricting plans. +**Compress, store, and stream massive ensembles of districting plans.** -More specifically, this package is designed to take canonicalized JSONL files which store ensembles -using lines of the form +Redistricting samplers like [GerryChain](https://gerrychain.readthedocs.io)'s ReCom, +ForestReCom, and Sequential Monte Carlo routinely emit millions of plans. Stored as JSONL, a +single ensemble can run to tens of gigabytes — most of it redundant, because each plan is +mostly long runs of the same district id and consecutive plans barely differ. BEN is a +compression format and toolkit built for exactly this data. It was built as an analogue to +[PCompress](https://github.com/mggg/pcompress) and interoperates with it. -``` - {"assignment": , "sample": } -``` - -and compress them into pure binary files. - -## Usage - -From a checkout, install the command-line tools with Cargo using - -``` -cargo install --path ben -``` - -[Here](./example/small_example.jsonl) is a link to a small example file that you can use to see what -the binary-ensemble package is capable of. - -Then you can run the package in one of the following modes (assuming that `~/.cargo/bin` is in your -path): - -- Encode - -``` -ben -m encode small_example.jsonl # Outputs small_example.jsonl.ben -``` - -- XEncode - -``` -ben -m x-encode small_example.jsonl # Outputs small_example.jsonl.xben -``` - -- Decode - -``` -ben -m decode small_example.jsonl.ben -o re_small_example.jsonl # Outputs re_small_example.jsonl -``` - -- XDecode - -``` -ben -m x-decode -p small_example.jsonl.xben # Prints decoding to the console -``` +> A real 100k-plan ensemble on Colorado's ~140k census blocks is **27 GB** as JSONL. +> Relabeled and sorted by `GEOID20` it becomes a **~550 MB** BEN file, and then a **~6 MB** +> XBEN file — over a **4500× reduction**, fully lossless. -- Read +The expected input is canonicalized JSONL, one plan per line: ``` -ben -m read -n 4 small_example.jsonl # Outputs [1,1,1,2,2,2,3,2,3,1,4,4,4,3,3,4] +{"assignment": [1, 1, 2, 2, ...], "sample": 1} +{"assignment": [1, 2, 2, 2, ...], "sample": 2} ``` -- XZCompress +## The format family -``` -ben -m xz-compress small_example.jsonl # Outputs small_example.jsonl.xz -``` - -- XZDecompress - -``` -ben -m xz-decompress small_example.jsonl.xz # Outputs small_example.jsonl -``` - -There is also a `reben` CLI tool that is available through this package, but there is more -information about that in the [Relabeling](#relabeling-for-smaller-files) section +| Format | What it is | Use it for | +|---|---|---| +| `.ben` | Bit-packed, run-length-encoded stream | Working with an ensemble: reading, replaying, subsampling | +| `.xben` | A BEN stream further compressed with LZMA2 | Long-term storage and transfer (slow to create, fast to extract) | +| `.bendl` | A BEN or XBEN stream plus the dual graph, metadata, and custom assets in one self-describing file | Sharing an ensemble without losing its context | -## How it works +The byte-level layouts are specified in [`docs/`](./docs): +[BEN/XBEN](./docs/ben-format-spec.md) · [TwoDelta variant](./docs/twodelta-format-spec.md) · +[BENDL](./docs/bendl-format-spec.md) · [format stability policy](./docs/format-stability.md). -There isn't a lot of complexity to the algorithm that we employ here; the power of the compression -comes from the fact that we use some of the expected information regarding the assignment vectors to -shrink the size of the compressed vector substantially. +## What's in the repository -The BEN compression format is a bit-level compression algorithm (as compared to a byte-level -compression seen in most compression applications) which works in two stages: +| Component | What it does | +|---|---| +| [`ben/`](./ben) | The Rust crate ([`binary-ensemble` on crates.io](https://crates.io/crates/binary-ensemble)) and the CLI tools below | +| `ben` (CLI) | Encode/decode between JSONL, BEN, and XBEN; random-access sample lookup | +| `reben` (CLI) | Relabel and reorder ensembles so they compress dramatically better | +| `bendl` (CLI) | Create, inspect, extract from, and append assets to `.bendl` files | +| `pcben` (CLI) | Convert between BEN and [PCompress](https://github.com/mggg/pcompress) formats | +| [`ben-py/`](./ben-py) | The Python package ([`binary-ensemble` on PyPI](https://pypi.org/project/binary-ensemble/)) — full docs at [binary-ensemble.readthedocs.io](https://binary-ensemble.readthedocs.io/) | +| [`docs/`](./docs) | Format specifications, stability policy, and project glossary | +| [`example/`](./example) | Small sample files used throughout this README | +| [`fuzz/`](./fuzz) | Fuzz targets for the readers and writers | -1. Run length-encoding (RLE) -1. Bit compatification +## Install -The first step is pretty simple: given an assignment like +Command-line tools (installs `ben`, `reben`, `bendl`, and `pcben`): -``` -[1,1,1,2,2,2,2,3,1,3,3,3] +```bash +cargo install binary-ensemble # from crates.io +cargo install --path ben # from a repository checkout ``` -We can encode this vector using an ordered pair $(value,, length)$ to get the vector +Python package: -``` -[(1,3), (2,4), (3,1), (1,1), (3,3)] +```bash +pip install binary-ensemble ``` -While not very efficient in the above example, in the majority of districting plans, if we order the -nodes in the assignment vector according to something geographical, (e.g. GEOID), then the savings -can be substantial. +## CLI quick start -The BEN standard then takes this vector and compactifies it into the following series of bytes: +Using [`example/small_example.jsonl`](./example/small_example.jsonl): -``` -00000010_ <- the maximum number of bits needed to store the assignment values (2) -00000011_ <- the maximum number of bits needed to store the length values (3) -00000000 -00000000 -00000000 -00000100_ <- the number of bytes needed to store the entire vector (4) -01011_101 -00_11001_0 -1001_1101 -1_0000000 <- the bit-packed assignment vector +```bash +ben -m encode small_example.jsonl # -> small_example.jsonl.ben +ben -m x-encode small_example.jsonl.ben # -> small_example.jsonl.xben +ben -m decode small_example.jsonl.xben -w # XBEN -> BEN (one layer down) +ben -m decode small_example.jsonl.ben -o roundtrip.jsonl # BEN -> JSONL +ben -m lookup -n 4 small_example.jsonl.ben # prints sample 4: [1, 1, 1, 2, ...] ``` -### Relabeling for Smaller Files +`ben` also has `x-decode` (XBEN straight to JSONL) and general-purpose `xz-compress` / +`xz-decompress` modes. The `--variant` flag selects the frame encoding (`standard`, +`mkvchain`, or `twodelta`); readers detect the variant automatically, so it is only ever +specified when encoding. -Since `ben` uses RLE under the hood, anything that can be done to improve the likelihood of long -runs of the same assignment in the output assignment vector will improve the compression -substantially. In general, this just means that we need to sort the nodes in the JSON file in such a -way that they have a high likelihood of being grouped together in the final assignment. Of course, -there is no BEST way to do this, but, generally, sorting the JSON file according to geographic -markers is a pretty good way to go. +### One self-describing file: `bendl` -Consider, for example [this](./example/CO_small.json) dual-graph file for the state of Colorado. -This is a block-level dual-graph file containing ~140k nodes in no particular order. If we then use -this dual-graph file to generate 100k example plans and store the result in an XBEN file, we end up -with something like [this](./example/100k_CO_chain.jsonl.xben). +A plain stream is just assignments — it is meaningless without the dual graph that defines +its node order. A `.bendl` file keeps the stream and that context together: -While this file is substantially smaller than the original JSONL file (which clocks in at a whopping -27GB), it is still not as small as we might like. However, we can improve the size of these files -with a little bit of relabeling. - -Before we can see the gains that relabeling buy us, we will need to extract the XBEN file back into -a BEN file so that we can work with it (WARNING: The BEN file that this will generate is ~7GB, but -we will fix that soon), so we will need to run the command - -``` -ben -m decode 100k_CO_chain.jsonl.xben +```bash +bendl create -i small_example.jsonl.ben -o run.bendl --metadata meta.json +bendl inspect run.bendl # header, sample count, asset directory +bendl append run.bendl --asset notes.txt=notes.txt +bendl extract run.bendl --asset metadata.json -o meta-out.json +bendl extract run.bendl --stream -o extracted.ben ``` -The first thing that we can do is change up the labeling of our plans so that districts are labeled -in the order that they appear in the assignment vector. For example, if we have the assignment -vectors +`create` and `append` also take `--graph` and `--node-permutation-map` for the standardized +assets. Asset payloads are checksummed (CRC32C) and xz-compressed on disk by default. -``` -[2,2,3,3,1,1,4,4] -[2,2,3,3,4,4,1,1] -``` +## Making files smaller: `reben` -we, as humans can see that these are just the same assignments with some numbers switched around. -However, the XBEN compressor (which uses LZMA2 for compression) does not have the context that we -have, so, it is necessary for us to help it along a little bit. +BEN's core compression is run-length encoding, so anything that produces longer runs of the +same district id shrinks the files dramatically. `reben` provides the two big levers: -Here, we can make use of the `reben` (short for relabeling-BEN) CLI tool to do this for us: +1. **First-seen relabeling.** `[2,2,3,3,1,1]` and `[1,1,2,2,3,3]` are the same partition + with different labels, but the XBEN compressor cannot know that. Renumbering districts in + order of first appearance makes equivalent plans encode identically: -``` -reben -m ben 100k_CO_chain.jsonl.ben -``` + ```bash + reben -m ben 100k_CO_chain.jsonl.ben + # -> 100k_CO_chain_first_seen_relabeled.jsonl.ben + ``` -This generally produces an improvement on the XBEN compression without fundamentally altering -anything about the underlying data (beyond the relabeling), so it's generally recommended that to -run things through `reben` before compressing into an XBEN format. In our running example, we can -then compress this file back down to an XBEN format using +2. **Node reordering.** Nearby geographic units tend to share a district, so sorting the + dual graph's nodes by a geographic key (or a topology-based ordering via `--ordering mlc` + / `--ordering rcm`) turns each plan into a handful of long runs: -``` -ben -m x-encode 100k_CO_chain_first_seen_relabeled.jsonl.ben -``` + ```bash + reben -m ben -d CO_small.json -k GEOID20 100k_CO_chain_first_seen_relabeled.jsonl.ben + # -> ..._sorted_by_GEOID20.jsonl.ben (the rewritten ensemble) + # -> CO_small_sorted_by_GEOID20.json (the reordered dual graph) + # -> CO_small_sorted_by_GEOID20_map.json (the reversible permutation map) + ``` -DON'T ACTUALLY DO THIS, IT WILL TAKE OVER AN HOUR!!! +On the Colorado example ([`example/CO_small.json`](./example/CO_small.json), with the +100k-plan ensemble in [`example/100k_CO_chain.jsonl.xben`](./example/100k_CO_chain.jsonl.xben)), +this takes the BEN file from ~7 GB to ~550 MB, and the final `ben -m x-encode` brings it to +~6 MB. + +**A note on speed:** XBEN *decoding* is fast — a large file extracts in minutes. High-ratio +XBEN *encoding* is slow (an hour or more for block-level ensembles; ~10 minutes for VTD-level +ones). Encode to XBEN once for storage; work against BEN day to day. -**NOTE:** Decoding is fast, but encoding with high compression does take time (maybe an hour or 2 -with how big this ensemble is, but that is only because this file is on census blocks. Files with -VTDs tend to only take 10 minutes or so.) +## Python -However, this is not the only thing that we can do to make the file smaller. An often more effective -strategy is to sort the file by using some geographical information before we run the chain. Since -nearby geographical regions tend to be placed into the same districts as each other, sorting the -nodes in the original JSON file according to something like GEOID tends to produce exceptionally -short run-length encodings (and thus, exceptionally small BEN files). However, it is not always the -case that we have the foresight to do this, so the question then becomes "can we sort the vectors -after we have run the simulation already?" to which the answer is "of course!" +The Python package wraps the same engine and adds an ergonomic streaming API: -This is where the other aspects of `reben`come into play. If we would like to produce a new mapping -for our dual-graph files so that they are sorted according to some key value then we may use the -command +```python +from binary_ensemble import BendlEncoder, BendlDecoder + +plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] -``` -reben -m ben -s -k -``` - -In our example, the CO_small.json file has the GEOID20 key that we would like to sort on, so we call -the command +encoder = BendlEncoder("ensemble.bendl", overwrite=True) +encoder.add_metadata({"sampler": "demo", "seed": 1234}) +with encoder.stream("ben") as stream: + for assignment in plans: + stream.write(assignment) -``` -reben -m ben -s CO_small.json -k GEOID20 100k_CO_chain_first_seen_relabeled.jsonl.ben +for assignment in BendlDecoder("ensemble.bendl"): + print(assignment) ``` -This will produce the files +See [binary-ensemble.readthedocs.io](https://binary-ensemble.readthedocs.io/) for the +quickstart, concept guides, how-to recipes (including streaming a GerryChain run straight +into a `.bendl` file), and the full API reference. -- 100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.ben (~550Mb) -- CO_small_sorted_by_GEOID20_map.json (a map file containing the new data) -- CO_small_sorted_by_GEOID20.json (a dual-graph file with the nodes shifted around) +## How the compression works -Notice, our BEN file has now shrunk from ~7Gb to around 0.5Gb, which is pretty good! Now, we can -further compress this file using the `x-encode` mode of the `ben` CLI - -``` -ben -m x-encode 100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.ben -``` +A BEN stream encodes each assignment vector in two stages: -And this will produce the file -`100k_CO_chain_first_seen_relabeled_sorted_by_GEOID20.jsonl.xben` which will only be ~6Mb! That -is over a 1000x improvement over the original BEN file, and over a 4500x improvement on the JSONL -file! - -### Assumptions - -The BEN compressor does make some assumptions about the data that the user should be aware of: - -- When using the `ben` CLI tool, it is assumed that the assignments in the assignment vector are - stored in the same order as the nodes in some JSON dual-graph file. While this seems to be the - standard, it is incumbent on the user to make sure that they know which dual-graph file / node - labeling produced these assignment vectors. - -- When the samples are encoded into BEN formatting, the decoded samples will always start at 1. - -- The maximum value of any assignment value is assumed to be 65535 (the largest number that can be - stored in 16 bits) and, likewise, the longest a single assignment run length is assumed to be - 65535\. In all practical applications, this should not cause any issues unless the user is - specifically looking at ways to split Idaho into congressional districts at the census block level - between the years 2010-2020 and they make the decision to sort the dual-graph file according to - the congressional assignments (and if you are doing that, 1. why? and 2. maybe sort the dual-graph - file in some other way that is more meaningful using `reben` first). None of the other states can - cause any issues for any of the state-wide races. - -- The computer that is applying the BEN and XBEN encoding and decoding algorithms is assumed to have - sufficient memory to store an entire assignment vector. This should not be an issue since any - computer that is trying to extract information from these files is presumably also doing analysis - with the ensemble of plans, but it is worth mentioning. - -## More on the XBEN format - -The XBEN (short for eXtreme-BEN) format is the ideal format to use for large data storage and for -the transferring of ensembles from user-to-user. Compared to BEN, XBEN uses an implementation of -[LZMA2](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Markov_chain_algorithm) to further -take advantage of the repetition often present across ensembles of redistricting plans to further -reduce the size of the file. - -This, as one might think from the name, the Lempel-Ziv Markov Algorithm (named because it uses a -Markov Algorithm under the hood, not because it is good at compressing data arising from a Markov -process), is particularly good at improving the compression ration of ensembles generated by Markov -methods like GerryChain or Forest Recom. It also works well at generally improving the compression -of BEN files. The LZMA2 algorithm is based off of the -[LZ-77](https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ77) algorithm which replaces repeated -occurrences of data with a single copy of that data appearing earlier in the data stream. LZMA2 then -uses a Markov process to dynamically determine the best Variable-Length Encoding (VLE) that can be -assigned to the most frequently occurring sequential bytes of data that appear within a data stream, -and then encodes the data using these codes (for a simplified idea of what is going on here, see -[Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding)). - -Of course, in order for the LZMA2 to work, we need for the data to be encoded using bytes instead of -the bit-packing method employed by BEN, so, when converting from a BEN file to an XBEN file (the bit -packing tends to produce data that looks like a random series of bits which is generally -incompressable), we actually unpack each assignment vector into an intermediate format known as -BEN32, which is an RLE that uses 32 bits to encode each assignment vector (using a big-endian -formatted u16 for the value and a big-endian formatted u16 for the length and a null u32 as the -separator). The LZMA2 algorithm is then able to make use of the repetition of the bytes across the -BEN32 file to substantially improve the compression. - -**NOTE:** The decompression from the BEN to the BEN32 format only processes one RLE assignment -vector at a time, so the memory requirement for the extra compression using LZMA2 is not that large. - -## When to use ben vs xben - -The BEN file format is designed to be a usable format for working with ensembles of plans. That is -to say, it comes with some auxiliary functionality that allows for the user to read off the -assignment vector for a particular sample number easily, and thus can be used to "replay" an -ensemble coming from a Markov chain if desired. - -The XBEN file format is designed to be used for storage only. The `ben` CLI tool has been built with -an emphasis on fast decompression, so any file that is stored as an XBEN file can quickly (in under -5 minutes) be extracted into a usable BEN format. Of course, the trade-off for this is that the -compression itself is fairly slow, and can sometimes take several hours to finish if the data is not -relabeled to improve the efficiency. However, considering any method used to create an ensemble of -plans is likely to take several hours anyway, the additional compression time to get a small XBEN -file should be mostly inconsequential in the grand scheme of things. - -## Limitations - -Since the BEN format and CLI tool is designed to work with general ensembles of districting plans, -it does come with some limitations. First and foremost, while BEN excels at the storage of ensembles -of plans built on census blocks, the compression ratios tend to be smaller when considering -ensembles of plans for things like VTDs or Tracts. In practice, since the assignment vectors for -these plans do not tend to be very long (maybe 10-20k), this is not that big of an issue, but it is -worth keeping that in mind. - -In the event that an exceptionally small file is needed for compressing a districting ensemble -arising from a Markov Chain Monte Carlo method (e.g. Recom or Forest Recom) on larger subunits like -tracts, the [PCompress](https://github.com/mggg/pcompress) format, which employs a byte-level delta -encoding, is a good alternative choice. +1. **Run-length encoding.** `[1,1,1,2,2,2,2,3,1,3,3,3]` becomes + `[(1,3), (2,4), (3,1), (1,1), (3,3)]` — districting plans on a well-ordered graph are + mostly long runs. +2. **Bit-packing.** Each frame stores its values and lengths at the minimum bit width its + own maxima require. The example above has a max id of 3 (2 bits) and a max run length of + 4 (3 bits), so the frame is: + + ``` + 00000010 <- bits per district id (2) + 00000011 <- bits per run length (3) + 00000000 00000000 00000000 00000100 <- payload byte length (4, big-endian u32) + 01011_10100_11001_01001_11011_0000000 <- the five runs, 5 bits each, zero-padded + ``` + +XBEN then runs LZMA2 over the stream to exploit the repetition *across* plans. Bit-packed +frames don't line up byte-for-byte, so the encoder first re-expands them into a byte-aligned +intermediate form (BEN32, one `(value, length)` pair per 4 bytes) that LZMA2 can deduplicate +— one frame at a time, so memory stays flat. This is also why first-seen relabeling helps: +structurally identical plans become byte-identical, and LZMA2 collapses them. + +The `pcben` tool converts between BEN and PCompress (`pcben -m ben-to-pc / pc-to-ben / +pc-to-xben -i `), so ensembles can move between the two ecosystems. + +## Assumptions and limitations + +- **Node order is the contract.** Assignments are stored in dual-graph node order; the + format cannot detect a mismatched graph. Decoding a stream against the wrong node order + yields valid-looking but wrong plans — which is exactly why `.bendl` files embed the + graph. +- **Samples are 1-indexed**, and decoded ensembles always start at sample 1. +- **District ids and run lengths are 16-bit** (ids 0–65535, run lengths 1–65535) — far + beyond any statewide redistricting use. +- A machine running the codecs only needs memory for one assignment vector at a time. +- BEN shines on long assignment vectors (census blocks). On coarser units (VTDs, tracts, + ~10–20k nodes) the ratios are more modest; for exceptionally small MCMC ensembles on + coarse units, [PCompress](https://github.com/mggg/pcompress)'s byte-level delta encoding + is a good alternative. + +## Testing and format stability + +A compression format's worst failure mode is decoding *silently wrong*, so the test policy +is built around the wire formats rather than just the code: + +- **Golden fixtures.** Byte-exact reference files for every format and variant — including + one minted by the real PCompress encoder for interop — are committed under + [`ben/tests/fixtures/`](./ben/tests/fixtures). Any later version of the readers must + accept them unchanged; format additions mint new fixtures instead of rewriting old ones. + The rules are spelled out in the [format stability policy](./docs/format-stability.md). +- **Mutation tests.** Every committed fixture is re-read under exhaustive single-byte + mutations: corruption must fail loudly, never decode into plausible-looking data. +- **Property-based tests** (proptest) check encode/decode round-trips, boundary conditions, + and that the high-level operations agree with their naive reference implementations. +- **Streaming soak test.** An encoder thread pipes a multi-gigabyte logical ensemble into a + decoder while the test asserts peak memory stays bounded — pinning the "streaming, not + slurping" invariant that nothing else in the suite would catch. +- **Coverage-guided fuzzing** ([`fuzz/`](./fuzz)) of the four read surfaces (BEN, XBEN + container and body, BENDL), since readers must survive arbitrary untrusted bytes. +- **Big-endian suite.** The full Rust suite also runs on s390x under QEMU, because the + formats are byte-order-sensitive and an endianness bug would corrupt silently. +- **Docs are tests.** Every Python code snippet in the documentation executes under pytest, + and the tutorial notebooks run end to end in CI with warnings as errors. + +CI runs format and lint checks on every push; the heavy gates (full Rust + Python suites, +big-endian, time-boxed fuzzing) run on demand via `/ci-full`, `/ci-endian`, and `/ci-fuzz` +PR comments or a manual workflow dispatch. Release builds produce wheels for Linux, macOS, +and Windows (including ARM), smoke-testing every wheel that can execute on its build +runner. Locally, the same gates are exposed as `task test`, `task fuzz`, `task test-endian`, +and `task coverage-summary`. + +## License + +MIT — see [LICENSE](./LICENSE.md). diff --git a/ben-py/docs/_refresh_notebooks.py b/ben-py/docs/_refresh_notebooks.py new file mode 100644 index 0000000..7f646d0 --- /dev/null +++ b/ben-py/docs/_refresh_notebooks.py @@ -0,0 +1,34 @@ +"""Re-execute tutorial notebooks in place, refreshing their committed outputs. + +The docs site renders the outputs committed inside each ``.ipynb`` (Sphinx runs with +``nb_execution_mode = "off"`` by default), so whenever a notebook's code cells change, this +script must be run to regenerate those outputs. Use the ``docs-refresh-notebooks`` task, which +runs it with the docs execution extras installed. + +Each notebook executes with its own directory as the working directory, so relative paths +(``example_data/``) behave exactly as they do in CI. +""" + +import sys +from pathlib import Path + +import nbformat +from nbclient import NotebookClient + + +def refresh(path: Path) -> None: + nb = nbformat.read(path, as_version=4) + client = NotebookClient( + nb, + timeout=1800, + kernel_name="python3", + resources={"metadata": {"path": str(path.parent)}}, + ) + client.execute() + nbformat.write(nb, path) + print(f"refreshed {path}") + + +if __name__ == "__main__": + for arg in sys.argv[1:]: + refresh(Path(arg)) diff --git a/ben-py/docs/api/bundle.md b/ben-py/docs/api/bundle.md index 0b3ea47..322586a 100644 --- a/ben-py/docs/api/bundle.md +++ b/ben-py/docs/api/bundle.md @@ -14,7 +14,7 @@ was written against. |---|---| | Create a new bundle | `BendlEncoder(path, overwrite=True)` | | Attach a dual graph | `encoder.add_graph(graph, sort=...)` | -| Stream assignments while sampling | `with encoder.stream("ben") as stream: ...` | +| Stream assignments while sampling | `with encoder.stream() as stream: ...` | | Read assignments and assets | `BendlDecoder(path)` | | Reorder/relabel an existing bundle | `relabel_bundle(...)` | | Recompress a bundle to XBEN | `compress_stream(...)` | @@ -50,7 +50,7 @@ from binary_ensemble import BendlEncoder encoder = BendlEncoder("api-demo.bendl", overwrite=True) encoder.add_metadata({"sampler": "demo"}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) stream.write([1, 2, 2, 2]) ``` @@ -80,7 +80,7 @@ for node in graph.nodes: encoder = BendlEncoder("api-graph.bendl", overwrite=True) ordered_graph = encoder.add_graph(nx.adjacency_data(graph), sort="key", key="GEOID20") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) assert ordered_graph.number_of_nodes() == 4 @@ -100,7 +100,7 @@ assignments, then close. A bundle can have only one assignment stream. from binary_ensemble import BendlEncoder encoder = BendlEncoder("api-session.bendl", overwrite=True) -with encoder.stream("ben", variant="twodelta") as stream: +with encoder.stream(variant="twodelta") as stream: for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: stream.write(assignment) ``` @@ -120,6 +120,7 @@ with encoder.stream("ben", variant="twodelta") as stream: | `assignment_format()` | `"ben"` or `"xben"` for the embedded stream | | `version()` / `is_complete()` | Bundle header inspection | | `asset_names()` / `list_assets()` | Asset directory inspection | +| `verify()` | Check every asset and stream checksum; raises on corruption | | `read_graph()` | `networkx.Graph` rebuilt from `graph.json`, or `None` | | `read_metadata()` | Parsed `metadata.json`, or `None` | | `read_node_permutation_map()` | Parsed permutation map, or `None` | @@ -159,8 +160,8 @@ relabel_bundle("ensemble.bendl", out_file="api-sorted.bendl", sort="mlc") compress_stream("api-sorted.bendl", out_file="api-archive.bendl") ``` -Both transforms require exactly one output mode: pass `out_file=...` to create a new file or -`in_place=True` to atomically replace the input. +Both transforms take an optional `out_file`: pass one to create a new file (`overwrite=True` +replaces an existing one), or leave it off to atomically replace the input in place. ```{eval-rst} .. autofunction:: binary_ensemble.bundle.compress_stream diff --git a/ben-py/docs/api/codec.md b/ben-py/docs/api/codec.md index 287d4b0..c3739ea 100644 --- a/ben-py/docs/api/codec.md +++ b/ben-py/docs/api/codec.md @@ -24,7 +24,7 @@ The expected JSONL shape is: ``` Only the `assignment` values are encoded into the stream. Store graph data, sampler -settings, scores, and provenance in a `.bendl` bundle if they need to travel with the file. +settings, scores, and provenance in a `.bendl` file if they need to travel with the file. ```{eval-rst} .. automodule:: binary_ensemble.codec diff --git a/ben-py/docs/api/index.md b/ben-py/docs/api/index.md index 63a0814..450c2ef 100644 --- a/ben-py/docs/api/index.md +++ b/ben-py/docs/api/index.md @@ -6,7 +6,7 @@ listed here is also re-exported from the top-level `binary_ensemble` namespace, `from binary_ensemble.bundle import BendlEncoder` are equivalent. ```{tip} -New here? Reach for **{mod}`binary_ensemble.bundle`** first. A `.bendl` bundle keeps the +New here? Reach for **{mod}`binary_ensemble.bundle`** first. A `.bendl` file keeps the assignment stream and its dual graph together in one self-describing file, which is what you want the vast majority of the time. The other modules are for plain streams, whole-file conversions, and graph preprocessing. diff --git a/ben-py/docs/changelog.md b/ben-py/docs/changelog.md index eef3331..fe8ee9f 100644 --- a/ben-py/docs/changelog.md +++ b/ben-py/docs/changelog.md @@ -8,11 +8,15 @@ stability promises for the BEN/XBEN/BENDL formats themselves are covered separat First stable release of the rewritten Python API. -- **`.bendl` bundles** — `BendlEncoder` / `BendlDecoder` read and write the single-file +- **`.bendl` files** — `BendlEncoder` / `BendlDecoder` read and write the single-file bundle format: an assignment stream plus the dual graph, node permutation map, metadata, and custom assets. `compress_stream` recompresses a bundle's stream to XBEN and `relabel_bundle` reorders a bundle's graph and rewrites its stream to match, both preserving every asset. +- **Custom assets** — `add_asset` accepts JSON, text, and arbitrary binary payloads (plus a + `"file"` content type that reads a path), with CRC32C checksums on every asset, + transparent xz compression for payloads of 1 KiB or more, and `BendlDecoder.verify()` to + validate a whole bundle's checksums in one call. - **Plain streams** — `BenEncoder` / `BenDecoder` write and iterate plain `.ben`/`.xben` streams, with frame-skipping subsampling (`subsample_indices`, `subsample_range`, `subsample_every`) shared with the bundle decoder. diff --git a/ben-py/docs/concepts/api-map.md b/ben-py/docs/concepts/api-map.md index df0a662..adc9858 100644 --- a/ben-py/docs/concepts/api-map.md +++ b/ben-py/docs/concepts/api-map.md @@ -5,7 +5,7 @@ Knowing which module owns which job makes the whole surface easy to navigate. | Module | Mirrors CLI | Owns | |---|---|---| -| {mod}`binary_ensemble.bundle` | `bendl` | Creating, reading, and transforming `.bendl` bundles | +| {mod}`binary_ensemble.bundle` | `bendl` | Creating, reading, and transforming `.bendl` files | | {mod}`binary_ensemble.stream` | `ben` | Reading and writing plain `.ben`/`.xben` streams | | {mod}`binary_ensemble.codec` | `ben` (encode/decode modes) | Whole-file conversions between JSONL, BEN, and XBEN | | {mod}`binary_ensemble.graph` | `reben` (orderings) | Reordering a dual graph before encoding | diff --git a/ben-py/docs/concepts/cli-parity.md b/ben-py/docs/concepts/cli-parity.md index ec66881..047773b 100644 --- a/ben-py/docs/concepts/cli-parity.md +++ b/ben-py/docs/concepts/cli-parity.md @@ -17,6 +17,8 @@ the CLI split so workflows can move between notebooks, scripts, and shell pipeli | Inspect a BENDL bundle | `BendlDecoder(...).list_assets()` | Also exposes graph and metadata helpers | | Extract a bundle stream | `BendlDecoder(...).extract_stream(...)` | Copies embedded BEN/XBEN stream bytes | | Append bundle assets | `BendlEncoder.append(...)` | Asset appends only; no stream appends | +| Remove bundle assets | `BendlEncoder.remove_asset(...)` | Compacts automatically, like `bendl remove` | +| Compact a bundle | automatic | Every Python write path keeps bundles compact; `bendl compact` exists for files from other tools | | Relabel/reorder a bundle | `relabel_bundle(...)` | Requires BEN stream plus graph | | Recompress bundle stream | `compress_stream(...)` | Embedded BEN stream to embedded XBEN stream | | Reorder a graph | `binary_ensemble.graph.reorder(...)` | Same orderings as bundle relabeling | diff --git a/ben-py/docs/concepts/compatibility.md b/ben-py/docs/concepts/compatibility.md index c8b1943..0b6a052 100644 --- a/ben-py/docs/concepts/compatibility.md +++ b/ben-py/docs/concepts/compatibility.md @@ -97,6 +97,6 @@ encoder.add_metadata( "binary_ensemble": binary_ensemble.__version__, } ) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` diff --git a/ben-py/docs/concepts/compression.md b/ben-py/docs/concepts/compression.md index 6e63c9b..dc5aabb 100644 --- a/ben-py/docs/concepts/compression.md +++ b/ben-py/docs/concepts/compression.md @@ -125,7 +125,7 @@ equivalent plans encode identically and compress better. Run it before encoding The recommended pipeline for a small, shareable archive is: -1. Build a `.bendl` bundle with a BEN stream while sampling (ideally on an already-reordered graph). +1. Build a `.bendl` file with a BEN stream while sampling (ideally on an already-reordered graph). 2. **Relabel and reorder** the bundle to maximize run length and cross-plan repetition. 3. **Recompress** the bundle's stream to XBEN. diff --git a/ben-py/docs/concepts/data-model.md b/ben-py/docs/concepts/data-model.md index f2edb3e..5616bc4 100644 --- a/ben-py/docs/concepts/data-model.md +++ b/ben-py/docs/concepts/data-model.md @@ -19,7 +19,7 @@ node. | Rule | Why it matters | |---|---| | Every assignment in one stream must have the same length. | A stream represents one ensemble over one fixed dual graph. | -| Values must be positive district ids that fit in 16 bits. | The binary format stores district ids compactly. | +| Values must be district ids in `0..=65535` (16-bit). | The binary format stores district ids compactly. | | The order must match the graph order you intend to use when reading. | BEN cannot infer geographic meaning from the values alone. | | Missing nodes are not represented. | Use one entry per graph node, even for islands or zero-population units. | @@ -67,7 +67,7 @@ ordered_graph = encoder.add_graph(adjacency, sort="rcm") write_order = list(ordered_graph.nodes) assert len(write_order) == 4 -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -88,7 +88,7 @@ The whole-file codec helpers expect JSON Lines with one JSON object per line and Extra fields such as `sample`, scores, or metadata can be present in the input JSONL, but only the assignment stream is encoded into `.ben` or `.xben`. If you need graph metadata, -sampler settings, or scores to travel with the file, put the stream in a `.bendl` bundle and +sampler settings, or scores to travel with the file, put the stream in a `.bendl` file and attach those payloads as assets. ```python @@ -99,14 +99,14 @@ encode_jsonl_to_ben("plans.jsonl", "plans.ben", overwrite=True) ## Bundle assets -A `.bendl` bundle can carry well-known assets and custom assets: +A `.bendl` file can carry well-known assets and custom assets: | Asset | Reader helper | Typical payload | |---|---|---| | `graph.json` | `read_graph()` or `read_json_asset("graph.json")` | NetworkX adjacency JSON | | `metadata.json` | `read_metadata()` | Sampler name, seed, date, chain settings | | `node_permutation_map.json` | `read_node_permutation_map()` | Reversible old-to-new node order map | -| Custom JSON/text asset | `read_json_asset()` or `read_asset_bytes()` | Scores, notes, provenance, run manifests | +| Custom JSON/text/binary asset | `read_json_asset()` or `read_asset_bytes()` | Scores, notes, provenance, run manifests, geometry blobs | ```python from binary_ensemble import BendlDecoder diff --git a/ben-py/docs/concepts/formats.md b/ben-py/docs/concepts/formats.md index f322bb1..3ec300c 100644 --- a/ben-py/docs/concepts/formats.md +++ b/ben-py/docs/concepts/formats.md @@ -41,7 +41,7 @@ plans — no separate graph file to track down, no chance of pairing the wrong o the bundle is the recommended default. A bundle can wrap *either* a BEN stream (the working form) or an XBEN stream (the compressed -form). You typically build a `.bendl` bundle with a BEN stream while sampling, then +form). You typically build a `.bendl` file with a BEN stream while sampling, then [recompress it to XBEN](../how-to/shrink-for-sharing.md) for distribution. ## Choosing a format @@ -54,7 +54,7 @@ form). You typically build a `.bendl` bundle with a BEN stream while sampling, t | Interoperate with the JSONL world | convert with the [codec helpers](../how-to/convert-formats.md) | ```{tip} -When in doubt, use a `.bendl` bundle. You only need the plain `.ben`/`.xben` stream classes +When in doubt, use a `.bendl` file. You only need the plain `.ben`/`.xben` stream classes when you specifically don't want the bundle packaging — for example, feeding a raw stream to another tool that expects it. ``` @@ -121,7 +121,9 @@ the embedded assignment stream, then a directory table at the end: CRC32C checksum over the stream bytes. - The **directory table** indexes every asset — the dual graph, the node permutation map, the metadata, and any custom assets — by offset and length, each with its own CRC32C. A reader can - pull out just the graph without scanning the file, and verify it before trusting it. + pull out just the graph without scanning the file, and verify it before trusting it + (`BendlDecoder.verify()` checks every asset and the stream in one call). Large asset payloads + are xz-compressed on disk by the writer and decompressed transparently on read. - The **assignment stream** is stored opaquely: the bundle never parses BEN/XBEN internals, it just carries the bytes and notes the format. That's what lets you replace the embedded BEN stream with an embedded XBEN stream by recompressing only the inner stream. diff --git a/ben-py/docs/concepts/jsonl-schema.md b/ben-py/docs/concepts/jsonl-schema.md index afae4ad..7e6050c 100644 --- a/ben-py/docs/concepts/jsonl-schema.md +++ b/ben-py/docs/concepts/jsonl-schema.md @@ -33,7 +33,7 @@ The codec ignores every field except `assignment`. Fields like `sample`, `score` `cut_edges`, or sampler metadata may be useful in the source file, but they are not stored in plain `.ben` or `.xben` streams. -If those fields need to travel with the compressed ensemble, use a `.bendl` bundle and store +If those fields need to travel with the compressed ensemble, use a `.bendl` file and store them as metadata or custom assets. ```python @@ -41,7 +41,7 @@ from binary_ensemble import BendlEncoder encoder = BendlEncoder("jsonl-contract.bendl", overwrite=True) encoder.add_metadata({"source": "plans.jsonl", "assignment_field": "assignment"}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -66,8 +66,8 @@ with open("plans.jsonl") as handle: if len(assignment) != expected_length: raise ValueError(f"line {line_number}: assignment length changed") - if not all(isinstance(value, int) and value > 0 for value in assignment): - raise ValueError(f"line {line_number}: assignment values must be positive integers") + if not all(isinstance(value, int) and 0 <= value <= 65535 for value in assignment): + raise ValueError(f"line {line_number}: assignment values must be 16-bit district ids") ``` That check does not prove the assignments match the intended graph order. It only verifies diff --git a/ben-py/docs/concepts/limitations.md b/ben-py/docs/concepts/limitations.md index 1d9807e..6e7ff74 100644 --- a/ben-py/docs/concepts/limitations.md +++ b/ben-py/docs/concepts/limitations.md @@ -38,7 +38,7 @@ The resulting plans are wrong, not unreadable. ## One stream per bundle -A `.bendl` bundle carries one assignment stream. You can append assets after finalization, but +A `.bendl` file carries one assignment stream. You can append assets after finalization, but you cannot append more samples or add a second stream. ```python @@ -57,7 +57,7 @@ subsampling; recompress to XBEN once the file is ready to share. ## Relabel before XBEN -`relabel_bundle()` expects a `.bendl` bundle with an embedded BEN stream and graph. Run it +`relabel_bundle()` expects a `.bendl` file with an embedded BEN stream and graph. Run it before `compress_stream()`. ```python @@ -69,9 +69,9 @@ compress_stream("limited-sorted.bendl", out_file="limited-archive.bendl") ## District ids are integers -Assignments store integer district ids. The practical limit is 16-bit positive district ids, -which is far above normal statewide redistricting use. Non-integer labels should be mapped to -integers before encoding. +Assignments store integer district ids in the range 0–65535 (16-bit), which is far above +normal statewide redistricting use. Non-integer labels should be mapped to integers before +encoding; values outside the 16-bit range raise an `OverflowError` at write time. ## Geospatial data travels as opaque blobs diff --git a/ben-py/docs/concepts/ordering-deep-dive.md b/ben-py/docs/concepts/ordering-deep-dive.md index 2050736..c0240f5 100644 --- a/ben-py/docs/concepts/ordering-deep-dive.md +++ b/ben-py/docs/concepts/ordering-deep-dive.md @@ -59,7 +59,7 @@ encoder = BendlEncoder("ordering.bendl", overwrite=True) ordered_graph = encoder.add_graph(adjacency, sort="rcm") write_order = list(ordered_graph.nodes) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) assert len(write_order) == 4 @@ -67,7 +67,7 @@ assert len(write_order) == 4 ## Use an ordering after a bundle already exists -If you already have a `.bendl` bundle with a BEN stream and a graph, use `relabel_bundle()`. +If you already have a `.bendl` file with a BEN stream and a graph, use `relabel_bundle()`. It reorders the graph, rewrites every assignment into that new order, and stores a fresh permutation map. diff --git a/ben-py/docs/concepts/overview.md b/ben-py/docs/concepts/overview.md index 5fcbd19..038d33f 100644 --- a/ben-py/docs/concepts/overview.md +++ b/ben-py/docs/concepts/overview.md @@ -57,7 +57,7 @@ BEN comes as three on-disk **containers**, each suited to a different job: The Python package mirrors the project's CLI tools: -- **{mod}`binary_ensemble.bundle`** — read and write `.bendl` bundles (start here). +- **{mod}`binary_ensemble.bundle`** — read and write `.bendl` files (start here). - **{mod}`binary_ensemble.stream`** — read and write plain `.ben`/`.xben` streams. - **{mod}`binary_ensemble.codec`** — convert whole files between JSONL, BEN, and XBEN. - **{mod}`binary_ensemble.graph`** — reorder a dual graph before encoding. diff --git a/ben-py/docs/concepts/performance.md b/ben-py/docs/concepts/performance.md index f979eb4..ad89ef5 100644 --- a/ben-py/docs/concepts/performance.md +++ b/ben-py/docs/concepts/performance.md @@ -110,7 +110,7 @@ access is the dominant workload. For serious runs: 1. Reorder the graph before or during bundle creation. -2. Write a `.bendl` bundle with a BEN stream while sampling. +2. Write a `.bendl` file with a BEN stream while sampling. 3. Attach metadata, graph, and provenance assets. 4. Use BEN for quality checks and analysis. 5. Relabel/reorder the final bundle if needed. diff --git a/ben-py/docs/concepts/vocabulary.md b/ben-py/docs/concepts/vocabulary.md index 3a0d290..b2b771b 100644 --- a/ben-py/docs/concepts/vocabulary.md +++ b/ben-py/docs/concepts/vocabulary.md @@ -39,8 +39,8 @@ silent nonsense. This is why bundles embed the graph — so the order travels wi ## District id -The integer values inside an assignment. The maximum supported district id is **65535** -(it must fit in 16 bits), which is far beyond any real statewide map. +The integer values inside an assignment. A district id is any integer from **0** to +**65535** (it must fit in 16 bits) — far beyond any real statewide map. ## Sample @@ -78,7 +78,7 @@ Use these terms consistently in docs, examples, and user-facing messages. | Prefer | Avoid | Reason | |---|---|---| -| `.bendl` bundle | `xben bundle`, `BEN-DL file` | The container is BENDL; the embedded stream may be BEN or XBEN. | +| `.bendl` file | `xben bundle`, `BEN-DL file` | The container is BENDL; the embedded stream may be BEN or XBEN. | | assignment stream | plan stream, map stream | The bytes store assignment vectors, not geometries or rendered maps. | | assignment | encoded plan, vector plan | An assignment is the concrete `list[int]` representation of a plan. | | sample | step, row | A sample is one decoded assignment in an ensemble. | diff --git a/ben-py/docs/conf.py b/ben-py/docs/conf.py index ee72230..b53f12a 100644 --- a/ben-py/docs/conf.py +++ b/ben-py/docs/conf.py @@ -510,12 +510,8 @@ def _pygments_theme_css(): from pygments.formatters import HtmlFormatter menu = [s for group in CODE_THEMES.values() for s in group] - dark_defaults = [ - p["dark_pygments"] for p in PALETTES.values() if p.get("dark_pygments") - ] - light_defaults = [ - p["light_pygments"] for p in PALETTES.values() if p.get("light_pygments") - ] + dark_defaults = [p["dark_pygments"] for p in PALETTES.values() if p.get("dark_pygments")] + light_defaults = [p["light_pygments"] for p in PALETTES.values() if p.get("light_pygments")] # A style name may resolve to a builtin (the string) or a registered custom class. def make_formatter(style): @@ -535,9 +531,7 @@ def rules(formatter, prefix): # Explicit picks (and any palette default, so it resolves even if absent from the # menu) apply in any mode via the order-independent `html body` prefix. for style in dict.fromkeys(menu + dark_defaults + light_defaults): - blocks.append( - rules(make_formatter(style), f'html body[data-code-theme="{style}"]') - ) + blocks.append(rules(make_formatter(style), f'html body[data-code-theme="{style}"]')) # "Auto" applies a palette's dark/light default, each scoped to its own mode so the # other mode keeps the global Pygments style. The auto-mode (`prefers-color-scheme`) # variants mirror Furo's `:not([data-theme=…])` selectors for system readers. @@ -548,12 +542,8 @@ def rules(formatter, prefix): blocks.append("@media (prefers-color-scheme: dark){\n" + auto + "\n}") for style in dict.fromkeys(light_defaults): fmt = make_formatter(style) - blocks.append( - rules(fmt, f'body[data-theme="light"][data-code-auto-light="{style}"]') - ) - auto = rules( - fmt, f'body:not([data-theme="dark"])[data-code-auto-light="{style}"]' - ) + blocks.append(rules(fmt, f'body[data-theme="light"][data-code-auto-light="{style}"]')) + auto = rules(fmt, f'body:not([data-theme="dark"])[data-code-auto-light="{style}"]') blocks.append("@media (prefers-color-scheme: light){\n" + auto + "\n}") return "\n".join(blocks) diff --git a/ben-py/docs/getting-started/quickstart.md b/ben-py/docs/getting-started/quickstart.md index c731de1..c53adfe 100644 --- a/ben-py/docs/getting-started/quickstart.md +++ b/ben-py/docs/getting-started/quickstart.md @@ -17,7 +17,7 @@ An **ensemble** is just a sequence of these. `binary-ensemble` compresses that s ## Write an ensemble -The recommended container is a **`.bendl` bundle** — a single self-describing file. Open a +The recommended container is a **`.bendl` file** — a single self-describing file. Open a `BendlEncoder`, attach any metadata, then write assignments through a `stream` context that finalizes the bundle when it closes: @@ -28,7 +28,7 @@ plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] encoder = BendlEncoder("ensemble.bendl", overwrite=True) encoder.add_metadata({"sampler": "demo", "seed": 1234}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for assignment in plans: stream.write(assignment) # bundle is finalized here @@ -68,7 +68,7 @@ adjacency = nx.adjacency_data(graph) # the dict shape add_graph expects encoder = BendlEncoder("ensemble.bendl", overwrite=True) encoder.add_graph(adjacency, sort=None) # store as-is; see below for reordering -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: stream.write(assignment) diff --git a/ben-py/docs/how-to/anti-patterns.md b/ben-py/docs/how-to/anti-patterns.md index eff3488..afd53f9 100644 --- a/ben-py/docs/how-to/anti-patterns.md +++ b/ben-py/docs/how-to/anti-patterns.md @@ -36,7 +36,7 @@ relabel_bundle("ensemble.bendl", out_file="ensemble-sorted.bendl", sort="mlc") ## Using XBEN as the working format XBEN is for archive and transfer. It is small, but compression is expensive and repeated -inspection pays decompression startup costs. Work in BEN or a BEN-backed `.bendl` bundle, +inspection pays decompression startup costs. Work in BEN or a BEN-backed `.bendl` file, then recompress once. ## Shipping a plain stream without its graph @@ -61,7 +61,7 @@ Plain streams should still use `.ben` and `.xben`. ## Appending samples to a finalized bundle -Append mode is for assets only. A `.bendl` bundle has one assignment stream. To add more +Append mode is for assets only. A `.bendl` file has one assignment stream. To add more samples, write a new bundle. ```python diff --git a/ben-py/docs/how-to/api-cookbook.md b/ben-py/docs/how-to/api-cookbook.md index 4d4bbb1..2c88ab3 100644 --- a/ben-py/docs/how-to/api-cookbook.md +++ b/ben-py/docs/how-to/api-cookbook.md @@ -11,7 +11,7 @@ from binary_ensemble import BendlEncoder plans = [[1, 1, 2, 2], [1, 2, 2, 2]] encoder = BendlEncoder("cookbook-minimal.bendl", overwrite=True) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for plan in plans: stream.write(plan) ``` @@ -24,7 +24,7 @@ from binary_ensemble import BendlEncoder encoder = BendlEncoder("cookbook-metadata.bendl", overwrite=True) encoder.add_metadata({"sampler": "demo", "seed": 1234}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -40,7 +40,7 @@ graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) encoder = BendlEncoder("cookbook-graph.bendl", overwrite=True) ordered_graph = encoder.add_graph(nx.adjacency_data(graph), sort="rcm") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) assert ordered_graph.number_of_nodes() == 4 @@ -92,7 +92,7 @@ encoder = BendlEncoder("cookbook-assets.bendl", overwrite=True) encoder.add_asset("scores.json", '{"cut_edges": [10, 11]}', content_type="json") encoder.add_asset("notes.txt", "Created for cookbook example.", content_type="text") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` diff --git a/ben-py/docs/how-to/compress-gerrychain-run.md b/ben-py/docs/how-to/compress-gerrychain-run.md index 22a6d6b..6b9a8ea 100644 --- a/ben-py/docs/how-to/compress-gerrychain-run.md +++ b/ben-py/docs/how-to/compress-gerrychain-run.md @@ -1,7 +1,7 @@ # Compress a GerryChain run The most common workflow: run a [GerryChain](https://gerrychain.readthedocs.io) ReCom chain -and stream every plan straight into a single self-describing `.bendl` bundle, so you never +and stream every plan straight into a single self-describing `.bendl` file, so you never materialize a giant JSONL file. ```{note} @@ -71,7 +71,7 @@ encoder.add_metadata( } ) -with encoder.stream("ben", variant="twodelta") as stream: # twodelta suits ReCom chains +with encoder.stream(variant="twodelta") as stream: # twodelta suits ReCom chains for partition in chain: series = partition.assignment.to_series() assignment = series.loc[node_order].astype(int).tolist() @@ -84,7 +84,7 @@ file. To read it back, see [Read and iterate an ensemble](read-and-iterate.md). ## Why this is better than reordering later -You *can* write a raw-order `.bendl` bundle with a BEN stream and later call +You *can* write a raw-order `.bendl` file with a BEN stream and later call `relabel_bundle()` to reorder the graph and rewrite the stream. But when you control the sampling code, it is cleaner to reorder first: diff --git a/ben-py/docs/how-to/convert-formats.md b/ben-py/docs/how-to/convert-formats.md index c4dd799..a7c64eb 100644 --- a/ben-py/docs/how-to/convert-formats.md +++ b/ben-py/docs/how-to/convert-formats.md @@ -70,5 +70,5 @@ replace it. You never specify a variant when decoding — it's detected from the ## Working with bundles instead? These helpers operate on plain streams and JSONL. To recompress the stream *inside* a -`.bendl` bundle (keeping its graph and metadata), use +`.bendl` file (keeping its graph and metadata), use [`compress_stream`](shrink-for-sharing.md) instead. diff --git a/ben-py/docs/how-to/custom-assets-and-append.md b/ben-py/docs/how-to/custom-assets-and-append.md index 201c9b5..fcd7733 100644 --- a/ben-py/docs/how-to/custom-assets-and-append.md +++ b/ben-py/docs/how-to/custom-assets-and-append.md @@ -5,26 +5,39 @@ you can add more to a bundle even after it's finalized. ## Attach metadata and custom assets -`add_metadata` writes the canonical `metadata.json`. `add_asset` writes any named blob; its -`content_type` is `"json"` (the payload must be valid UTF-8 JSON, and the decoder will parse -it for you) or `"text"` (any UTF-8 text): +`add_metadata` writes the canonical `metadata.json`. `add_asset` writes any named blob with a +`content_type` describing the payload: + +| `content_type` | Payload | Decoder behavior | +|---|---|---| +| `"json"` | Valid UTF-8 JSON (a `dict`/`list` is serialized for you) | `read_json_asset` auto-parses it | +| `"text"` | Any UTF-8 text | Raw bytes via `read_asset_bytes` | +| `"binary"` | Arbitrary bytes — a zipped shapefile, a GeoPackage, anything | Raw bytes via `read_asset_bytes` | +| `"file"` | A `str` or `Path` naming a file, whose contents are read and stored as binary | Raw bytes via `read_asset_bytes` | + +The payload itself may be bytes-like, a `str` (always stored as UTF-8 *content* — a plain +string is never treated as a path except under `content_type="file"`), a `dict`/`list` (JSON +only), an open file or other object with `.read()`, or a `pathlib.Path` whose file is read: ```python from binary_ensemble import BendlEncoder encoder = BendlEncoder("ensemble.bendl", overwrite=True) encoder.add_metadata({"sampler": "ReCom", "seed": 1234}) -encoder.add_asset("scores.json", '{"mean_cut_edges": 41.2}', content_type="json") +encoder.add_asset("scores.json", {"mean_cut_edges": 41.2}, content_type="json") encoder.add_asset("README.txt", "Generated for the 2026 analysis.", content_type="text") +encoder.add_asset("render.png", b"\x89PNG\r\n\x1a\n...", content_type="binary") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for assignment in [[1, 1, 2, 2], [1, 2, 2, 2]]: stream.write(assignment) ``` -Assets may be added before *or* after the stream — only the stream itself is single-use. (The -one exception is a *reordering* `add_graph`, which must come before the stream because it sets -the node order the chain writes in.) +Every asset is stored with a CRC32C integrity checksum, and payloads of 1 KiB or more are +xz-compressed on disk by default — both invisible on read. Assets may be added before *or* +after the stream — only the stream itself is single-use. (The one exception is a *reordering* +`add_graph`, which must come before the stream because it sets the node order the chain +writes in.) ## Read assets back @@ -33,7 +46,7 @@ from binary_ensemble import BendlDecoder decoder = BendlDecoder("ensemble.bendl") -print(decoder.asset_names()) # ['metadata.json', 'scores.json', 'README.txt'] +print(decoder.asset_names()) # [...'metadata.json', 'scores.json', 'README.txt', 'render.png'] print(decoder.read_json_asset("scores.json")) # {'mean_cut_edges': 41.2} (parsed) print(decoder.read_asset_bytes("README.txt")) # b'Generated for the 2026 analysis.' (raw bytes) ``` @@ -42,6 +55,14 @@ Use `read_json_asset` for JSON assets (it parses them) and `read_asset_bytes` fo anything. The canonical getters `read_metadata()`, `read_graph()`, and `read_node_permutation_map()` are shortcuts for the well-known assets. +To check integrity before trusting a bundle you received, `decoder.verify()` validates every +asset checksum and the stream checksum against the on-disk bytes, raising on any mismatch +(iteration alone does not check checksums): + +```python +BendlDecoder("ensemble.bendl").verify() # raises on corruption or an unfinalized bundle +``` + ## Append to a finalized bundle To add assets to a bundle that's already finalized, open it with `BendlEncoder.append`. In diff --git a/ben-py/docs/how-to/end-to-end-workflow.md b/ben-py/docs/how-to/end-to-end-workflow.md index ea5c4e4..0c8fb62 100644 --- a/ben-py/docs/how-to/end-to-end-workflow.md +++ b/ben-py/docs/how-to/end-to-end-workflow.md @@ -3,7 +3,7 @@ This tutorial follows the recommended lifecycle: 1. prepare a graph, -2. write a `.bendl` bundle with a BEN stream while producing assignments, +2. write a `.bendl` file with a BEN stream while producing assignments, 3. inspect and analyze the bundle, 4. add provenance, 5. relabel and recompress for sharing. @@ -41,7 +41,7 @@ encoder.add_metadata({"sampler": "toy-grid", "seed": 2026, "node_order": "GEOID2 node_count = ordered_graph.number_of_nodes() -with encoder.stream("ben", variant="twodelta") as stream: +with encoder.stream(variant="twodelta") as stream: for step in range(20): assignment = [(node + step) % 4 + 1 for node in range(node_count)] stream.write(assignment) @@ -106,7 +106,7 @@ graph stored in the bundle. ```python write_order = list(ordered_graph.nodes) -with encoder.stream("ben", variant="twodelta") as stream: +with encoder.stream(variant="twodelta") as stream: for partition in chain: series = partition.assignment.to_series() stream.write(series.loc[write_order].astype(int).tolist()) diff --git a/ben-py/docs/how-to/error-reference.md b/ben-py/docs/how-to/error-reference.md index cf86877..687aa96 100644 --- a/ben-py/docs/how-to/error-reference.md +++ b/ben-py/docs/how-to/error-reference.md @@ -25,7 +25,7 @@ encode_jsonl_to_ben("plans.jsonl", "error-reference.ben", overwrite=True) ## Wrong reader for the file type **Symptom:** opening a file raises an `Exception` whose message names the decoder to use — -for example *"…is a .bendl bundle, not a plain BEN/XBEN stream. Open it with +for example *"…is a .bendl file, not a plain BEN/XBEN stream. Open it with binary_ensemble.bundle.BendlDecoder instead."* **Cause:** `.bendl`, `.ben`, and `.xben` are different containers. (A missing or unreadable @@ -75,7 +75,7 @@ graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) encoder = BendlEncoder("error-with-graph.bendl", overwrite=True) encoder.add_graph(nx.adjacency_data(graph), sort=None) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -84,7 +84,7 @@ with encoder.stream("ben") as stream: **Symptom:** `ValueError: relabel_bundle only supports BEN bundles; relabel before compressing to XBEN`. -**Cause:** `relabel_bundle()` works on `.bendl` bundles with embedded BEN streams. XBEN is the +**Cause:** `relabel_bundle()` works on `.bendl` files with embedded BEN streams. XBEN is the final archive step. **Fix:** relabel first, then recompress. @@ -96,24 +96,26 @@ relabel_bundle("ensemble.bendl", out_file="error-sorted.bendl", sort="mlc") compress_stream("error-sorted.bendl", out_file="error-archive.bendl") ``` -## `content_type` is rejected +## `content_type` or asset payload is rejected -**Symptom:** `ValueError: content_type must be 'json' or 'text'`, or a `ValueError` about -invalid UTF-8 / invalid JSON. +**Symptom:** `ValueError: content_type must be 'json', 'text', 'binary', or 'file'`, a +`ValueError` about invalid UTF-8 / invalid JSON, or a `TypeError` about the payload type. -**Cause:** `add_asset()` accepts only `content_type="json"` or `content_type="text"` from the -Python wrapper. JSON payloads must be valid UTF-8 JSON; text payloads must be valid UTF-8. +**Cause:** `add_asset()` accepts `content_type="json"` (valid UTF-8 JSON; a `dict`/`list` +payload is serialized for you), `"text"` (valid UTF-8), `"binary"` (arbitrary bytes), or +`"file"` (a `str`/`Path` naming a file to read in, which requires a path-like payload). -**Fix:** choose the right content type and validate payloads before writing. +**Fix:** choose the content type that matches the payload. ```python from binary_ensemble import BendlEncoder encoder = BendlEncoder("error-assets.bendl", overwrite=True) -encoder.add_asset("valid.json", '{"ok": true}', content_type="json") +encoder.add_asset("valid.json", {"ok": True}, content_type="json") encoder.add_asset("valid.txt", "plain text", content_type="text") +encoder.add_asset("valid.bin", b"\x00\x01\x02", content_type="binary") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -152,8 +154,8 @@ of samples in base data`, `range must be 1-based and end >= start`, or `step and must be >= 1`. **Cause:** sample positions are 1-based everywhere, and out-of-range positions raise -rather than being silently dropped. (An unsorted or duplicated index list does not raise — -it is sorted and deduplicated with a `UserWarning`.) +rather than being silently dropped. (Duplicate indices do not raise — they are dropped; an +*unsorted* list is sorted with a `UserWarning`. An empty index list raises.) **Fix:** clamp the request to `len(decoder)` first. diff --git a/ben-py/docs/how-to/examples-gallery.md b/ben-py/docs/how-to/examples-gallery.md index 906cf76..22b7890 100644 --- a/ben-py/docs/how-to/examples-gallery.md +++ b/ben-py/docs/how-to/examples-gallery.md @@ -9,7 +9,7 @@ links from each example. from binary_ensemble import BendlEncoder, BendlDecoder encoder = BendlEncoder("gallery-minimal.bendl", overwrite=True) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) stream.write([1, 2, 2, 2]) @@ -32,7 +32,7 @@ encoder.add_graph(nx.adjacency_data(graph), sort=None) encoder.add_metadata({"seed": 2026, "sampler": "demo"}) encoder.add_asset("notes.txt", "Toy gallery bundle.", content_type="text") -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) decoder = BendlDecoder("gallery-rich.bendl") diff --git a/ben-py/docs/how-to/index.md b/ben-py/docs/how-to/index.md index 7d57818..1e7cde7 100644 --- a/ben-py/docs/how-to/index.md +++ b/ben-py/docs/how-to/index.md @@ -38,7 +38,7 @@ plans = [[(node + step) % 4 + 1 for node in range(n_nodes)] for step in range(12 encoder = BendlEncoder("ensemble.bendl", overwrite=True) encoder.add_graph(adjacency, sort=None) encoder.add_metadata({"sampler": "demo", "seed": 0}) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for plan in plans: stream.write(plan) @@ -61,7 +61,7 @@ encode_ben_to_xben("chain.ben", "chain.xben", overwrite=True) :link: end-to-end-workflow :link-type: doc -Build a working `.bendl` bundle, inspect it, attach provenance, and archive it with XBEN. +Build a working `.bendl` file, inspect it, attach provenance, and archive it with XBEN. ::: :::{grid-item-card} API cookbook @@ -89,7 +89,7 @@ Avoid node-order mistakes, repeated bundle extensions, wrong working formats, an :link: compress-gerrychain-run :link-type: doc -Stream a ReCom chain straight into a self-describing `.bendl` bundle. +Stream a ReCom chain straight into a self-describing `.bendl` file. ::: :::{grid-item-card} Read and iterate an ensemble diff --git a/ben-py/docs/how-to/read-and-iterate.md b/ben-py/docs/how-to/read-and-iterate.md index 1a66269..82ae694 100644 --- a/ben-py/docs/how-to/read-and-iterate.md +++ b/ben-py/docs/how-to/read-and-iterate.md @@ -1,6 +1,6 @@ # Read and iterate an ensemble -Open a `.bendl` bundle with `BendlDecoder` and you get the assignment stream *and* everything +Open a `.bendl` file with `BendlDecoder` and you get the assignment stream *and* everything the bundle carries alongside it. ## Inspect before you iterate diff --git a/ben-py/docs/how-to/shrink-for-sharing.md b/ben-py/docs/how-to/shrink-for-sharing.md index d3fe2b7..003fb93 100644 --- a/ben-py/docs/how-to/shrink-for-sharing.md +++ b/ben-py/docs/how-to/shrink-for-sharing.md @@ -1,6 +1,6 @@ # Shrink a bundle for sharing -A `.bendl` bundle you build while sampling usually has an embedded BEN stream in the graph's +A `.bendl` file you build while sampling usually has an embedded BEN stream in the graph's original node order — convenient, but not as small as it could be. Before handing it to a collaborator or archiving it, two steps get it to its smallest form: @@ -41,16 +41,17 @@ self-describing, since the graph and permutation map travel inside it. ## In place vs. a new file -Both transforms take **either** `out_file` (write a new bundle) **or** `in_place=True` -(atomically replace the original). Passing both, or neither, raises: +Both transforms take an optional `out_file`. Pass one to write a new bundle and leave the +original untouched (`overwrite=True` replaces an existing `out_file`); leave it off to +transform the bundle in place: ```python -relabel_bundle("ensemble.bendl", in_place=True, sort="key", key="GEOID20") -compress_stream("ensemble.bendl", in_place=True) +relabel_bundle("ensemble.bendl", sort="key", key="GEOID20") +compress_stream("ensemble.bendl") ``` -`in_place=True` writes to a temporary file and swaps it over the original only on success, so -an interrupted run won't corrupt your bundle. +The in-place mode writes to a temporary file and swaps it over the original only on success, +so an interrupted run won't corrupt your bundle. ```{tip} Reorder *before* compressing. Relabeling and node reordering are what create the long runs and diff --git a/ben-py/docs/how-to/subsample.md b/ben-py/docs/how-to/subsample.md index 00b1b1c..75c98c2 100644 --- a/ben-py/docs/how-to/subsample.md +++ b/ben-py/docs/how-to/subsample.md @@ -7,8 +7,8 @@ frames without unpacking them. All three methods are available on both `BendlDecoder` (for bundles) and `BenDecoder` (for plain streams). Each returns a decoder you iterate. Indices are **1-based**; out-of-range -indices raise rather than being silently dropped, and an unsorted or duplicated index list -is sorted and deduplicated (with a warning). +indices raise rather than being silently dropped, duplicate indices are dropped, and an +unsorted index list is sorted (with a `UserWarning`). ```{note} How cheap skipping is depends on the stream's [encoding variant](../concepts/variants.md). diff --git a/ben-py/docs/how-to/troubleshooting.md b/ben-py/docs/how-to/troubleshooting.md index 641aa02..353a48e 100644 --- a/ben-py/docs/how-to/troubleshooting.md +++ b/ben-py/docs/how-to/troubleshooting.md @@ -52,7 +52,7 @@ Use context managers around stream writes so finalization happens at the right t from binary_ensemble import BendlEncoder encoder = BendlEncoder("new.bendl", overwrite=True) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -95,7 +95,7 @@ while True: # Re-encode the salvaged samples into a fresh, finalized bundle. encoder = BendlEncoder("recovered.bendl", overwrite=True) -with encoder.stream("ben") as out: +with encoder.stream() as out: for assignment in recovered: out.write(assignment) ``` @@ -137,7 +137,7 @@ run. ## `read_graph()` returns `None` The bundle does not contain `graph.json`. Plain `.ben` and `.xben` streams never contain a -graph, and a `.bendl` bundle only contains one if the writer called `add_graph()`. +graph, and a `.bendl` file only contains one if the writer called `add_graph()`. ```python from binary_ensemble import BendlDecoder @@ -157,7 +157,7 @@ graph = nx.convert_node_labels_to_integers(nx.path_graph(4)) encoder = BendlEncoder("with-graph.bendl", overwrite=True) encoder.add_graph(nx.adjacency_data(graph), sort=None) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) ``` @@ -177,12 +177,13 @@ or: ```python from binary_ensemble import compress_stream, relabel_bundle -relabel_bundle("ensemble.bendl", in_place=True, sort="mlc") -compress_stream("ensemble.bendl", in_place=True) +relabel_bundle("ensemble.bendl", sort="mlc") +compress_stream("ensemble.bendl") ``` -Passing both `out_file` and `in_place=True`, or passing neither, raises `ValueError`. Relabel -before recompressing to XBEN; relabeling needs a BEN stream and an embedded graph. +With no `out_file`, both transforms work in place (a temp file is atomically swapped over the +original). Relabel before recompressing to XBEN; relabeling needs a BEN stream and an embedded +graph. ## XBEN compression is slow @@ -194,7 +195,7 @@ the bundle is ready to share. from binary_ensemble import BendlEncoder, compress_stream encoder = BendlEncoder("to-archive.bendl", overwrite=True) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: stream.write([1, 1, 2, 2]) compress_stream("to-archive.bendl", out_file="archive-copy.bendl") diff --git a/ben-py/docs/index.md b/ben-py/docs/index.md index 64febb9..18c7a36 100644 --- a/ben-py/docs/index.md +++ b/ben-py/docs/index.md @@ -45,7 +45,7 @@ pip install binary-ensemble ## A first taste -Write an ensemble into one self-describing `.bendl` bundle, then read it back: +Write an ensemble into one self-describing `.bendl` file, then read it back: ```python from binary_ensemble import BendlEncoder, BendlDecoder @@ -54,7 +54,7 @@ plans = [[1, 1, 2, 2], [1, 2, 2, 2], [1, 1, 1, 2]] # The stream context finalizes the bundle when it closes. encoder = BendlEncoder("ensemble.bendl", overwrite=True) -with encoder.stream("ben") as stream: +with encoder.stream() as stream: for assignment in plans: stream.write(assignment) @@ -159,8 +159,8 @@ how-to/error-reference :hidden: :caption: Tutorials -user/using_ben_py user/using_bendl +user/using_ben_py ``` ```{toctree} diff --git a/ben-py/docs/user/using_ben_py.ipynb b/ben-py/docs/user/using_ben_py.ipynb index 91aab0d..f3750f6 100644 --- a/ben-py/docs/user/using_ben_py.ipynb +++ b/ben-py/docs/user/using_ben_py.ipynb @@ -5,18 +5,23 @@ "id": "cf968bd9", "metadata": {}, "source": [ - "# BEN & XBEN streams: compressing and converting ensembles\n", - "\n", - "This tutorial covers the plain **BEN/XBEN streams** and the whole-file conversion helpers:\n", - "`binary_ensemble.codec` (JSONL ↔ BEN ↔ XBEN) and `binary_ensemble.stream`\n", - "(`BenEncoder` / `BenDecoder`). Its companion, **Working with `.bendl` bundles**, covers the\n", - "recommended self-describing bundle format and the graph-reordering utilities.\n", - "\n", - "**BEN** (Binary-Ensemble) compresses ensembles of districting plans. An ensemble is usually\n", - "stored as [JSONL](https://jsonlines.org) — one plan per line, like\n", - "`{\"assignment\": [...], \"sample\": n}` — which is simple but can balloon to tens of gigabytes.\n", - "BEN shrinks that losslessly; **XBEN** adds LZMA2 on top for archival-grade compression. See\n", - "[Formats](../concepts/formats.md) for the full picture." + "# Plain BEN & XBEN streams\n", + "\n", + "The [bundle tutorial](using_bendl.ipynb) is the main event around here — `.bendl`\n", + "bundles are what we recommend for storing and sharing ensembles. But underneath every\n", + "bundle sits a plain **BEN/XBEN stream**, and sometimes that layer is all you need:\n", + "\n", + "- you have an existing [JSONL](https://jsonlines.org) ensemble to convert,\n", + "- someone handed you a bare `.ben` / `.xben` file,\n", + "- or you're working alongside the `ben` CLI and want the same conversions from Python.\n", + "\n", + "This notebook covers that layer: the whole-file converters in `binary_ensemble.codec`\n", + "(JSONL ↔ BEN ↔ XBEN) and the streaming `BenEncoder` / `BenDecoder` classes.\n", + "\n", + "The quick mental model: an ensemble usually starts life as JSONL — one\n", + "`{\"assignment\": [...], \"sample\": n}` line per plan — which is simple but balloons to\n", + "tens of gigabytes fast. BEN shrinks that losslessly; XBEN adds LZMA2 on top for\n", + "archival-grade compression. See [Formats](../concepts/formats.md) for the full picture." ] }, { @@ -26,10 +31,11 @@ "source": [ "## Setup: generate a small ensemble\n", "\n", - "So this tutorial is self-contained and reproducible, we *generate* a small ensemble instead\n", - "of downloading one: a short [GerryChain](https://gerrychain.readthedocs.io) ReCom chain on a\n", - "16×16 grid (256 nodes), written out as a JSONL file. `binary-ensemble` only ever sees lists\n", - "of integers, so any sampler — or any existing JSONL file — works the same way." + "So this notebook is self-contained and reproducible, we *generate* a small ensemble\n", + "instead of downloading one: a short [GerryChain](https://gerrychain.readthedocs.io)\n", + "ReCom chain on a 16×16 grid (256 nodes), written out as a JSONL file.\n", + "`binary-ensemble` only ever sees lists of integers, so any sampler — or any JSONL\n", + "file you already have lying around — works exactly the same way." ] }, { @@ -38,10 +44,10 @@ "id": "ed9904bc", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:42.359600Z", - "iopub.status.busy": "2026-06-06T03:02:42.359414Z", - "iopub.status.idle": "2026-06-06T03:02:44.247643Z", - "shell.execute_reply": "2026-06-06T03:02:44.242891Z" + "iopub.execute_input": "2026-06-12T20:37:51.059919Z", + "iopub.status.busy": "2026-06-12T20:37:51.059831Z", + "iopub.status.idle": "2026-06-12T20:37:52.009625Z", + "shell.execute_reply": "2026-06-12T20:37:52.009294Z" } }, "outputs": [ @@ -83,9 +89,7 @@ ")\n", "ideal = sum(initial[\"population\"].values()) / len(initial)\n", "chain = MarkovChain(\n", - " proposal=partial(\n", - " recom, pop_col=\"TOTPOP\", pop_target=ideal, epsilon=0.05, node_repeats=2\n", - " ),\n", + " proposal=partial(recom, pop_col=\"TOTPOP\", pop_target=ideal, epsilon=0.05, node_repeats=2),\n", " constraints=[constraints.contiguous],\n", " accept=accept.always_accept,\n", " initial_state=initial,\n", @@ -94,9 +98,7 @@ "\n", "with open(\"example_data/small_example.jsonl\", \"w\") as f:\n", " for i, partition in enumerate(chain, start=1):\n", - " assignment = (\n", - " partition.assignment.to_series().loc[node_order].astype(int).tolist()\n", - " )\n", + " assignment = partition.assignment.to_series().loc[node_order].astype(int).tolist()\n", " f.write(json.dumps({\"assignment\": assignment, \"sample\": i}) + \"\\n\")\n", "\n", "jsonl_size = os.path.getsize(\"example_data/small_example.jsonl\")\n", @@ -112,7 +114,9 @@ "source": [ "## Converting between file types\n", "\n", - "The `binary_ensemble.codec` helpers convert whole files in a single call." + "The `binary_ensemble.codec` helpers convert whole files in a single call. (These are\n", + "the same conversions the `ben` CLI tool performs — see\n", + "[CLI parity](../concepts/cli-parity.md) for the mapping.)" ] }, { @@ -121,10 +125,10 @@ "id": "1ec1d4ed", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.251325Z", - "iopub.status.busy": "2026-06-06T03:02:44.250847Z", - "iopub.status.idle": "2026-06-06T03:02:44.256236Z", - "shell.execute_reply": "2026-06-06T03:02:44.255813Z" + "iopub.execute_input": "2026-06-12T20:37:52.010868Z", + "iopub.status.busy": "2026-06-12T20:37:52.010686Z", + "iopub.status.idle": "2026-06-12T20:37:52.014415Z", + "shell.execute_reply": "2026-06-12T20:37:52.014124Z" } }, "outputs": [], @@ -146,8 +150,8 @@ "source": [ "### JSONL → BEN\n", "\n", - "The quickest format is BEN. `encode_jsonl_to_ben` reads the JSONL ensemble and writes a\n", - "compact `.ben` stream." + "The quickest format is BEN. `encode_jsonl_to_ben` reads the JSONL ensemble and writes\n", + "a compact `.ben` stream — watch the size drop:" ] }, { @@ -156,10 +160,10 @@ "id": "ef701764", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.257724Z", - "iopub.status.busy": "2026-06-06T03:02:44.257583Z", - "iopub.status.idle": "2026-06-06T03:02:44.261414Z", - "shell.execute_reply": "2026-06-06T03:02:44.261122Z" + "iopub.execute_input": "2026-06-12T20:37:52.015374Z", + "iopub.status.busy": "2026-06-12T20:37:52.015259Z", + "iopub.status.idle": "2026-06-12T20:37:52.027030Z", + "shell.execute_reply": "2026-06-12T20:37:52.026709Z" } }, "outputs": [ @@ -167,7 +171,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "BEN bytes: 4303\n" + "BEN bytes: 4366\n" ] } ], @@ -177,7 +181,7 @@ " out_file=\"example_data/small_example.ben\",\n", " overwrite=True,\n", ")\n", - "print(\"BEN bytes:\", os.path.getsize(\"example_data/small_example.ben\"))" + "print(f\"BEN bytes: {os.path.getsize('example_data/small_example.ben')}\")" ] }, { @@ -186,7 +190,7 @@ "metadata": {}, "source": [ "By default the conversion functions refuse to overwrite an existing output file — pass\n", - "`overwrite=True` to replace it." + "`overwrite=True` when you actually mean to replace it:" ] }, { @@ -195,10 +199,10 @@ "id": "cce1caac", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.267885Z", - "iopub.status.busy": "2026-06-06T03:02:44.266218Z", - "iopub.status.idle": "2026-06-06T03:02:44.279005Z", - "shell.execute_reply": "2026-06-06T03:02:44.276912Z" + "iopub.execute_input": "2026-06-12T20:37:52.027867Z", + "iopub.status.busy": "2026-06-12T20:37:52.027764Z", + "iopub.status.idle": "2026-06-12T20:37:52.029496Z", + "shell.execute_reply": "2026-06-12T20:37:52.029295Z" } }, "outputs": [ @@ -217,7 +221,7 @@ " out_file=\"example_data/small_example.ben\",\n", " )\n", "except OSError as e:\n", - " print(\"refused to overwrite:\", e)" + " print(f\"refused to overwrite: {e}\")" ] }, { @@ -229,12 +233,15 @@ "\n", "A BEN stream is encoded with one of three **variants**, chosen with `variant=`:\n", "\n", - "- `\"twodelta\"` (the **default**) delta-encodes pairwise ReCom moves — ideal for ReCom chains.\n", - "- `\"mkv_chain\"` collapses identical consecutive plans — for full MCMC chains with rejections.\n", + "- `\"twodelta\"` (the **default**) delta-encodes pairwise ReCom moves — ideal for ReCom\n", + " chains like ours.\n", + "- `\"mkv_chain\"` collapses identical consecutive plans — for full MCMC chains with\n", + " rejections.\n", "- `\"standard\"` stores each plan independently — a simple baseline.\n", "\n", - "Decoding **auto-detects** the variant, so you never specify it when reading. See\n", - "[Encoding variants](../concepts/variants.md)." + "You never specify the variant when reading — decoding **auto-detects** it. Here's how\n", + "much the choice matters for a ReCom ensemble (see\n", + "[Encoding variants](../concepts/variants.md) for how to choose on other samplers):" ] }, { @@ -243,10 +250,10 @@ "id": "7646489b", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.283412Z", - "iopub.status.busy": "2026-06-06T03:02:44.282472Z", - "iopub.status.idle": "2026-06-06T03:02:44.311097Z", - "shell.execute_reply": "2026-06-06T03:02:44.310379Z" + "iopub.execute_input": "2026-06-12T20:37:52.030244Z", + "iopub.status.busy": "2026-06-12T20:37:52.030151Z", + "iopub.status.idle": "2026-06-12T20:37:52.060691Z", + "shell.execute_reply": "2026-06-12T20:37:52.060261Z" } }, "outputs": [ @@ -254,9 +261,9 @@ "name": "stdout", "output_type": "stream", "text": [ - " standard: 8216 bytes\n", - " mkv_chain: 8576 bytes\n", - " twodelta: 4303 bytes\n" + " standard: 8359 bytes\n", + " mkv_chain: 8759 bytes\n", + " twodelta: 4366 bytes\n" ] } ], @@ -268,9 +275,7 @@ " overwrite=True,\n", " variant=variant,\n", " )\n", - " print(\n", - " f\"{variant:>10}: {os.path.getsize(f'example_data/small_example.{variant}.ben'):>6} bytes\"\n", - " )" + " print(f\"{variant:>10}: {os.path.getsize(f'example_data/small_example.{variant}.ben'):>6} bytes\")" ] }, { @@ -280,8 +285,9 @@ "source": [ "### BEN → XBEN\n", "\n", - "XBEN wraps a BEN stream in LZMA2 for much smaller files, at the cost of slower compression.\n", - "The XBEN encoders accept `n_threads` and `compression_level` (0 fastest … 9 smallest)." + "When a file is done changing, XBEN wraps the BEN stream in LZMA2 for much smaller\n", + "files, at the cost of slower compression. The XBEN encoders accept `n_threads` and\n", + "`compression_level` (0 fastest … 9 smallest):" ] }, { @@ -290,10 +296,10 @@ "id": "97d5e070", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.312703Z", - "iopub.status.busy": "2026-06-06T03:02:44.312531Z", - "iopub.status.idle": "2026-06-06T03:02:44.327111Z", - "shell.execute_reply": "2026-06-06T03:02:44.326520Z" + "iopub.execute_input": "2026-06-12T20:37:52.061902Z", + "iopub.status.busy": "2026-06-12T20:37:52.061752Z", + "iopub.status.idle": "2026-06-12T20:37:52.079946Z", + "shell.execute_reply": "2026-06-12T20:37:52.079618Z" } }, "outputs": [ @@ -302,8 +308,8 @@ "output_type": "stream", "text": [ " small_example.jsonl: 159892 bytes\n", - " small_example.ben: 4303 bytes\n", - " small_example.xben: 2076 bytes\n" + " small_example.ben: 4366 bytes\n", + " small_example.xben: 2104 bytes\n" ] } ], @@ -333,7 +339,8 @@ "source": [ "### Decoding\n", "\n", - "The decoders mirror the encoders and all take `(in_file, out_file, overwrite=False)`." + "The decoders mirror the encoders, and all of them take\n", + "`(in_file, out_file, overwrite=False)`:" ] }, { @@ -342,10 +349,10 @@ "id": "5a315a19", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.328951Z", - "iopub.status.busy": "2026-06-06T03:02:44.328764Z", - "iopub.status.idle": "2026-06-06T03:02:44.336173Z", - "shell.execute_reply": "2026-06-06T03:02:44.335649Z" + "iopub.execute_input": "2026-06-12T20:37:52.080885Z", + "iopub.status.busy": "2026-06-12T20:37:52.080767Z", + "iopub.status.idle": "2026-06-12T20:37:52.098712Z", + "shell.execute_reply": "2026-06-12T20:37:52.098285Z" } }, "outputs": [ @@ -364,9 +371,7 @@ "decode_xben_to_jsonl(\n", " \"example_data/small_example.xben\", \"example_data/from_xben.jsonl\", overwrite=True\n", ")\n", - "decode_xben_to_ben(\n", - " \"example_data/small_example.xben\", \"example_data/from_xben.ben\", overwrite=True\n", - ")\n", + "decode_xben_to_ben(\"example_data/small_example.xben\", \"example_data/from_xben.ben\", overwrite=True)\n", "print(\"decoded BEN -> JSONL, XBEN -> JSONL, and XBEN -> BEN\")" ] }, @@ -375,7 +380,8 @@ "id": "dcd1f6b4", "metadata": {}, "source": [ - "Encoding is lossless. Decoding a BEN stream back to JSONL recovers the original plans exactly:" + "Encoding is lossless — decoding a BEN stream back to JSONL recovers the original plans\n", + "exactly:" ] }, { @@ -384,10 +390,10 @@ "id": "d37a01b9", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.339313Z", - "iopub.status.busy": "2026-06-06T03:02:44.339066Z", - "iopub.status.idle": "2026-06-06T03:02:44.355306Z", - "shell.execute_reply": "2026-06-06T03:02:44.352869Z" + "iopub.execute_input": "2026-06-12T20:37:52.099553Z", + "iopub.status.busy": "2026-06-12T20:37:52.099464Z", + "iopub.status.idle": "2026-06-12T20:37:52.104685Z", + "shell.execute_reply": "2026-06-12T20:37:52.104397Z" } }, "outputs": [ @@ -405,10 +411,8 @@ " return [json.loads(line)[\"assignment\"] for line in f]\n", "\n", "\n", - "print(\n", - " \"round-trip identical:\",\n", - " load(\"example_data/small_example.jsonl\") == load(\"example_data/roundtrip.jsonl\"),\n", - ")" + "identical = load(\"example_data/small_example.jsonl\") == load(\"example_data/roundtrip.jsonl\")\n", + "print(f\"round-trip identical: {identical}\")" ] }, { @@ -418,9 +422,12 @@ "source": [ "## Streaming with `BenEncoder` / `BenDecoder`\n", "\n", - "When you'd rather write plans one at a time as they're produced (instead of from a JSONL\n", - "file), use `BenEncoder`. It's a context manager: write each assignment, and the stream is\n", - "flushed on exit. `BenDecoder` reads a stream back, one assignment at a time." + "When plans are *produced* one at a time — by a chain, not a file — there's no reason\n", + "to stage them through JSONL. `BenEncoder` writes assignments as they arrive; it's a\n", + "context manager, and the stream is flushed and finished on exit. `BenDecoder` reads a\n", + "stream back the same way, one assignment at a time. (If this pattern looks familiar,\n", + "it should: the bundle's `stream()` writer and iterator are built on the same\n", + "machinery.)" ] }, { @@ -429,10 +436,10 @@ "id": "e693be01", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.357968Z", - "iopub.status.busy": "2026-06-06T03:02:44.357728Z", - "iopub.status.idle": "2026-06-06T03:02:44.365655Z", - "shell.execute_reply": "2026-06-06T03:02:44.364033Z" + "iopub.execute_input": "2026-06-12T20:37:52.105479Z", + "iopub.status.busy": "2026-06-12T20:37:52.105354Z", + "iopub.status.idle": "2026-06-12T20:37:52.108153Z", + "shell.execute_reply": "2026-06-12T20:37:52.107863Z" } }, "outputs": [ @@ -456,7 +463,7 @@ " encoder.write(plan)\n", "\n", "decoder = BenDecoder(\"example_data/tiny.ben\")\n", - "print(\"samples:\", len(decoder))\n", + "print(f\"samples: {len(decoder)}\")\n", "for assignment in decoder:\n", " print(assignment)" ] @@ -466,7 +473,17 @@ "id": "e6bc1ecd", "metadata": {}, "source": [ - "## Subsampling\\n\\nFor winnowing a large ensemble you rarely want every plan. `BenDecoder` can yield just a\\nsubset without materializing the rest. Indices are 1-based. (How cheap a skipped sample is\\ndepends on the variant: `standard` and `mkv_chain` frames are skipped wholesale, while\\n`twodelta` — the default — replays the deltas between snapshots.)\\n\\nA decoder is reusable: call the `subsample_*` methods on the **same** `BenDecoder` as many\\ntimes as you like. Each call rewinds to the start of the stream and applies the new\\nselection, so there's no need to open a fresh decoder per subsample." + "## Subsampling\n", + "\n", + "`BenDecoder` can yield just a subset of plans without materializing the rest, using\n", + "the same three `subsample_*` methods you saw on the bundle decoder (one machinery,\n", + "two decoders — the [bundle tutorial](using_bendl.ipynb) has the full tour). Indices\n", + "are 1-based, and a decoder is reusable: each call rewinds the stream and applies the\n", + "new selection, so there's no need to open a fresh decoder per subsample.\n", + "\n", + "One thing worth knowing at this layer: how cheap a *skipped* sample is depends on the\n", + "variant. `standard` and `mkv_chain` frames are skipped wholesale, while `twodelta` —\n", + "the default — replays the deltas between snapshots." ] }, { @@ -475,10 +492,10 @@ "id": "9d706ebc", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.367358Z", - "iopub.status.busy": "2026-06-06T03:02:44.367181Z", - "iopub.status.idle": "2026-06-06T03:02:44.372134Z", - "shell.execute_reply": "2026-06-06T03:02:44.371724Z" + "iopub.execute_input": "2026-06-12T20:37:52.108837Z", + "iopub.status.busy": "2026-06-12T20:37:52.108724Z", + "iopub.status.idle": "2026-06-12T20:37:52.116231Z", + "shell.execute_reply": "2026-06-12T20:37:52.115822Z" } }, "outputs": [ @@ -486,10 +503,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "indices [1, 100, 200] -> [[0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 3, 3], [2, 2, 2, 2, 2, 2]]\n", - "range(50, 53) -> [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]]\n", + "indices [1, 100, 200] -> [[0, 0, 0, 0, 1, 1], [3, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 2]]\n", + "range(50, 53) -> [[2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0]]\n", "every 50th -> 4 plans\n", - "indices again -> [[0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 3, 3], [2, 2, 2, 2, 2, 2]]\n" + "indices again -> [[0, 0, 0, 0, 1, 1], [3, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 2]]\n" ] } ], @@ -497,22 +514,18 @@ "ben_file = \"example_data/small_example.ben\"\n", "decoder = BenDecoder(ben_file) # one decoder, reused for every subsample below\n", "\n", - "print(\n", - " \"indices [1, 100, 200] ->\",\n", - " [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])],\n", - ")\n", - "print(\n", - " \"range(50, 53) ->\",\n", - " [assignment[:6] for assignment in decoder.subsample_range(50, 53)],\n", - ")\n", - "print(\"every 50th ->\", sum(1 for _ in decoder.subsample_every(50)), \"plans\")\n", + "picked = [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])]\n", + "print(f\"indices [1, 100, 200] -> {picked}\")\n", + "\n", + "ranged = [assignment[:6] for assignment in decoder.subsample_range(50, 53)]\n", + "print(f\"range(50, 53) -> {ranged}\")\n", + "\n", + "print(f\"every 50th -> {sum(1 for _ in decoder.subsample_every(50))} plans\")\n", "\n", "# The same decoder rewinds and re-selects on each call, so you can run subsamples\n", "# repeatedly without building a new decoder:\n", - "print(\n", - " \"indices again ->\",\n", - " [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])],\n", - ")" + "again = [assignment[:6] for assignment in decoder.subsample_indices([1, 100, 200])]\n", + "print(f\"indices again -> {again}\")" ] }, { @@ -520,9 +533,9 @@ "id": "10268f1c", "metadata": {}, "source": [ - "The same methods work on an XBEN stream — pass `mode=\"xben\"`. Reading XBEN pays a one-time\n", - "decompression startup cost, so if you'll subsample repeatedly, extract to BEN first with\n", - "`decode_xben_to_ben`." + "The same methods work on an XBEN stream — pass `mode=\"xben\"`. Reading XBEN pays a\n", + "one-time decompression startup cost, so if you'll be subsampling repeatedly, extract\n", + "to BEN first with `decode_xben_to_ben`:" ] }, { @@ -531,10 +544,10 @@ "id": "59c78e95", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:02:44.373339Z", - "iopub.status.busy": "2026-06-06T03:02:44.373192Z", - "iopub.status.idle": "2026-06-06T03:02:44.379619Z", - "shell.execute_reply": "2026-06-06T03:02:44.379163Z" + "iopub.execute_input": "2026-06-12T20:37:52.117427Z", + "iopub.status.busy": "2026-06-12T20:37:52.117303Z", + "iopub.status.idle": "2026-06-12T20:37:52.123128Z", + "shell.execute_reply": "2026-06-12T20:37:52.122713Z" } }, "outputs": [ @@ -551,15 +564,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/claude-1000/ipykernel_3711264/1859154157.py:1: UserWarning: XBEN may take a second to start decoding.\n", + "/tmp/claude-1000/ipykernel_1810084/1859154157.py:1: UserWarning: XBEN may take a second to start decoding.\n", " for assignment in BenDecoder(\"example_data/small_example.xben\", mode=\"xben\").subsample_range(1, 3):\n" ] } ], "source": [ - "for assignment in BenDecoder(\n", - " \"example_data/small_example.xben\", mode=\"xben\"\n", - ").subsample_range(1, 3):\n", + "for assignment in BenDecoder(\"example_data/small_example.xben\", mode=\"xben\").subsample_range(1, 3):\n", " print(assignment[:6])" ] }, @@ -570,9 +581,12 @@ "source": [ "## Where to next\n", "\n", - "- [Working with `.bendl` bundles](using_bendl.ipynb) — the recommended self-describing\n", - " container, plus graph reordering for much better compression.\n", - "- [Concepts](../concepts/overview.md) — the formats, variants, and how the compression works.\n", + "- [Working with `.bendl` files](using_bendl.ipynb) — if you skipped it, go back: one\n", + " self-describing file with the graph, metadata, and checksums built in is almost\n", + " always what you want. Everything here (plus graph reordering for much better\n", + " compression) is available at the bundle level.\n", + "- [Concepts](../concepts/overview.md) — the formats, variants, and how the\n", + " compression works.\n", "- [API reference](../api/index.md) — every public class and function." ] } diff --git a/ben-py/docs/user/using_bendl.ipynb b/ben-py/docs/user/using_bendl.ipynb index d648bd9..c55e02c 100644 --- a/ben-py/docs/user/using_bendl.ipynb +++ b/ben-py/docs/user/using_bendl.ipynb @@ -5,47 +5,41 @@ "id": "dc9c2020", "metadata": {}, "source": [ - "# Working with `.bendl` bundles\n", - "\n", - "This tutorial is a companion to `using_ben_py.ipynb`. That notebook covers the\n", - "plain BEN/XBEN *streams* (`binary_ensemble.stream` + `binary_ensemble.codec`);\n", - "this one covers the **`.bendl` bundle** — the recommended, self-describing\n", - "container format — and walks the full `binary_ensemble.bundle` /\n", - "`binary_ensemble.graph` API, driving it with a live GerryChain ReCom run.\n", - "\n", - "It is written in the `# %%` \"percent\" cell format, so you can step through it\n", - "cell-by-cell in VS Code / Jupyter (via Jupytext) or just run it top-to-bottom\n", - "as a plain script: `python using_bendl.py`.\n", - "\n", - "## What is a bundle, and why use one?\n", - "\n", - "A plain `.ben` file is *just* the assignment stream: a sequence of districting\n", - "plans, with no record of the graph they were drawn on. To use it, a\n", - "collaborator has to separately track down the matching dual-graph JSON **and**\n", - "know the exact node ordering the assignments were written in. Lose either and\n", - "the file is undecodable.\n", - "\n", - "A `.bendl` bundle fixes this by wrapping the stream together with *assets* in a\n", - "single file:\n", - "\n", - "- the **dual graph** (`graph.json`), so the file is self-describing;\n", - "- an optional **`node_permutation_map.json`**, recording any reordering applied\n", - " to the graph for better compression;\n", - "- **`metadata.json`**, for run provenance (seed, parameters, generator, …);\n", - "- arbitrary **custom assets** (notes, analysis results, plots-as-bytes, …).\n", - "\n", - "Intended use cases:\n", - "\n", - "1. **Shareable, reproducible ensembles** — hand someone one file; they can\n", - " recover the graph and replay the plans with no side files.\n", - "2. **Provenance** — stamp the seed / chain parameters into the bundle.\n", - "3. **Better compression** — reorder the graph (RCM / multi-level clustering)\n", - " before writing so the BEN/XBEN delta-encoding shrinks; the permutation map\n", - " keeps the reordering reversible.\n", - "4. **A lifecycle** — work in BEN (fast) while a project is active, then\n", - " recompress the bundle to XBEN for long-term archival, assets preserved.\n", - "5. **Extensibility** — append analysis results to a finished bundle later,\n", - " without rewriting the stream." + "# Working with `.bendl` files\n", + "\n", + "BENDL is a portmanteau of \"BEN\" (Binary-ENsemble) and \"bundle\", and is the generally\n", + "recommended format for storage and transmission of ensembles of districting plans. It is intended \n", + "to be a single self-describing file that contains a districting ensemble, its associated graph, and\n", + "any metadata / accompanying artifacts that the user would like to include.\n", + "\n", + "The BENDL format was created to alleviate two common pain points for redistricting analysts:\n", + "\n", + "1. How do you store millions of districting plans efficiently?\n", + "2. Which graph did you use to make that ensemble again? There are like 7 in this folder...\n", + "\n", + "The purpose of this tutorial is to re-work one of the classic ReCom tutorials from\n", + "[GerryChain](https://gerrychain.readthedocs.io), but with the additional support of\n", + "`binary-ensemble`, to demonstrate the intended workflow.\n", + "\n", + "> Note: There is also a [companion notebook](using_ben_py.ipynb) on the plain BEN/XBEN\n", + "> *streams*: the central layer of every bundle that encodes a redistricting ensemble.\n", + "\n", + "## What goes in a bundle?\n", + "\n", + "A plain `.ben` file is just the assignment stream: a long sequence of districting plans and\n", + "nothing else. A `.bendl` file wraps that stream together with *assets*:\n", + "\n", + "- the **dual graph** (`graph.json`), so the file explains itself;\n", + "- an optional **`node_permutation_map.json`**, recording any reordering applied to the\n", + " graph for better compression;\n", + "- **`metadata.json`**, for run provenance (seed, parameters, generator, ...);\n", + "- arbitrary **custom assets**: notes, analysis results, plots, even geospatial blobs.\n", + "\n", + "Keeping everything in one file means an ensemble can be shared and reproduced with no side\n", + "files, the chain parameters travel with the plans, and analysis results can be appended to a\n", + "finished bundle later without rewriting the stream. Bundles also have a natural lifecycle:\n", + "work in BEN (fast) while a project is active, then recompress to XBEN for long-term archival\n", + "with every asset preserved." ] }, { @@ -55,19 +49,18 @@ "source": [ "## Setup\n", "\n", - "We need a dual graph to draw plans on. Rather than download a multi-megabyte\n", - "real-world graph, we *generate* a `SIDE × SIDE` grid (here 32×32 = 1024 nodes) —\n", - "big enough to feel like a real ensemble, small enough to run in seconds, and\n", - "fully reproducible. Each node gets unit population (`TOTPOP = 1`) and an initial\n", - "`district` label of vertical stripes, which gives ReCom a contiguous, balanced\n", - "starting partition.\n", - "\n", - "Then we deliberately **shuffle the node order**. Real-world dual graphs rarely\n", - "arrive in a compression-friendly order (think census blocks listed by GEOID, or\n", - "nodes in arbitrary shapefile order), so the stored order has no relationship to\n", - "graph locality. Shuffling reproduces that — and it's exactly the situation where\n", - "reordering before encoding pays off, which we'll see below. We write the graph\n", - "out as NetworkX adjacency JSON under `example_data/`, the shape a bundle stores." + "First we need a dual graph to draw plans on. Rather than download a multi-megabyte\n", + "real-world graph, we generate a `SIDE × SIDE` grid (here 32×32 = 1024 nodes): big enough to\n", + "behave like a real ensemble, small enough to run in seconds, and fully reproducible. Each\n", + "node gets unit population (`TOTPOP = 1`) and an initial `district` label of vertical\n", + "stripes, which gives ReCom a contiguous, balanced starting partition.\n", + "\n", + "Then we deliberately **shuffle the node order**. Real-world dual graphs rarely arrive in a\n", + "compression-friendly order (census blocks listed by GEOID, nodes in whatever order the\n", + "shapefile happened to have), so the stored order usually has no relationship to graph\n", + "locality. Shuffling reproduces that situation, and it is exactly the one where reordering\n", + "before encoding pays off, as we will see below. The graph is written out as NetworkX\n", + "adjacency JSON under `example_data/`, the shape a bundle stores." ] }, { @@ -76,10 +69,10 @@ "id": "351e870b", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:05:40.368921Z", - "iopub.status.busy": "2026-06-06T03:05:40.368852Z", - "iopub.status.idle": "2026-06-06T03:05:40.416711Z", - "shell.execute_reply": "2026-06-06T03:05:40.416323Z" + "iopub.execute_input": "2026-06-12T20:37:19.743842Z", + "iopub.status.busy": "2026-06-12T20:37:19.743771Z", + "iopub.status.idle": "2026-06-12T20:37:19.789863Z", + "shell.execute_reply": "2026-06-12T20:37:19.789504Z" } }, "outputs": [ @@ -125,9 +118,7 @@ "\n", "grid = build_grid_graph(SIDE, N_DISTRICTS)\n", "GRAPH_PATH.write_text(json.dumps(nx.readwrite.json_graph.adjacency_data(grid)))\n", - "print(\n", - " f\"graph file: {GRAPH_PATH} ({GRAPH_PATH.stat().st_size} bytes, {SIDE * SIDE} nodes)\"\n", - ")" + "print(f\"graph file: {GRAPH_PATH} ({GRAPH_PATH.stat().st_size} bytes, {SIDE * SIDE} nodes)\")" ] }, { @@ -137,15 +128,15 @@ "source": [ "### The public surface\n", "\n", - "Everything bundle-related is re-exported from the top-level package, but it\n", - "lives in two submodules:\n", + "Everything bundle-related is re-exported from the top-level package, but it lives in two\n", + "submodules:\n", "\n", "- `binary_ensemble.bundle` — `BendlEncoder`, `BendlDecoder`, `compress_stream`\n", "- `binary_ensemble.graph` — `reorder`, `reorder_multi_level_cluster`,\n", " `reorder_reverse_cuthill_mckee`, `reorder_by_key`\n", "\n", - "(The plain-stream `BenEncoder` / `BenDecoder` and the whole-file `encode_*` /\n", - "`decode_*` codec helpers are the subject of the BEN tutorial.)" + "> Note: The plain-stream `BenEncoder` / `BenDecoder` and the whole-file `encode_*` /\n", + "> `decode_*` codec helpers are covered in the [streams notebook](using_ben_py.ipynb)." ] }, { @@ -154,10 +145,10 @@ "id": "e0fa37a7", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:05:40.417603Z", - "iopub.status.busy": "2026-06-06T03:05:40.417449Z", - "iopub.status.idle": "2026-06-06T03:05:40.420025Z", - "shell.execute_reply": "2026-06-06T03:05:40.419752Z" + "iopub.execute_input": "2026-06-12T20:37:19.790796Z", + "iopub.status.busy": "2026-06-12T20:37:19.790634Z", + "iopub.status.idle": "2026-06-12T20:37:19.794060Z", + "shell.execute_reply": "2026-06-12T20:37:19.793798Z" } }, "outputs": [], @@ -171,13 +162,12 @@ "id": "42d0386c", "metadata": {}, "source": [ - "## The GerryChain ingredients\n", + "## Adding in GerryChain\n", "\n", - "We drive everything with a short ReCom chain. The chain's *recipe* (proposal,\n", - "constraints, updaters) is independent of how nodes are ordered, so we factor it\n", - "into a helper that builds a fresh chain on whatever graph we hand it. We'll call\n", - "this once per bundle and **stream each plan to disk as the chain produces it** —\n", - "no need to hold the whole ensemble in memory." + "As per usual, our example ReCom chain will use a ReCom proposal and a couple of updaters.\n", + "The standard chain recipe is independent of how the nodes are ordered, so we factor it into a\n", + "helper that builds a fresh chain on whatever graph we hand it. We will call this once per bundle\n", + "and stream each plan to disk as the chain produces it." ] }, { @@ -186,17 +176,17 @@ "id": "b34e60bd", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:05:40.420533Z", - "iopub.status.busy": "2026-06-06T03:05:40.420472Z", - "iopub.status.idle": "2026-06-06T03:05:40.684962Z", - "shell.execute_reply": "2026-06-06T03:05:40.684568Z" + "iopub.execute_input": "2026-06-12T20:37:19.794965Z", + "iopub.status.busy": "2026-06-12T20:37:19.794887Z", + "iopub.status.idle": "2026-06-12T20:37:20.060421Z", + "shell.execute_reply": "2026-06-12T20:37:20.059964Z" } }, "outputs": [], "source": [ "from functools import partial\n", "\n", - "from gerrychain import Graph, MarkovChain, Partition, accept, constraints, updaters\n", + "from gerrychain import Graph, MarkovChain, Partition, accept, updaters\n", "from gerrychain.proposals import recom\n", "\n", "\n", @@ -212,7 +202,7 @@ " proposal=partial(\n", " recom, pop_col=\"TOTPOP\", pop_target=ideal_pop, epsilon=0.05, node_repeats=2\n", " ),\n", - " constraints=[constraints.contiguous],\n", + " constraints=[],\n", " accept=accept.always_accept,\n", " initial_state=initial,\n", " total_steps=steps,\n", @@ -224,26 +214,35 @@ "id": "66229db9", "metadata": {}, "source": [ - "## Writing your first bundle — encoding as the chain runs\n", + "## Writing your first BENDL\n", "\n", - "You do **not** need to use `BendlEncoder` itself as a context manager. Only the\n", - "`stream(...)` writer needs a `with` block: closing the stream context is what\n", - "finalizes the bundle. So the pattern is:\n", + "When working with the BENDL format, the general workflow is as follows:\n", "\n", "1. create the encoder and add the graph (and any other assets),\n", "2. open the single-use `stream(...)` in a `with` block,\n", - "3. iterate the chain and `write` each plan inside it,\n", + "3. loop through the chain and `write` each plan to the stream,\n", "4. when the `with enc.stream(...)` block exits, the bundle is finalized on disk.\n", "\n", - "The one rule when writing: every assignment must be in a **fixed, known node\n", - "order**. GerryChain makes no ordering promise, so we pin the order to the graph's\n", - "node iteration order and reindex each plan to it.\n", + "It is important to note that, when writing to a BENDL file, every assignment must be in a fixed,\n", + "known node order. GerryChain makes no ordering promise, so we pin the order to the graph's node\n", + "iteration order and reindex each plan to it.\n", + "\n", + "To take full advantage of the BENDL format, we need to first store the graph that we intend to pass\n", + "to our Markov chain in the bundle itself. The `BendlEncoder` class provides a `sort` parameter that\n", + "you can modify to make the ensemble storage more efficient (more on that later).\n", "\n", - "A convenient trick: `add_graph` *returns* the embedded graph (as a NetworkX\n", - "graph), so we can build the GerryChain graph straight from it and guarantee the\n", - "write order matches what gets stored. For this first bundle we pass\n", - "`sort=None` to store the graph in its raw (shuffled) order — a\n", - "deliberately un-optimized baseline we'll improve on next." + "The `add_graph` method accepts most of the standard formats that you would expect:\n", + "\n", + "- a `networkx.Graph` instance (subclasses such as `gerrychain.Graph` count),\n", + "- [adjacency-format](https://networkx.org/documentation/stable/_modules/networkx/readwrite/json_graph/adjacency.html#adjacency_data)\n", + " JSON as a parsed `dict` or `list`, raw `bytes`, or a file-like object with `.read()`,\n", + "- a `str` / `os.PathLike` path to a JSON file. A plain `str` is interpreted as a path.\n", + "\n", + "The `add_graph` function also *returns* the embedded graph (as a NetworkX graph) for immediate use,\n", + "i.e. for building the GerryChain graph, and the write order is guaranteed to match what gets stored.\n", + "\n", + "For this first bundle we pass `sort=None` to store the graph in its raw order, and we will show\n", + "some optional pre-processing steps that can decrease the bundle size momentarily." ] }, { @@ -252,10 +251,10 @@ "id": "68ad7766", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:05:40.686373Z", - "iopub.status.busy": "2026-06-06T03:05:40.686121Z", - "iopub.status.idle": "2026-06-06T03:06:04.364287Z", - "shell.execute_reply": "2026-06-06T03:06:04.363727Z" + "iopub.execute_input": "2026-06-12T20:37:20.061642Z", + "iopub.status.busy": "2026-06-12T20:37:20.061477Z", + "iopub.status.idle": "2026-06-12T20:37:33.501548Z", + "shell.execute_reply": "2026-06-12T20:37:33.501204Z" } }, "outputs": [ @@ -268,16 +267,16 @@ } ], "source": [ - "encoder = BendlEncoder(\"example_data/basic.bendl\", overwrite=True) # no `with` needed\n", + "encoder = BendlEncoder(\"example_data/basic.bendl\", overwrite=True)\n", "stored_graph = encoder.add_graph(GRAPH_PATH, sort=None)\n", "gc_graph = Graph.from_networkx(stored_graph)\n", - "write_order = list(gc_graph.nodes) # the order stored == the order we write\n", + "node_order = list(gc_graph.nodes) # the order stored == the order we write assignments in\n", "\n", - "with encoder.stream(\"ben\") as stream: # only the stream is context-managed\n", + "with encoder.stream() as stream: # only the stream is context-managed\n", " for partition in make_chain(gc_graph, steps=1000):\n", " series = partition.assignment.to_series()\n", - " stream.write(series.loc[write_order].astype(int).tolist())\n", - "# the bundle is finalized now that the stream context has closed\n", + " stream.write(series.loc[node_order].astype(int).tolist())\n", + "# the bundle is finalized now that the stream context has closed and can no longer be updated\n", "\n", "print(\"wrote example_data/basic.bendl\")" ] @@ -287,11 +286,10 @@ "id": "ac4a2306", "metadata": {}, "source": [ - "A note on validation: because we embedded a graph *before* the stream, the\n", - "encoder knows the node count and checks every `write` against it. A\n", - "wrong-length assignment raises immediately instead of silently corrupting the\n", - "file (and because the exception escapes the stream context, the bundle is left\n", - "unfinalized rather than stamped complete — more on that at the end):" + "Because we embedded the graph *before* the stream, the encoder knows the node count and\n", + "checks every `write` against it. A wrong-length assignment raises immediately instead of\n", + "silently corrupting the file, and since the exception escapes the stream context, the\n", + "bundle is left unfinalized rather than stamped complete (more on that at the end):" ] }, { @@ -300,10 +298,10 @@ "id": "4723655b", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:04.365628Z", - "iopub.status.busy": "2026-06-06T03:06:04.365482Z", - "iopub.status.idle": "2026-06-06T03:06:04.399476Z", - "shell.execute_reply": "2026-06-06T03:06:04.398745Z" + "iopub.execute_input": "2026-06-12T20:37:33.502333Z", + "iopub.status.busy": "2026-06-12T20:37:33.502254Z", + "iopub.status.idle": "2026-06-12T20:37:33.530755Z", + "shell.execute_reply": "2026-06-12T20:37:33.530414Z" } }, "outputs": [ @@ -319,10 +317,10 @@ "encoder = BendlEncoder(\"example_data/willfail.bendl\", overwrite=True)\n", "encoder.add_graph(GRAPH_PATH, sort=None)\n", "try:\n", - " with encoder.stream(\"ben\") as stream:\n", + " with encoder.stream() as stream:\n", " stream.write([0, 1, 2]) # too short\n", "except ValueError as e:\n", - " print(\"rejected as expected:\", e)" + " print(f\"rejected as expected: {e}\")" ] }, { @@ -332,26 +330,29 @@ "source": [ "## Reordering for compression (the default)\n", "\n", - "BEN/XBEN compress *runs of equal adjacent labels* well, so a node ordering that\n", - "keeps neighbouring nodes near each other in the stream compresses much better.\n", - "Because our grid's stored order is shuffled, the raw `basic.bendl` above is a\n", - "worst case. Fixing it is the encoder's default behaviour: `add_graph` reorders\n", - "the graph with **multi-level clustering (`sort=\"mlc\"`)** unless you opt out with\n", - "`sort=None`. Reordering:\n", + "BEN and XBEN compress *runs of equal adjacent labels*, so a node ordering that keeps\n", + "neighbouring nodes near each other in the stream compresses much better. Our grid's stored\n", + "order is shuffled, which makes the raw `basic.bendl` above close to a worst case. Fixing\n", + "this is the encoder's default behaviour: `add_graph` reorders the graph with multi-level\n", + "clustering (`sort=\"mlc\"`) unless you opt out with `sort=None`. Reordering:\n", "\n", - "- reorders the graph — `sort=\"mlc\"` (default), `sort=\"rcm\"`, or `sort=\"key\"`\n", - " with `key=\"\"` (e.g. `key=\"GEOID\"`) to sort by a node attribute,\n", + "- reorders the graph — `sort=\"mlc\"` (default), `sort=\"rcm\"`, or `sort=\"key\"` with\n", + " `key=\"\"` (e.g. `key=\"GEOID\"`) to sort by a node attribute, \n", "- stores both the reordered `graph.json` **and** a `node_permutation_map.json`,\n", "- and **returns the reordered graph**.\n", "\n", - "Returning the reordered graph is what makes this ergonomic: we build the *entire\n", - "ReCom chain on that ordering*, so the chain's natural node order already equals\n", - "the stored order — streaming needs no extra bookkeeping. **Reordering is\n", - "pre-stream only** (it decides the write order), so `add_graph(...)` must come\n", - "before `stream()`.\n", + "That last point is what makes the workflow tidy: build the entire ReCom chain on the\n", + "returned graph, and the chain's natural node order already equals the stored order, so\n", + "streaming needs no extra bookkeeping.\n", + "\n", + "> Note: For graphs composed of census blocks, sorting by the \"GEOID\" attribute generally produces\n", + "> the best compression\n", + "\n", + "> Note: Reordering is pre-stream only — it decides the write order — so `add_graph(...)`\n", + "> must come before `stream()`.\n", "\n", - "We'll make this the \"real\" bundle for the rest of the tutorial, so we also stamp\n", - "in metadata and a couple of custom assets while we're here." + "This will be the \"real\" bundle for the rest of the tutorial, so we also stamp in metadata\n", + "and a custom asset while we are here." ] }, { @@ -360,10 +361,10 @@ "id": "af9eb82e", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:04.400699Z", - "iopub.status.busy": "2026-06-06T03:06:04.400539Z", - "iopub.status.idle": "2026-06-06T03:06:24.003515Z", - "shell.execute_reply": "2026-06-06T03:06:24.001220Z" + "iopub.execute_input": "2026-06-12T20:37:33.531799Z", + "iopub.status.busy": "2026-06-12T20:37:33.531708Z", + "iopub.status.idle": "2026-06-12T20:37:45.929921Z", + "shell.execute_reply": "2026-06-12T20:37:45.929571Z" } }, "outputs": [ @@ -371,12 +372,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "wrote example_data/rich.bendl\n" + "wrote example_data/mlc_reordered.bendl\n" ] } ], "source": [ - "encoder = BendlEncoder(\"example_data/rich.bendl\", overwrite=True)\n", + "from datetime import datetime\n", + "\n", + "encoder = BendlEncoder(\"example_data/mlc_reordered.bendl\", overwrite=True)\n", "\n", "# add_graph reorders with MLC by default; build the chain on the returned graph.\n", "reordered_graph = encoder.add_graph(GRAPH_PATH)\n", @@ -385,18 +388,24 @@ "\n", "# Provenance + extra assets (covered in detail in the next section).\n", "encoder.add_metadata(\n", - " {\"generator\": \"gerrychain\", \"proposal\": \"recom\", \"epsilon\": 0.05, \"seed\": 1234}\n", - ")\n", - "encoder.add_asset(\n", - " \"readme.txt\", \"ReCom ensemble on a 32x32 grid, MLC-reordered.\", \"text\"\n", + " {\n", + " \"generator\": \"gerrychain\",\n", + " \"proposal\": \"recom\",\n", + " \"epsilon\": 0.05,\n", + " \"seed\": 1234,\n", + " \"created_by\": \"me\",\n", + " \"created_at\": datetime(1970, 1, 1).isoformat(),\n", + " \"description\": \"ReCom ensemble on a 32x32 grid, MLC-reordered.\",\n", + " }\n", ")\n", + "encoder.add_asset(\"readme.txt\", \"ReCom ensemble on a 32x32 grid, MLC-reordered.\", \"text\")\n", "\n", - "with encoder.stream(\"ben\") as stream:\n", + "with encoder.stream() as stream:\n", " for partition in make_chain(gc_graph, steps=1000):\n", " series = partition.assignment.to_series()\n", " stream.write(series.loc[write_order].astype(int).tolist())\n", "\n", - "print(\"wrote example_data/rich.bendl\")" + "print(\"wrote example_data/mlc_reordered.bendl\")" ] }, { @@ -406,12 +415,12 @@ "source": [ "### Did reordering actually help?\n", "\n", - "Tempting as it is to compare `basic.bendl` against `rich.bendl`, that isn't a\n", - "fair fight: they hold **different ensembles** — each was streamed live from its\n", - "own independent ReCom run — so their stream sizes mix the ordering effect with\n", - "run-to-run randomness. Let's look anyway, then do it properly. We compare the\n", - "*embedded BEN stream* sizes (the assignment data only, excluding assets and\n", - "header) by extracting each stream and measuring it:" + "It is tempting to just compare `basic.bendl` against `mlc_reordered.bendl`, but that is not a fair\n", + "fight: they hold different ensembles (each was streamed live from its own independent ReCom\n", + "run), so their stream sizes mix the ordering effect with run-to-run randomness. We'll take a\n", + "look anyway, and then do the comparison properly. The bundle header records the exact byte\n", + "length of the embedded assignment stream, so `stream_size()` reads it back without decoding\n", + "or copying anything:" ] }, { @@ -420,10 +429,10 @@ "id": "d3ae7f81", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.012031Z", - "iopub.status.busy": "2026-06-06T03:06:24.011612Z", - "iopub.status.idle": "2026-06-06T03:06:24.030110Z", - "shell.execute_reply": "2026-06-06T03:06:24.024395Z" + "iopub.execute_input": "2026-06-12T20:37:45.930923Z", + "iopub.status.busy": "2026-06-12T20:37:45.930833Z", + "iopub.status.idle": "2026-06-12T20:37:45.932573Z", + "shell.execute_reply": "2026-06-12T20:37:45.932347Z" } }, "outputs": [ @@ -431,27 +440,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "basic.bendl (raw, run A): 135840 bytes\n", - "rich.bendl (mlc, run B): 40081 bytes\n" + "basic.bendl (raw, run A): 135772 bytes\n", + "mlc_reordered.bendl (mlc, run B): 40521 bytes\n" ] } ], "source": [ - "import os\n", - "\n", - "\n", - "def stream_size(path):\n", - " \"\"\"Size in bytes of a bundle's embedded BEN stream (extracted).\"\"\"\n", - " decoder = BendlDecoder(path)\n", - " tmp = \"example_data/_measure.ben\"\n", - " decoder.extract_stream(tmp, overwrite=True)\n", - " size = os.path.getsize(tmp)\n", - " os.remove(tmp)\n", - " return size\n", - "\n", + "basic_size = BendlDecoder(\"example_data/basic.bendl\").stream_size()\n", + "mlc_size = BendlDecoder(\"example_data/mlc_reordered.bendl\").stream_size()\n", "\n", - "print(f\"basic.bendl (raw, run A): {stream_size('example_data/basic.bendl'):>8} bytes\")\n", - "print(f\"rich.bendl (mlc, run B): {stream_size('example_data/rich.bendl'):>8} bytes\")" + "print(f\"basic.bendl (raw, run A): {basic_size:>8} bytes\")\n", + "print(f\"mlc_reordered.bendl (mlc, run B): {mlc_size:>8} bytes\")" ] }, { @@ -459,13 +458,12 @@ "id": "b0d561f5", "metadata": {}, "source": [ - "For a true **apples-to-apples** measurement we need the *same* plans in two\n", - "orderings. We can get that without running a second chain by **relabeling**\n", - "`basic.bendl`'s exact ensemble into MLC order. `relabel_bundle` does exactly\n", - "this in one call: it reorders the stored graph, rewrites every assignment into\n", - "the new node order, and stores a `node_permutation_map.json` so the change stays\n", - "reversible (it preserves metadata and custom assets too). It's the bundle-level\n", - "form of the CLI's `reben` ordering step:" + "For an apples-to-apples measurement we need the *same* plans in two orderings. We can get\n", + "that without running a second chain by relabeling `basic.bendl`'s exact ensemble into MLC\n", + "order. `relabel_bundle` does the whole thing in one call: it reorders the stored graph,\n", + "rewrites every assignment into the new node order, and stores a\n", + "`node_permutation_map.json` so the change stays reversible (metadata and custom assets come\n", + "along too). It is the bundle-level form of the CLI's `reben` ordering step:" ] }, { @@ -474,10 +472,10 @@ "id": "a0f34268", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.037694Z", - "iopub.status.busy": "2026-06-06T03:06:24.037235Z", - "iopub.status.idle": "2026-06-06T03:06:24.087832Z", - "shell.execute_reply": "2026-06-06T03:06:24.086921Z" + "iopub.execute_input": "2026-06-12T20:37:45.933211Z", + "iopub.status.busy": "2026-06-12T20:37:45.933153Z", + "iopub.status.idle": "2026-06-12T20:37:46.037855Z", + "shell.execute_reply": "2026-06-12T20:37:46.037492Z" } }, "outputs": [ @@ -485,8 +483,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "same ensemble, raw order: 135840 bytes\n", - "same ensemble, MLC order: 39908 bytes\n", + "same ensemble, raw order: 135772 bytes\n", + "same ensemble, MLC order: 40192 bytes\n", "-> 3.4x smaller from reordering alone\n" ] } @@ -494,14 +492,16 @@ "source": [ "from binary_ensemble import relabel_bundle\n", "\n", - "# out_file won't overwrite an existing file, so clear any copy from a previous run.\n", - "Path(\"example_data/relabeled.bendl\").unlink(missing_ok=True)\n", + "# overwrite=True replaces any copy left behind by a previous run of this notebook.\n", "relabel_bundle(\n", - " \"example_data/basic.bendl\", out_file=\"example_data/relabeled.bendl\", sort=\"mlc\"\n", + " \"example_data/basic.bendl\",\n", + " out_file=\"example_data/basic_mlc_relabeled.bendl\",\n", + " sort=\"mlc\",\n", + " overwrite=True,\n", ")\n", "\n", - "raw_bytes = stream_size(\"example_data/basic.bendl\")\n", - "mlc_bytes = stream_size(\"example_data/relabeled.bendl\")\n", + "raw_bytes = BendlDecoder(\"example_data/basic.bendl\").stream_size()\n", + "mlc_bytes = BendlDecoder(\"example_data/basic_mlc_relabeled.bendl\").stream_size()\n", "print(f\"same ensemble, raw order: {raw_bytes:>8} bytes\")\n", "print(f\"same ensemble, MLC order: {mlc_bytes:>8} bytes\")\n", "print(f\"-> {raw_bytes / mlc_bytes:.1f}x smaller from reordering alone\")" @@ -512,13 +512,14 @@ "id": "f80e0806", "metadata": {}, "source": [ - "Now the *only* thing that changed is the node ordering, so that ratio is the\n", - "real compression win from MLC — and it's why MLC is the **default** in\n", - "`add_graph`. (On a graph that already arrives in a locality-friendly order the\n", - "gain is smaller, and the extra `node_permutation_map.json` can even make a tiny\n", - "file net-larger, but reordering is cheap and rarely hurts — so the encoder does\n", - "it for you unless you ask for raw with `sort=None`.) It matters most\n", - "right before an expensive XBEN recompress, where every byte of BEN is amplified." + "Now the only thing that changed is the node ordering, so that ratio is the real compression\n", + "win from MLC. On more complicated dual graphs, the savings can be very significant, and these\n", + "savings matters most right before an expensive XBEN recompress.\n", + "\n", + "> Note: On a graph that already arrives in a locality-friendly order the gain is smaller,\n", + "> and the extra `node_permutation_map.json` can even make a tiny file net-larger.\n", + "> Reordering is cheap and rarely hurts, though, so the encoder does it unless you ask for\n", + "> raw order with `sort=None`." ] }, { @@ -528,10 +529,10 @@ "source": [ "### Reordering under the hood: the standalone utilities\n", "\n", - "`add_graph(..., sort=..., key=...)` is built on the `binary_ensemble.graph`\n", - "utilities, which you can also call directly — handy when you want to compute an\n", - "ordering once and reuse it, or inspect the permutation map before committing.\n", - "Each returns `(reordered_graph, node_permutation_map)`: a live NetworkX graph\n", + "`add_graph(..., sort=..., key=...)` is built on the `binary_ensemble.graph` utilities,\n", + "which you can also call directly. This is handy when you want to compute an ordering once\n", + "and reuse it across several bundles, or inspect the permutation map before committing to\n", + "anything. Each returns `(reordered_graph, node_permutation_map)`: a live NetworkX graph\n", "plus the map dict." ] }, @@ -541,10 +542,10 @@ "id": "4ccffbe8", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.089778Z", - "iopub.status.busy": "2026-06-06T03:06:24.089558Z", - "iopub.status.idle": "2026-06-06T03:06:24.210989Z", - "shell.execute_reply": "2026-06-06T03:06:24.207417Z" + "iopub.execute_input": "2026-06-12T20:37:46.038737Z", + "iopub.status.busy": "2026-06-12T20:37:46.038661Z", + "iopub.status.idle": "2026-06-12T20:37:46.163228Z", + "shell.execute_reply": "2026-06-12T20:37:46.162805Z" } }, "outputs": [ @@ -561,31 +562,26 @@ ], "source": [ "reordered, permutation_map = bgraph.reorder(GRAPH_PATH, sort=\"rcm\")\n", - "print(\n", - " \"reorder(sort='rcm') ->\",\n", - " type(reordered).__name__,\n", - " \"with\",\n", - " reordered.number_of_nodes(),\n", - " \"nodes\",\n", - ")\n", + "print(f\"reorder(sort='rcm') -> {type(reordered).__name__} with {reordered.number_of_nodes()} nodes\")\n", "\n", - "# Sort by a node attribute with sort=\"key\" + key=... (on real data this is how\n", - "# you'd order by, say, \"GEOID\"; here the grid only has \"district\"/\"id\"):\n", + "# Sort by a node attribute with sort=\"key\" + key=... (on real data this is how you'd order by,\n", + "# say, \"GEOID\"; here the grid only has \"district\"/\"id\"):\n", "graph_mlc, _ = bgraph.reorder(GRAPH_PATH, sort=\"mlc\")\n", "graph_rcm, _ = bgraph.reorder(GRAPH_PATH, sort=\"rcm\")\n", "graph_by_district, _ = bgraph.reorder(GRAPH_PATH, sort=\"key\", key=\"district\")\n", - "# reorder_multi_level_cluster / reorder_reverse_cuthill_mckee / reorder_by_key are\n", - "# thin convenience wrappers over these.\n", + "\n", + "# reorder_multi_level_cluster / reorder_reverse_cuthill_mckee / reorder_by_key are thin convenience\n", + "# wrappers over these.\n", "print(\"orderings: sort='mlc', sort='rcm', or sort='key' with key=''\")\n", "\n", "# The permutation map is what makes a reordering reversible: its required field\n", "# `node_permutation_old_to_new` maps original 0-based node positions -> new ones.\n", "old_to_new = permutation_map[\"node_permutation_old_to_new\"]\n", - "print(\n", - " \"old_to_new is a bijection over [0, n):\",\n", - " sorted(old_to_new.values()) == list(range(reordered.number_of_nodes())),\n", - ")\n", - "print(\"provenance fields:\", {k: permutation_map[k] for k in (\"ordering_method\", \"key\")})" + "is_bijection = sorted(old_to_new.values()) == list(range(reordered.number_of_nodes()))\n", + "provenance = {k: permutation_map[k] for k in (\"ordering_method\", \"key\")}\n", + "\n", + "print(f\"old_to_new is a bijection over [0, n): {is_bijection}\")\n", + "print(f\"provenance fields: {provenance}\")" ] }, { @@ -595,18 +591,26 @@ "source": [ "## Metadata and custom assets\n", "\n", - "We already used these while building `rich.bendl`. `add_metadata` writes the\n", - "canonical `metadata.json` (provenance). `add_asset` writes a *custom* asset\n", - "under a name you choose, with a `content_type` of `\"json\"` or `\"text\"`:\n", - "\n", - "- `\"json\"` — payload must be valid UTF-8 JSON; the decoder will auto-parse it.\n", - "- `\"text\"` — payload must be valid UTF-8; stored without the JSON flag.\n", - "\n", - "The facade validates the payload, so a malformed `\"json\"` asset is caught at\n", - "write time. Assets may be added before *or* after the stream — only the stream\n", - "itself is single-use. Post-stream adds commit immediately (one directory\n", - "rewrite each), so use them sparingly. Here we tack a JSON asset onto an\n", - "already-finalized bundle to show both behaviours:" + "We already used both of these while building `mlc_reordered.bendl`. `add_metadata` writes the\n", + "canonical `metadata.json` (provenance). `add_asset` stores a custom asset under a name you\n", + "choose, and the `content_type` tells the facade how to treat the payload:\n", + "\n", + "- `\"json\"` — a `dict`/`list` (serialized for you) or a JSON string; the decoder\n", + " auto-parses it on the way out.\n", + "- `\"text\"` — any UTF-8 string.\n", + "- `\"binary\"` — raw bytes, stored verbatim: plots, pickles, zipped shapefiles, anything.\n", + "- `\"file\"` — a path (`str` or `pathlib.Path`); the file's bytes are read in and stored.\n", + " This is the easy way to ship an existing file inside the bundle.\n", + "\n", + "Every asset carries a CRC32C checksum, and payloads of 1 KiB or more are xz-compressed on\n", + "disk (both are transparent on read). The facade validates payloads up front, so a malformed\n", + "`\"json\"` asset is caught at write time rather than discovered by a collaborator at read\n", + "time.\n", + "\n", + "Assets may be added before or after the stream; only the stream itself is single-use.\n", + "Post-stream adds commit immediately (one directory rewrite each), so use them sparingly.\n", + "Here we tack a couple of assets onto the already-finalized `mlc_reordered.bendl` to show all of this\n", + "at once:" ] }, { @@ -615,10 +619,10 @@ "id": "f4db28a9", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.218758Z", - "iopub.status.busy": "2026-06-06T03:06:24.217041Z", - "iopub.status.idle": "2026-06-06T03:06:24.230725Z", - "shell.execute_reply": "2026-06-06T03:06:24.229938Z" + "iopub.execute_input": "2026-06-12T20:37:46.164026Z", + "iopub.status.busy": "2026-06-12T20:37:46.163949Z", + "iopub.status.idle": "2026-06-12T20:37:46.166275Z", + "shell.execute_reply": "2026-06-12T20:37:46.166060Z" } }, "outputs": [ @@ -631,17 +635,21 @@ } ], "source": [ - "# Add to rich.bendl after the fact (this finalized bundle is reopened to append).\n", - "# In append mode each add_* commits immediately, so there is nothing to finalize.\n", - "appender = BendlEncoder.append(\"example_data/rich.bendl\")\n", - "appender.add_asset(\"params.json\", json.dumps({\"node_repeats\": 2}), \"json\")\n", + "# Reopen the finalized bundle in append mode. Each add_* commits immediately,\n", + "# so there is nothing to finalize afterwards.\n", + "appender = BendlEncoder.append(\"example_data/mlc_reordered.bendl\")\n", + "appender.add_asset(\"params.json\", {\"node_repeats\": 2}, \"json\") # dicts are fine\n", + "\n", + "# Ship an existing file (here a stand-in for, say, a geopackage) straight off disk:\n", + "Path(\"example_data/tracts.gpkg\").write_bytes(b\"GPKG\\x00stand-in geospatial bytes\")\n", + "appender.add_asset(\"tracts.gpkg\", \"example_data/tracts.gpkg\", \"file\")\n", "\n", "# Validation in action — a \"json\" asset that isn't JSON is rejected up front:\n", "encoder = BendlEncoder(\"example_data/tmp.bendl\", overwrite=True)\n", "try:\n", " encoder.add_asset(\"bad.json\", \"this is not json\", \"json\")\n", "except ValueError as e:\n", - " print(\"rejected as expected:\", e)" + " print(f\"rejected as expected: {e}\")" ] }, { @@ -651,16 +659,16 @@ "source": [ "## Reading a bundle\n", "\n", - "`BendlDecoder(path)` opens a bundle. The **canonical getters** pull the\n", - "well-known assets back in convenient form:\n", + "`BendlDecoder(path)` opens a bundle. The canonical getters pull the well-known assets back\n", + "out in convenient form:\n", "\n", "- `read_graph()` → a live **NetworkX graph** (or `None` if absent),\n", "- `read_metadata()` → parsed `metadata.json` (or `None`),\n", "- `read_node_permutation_map()` → parsed map dict (or `None`).\n", "\n", - "Crucially, `read_graph()` returns the graph in the node order the assignments\n", - "were written in — which, because we built the chain on the reordered graph, is\n", - "exactly the reordered order. It lines up with the stream with no extra work." + "The important detail: `read_graph()` returns the graph in the node order the assignments\n", + "were written in. Since we built the chain on the reordered graph, everything will line up with the \n", + "stream automatically:" ] }, { @@ -669,10 +677,10 @@ "id": "56fdac96", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.233188Z", - "iopub.status.busy": "2026-06-06T03:06:24.232969Z", - "iopub.status.idle": "2026-06-06T03:06:24.272046Z", - "shell.execute_reply": "2026-06-06T03:06:24.270881Z" + "iopub.execute_input": "2026-06-12T20:37:46.166904Z", + "iopub.status.busy": "2026-06-12T20:37:46.166825Z", + "iopub.status.idle": "2026-06-12T20:37:46.172294Z", + "shell.execute_reply": "2026-06-12T20:37:46.172000Z" } }, "outputs": [ @@ -681,27 +689,21 @@ "output_type": "stream", "text": [ "read_graph() -> Graph with 1024 nodes\n", - "read_metadata() -> {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234}\n", + "read_metadata() -> {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234, 'created_by': 'me', 'created_at': '1970-01-01T00:00:00', 'description': 'ReCom ensemble on a 32x32 grid, MLC-reordered.'}\n", "read_node_permutation_map() has old_to_new: True\n" ] } ], "source": [ - "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", "\n", "packaged_graph = decoder.read_graph()\n", - "print(\n", - " \"read_graph() ->\",\n", - " type(packaged_graph).__name__,\n", - " \"with\",\n", - " packaged_graph.number_of_nodes(),\n", - " \"nodes\",\n", - ")\n", - "print(\"read_metadata() ->\", decoder.read_metadata())\n", - "print(\n", - " \"read_node_permutation_map() has old_to_new:\",\n", - " \"node_permutation_old_to_new\" in decoder.read_node_permutation_map(),\n", - ")" + "graph_desc = f\"{type(packaged_graph).__name__} with {packaged_graph.number_of_nodes()} nodes\"\n", + "has_map = \"node_permutation_old_to_new\" in decoder.read_node_permutation_map()\n", + "\n", + "print(f\"read_graph() -> {graph_desc}\")\n", + "print(f\"read_metadata() -> {decoder.read_metadata()}\")\n", + "print(f\"read_node_permutation_map() has old_to_new: {has_map}\")" ] }, { @@ -709,13 +711,13 @@ "id": "45770f97", "metadata": {}, "source": [ - "**Generic accessors** reach any asset by name:\n", + "The generic accessors reach any asset by name:\n", "\n", "- `read_asset_bytes(name)` → raw `bytes`,\n", "- `read_json_asset(name)` → parsed JSON.\n", "\n", - "Note `read_json_asset(\"graph.json\")` gives you the *raw* adjacency dict, in case\n", - "you want the JSON rather than the rebuilt NetworkX object." + "> Note: `read_json_asset(\"graph.json\")` gives you the raw adjacency dict, in case you want\n", + "> the JSON rather than the rebuilt NetworkX object." ] }, { @@ -724,10 +726,10 @@ "id": "e76546f3", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.278023Z", - "iopub.status.busy": "2026-06-06T03:06:24.277591Z", - "iopub.status.idle": "2026-06-06T03:06:24.294493Z", - "shell.execute_reply": "2026-06-06T03:06:24.292037Z" + "iopub.execute_input": "2026-06-12T20:37:46.172840Z", + "iopub.status.busy": "2026-06-12T20:37:46.172781Z", + "iopub.status.idle": "2026-06-12T20:37:46.175240Z", + "shell.execute_reply": "2026-06-12T20:37:46.175060Z" } }, "outputs": [ @@ -737,17 +739,18 @@ "text": [ "readme.txt -> b'ReCom ensemble on a 32x32 grid, MLC-reordered.'\n", "params.json -> {'node_repeats': 2}\n", + "tracts.gpkg -> b'GPKG\\x00stand-in geospatial bytes'\n", "graph.json (raw dict) top-level keys: ['directed', 'multigraph', 'graph', 'nodes', 'adjacency']\n" ] } ], "source": [ - "print(\"readme.txt ->\", decoder.read_asset_bytes(\"readme.txt\"))\n", - "print(\"params.json ->\", decoder.read_json_asset(\"params.json\"))\n", - "print(\n", - " \"graph.json (raw dict) top-level keys:\",\n", - " list(decoder.read_json_asset(\"graph.json\").keys()),\n", - ")" + "graph_keys = list(decoder.read_json_asset(\"graph.json\").keys())\n", + "\n", + "print(f\"readme.txt -> {decoder.read_asset_bytes('readme.txt')}\")\n", + "print(f\"params.json -> {decoder.read_json_asset('params.json')}\")\n", + "print(f\"tracts.gpkg -> {decoder.read_asset_bytes('tracts.gpkg')}\")\n", + "print(f\"graph.json (raw dict) top-level keys: {graph_keys}\")" ] }, { @@ -757,14 +760,17 @@ "source": [ "## Inspecting a bundle\n", "\n", - "Before (or instead of) reading payloads, you can inspect structure — handy for\n", - "tooling, debugging, or deciding whether a file is what you expect:\n", + "Before (or instead of) reading payloads, you can inspect structure. This is handy for\n", + "tooling, debugging, or deciding whether a file is what you think it is:\n", "\n", - "- `version` → `(major, minor)` format version,\n", + "- `version()` → `(major, minor)` format version,\n", "- `is_complete()` → was it finalized cleanly,\n", "- `assignment_format()` → `\"ben\"` or `\"xben\"`,\n", + "- `stream_size()` → byte length of the embedded stream, straight from the header,\n", + "- `asset_size(name)` → stored byte length of one asset, straight from the directory\n", + " (the compressed size for xz-flagged assets),\n", "- `asset_names()` → directory names in order,\n", - "- `list_assets()` → full directory: name, type, offset, len, flag tags,\n", + "- `list_assets()` → the full directory: name, type, offset, len, flag tags,\n", "- `len(dec)` / `count_samples()` → number of plans in the stream." ] }, @@ -774,10 +780,10 @@ "id": "59091af4", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.299715Z", - "iopub.status.busy": "2026-06-06T03:06:24.298245Z", - "iopub.status.idle": "2026-06-06T03:06:24.319565Z", - "shell.execute_reply": "2026-06-06T03:06:24.317012Z" + "iopub.execute_input": "2026-06-12T20:37:46.175851Z", + "iopub.status.busy": "2026-06-12T20:37:46.175798Z", + "iopub.status.idle": "2026-06-12T20:37:46.177529Z", + "shell.execute_reply": "2026-06-12T20:37:46.177324Z" } }, "outputs": [ @@ -785,30 +791,76 @@ "name": "stdout", "output_type": "stream", "text": [ - "version: (1, 0)\n", + "format version: (1, 0)\n", "is_complete: True\n", "assignment_format: ben\n", + "stream_size: 40521 bytes\n", "sample count: 1000\n", - "asset_names: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json']\n", + "asset_names: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'tracts.gpkg']\n", "full directory:\n", " {'name': 'graph.json', 'type': 2, 'offset': 64, 'len': 6788, 'flags': ['json', 'xz', 'checksum']}\n", - " {'name': 'node_permutation_map.json', 'type': 3, 'offset': 6852, 'len': 10152, 'flags': ['json', 'checksum']}\n", - " {'name': 'metadata.json', 'type': 1, 'offset': 17004, 'len': 79, 'flags': ['json', 'checksum']}\n", - " {'name': 'readme.txt', 'type': 4, 'offset': 17083, 'len': 46, 'flags': ['checksum']}\n", - " {'name': 'params.json', 'type': 4, 'offset': 57400, 'len': 19, 'flags': ['json', 'checksum']}\n" + " {'name': 'node_permutation_map.json', 'type': 3, 'offset': 6852, 'len': 2964, 'flags': ['json', 'xz', 'checksum']}\n", + " {'name': 'metadata.json', 'type': 1, 'offset': 9816, 'len': 201, 'flags': ['json', 'checksum']}\n", + " {'name': 'readme.txt', 'type': 4, 'offset': 10017, 'len': 46, 'flags': ['checksum']}\n", + " {'name': 'params.json', 'type': 4, 'offset': 50774, 'len': 19, 'flags': ['json', 'checksum']}\n", + " {'name': 'tracts.gpkg', 'type': 4, 'offset': 51026, 'len': 30, 'flags': ['checksum']}\n" ] } ], "source": [ - "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", - "print(\"version: \", decoder.version())\n", - "print(\"is_complete: \", decoder.is_complete())\n", - "print(\"assignment_format:\", decoder.assignment_format())\n", - "print(\"sample count: \", len(decoder))\n", - "print(\"asset_names: \", decoder.asset_names())\n", + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", + "print(f\"format version: {decoder.version()}\")\n", + "print(f\"is_complete: {decoder.is_complete()}\")\n", + "print(f\"assignment_format: {decoder.assignment_format()}\")\n", + "print(f\"stream_size: {decoder.stream_size()} bytes\")\n", + "print(f\"sample count: {len(decoder)}\")\n", + "print(f\"asset_names: {decoder.asset_names()}\")\n", "print(\"full directory:\")\n", "for entry in decoder.list_assets():\n", - " print(\" \", entry)" + " print(f\" {entry}\")" + ] + }, + { + "cell_type": "markdown", + "id": "bdf0a1db", + "metadata": {}, + "source": [ + "## Trust, but verify\n", + "\n", + "Notice the `'checksum'` flag on every directory entry above: the writer checksums each\n", + "asset and the embedded stream (CRC32C) as it goes. Reading, iterating, and subsampling do\n", + "**not** re-verify those checksums — a partial read cannot prove a whole-stream checksum,\n", + "and you do not want to pay for full verification on every loop. When integrity actually\n", + "matters (you just downloaded a bundle, or you are about to archive one), ask for it\n", + "explicitly. `verify()` checks every asset checksum and the stream checksum against the raw\n", + "bytes on disk, and raises on the first mismatch:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fcbfcdd8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-12T20:37:46.178061Z", + "iopub.status.busy": "2026-06-12T20:37:46.178007Z", + "iopub.status.idle": "2026-06-12T20:37:46.179354Z", + "shell.execute_reply": "2026-06-12T20:37:46.179168Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Every byte accounted for; assets and stream both check out!\n" + ] + } + ], + "source": [ + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", + "decoder.verify()\n", + "print(\"Every byte accounted for; assets and stream both check out!\")" ] }, { @@ -818,22 +870,21 @@ "source": [ "## Iterating the stream and reconstructing plans\n", "\n", - "A `BendlDecoder` iterates its embedded stream, yielding each assignment as a\n", - "`list[int]`. Combined with `read_graph()`, you can rebuild GerryChain\n", - "`Partition`s straight from the bundle — no separate graph file, no remembered\n", - "node order:" + "A `BendlDecoder` iterates its embedded stream, yielding each assignment as a `list[int]`.\n", + "Combined with `read_graph()`, that is enough to rebuild GerryChain `Partition`s straight\n", + "from the bundle which eliminates the need to remember the correct graph file or node order." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "4686c660", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:24.325095Z", - "iopub.status.busy": "2026-06-06T03:06:24.324485Z", - "iopub.status.idle": "2026-06-06T03:06:33.608232Z", - "shell.execute_reply": "2026-06-06T03:06:33.607233Z" + "iopub.execute_input": "2026-06-12T20:37:46.179899Z", + "iopub.status.busy": "2026-06-12T20:37:46.179848Z", + "iopub.status.idle": "2026-06-12T20:37:49.318050Z", + "shell.execute_reply": "2026-06-12T20:37:49.317755Z" } }, "outputs": [ @@ -842,14 +893,14 @@ "output_type": "stream", "text": [ "reconstructed 1000 partitions from the bundle alone\n", - "first five cut-edge counts: [96, 87, 96, 100, 122]\n" + "first five cut-edge counts: [96, 87, 94, 85, 98]\n" ] } ], "source": [ "import pandas as pd\n", "\n", - "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", "packaged_graph = decoder.read_graph()\n", "order = pd.Index(packaged_graph.nodes) # matches the written assignment order\n", "\n", @@ -863,7 +914,7 @@ " cut_edge_counts.append(len(partition[\"cut_edges\"]))\n", "\n", "print(f\"reconstructed {len(cut_edge_counts)} partitions from the bundle alone\")\n", - "print(\"first five cut-edge counts:\", cut_edge_counts[:5])" + "print(f\"first five cut-edge counts: {cut_edge_counts[:5]}\")" ] }, { @@ -873,10 +924,9 @@ "source": [ "## Subsampling\n", "\n", - "For winnowing a large ensemble you rarely want every plan. `BendlDecoder`\n", - "supports three native subsamplers; each returns the decoder set up to yield\n", - "only the chosen plans, so you still just iterate. **Indices are 1-based** (plan\n", - "1 is the first sample):\n", + "When winnowing a large ensemble you rarely want every plan. `BendlDecoder` has three native\n", + "subsamplers; each returns the decoder set up to yield only the chosen plans, so you still\n", + "just iterate. Indices are 1-based (plan 1 is the first sample):\n", "\n", "- `subsample_indices([...])` — exactly these 1-based indices (sorted, unique),\n", "- `subsample_range(start, end)` — the 1-based *inclusive* range `[start, end]`,\n", @@ -886,14 +936,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "b2879d36", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.609437Z", - "iopub.status.busy": "2026-06-06T03:06:33.609296Z", - "iopub.status.idle": "2026-06-06T03:06:33.619502Z", - "shell.execute_reply": "2026-06-06T03:06:33.619228Z" + "iopub.execute_input": "2026-06-12T20:37:49.319022Z", + "iopub.status.busy": "2026-06-12T20:37:49.318943Z", + "iopub.status.idle": "2026-06-12T20:37:49.360891Z", + "shell.execute_reply": "2026-06-12T20:37:49.360491Z" } }, "outputs": [ @@ -901,35 +951,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "indices [1, 500, 1000] -> [[2, 2, 2, 2], [3, 3, 3, 3], [2, 2, 2, 2]]\n", - "range(100, 104) -> [[0, 0, 0, 0], [0, 0, 0, 0], [3, 3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3]]\n", + "indices [1, 500, 1000] -> [[2, 2, 2, 2], [1, 1, 1, 1], [1, 1, 1, 1]]\n", + "range(100, 104) -> [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]\n", "every 250th -> 4 plans\n", - "indices again -> [[2, 2, 2, 2], [3, 3, 3, 3], [2, 2, 2, 2]]\n" + "indices again -> [[2, 2, 2, 2], [1, 1, 1, 1], [1, 1, 1, 1]]\n" ] } ], "source": [ - "bundle_file = \"example_data/rich.bendl\"\n", + "bundle_file = \"example_data/mlc_reordered.bendl\"\n", "decoder = BendlDecoder(bundle_file) # one decoder, reused for every subsample below\n", "\n", - "print(\n", - " \"indices [1, 500, 1000] ->\",\n", - " [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])],\n", - ")\n", - "print(\n", - " \"range(100, 104) ->\", # plans 100..104 inclusive = 5 plans\n", - " [assignment[:4] for assignment in decoder.subsample_range(100, 104)],\n", - ")\n", - "print(\n", - " \"every 250th ->\", sum(1 for _ in decoder.subsample_every(250)), \"plans\"\n", - ")\n", + "picked = [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])]\n", + "print(f\"indices [1, 500, 1000] -> {picked}\")\n", + "\n", + "ranged = [assignment[:4] for assignment in decoder.subsample_range(100, 104)]\n", + "print(f\"range(100, 104) -> {ranged}\") # plans 100..104 inclusive = 5 plans\n", + "\n", + "print(f\"every 250th -> {sum(1 for _ in decoder.subsample_every(250))} plans\")\n", "\n", "# The same decoder rewinds and re-selects on each call, so you can run subsamples\n", "# repeatedly without building a new decoder:\n", - "print(\n", - " \"indices again ->\",\n", - " [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])],\n", - ")" + "again = [assignment[:4] for assignment in decoder.subsample_indices([1, 500, 1000])]\n", + "print(f\"indices again -> {again}\")" ] }, { @@ -939,22 +983,22 @@ "source": [ "## Extracting the raw stream\n", "\n", - "Sometimes you want the bare assignment stream back out — e.g. to hand it to the\n", - "plain-stream tools or a different pipeline. `extract_stream` copies the\n", - "embedded stream region verbatim to a standalone `.ben`/`.xben` file, which you\n", - "can then open with the stream-only `BenDecoder`." + "Sometimes you want the bare assignment stream back out, e.g. to hand it to the plain-stream\n", + "tools or a different pipeline. `extract_stream` copies the embedded stream region verbatim\n", + "to a standalone `.ben`/`.xben` file, which the stream-only `BenDecoder` (covered in the\n", + "[streams notebook](using_ben_py.ipynb)) opens directly:" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "e27b2818", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.620112Z", - "iopub.status.busy": "2026-06-06T03:06:33.620051Z", - "iopub.status.idle": "2026-06-06T03:06:33.626399Z", - "shell.execute_reply": "2026-06-06T03:06:33.626105Z" + "iopub.execute_input": "2026-06-12T20:37:49.361801Z", + "iopub.status.busy": "2026-06-12T20:37:49.361714Z", + "iopub.status.idle": "2026-06-12T20:37:49.400897Z", + "shell.execute_reply": "2026-06-12T20:37:49.400582Z" } }, "outputs": [ @@ -969,12 +1013,12 @@ "source": [ "from binary_ensemble import BenDecoder\n", "\n", - "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", "decoder.extract_stream(\"example_data/extracted.ben\", overwrite=True)\n", "\n", "# Open the extracted file with the plain stream decoder (mode matches the bundle).\n", "ben = BenDecoder(\"example_data/extracted.ben\", mode=decoder.assignment_format())\n", - "print(\"extracted stream yields\", sum(1 for _ in ben), \"plans\")" + "print(f\"extracted stream yields {sum(1 for _ in ben)} plans\")" ] }, { @@ -984,22 +1028,25 @@ "source": [ "## Appending analysis back onto the bundle\n", "\n", - "A finished, finalized bundle isn't frozen: `BendlEncoder.append(path)` opens it\n", - "to add more assets later — say, the cut-edge summary we just computed. The\n", - "stream is *not* re-opened (it's already written); each `add_*` commits\n", - "immediately to disk." + "A finished, finalized bundle is not frozen. `BendlEncoder.append(path)` reopens it to add\n", + "more assets later — say, the cut-edge summary we just computed, so the analysis travels\n", + "with the plans it describes. The stream is not re-opened (it is already written); each\n", + "`add_*` commits immediately to disk:\n", + "\n", + "> Note: Assets can also be removed with `remove_asset(name)`. Since the asset name becomes free\n", + "> again, remove-then-add is the way to *replace* an asset (e.g. to update `metadata.json`)." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "8e9437f5", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.627298Z", - "iopub.status.busy": "2026-06-06T03:06:33.627187Z", - "iopub.status.idle": "2026-06-06T03:06:33.629095Z", - "shell.execute_reply": "2026-06-06T03:06:33.628894Z" + "iopub.execute_input": "2026-06-12T20:37:49.401695Z", + "iopub.status.busy": "2026-06-12T20:37:49.401618Z", + "iopub.status.idle": "2026-06-12T20:37:49.403893Z", + "shell.execute_reply": "2026-06-12T20:37:49.403625Z" } }, "outputs": [ @@ -1007,28 +1054,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "assets after append: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'cut_edge_summary.json']\n", - "appended summary: {'mean': 130.707, 'min': 87, 'max': 186}\n" + "assets after append: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'tracts.gpkg', 'cut_edge_summary.json']\n", + "appended summary: {'mean': 132.448, 'min': 85, 'max': 191}\n" ] } ], "source": [ - "appender = BendlEncoder.append(\"example_data/rich.bendl\")\n", + "appender = BendlEncoder.append(\"example_data/mlc_reordered.bendl\")\n", "appender.add_asset(\n", " \"cut_edge_summary.json\",\n", - " json.dumps(\n", - " {\n", - " \"mean\": sum(cut_edge_counts) / len(cut_edge_counts),\n", - " \"min\": min(cut_edge_counts),\n", - " \"max\": max(cut_edge_counts),\n", - " }\n", - " ),\n", + " {\n", + " \"mean\": sum(cut_edge_counts) / len(cut_edge_counts),\n", + " \"min\": min(cut_edge_counts),\n", + " \"max\": max(cut_edge_counts),\n", + " },\n", " \"json\",\n", ")\n", "\n", - "decoder = BendlDecoder(\"example_data/rich.bendl\")\n", - "print(\"assets after append:\", decoder.asset_names())\n", - "print(\"appended summary:\", decoder.read_json_asset(\"cut_edge_summary.json\"))" + "decoder = BendlDecoder(\"example_data/mlc_reordered.bendl\")\n", + "print(f\"assets after append: {decoder.asset_names()}\")\n", + "print(f\"appended summary: {decoder.read_json_asset('cut_edge_summary.json')}\")" ] }, { @@ -1038,24 +1083,22 @@ "source": [ "## Assets-only bundles (no stream)\n", "\n", - "You don't have to write a stream at all. This is the one case where you finalize\n", - "the bundle yourself — since there's no `stream()` context to do it — with an\n", - "explicit `close()` (or by using the encoder as a context manager). The result is\n", - "a valid **assets-only** bundle, useful for shipping a graph + metadata package\n", - "on its own. It decodes to an empty iteration with `len == 0` (no spurious\n", - "\"missing stream\" error)." + "Technically, the BENDL format does not require an ensemble stream at all, and it is possible to make \n", + "an assets-only bundle. This is useful for shipping a graph + metadata package on its own and the\n", + "\"stream\" section will still decode to an empty iteration with `len == 0`, not a spurious \n", + "\"missing stream\" error:" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "cf9903e8", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.629606Z", - "iopub.status.busy": "2026-06-06T03:06:33.629548Z", - "iopub.status.idle": "2026-06-06T03:06:33.664827Z", - "shell.execute_reply": "2026-06-06T03:06:33.664354Z" + "iopub.execute_input": "2026-06-12T20:37:49.404495Z", + "iopub.status.busy": "2026-06-12T20:37:49.404434Z", + "iopub.status.idle": "2026-06-12T20:37:49.432034Z", + "shell.execute_reply": "2026-06-12T20:37:49.431782Z" } }, "outputs": [ @@ -1075,12 +1118,8 @@ "\n", "decoder = BendlDecoder(\"example_data/assets_only.bendl\")\n", "print(\n", - " \"assets-only: is_complete =\",\n", - " decoder.is_complete(),\n", - " \"| len =\",\n", - " len(decoder),\n", - " \"| assets =\",\n", - " decoder.asset_names(),\n", + " f\"assets-only: is_complete = {decoder.is_complete()} \"\n", + " f\"| len = {len(decoder)} | assets = {decoder.asset_names()}\"\n", ")" ] }, @@ -1091,25 +1130,26 @@ "source": [ "## Recompressing to XBEN for archival\n", "\n", - "BEN is fast to write and good for active work. For long-term storage, XBEN\n", - "squeezes much harder (at a real CPU/time cost). `compress_stream` repackages a\n", - "bundle's BEN stream as XBEN, **preserving every asset** (graph, metadata,\n", - "permutation map, custom blobs). Choose exactly one of:\n", - "\n", - "- `in_place=True` — recompress to a temp file and atomically swap it in, or\n", - "- `out_file=...` — write a new bundle and leave the original untouched." + "BEN is the best working format for an active project, but when the project\n", + "wraps up, XBEN is a better compression mechanism. We don't use it as a working format since the\n", + "the CPU overhead is significant, but the savings are substantial enough to be worth the cost of \n", + "encoding once (fortunately, decoding is very fast from XBEN). `compress_stream` repackages a \n", + "bundle's BEN stream as XBEN, preserving every asset: graph, metadata, permutation map, custom \n", + "blobs. Pass `out_file=...` to write a new bundle and leave the original untouched \n", + "(`overwrite=True` replaces an existing copy); with no `out_file` the bundle is recompressed in \n", + "place, atomically." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "e79a35c8", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.666037Z", - "iopub.status.busy": "2026-06-06T03:06:33.665888Z", - "iopub.status.idle": "2026-06-06T03:06:33.711857Z", - "shell.execute_reply": "2026-06-06T03:06:33.711295Z" + "iopub.execute_input": "2026-06-12T20:37:49.432805Z", + "iopub.status.busy": "2026-06-12T20:37:49.432735Z", + "iopub.status.idle": "2026-06-12T20:37:49.465140Z", + "shell.execute_reply": "2026-06-12T20:37:49.464865Z" } }, "outputs": [ @@ -1118,8 +1158,8 @@ "output_type": "stream", "text": [ "recompressed format: xben\n", - "assets preserved: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'cut_edge_summary.json']\n", - "metadata preserved: {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234}\n", + "assets preserved: ['graph.json', 'node_permutation_map.json', 'metadata.json', 'readme.txt', 'params.json', 'tracts.gpkg', 'cut_edge_summary.json']\n", + "metadata preserved: {'generator': 'gerrychain', 'proposal': 'recom', 'epsilon': 0.05, 'seed': 1234, 'created_by': 'me', 'created_at': '1970-01-01T00:00:00', 'description': 'ReCom ensemble on a 32x32 grid, MLC-reordered.'}\n", "plans unchanged: 1000 == 1000\n" ] }, @@ -1127,27 +1167,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/claude-1000/ipykernel_3730522/3018985229.py:6: UserWarning: XBEN may take a second to start decoding.\n", - " xben_decoder = BendlDecoder(\"example_data/rich-archive.bendl\")\n" + "/tmp/claude-1000/ipykernel_1800399/2142953642.py:9: UserWarning: XBEN may take a second to start decoding.\n", + " xben_decoder = BendlDecoder(\"example_data/archive.bendl\")\n" ] } ], "source": [ - "# Write a fresh XBEN copy, original preserved. (out_file won't overwrite an\n", - "# existing file, so clear any copy from a previous run first.)\n", - "Path(\"example_data/rich-archive.bendl\").unlink(missing_ok=True)\n", - "compress_stream(\"example_data/rich.bendl\", out_file=\"example_data/rich-archive.bendl\")\n", - "\n", - "xben_decoder = BendlDecoder(\"example_data/rich-archive.bendl\")\n", - "print(\"recompressed format:\", xben_decoder.assignment_format())\n", - "print(\"assets preserved: \", xben_decoder.asset_names())\n", - "print(\"metadata preserved: \", xben_decoder.read_metadata())\n", - "print(\n", - " \"plans unchanged: \",\n", - " len(xben_decoder),\n", - " \"==\",\n", - " len(BendlDecoder(\"example_data/rich.bendl\")),\n", - ")" + "# Write a fresh XBEN copy, original preserved. overwrite=True replaces any copy\n", + "# left behind by a previous run of this notebook.\n", + "compress_stream(\n", + " \"example_data/mlc_reordered.bendl\",\n", + " out_file=\"example_data/archive.bendl\",\n", + " overwrite=True,\n", + ")\n", + "\n", + "xben_decoder = BendlDecoder(\"example_data/archive.bendl\")\n", + "n_plans_before = len(BendlDecoder(\"example_data/mlc_reordered.bendl\"))\n", + "print(f\"recompressed format: {xben_decoder.assignment_format()}\")\n", + "print(f\"assets preserved: {xben_decoder.asset_names()}\")\n", + "print(f\"metadata preserved: {xben_decoder.read_metadata()}\")\n", + "print(f\"plans unchanged: {len(xben_decoder)} == {n_plans_before}\")" ] }, { @@ -1155,9 +1194,9 @@ "id": "0a8025fd", "metadata": {}, "source": [ - "(Passing both `in_place=True` and `out_file=`, or neither, raises — the choice\n", - "is exclusive. Note `.bendl` bundles with XBEN streams emit a one-time startup warning on decode,\n", - "since opening them does real decompression work.)" + "> Note: Bundles with XBEN streams emit a one-time startup warning on decode, since opening\n", + "> them does real decompression work. The in-place mode writes to a temp file and swaps it\n", + "> over the original only on success, so an interrupted run cannot corrupt the bundle." ] }, { @@ -1167,23 +1206,23 @@ "source": [ "## Lifecycle and failure semantics\n", "\n", - "A subtle but important guarantee: if an exception escapes the `stream()`\n", - "context — say the chain or your write logic throws partway through — the bundle\n", - "is left **unfinalized** rather than stamped complete over a half-written\n", - "stream. You can detect this (`is_complete()` is `False`) and still recover what\n", - "was written via `extract_stream(..., allow_unfinalized=True)`." + "One last guarantee, subtle but important: if an exception escapes the `stream()` context (e.g.\n", + "the chain throws, the write logic has a bug, the process dies partway, etc.) the bundle is left\n", + "unfinalized rather than stamped complete over a half-written stream. You can detect this\n", + "(`is_complete()` is `False`) and still recover everything that was written via\n", + "`extract_stream(..., allow_unfinalized=True)`:" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "6fc232f9", "metadata": { "execution": { - "iopub.execute_input": "2026-06-06T03:06:33.713277Z", - "iopub.status.busy": "2026-06-06T03:06:33.713121Z", - "iopub.status.idle": "2026-06-06T03:06:34.663699Z", - "shell.execute_reply": "2026-06-06T03:06:34.663011Z" + "iopub.execute_input": "2026-06-12T20:37:49.465810Z", + "iopub.status.busy": "2026-06-12T20:37:49.465741Z", + "iopub.status.idle": "2026-06-12T20:37:50.255405Z", + "shell.execute_reply": "2026-06-12T20:37:50.255079Z" } }, "outputs": [ @@ -1203,22 +1242,20 @@ "gc_graph = Graph.from_networkx(stored_graph)\n", "write_order = list(gc_graph.nodes)\n", "try:\n", - " with encoder.stream(\"ben\") as stream:\n", + " with encoder.stream() as stream:\n", " for i, partition in enumerate(make_chain(gc_graph, steps=1000)):\n", " if i == 50:\n", " raise RuntimeError(\"simulated crash mid-stream\")\n", " series = partition.assignment.to_series()\n", " stream.write(series.loc[write_order].astype(int).tolist())\n", "except RuntimeError as e:\n", - " print(\"caught:\", e)\n", + " print(f\"caught: {e}\")\n", "\n", "decoder = BendlDecoder(\"example_data/partial.bendl\")\n", - "print(\"is_complete:\", decoder.is_complete(), \"(left unfinalized, as intended)\")\n", - "decoder.extract_stream(\n", - " \"example_data/partial.ben\", overwrite=True, allow_unfinalized=True\n", - ")\n", + "print(f\"is_complete: {decoder.is_complete()} (left unfinalized, as intended)\")\n", + "decoder.extract_stream(\"example_data/partial.ben\", overwrite=True, allow_unfinalized=True)\n", "recovered = sum(1 for _ in BenDecoder(\"example_data/partial.ben\", mode=\"ben\"))\n", - "print(\"recovered\", recovered, \"plans written before the crash\")" + "print(f\"recovered {recovered} plans written before the crash\")" ] }, { @@ -1228,33 +1265,35 @@ "source": [ "## Recap — when to reach for what\n", "\n", - "- **`BendlEncoder` / `BendlDecoder`** are your default for storing an ensemble:\n", - " one self-describing file, graph + metadata included, encoded live as the\n", - " chain runs. You only ever need a `with` block around the `stream()` writer —\n", - " closing it finalizes the bundle (use `close()` for an assets-only bundle).\n", + "- **`BendlEncoder` / `BendlDecoder`** are the default for storing an ensemble: one\n", + " self-describing file, graph + metadata included, encoded live as the chain runs.\n", + " Only the `stream()` writer needs a `with` block — closing it finalizes the bundle\n", + " (use `close()` for an assets-only bundle).\n", "- **`add_graph(graph)`** before the stream (MLC-reordered by default; pass\n", - " `sort=\"rcm\"`, `sort=\"key\", key=\"GEOID\"`, or `sort=None` for raw), then build\n", - " the chain on the returned graph — you get a compression win *and* a write order\n", - " that already matches the stored graph.\n", - "- **`relabel_bundle`** to reorder an existing `.bendl` bundle with a BEN stream and rewrite its stream\n", - " to match (in place or to a new file) — e.g. to optimize a bundle you received\n", - " raw, before archiving it.\n", - "- **`binary_ensemble.graph.reorder*`** when you want the reordering standalone\n", - " (e.g. to reuse an ordering across several bundles).\n", - "- **`add_metadata` / `add_asset`** to stamp provenance and ship analysis\n", - " alongside the plans; **`append`** to add results to a finished bundle.\n", - "- **`compress_stream`** to graduate an active `.bendl` bundle from an embedded BEN stream to an embedded XBEN\n", - " stream without losing any asset.\n", - "- Drop to the plain **`binary_ensemble.stream`** API (via `extract_stream`)\n", - " only when you specifically need the bare stream and are tracking the graph\n", - " and node order yourself.\n", - "print(\"done — see the example_data/ folder for the bundles this tutorial wrote\")" + " `sort=\"rcm\"`, `sort=\"key\", key=\"GEOID\"`, or `sort=None` for raw), then build the\n", + " chain on the returned graph — a compression win *and* a write order that already\n", + " matches the stored graph.\n", + "- **`add_metadata` / `add_asset`** to stamp provenance and ship anything else\n", + " alongside the plans (`\"json\"` takes dicts directly, `\"binary\"` takes raw bytes,\n", + " `\"file\"` reads a path off disk); **`append`** to add results to a finished bundle,\n", + " and **`remove_asset`** to drop one (remove-then-add replaces an asset).\n", + "- **`verify()`** when integrity matters: after a download, before an archive.\n", + "- **`relabel_bundle`** to reorder an existing BEN-stream bundle and rewrite its\n", + " stream to match (in place or to a new file), e.g. to optimize a bundle you\n", + " received raw before archiving it.\n", + "- **`binary_ensemble.graph.reorder*`** when you want the reordering standalone,\n", + " e.g. to reuse one ordering across several bundles.\n", + "- **`compress_stream`** to graduate an active bundle from an embedded BEN stream to\n", + " an embedded XBEN stream without losing any asset.\n", + "- The plain **stream layer** (via `extract_stream` and the\n", + " [streams notebook](using_ben_py.ipynb)) is there for when you specifically need\n", + " the bare stream and are tracking the graph and node order yourself." ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, diff --git a/docs/ben-format-spec.md b/docs/ben-format-spec.md index bf5e4ab..af8b3f5 100644 --- a/docs/ben-format-spec.md +++ b/docs/ben-format-spec.md @@ -11,7 +11,7 @@ The **TwoDelta** variant is a delta encoding with a different frame shape and a layout. It is out of scope here and specified separately. This specification covers the `.ben` container and the BEN32 body carried inside a `.xben` -container. It does not cover the `.bendl` bundle container, which embeds a BEN/XBEN stream as an +container. It does not cover the `.bendl` file container, which embeds a BEN/XBEN stream as an opaque payload; see the BENDL format specification for that. ## Design Goals diff --git a/docs/bendl-format-spec.md b/docs/bendl-format-spec.md index 91d0dcd..96a1085 100644 --- a/docs/bendl-format-spec.md +++ b/docs/bendl-format-spec.md @@ -205,6 +205,14 @@ Each asset payload is raw bytes referenced by the directory table. The bundle do per-asset wrapper headers in the payload region because offsets and lengths are already described by the directory entries. +The directory is the sole authority on which payloads exist. A bundle MAY contain byte ranges that +no directory entry (and no header field) references — for example, the payload left behind when a +writer removes an asset by rewriting the directory without its entry, or a superseded directory +left behind by an append. Readers MUST locate payloads solely via directory offsets and MUST NOT +assume payloads are contiguous. Whole-bundle rewrites (a compaction or a recompression) reclaim +unreferenced ranges; the user-facing removal paths (the `bendl remove` CLI command and the Python +facade) compact automatically after removing. + Examples of assets: - graph file diff --git a/docs/coding-standards.md b/docs/coding-standards.md index bc1788c..9bcc0ef 100644 --- a/docs/coding-standards.md +++ b/docs/coding-standards.md @@ -238,6 +238,20 @@ ______________________________________________________________________ - **Ship typing metadata:** the package includes `py.typed` and a `_core.pyi` stub; keep the stub in sync with the exported surface. Python users import re-exported names from the `binary_ensemble` package, not from `_core` directly. +- **Type the surface precisely, with shared aliases.** Public payload shapes live in + `binary_ensemble.types` (`GraphInput`, `StrPath`, `Variant`, `SortMethod`, the asset-payload + unions, and the `NodePermutationMap` / `AssetEntry` TypedDicts) and are used by the facades and + every `.pyi` stub — no `Any` where the accepted shapes are known. Use modern hints (`X | None`, + builtin generics, `collections.abc`); the floor is Python 3.11. Type checking is two-stage — + `ty` then `pyright` (`task typecheck-python`, part of `task lint`) — and + `tests/typing_assertions.py` pins the surface from the consumer side: `assert_type` for + positives, bare `# type: ignore` for calls that must NOT type-check (kept honest by pyright's + `reportUnnecessaryTypeIgnoreComment`). +- **Python-visible docstrings document every argument** (facade `.py` files and the Rust `///` + docs alike, Google style): each `Args:` entry carries its type in parentheses — the shared alias + name where one exists, e.g. ``graph (GraphInput):`` — with custom-type shapes spelled out in the + description. Defaulted parameters are marked ``(, optional)`` and state the default as + "Default is ``X``." — or, when ``None`` is meaningful, "Default is ``None`` which ⟨meaning⟩." ______________________________________________________________________ diff --git a/docs/twodelta-format-spec.md b/docs/twodelta-format-spec.md index 749db70..b8cc62d 100644 --- a/docs/twodelta-format-spec.md +++ b/docs/twodelta-format-spec.md @@ -13,7 +13,7 @@ specification and are only summarized here. TwoDelta differs in that most frames against the previous sample rather than independent snapshots, which gives it a different frame layout and a different `.xben` body. -The `.bendl` bundle container embeds a BEN/XBEN stream as an opaque payload and is unaffected by the +The `.bendl` file container embeds a BEN/XBEN stream as an opaque payload and is unaffected by the variant; see the BENDL format specification. ## Design Goals @@ -277,5 +277,5 @@ existing fixtures are never regenerated in place. See the format-stability polic ## Out of Scope - The Standard and MkvChain variants (independent snapshot frames; BEN32 `.xben` body). -- The `.bendl` bundle container that embeds a BEN/XBEN stream as an opaque payload. +- The `.bendl` file container that embeds a BEN/XBEN stream as an opaque payload. - LZMA2 framing details; XBEN treats LZMA2 as an opaque outer wrapper around the columnar body. From f26b2b19b2e5256ea6f8920f3d66a3b116ccab30 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:29:36 -0600 Subject: [PATCH 170/221] naming fixes --- ben-py/binary_ensemble/bundle.py | 8 +++--- ben-py/docs/concepts/cli-parity.md | 2 +- ben-py/src/compact.rs | 15 ++++++----- ben-py/tests/test_compact.py | 42 +++++++++++++++++++++++++++--- 4 files changed, 54 insertions(+), 13 deletions(-) diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 1293f45..ceb3248 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -336,9 +336,11 @@ def remove_asset(self, name: str) -> None: Removing appended (post-stream) assets is cheap at any scale: the compaction rebuilds only the small post-stream tail and never touches the assignment stream, even when the stream is tens of gigabytes. Removing a *pre-stream* asset (the graph, or metadata - added before streaming) costs one whole-file rewrite instead. For the rare bundle that - arrives with dead space from elsewhere (every public write path here leaves bundles - compact), the raw ``_core.compact_bundle_in_place`` reclaims it directly, and the raw + added before streaming) costs one whole-file rewrite instead. Note that each + immediate-commit ``add_asset`` (append mode, or create mode after the stream) leaves + the superseded directory behind as a few dead bytes; the compaction here reclaims + those too. For a bundle that arrives with dead space from other tooling, the raw + ``_core.compact_bundle_in_place`` reclaims it directly, and the raw ``_core.BendlEncoder.remove_asset`` drops only the directory entry if you specifically need that form. diff --git a/ben-py/docs/concepts/cli-parity.md b/ben-py/docs/concepts/cli-parity.md index 047773b..8a090a2 100644 --- a/ben-py/docs/concepts/cli-parity.md +++ b/ben-py/docs/concepts/cli-parity.md @@ -18,7 +18,7 @@ the CLI split so workflows can move between notebooks, scripts, and shell pipeli | Extract a bundle stream | `BendlDecoder(...).extract_stream(...)` | Copies embedded BEN/XBEN stream bytes | | Append bundle assets | `BendlEncoder.append(...)` | Asset appends only; no stream appends | | Remove bundle assets | `BendlEncoder.remove_asset(...)` | Compacts automatically, like `bendl remove` | -| Compact a bundle | automatic | Every Python write path keeps bundles compact; `bendl compact` exists for files from other tools | +| Compact a bundle | automatic | `remove_asset`, `compress_stream`, and `relabel_bundle` emit compact bundles (appends leave only a superseded directory behind, a few bytes each); `bendl compact` reclaims dead space in files from other tools | | Relabel/reorder a bundle | `relabel_bundle(...)` | Requires BEN stream plus graph | | Recompress bundle stream | `compress_stream(...)` | Embedded BEN stream to embedded XBEN stream | | Reorder a graph | `binary_ensemble.graph.reorder(...)` | Same orderings as bundle relabeling | diff --git a/ben-py/src/compact.rs b/ben-py/src/compact.rs index c00fa64..9b52902 100644 --- a/ben-py/src/compact.rs +++ b/ben-py/src/compact.rs @@ -27,9 +27,11 @@ fn map_bundle_err(err: BendlWriteError) -> PyErr { /// Rewrite the bundle at `in_file` without unreferenced byte ranges, writing the result to /// `out_file`. /// -/// Raw surface for the rare bundle that arrives with dead space from other tooling — every -/// facade write path (``remove_asset``, ``compress_stream``, ``relabel_bundle``) keeps bundles -/// compact automatically. See also :func:`compact_bundle_in_place`. +/// Raw surface for a bundle carrying dead space: one that arrives from other tooling, or one +/// grown by appends (each immediate-commit add supersedes the previous directory, leaving a +/// few dead bytes). The facade transforms (``remove_asset``, ``compress_stream``, +/// ``relabel_bundle``) emit compact bundles themselves. See also +/// :func:`compact_bundle_in_place`. /// /// Args: /// in_file (StrPath): Path to the source ``.bendl`` bundle (``str`` or ``os.PathLike``). @@ -73,9 +75,10 @@ pub fn compact_bundle(in_file: PathBuf, out_file: PathBuf, overwrite: bool) -> P /// rewritten wholesale through a temp file (stream checksum-verified during the copy) and /// atomically swapped over `path`. /// -/// Raw surface, also used by :meth:`binary_ensemble.bundle.BendlEncoder.remove_asset` — every -/// facade write path keeps bundles compact automatically, so this is only needed for bundles -/// that arrive with dead space from other tooling. +/// Raw surface, also used by :meth:`binary_ensemble.bundle.BendlEncoder.remove_asset`. The +/// facade transforms emit compact bundles themselves, so calling this directly is only needed +/// for a bundle that arrived with dead space from other tooling or accumulated superseded +/// directories from appends. /// /// Args: /// path (StrPath): Path to the ``.bendl`` bundle to compact (``str`` or ``os.PathLike``). diff --git a/ben-py/tests/test_compact.py b/ben-py/tests/test_compact.py index 89eb5b2..27c9473 100644 --- a/ben-py/tests/test_compact.py +++ b/ben-py/tests/test_compact.py @@ -4,9 +4,10 @@ same metadata, same wire format — just no unreferenced byte ranges (left behind by directory-only removals and superseded directories). These tests pin both halves: the space actually comes back, and nothing else changes. The public facade has no standalone compact — -every public write path (``remove_asset``, ``compress_stream``, ``relabel_bundle``) keeps -bundles compact automatically — so the machinery is exercised through ``_core``, which also -reports which strategy ran (``"none"`` / ``"tail"`` / ``"full"``). +the facade transforms (``remove_asset``, ``compress_stream``, ``relabel_bundle``) emit +compact bundles themselves, while appends leave a small superseded directory behind — so the +machinery is exercised through ``_core``, which also reports which strategy ran +(``"none"`` / ``"tail"`` / ``"full"``). """ from __future__ import annotations @@ -248,3 +249,38 @@ def test_compact_refuses_corrupt_asset(tmp_path: Path) -> None: _core.compact_bundle_in_place(path) with pytest.raises(Exception): BendlDecoder(path).verify() + + +def test_public_append_leaves_a_superseded_directory(tmp_path: Path) -> None: + """Pins the dead-space story the docs tell: an immediate-commit ``add_asset`` supersedes + the previous directory (a few dead bytes, reported as ``"tail"``-reclaimable), while the + facade transforms emit compact bundles themselves.""" + path = tmp_path / "appended.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream() as s: + s.write([1, 2, 3]) + enc.add_asset("notes.txt", "hello", content_type="text") # commits immediately + + assert _core.compact_bundle_in_place(path) == "tail" + assert _core.compact_bundle_in_place(path) == "none" + + dec = BendlDecoder(path) + assert dec.read_asset_bytes("notes.txt") == b"hello" + assert list(dec) == [[1, 2, 3]] + dec.verify() + + +def test_facade_remove_asset_leaves_bundle_fully_compact(tmp_path: Path) -> None: + """The facade's remove_asset compacts in place: a follow-up compaction finds nothing.""" + path = tmp_path / "removed.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream() as s: + s.write([1, 2, 3]) + enc.add_asset("a.txt", "a", content_type="text") + enc.add_asset("b.txt", "b", content_type="text") + enc.remove_asset("a.txt") + + assert _core.compact_bundle_in_place(path) == "none" + dec = BendlDecoder(path) + assert dec.asset_names() == ["b.txt"] + dec.verify() From 44dbe9b795ff34f3e867f8baf458c788102e33bd Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:46:33 -0600 Subject: [PATCH 171/221] make sure payload lengths do not overrun file size --- ben/src/io/bundle/compact.rs | 22 +++++++++++--- ben/src/io/bundle/reader.rs | 45 ++++++++++++++++++++++++++++ ben/src/io/bundle/tests/compact.rs | 47 ++++++++++++++++++++++++++++++ ben/src/io/bundle/tests/mod.rs | 1 + ben/src/io/bundle/tests/reader.rs | 34 +++++++++++++++++++-- ben/src/io/bundle/tests/writer.rs | 20 ++++++++++++- ben/src/io/bundle/writer.rs | 4 +++ 7 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 ben/src/io/bundle/tests/compact.rs diff --git a/ben/src/io/bundle/compact.rs b/ben/src/io/bundle/compact.rs index aa84548..be3b3c7 100644 --- a/ben/src/io/bundle/compact.rs +++ b/ben/src/io/bundle/compact.rs @@ -30,7 +30,7 @@ use super::format::{ ASSET_FLAG_JSON, ASSET_TYPE_GRAPH, ASSET_TYPE_METADATA, ASSET_TYPE_NODE_PERMUTATION_MAP, HEADER_SIZE, }; -use super::reader::BendlReader; +use super::reader::{validate_entry_extents, BendlReader}; use super::writer::{AddAssetOptions, BendlWriteError, BendlWriter}; /// Which compaction strategy [`compact_bundle_in_place`] ended up using. @@ -186,7 +186,8 @@ fn plan_tail( return Ok(None); } - // Read survivors' raw on-disk bytes and lay them out from the stream end. + // Read survivors' raw on-disk bytes and lay them out from the stream end. The allocation is + // bounded: open-time extent validation guarantees every payload range lies within the file. post.sort_by_key(|e| e.payload_offset); let mut block = Vec::new(); let mut new_entries: Vec = pre.iter().map(|e| (*e).clone()).collect(); @@ -199,7 +200,9 @@ fn plan_tail( let mut moved = (*entry).clone(); moved.payload_offset = offset; new_entries.push(moved); - offset += entry.payload_len; + offset = offset + .checked_add(entry.payload_len) + .ok_or_else(|| io::Error::other("payload range overflowed"))?; } let directory_offset = offset; let directory_bytes = encode_directory(&new_entries)?; @@ -210,7 +213,9 @@ fn plan_tail( block, directory_offset, directory_len, - file_len: directory_offset + directory_len, + file_len: directory_offset + .checked_add(directory_len) + .ok_or_else(|| io::Error::other("directory range overflowed"))?, })) } @@ -274,10 +279,19 @@ fn execute_tail( pub fn compact_bundle_in_place(path: &Path) -> Result { // Parse and validate through the reader so malformed bundles are rejected up front. let file = File::open(path)?; + let file_len = file.metadata()?.len(); let reader = BendlReader::open(BufReader::new(file)).map_err(BendlWriteError::Format)?; if !reader.is_finalized() { return Err(BendlWriteError::BundleIncomplete); } + // Compaction trusts the directory's lengths to size allocations and the rewritten layout + // (reader opens stay lenient so truncated bundles remain inspectable), so out-of-range + // extents must be rejected before any planning. + validate_entry_extents(reader.assets(), file_len).map_err(|e| { + BendlWriteError::Format(super::format::BendlFormatError::MalformedDirectory( + e.to_string(), + )) + })?; let mut header = *reader.header(); let entries: Vec = reader.assets().to_vec(); drop(reader); diff --git a/ben/src/io/bundle/reader.rs b/ben/src/io/bundle/reader.rs index 2d78f9b..ee66733 100644 --- a/ben/src/io/bundle/reader.rs +++ b/ben/src/io/bundle/reader.rs @@ -517,6 +517,35 @@ fn classify_read_error(err: io::Error, entry: &BendlDirectoryEntry) -> BendlRead } } +/// Validate that every entry's payload range lies within the backing file. +/// +/// Read paths stay lenient at open — a truncated bundle remains inspectable, and every byte +/// access surfaces a strict-EOF error at touch — but paths that *trust* the declared lengths +/// (the appender, which carries entries into a rewritten directory, and in-place compaction, +/// which sizes allocations and the new layout from them) must reject out-of-range extents up +/// front, so a corrupt or adversarial length surfaces as an error instead of an oversized +/// reservation or a garbage layout. +pub(crate) fn validate_entry_extents( + directory: &[BendlDirectoryEntry], + file_len: u64, +) -> Result<(), BundleValidationError> { + for entry in directory { + let in_bounds = entry + .payload_offset + .checked_add(entry.payload_len) + .is_some_and(|end| end <= file_len); + if !in_bounds { + return Err(BundleValidationError::PayloadOutOfBounds { + name: entry.name.clone(), + payload_offset: entry.payload_offset, + payload_len: entry.payload_len, + file_len, + }); + } + } + Ok(()) +} + pub(crate) fn validate_directory_entries( directory: &[BendlDirectoryEntry], ) -> Result<(), BundleValidationError> { @@ -556,4 +585,20 @@ pub enum BundleValidationError { /// The name that was actually written. found: String, }, + + /// An entry's payload range extends beyond the end of the file. + #[error( + "asset {name:?} declares payload bytes {payload_offset}..{payload_offset}+{payload_len} \ + beyond the file end ({file_len} bytes)" + )] + PayloadOutOfBounds { + /// The asset whose payload range is out of bounds. + name: String, + /// The payload offset declared in the directory. + payload_offset: u64, + /// The payload length declared in the directory. + payload_len: u64, + /// The actual length of the backing file. + file_len: u64, + }, } diff --git a/ben/src/io/bundle/tests/compact.rs b/ben/src/io/bundle/tests/compact.rs new file mode 100644 index 0000000..5431631 --- /dev/null +++ b/ben/src/io/bundle/tests/compact.rs @@ -0,0 +1,47 @@ +use std::fs; +use std::path::PathBuf; + +use super::writer::build_base_bundle; +use crate::io::bundle::compact::compact_bundle_in_place; + +/// A temp file that removes itself on drop, so failed assertions don't leak files. +struct TempBundle(PathBuf); + +impl Drop for TempBundle { + fn drop(&mut self) { + let _ = fs::remove_file(&self.0); + } +} + +fn temp_bundle(bytes: &[u8], tag: &str) -> TempBundle { + let path = std::env::temp_dir().join(format!( + "bendl-compact-test-{}-{tag}.bendl", + std::process::id() + )); + fs::write(&path, bytes).unwrap(); + TempBundle(path) +} + +#[test] +fn in_place_compaction_rejects_out_of_bounds_payload_len_without_panicking() { + // A corrupt directory length used to reach the tail planner and size an allocation from the + // untrusted value (an abort on adversarial input); open-time extent validation must surface + // it as an error and leave the file untouched. + let (mut bytes, _) = build_base_bundle(); + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let payload_len_offset = directory_offset + 4 + 16; + bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&(1u64 << 48).to_le_bytes()); + + let tmp = temp_bundle(&bytes, "oob-payload"); + let before = fs::read(&tmp.0).unwrap(); + let err = compact_bundle_in_place(&tmp.0).unwrap_err(); + assert!( + err.to_string().contains("beyond the file end"), + "unexpected error: {err}" + ); + assert_eq!( + fs::read(&tmp.0).unwrap(), + before, + "failed compaction must leave the file untouched" + ); +} diff --git a/ben/src/io/bundle/tests/mod.rs b/ben/src/io/bundle/tests/mod.rs index 6d7d6ee..9ce437b 100644 --- a/ben/src/io/bundle/tests/mod.rs +++ b/ben/src/io/bundle/tests/mod.rs @@ -1,3 +1,4 @@ +mod compact; mod format; mod manifest; mod reader; diff --git a/ben/src/io/bundle/tests/reader.rs b/ben/src/io/bundle/tests/reader.rs index 5b99c1e..c4075c0 100644 --- a/ben/src/io/bundle/tests/reader.rs +++ b/ben/src/io/bundle/tests/reader.rs @@ -9,7 +9,9 @@ use crate::io::bundle::format::{ BENDL_MINOR_VERSION, FINALIZED_NO, FINALIZED_YES, HEADER_FLAG_STREAM_CHECKSUM, HEADER_SIZE, MAX_DIRECTORY_ENTRIES, }; -use crate::io::bundle::reader::{validate_directory_entries, BendlReader, BundleValidationError}; +use crate::io::bundle::reader::{ + validate_directory_entries, validate_entry_extents, BendlReader, BundleValidationError, +}; /// Stamp a valid CRC32C and `ASSET_FLAG_CHECKSUM` onto a hand-built directory entry whose on-disk /// payload bytes are `payload`. Use this in test fixtures so the entry round-trips through the @@ -426,7 +428,9 @@ fn asset_bytes_errors_with_unexpected_eof_when_payload_len_runs_past_eof() { // Strict-EOF contract: a directory entry whose payload_len claims more bytes than the backing // file provides must surface as BendlReadError::Io wrapping io::ErrorKind::UnexpectedEof. // Returning a short successful read on a corrupt bundle is exactly the silent-corruption - // failure mode this contract exists to prevent. + // failure mode this contract exists to prevent. (Open itself stays lenient so a truncated + // bundle remains inspectable; paths that trust the lengths — append, in-place compaction — + // reject via validate_entry_extents instead.) let mut bytes = build_basic_finalized_bundle(); let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; let entry_start = directory_offset + 4; @@ -442,6 +446,32 @@ fn asset_bytes_errors_with_unexpected_eof_when_payload_len_runs_past_eof() { } } +#[test] +fn validate_entry_extents_accepts_eof_boundary_and_rejects_past_it() { + let entry = |offset: u64, len: u64| BendlDirectoryEntry { + asset_type: ASSET_TYPE_CUSTOM, + asset_flags: 0, + name: "a.bin".to_string(), + payload_offset: offset, + payload_len: len, + checksum: None, + }; + // A range ending exactly at EOF is in bounds. + validate_entry_extents(&[entry(64, 36)], 100).expect("range ending at EOF is in bounds"); + // One byte past EOF is rejected. + let err = validate_entry_extents(&[entry(64, 37)], 100).unwrap_err(); + assert!(matches!( + err, + BundleValidationError::PayloadOutOfBounds { .. } + )); + // offset + len overflowing u64 is rejected, not wrapped. + let err = validate_entry_extents(&[entry(u64::MAX, 2)], 100).unwrap_err(); + assert!(matches!( + err, + BundleValidationError::PayloadOutOfBounds { .. } + )); +} + #[test] fn incomplete_bundle_sample_count_is_none_even_if_header_value_is_nonzero() { // Build an incomplete bundle but stuff a stale sample count into the header. `sample_count()` diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index 839fcce..db63ea9 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -215,7 +215,7 @@ fn finalized_directory_lives_at_eof() { /// Build a finalized bundle with a single `metadata.json` asset and a short fake stream, then /// return both the bytes and the byte range (offset, len) occupied by the stream region. -fn build_base_bundle() -> (Vec, (u64, u64)) { +pub(super) fn build_base_bundle() -> (Vec, (u64, u64)) { let mut writer = BendlWriter::new(make_buffer(), AssignmentFormat::Ben).unwrap(); writer .add_json_asset(ASSET_TYPE_METADATA, "metadata.json", b"{\"version\":1}") @@ -229,6 +229,24 @@ fn build_base_bundle() -> (Vec, (u64, u64)) { (buf, range) } +#[test] +fn appender_open_rejects_entry_with_payload_len_past_eof() { + // The appender clones existing entries into the directory it rewrites, so an out-of-bounds + // payload range must be rejected at open rather than propagated into a new directory. + let (mut bytes, _) = build_base_bundle(); + let directory_offset = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize; + let payload_len_offset = directory_offset + 4 + 16; + bytes[payload_len_offset..payload_len_offset + 8].copy_from_slice(&u64::MAX.to_le_bytes()); + + let Err(err) = BendlAppender::open(Cursor::new(bytes)) else { + panic!("appender open accepted an out-of-bounds payload range"); + }; + assert!( + err.to_string().contains("beyond the file end"), + "unexpected error: {err}" + ); +} + #[test] fn append_adds_new_asset_and_preserves_old_entries() { let (bundle, _) = build_base_bundle(); diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index d6d2782..79e60a6 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -619,6 +619,7 @@ impl BendlAppender { /// Returns [`BendlWriteError::BundleIncomplete`] if the header's `finalized` flag is not set — /// append is unsafe on unfinalized bundles because the stream region has no authoritative end. pub fn open(mut inner: W) -> Result { + let file_len = inner.seek(SeekFrom::End(0))?; inner.seek(SeekFrom::Start(0))?; let header = BendlHeader::read_from(&mut inner).map_err(BendlWriteError::Format)?; if !header.is_finalized() { @@ -640,6 +641,9 @@ impl BendlAppender { super::reader::validate_directory_entries(&existing_entries).map_err(|e| { BendlWriteError::Format(BendlFormatError::MalformedDirectory(e.to_string())) })?; + super::reader::validate_entry_extents(&existing_entries, file_len).map_err(|e| { + BendlWriteError::Format(BendlFormatError::MalformedDirectory(e.to_string())) + })?; let registry = AssetNameRegistry::from_entries(&existing_entries); From 60f6bce1e8429b1d40d696b40fe9e6665153bb35 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:52:19 -0600 Subject: [PATCH 172/221] fix mid-write crash bug --- ben/src/io/bundle/compact.rs | 109 +++++++++++++++++++++-------- ben/src/io/bundle/tests/compact.rs | 86 ++++++++++++++++++++++- 2 files changed, 165 insertions(+), 30 deletions(-) diff --git a/ben/src/io/bundle/compact.rs b/ben/src/io/bundle/compact.rs index be3b3c7..6df3a40 100644 --- a/ben/src/io/bundle/compact.rs +++ b/ben/src/io/bundle/compact.rs @@ -131,9 +131,13 @@ where /// The post-stream tail to rebuild: surviving appended assets (raw on-disk bytes, so their /// storage form and checksums carry over unchanged) followed by the new directory. -struct PlannedTail { - /// Concatenated raw bytes to write at the stream end: survivor payloads then directory. - block: Vec, +pub(super) struct PlannedTail { + /// Concatenated raw survivor payload bytes, in final layout order. + payloads: Vec, + /// All surviving entries (pre-stream entries unchanged, survivors at their final offsets). + final_entries: Vec, + /// The encoded form of `final_entries`. + final_directory_bytes: Vec, /// Final directory offset (stream end + survivor payload bytes). directory_offset: u64, /// Final directory length. @@ -147,7 +151,7 @@ struct PlannedTail { /// Applicable iff the prefix `[0, stream_end)` is fully live: the pre-stream assets tile /// `[HEADER_SIZE, stream_offset)` exactly and every other live payload sits at or beyond the /// stream end. Returns `None` when dead bytes exist before the stream end (full rewrite needed). -fn plan_tail( +pub(super) fn plan_tail( file: &mut File, header: &BendlHeader, entries: &[BendlDirectoryEntry], @@ -187,30 +191,32 @@ fn plan_tail( } // Read survivors' raw on-disk bytes and lay them out from the stream end. The allocation is - // bounded: open-time extent validation guarantees every payload range lies within the file. + // bounded: extent validation before planning guarantees every payload range lies within the + // file. post.sort_by_key(|e| e.payload_offset); - let mut block = Vec::new(); - let mut new_entries: Vec = pre.iter().map(|e| (*e).clone()).collect(); + let mut payloads = Vec::new(); + let mut final_entries: Vec = pre.iter().map(|e| (*e).clone()).collect(); let mut offset = stream_end; for entry in &post { let mut payload = vec![0u8; entry.payload_len as usize]; file.seek(SeekFrom::Start(entry.payload_offset))?; file.read_exact(&mut payload)?; - block.extend_from_slice(&payload); + payloads.extend_from_slice(&payload); let mut moved = (*entry).clone(); moved.payload_offset = offset; - new_entries.push(moved); + final_entries.push(moved); offset = offset .checked_add(entry.payload_len) .ok_or_else(|| io::Error::other("payload range overflowed"))?; } let directory_offset = offset; - let directory_bytes = encode_directory(&new_entries)?; - let directory_len = directory_bytes.len() as u64; - block.extend_from_slice(&directory_bytes); + let final_directory_bytes = encode_directory(&final_entries)?; + let directory_len = final_directory_bytes.len() as u64; Ok(Some(PlannedTail { - block, + payloads, + final_entries, + final_directory_bytes, directory_offset, directory_len, file_len: directory_offset @@ -233,36 +239,83 @@ fn patch_header( file.sync_data() } -/// Execute a planned tail rewrite crash-safely. +/// Execute a planned tail rewrite crash-safely: stage at EOF, adopt, rewrite at the final home, +/// adopt, truncate. /// -/// Phase 1 appends the new tail block (survivor payloads + directory) at the current EOF and -/// patches the header to the appended directory — pure append, so a crash anywhere leaves either -/// the old or the appended directory authoritative over intact bytes. Phase 2 writes the same -/// block at the stream end and patches the header again; every byte it touches is dead under the -/// phase-1 state (the block is never larger than the dead region, which contains the survivors' -/// old payloads plus at least one superseded directory of equal entry count). The trailing -/// truncate runs last. +/// The invariant both phases preserve is that the authoritative directory only ever references +/// bytes that have already been written and synced. Re-running compaction from any intermediate +/// state converges losslessly, because each state's directory points at intact payload copies. fn execute_tail( file: &mut File, header: &mut BendlHeader, plan: &PlannedTail, ) -> Result<(), BendlWriteError> { - let block_start = plan.directory_offset - (plan.block.len() as u64 - plan.directory_len); + stage_tail(file, header, plan)?; + finalize_tail(file, header, plan) +} + +/// Phase 1: append the survivor payloads and a *staged* directory — one whose entries point at +/// those appended copies — at the current EOF, then patch the header to adopt it. +/// +/// Every write is append-only and the staged directory references only bytes that already exist +/// (the live prefix plus the appended copies), so a crash anywhere up to and including the header +/// patch leaves an intact bundle: either the old directory or the fully self-consistent staged +/// one is authoritative. +pub(super) fn stage_tail( + file: &mut File, + header: &mut BendlHeader, + plan: &PlannedTail, +) -> Result<(), BendlWriteError> { + let payloads_len = plan.payloads.len() as u64; + let block_start = plan.directory_offset - payloads_len; - // Phase 1: relocate the tail to the end of the file (append-only), then adopt it. let eof = file.seek(SeekFrom::End(0))?; debug_assert!( plan.file_len <= eof, "tail block must fit in the dead region" ); - file.write_all(&plan.block)?; + let staged_entries: Vec = plan + .final_entries + .iter() + .map(|entry| { + let mut staged = entry.clone(); + if staged.payload_offset >= block_start { + staged.payload_offset = eof + (staged.payload_offset - block_start); + } + staged + }) + .collect(); + let staged_directory_bytes = encode_directory(&staged_entries)?; + debug_assert_eq!( + staged_directory_bytes.len() as u64, + plan.directory_len, + "staged and final directories must encode to the same length" + ); + + file.write_all(&plan.payloads)?; + file.write_all(&staged_directory_bytes)?; file.sync_data()?; - let staged_dir_offset = eof + (plan.directory_offset - block_start); - patch_header(file, header, staged_dir_offset, plan.directory_len)?; + patch_header(file, header, eof + payloads_len, plan.directory_len) + .map_err(BendlWriteError::Io) +} + +/// Phase 2: write the payloads and the final directory at the stream end, patch the header to +/// the final directory, and truncate the staged tail away. +/// +/// Every byte this touches is dead under the staged state: the staged directory references only +/// the live prefix (which ends at the stream end) and the staged copies at or beyond the old EOF, +/// and the final tail never extends past the old EOF. The truncate runs only after the final +/// header patch is synced. +fn finalize_tail( + file: &mut File, + header: &mut BendlHeader, + plan: &PlannedTail, +) -> Result<(), BendlWriteError> { + let block_start = plan.directory_offset - plan.payloads.len() as u64; - // Phase 2: write the block at its final home (every touched byte is dead), adopt, truncate. file.seek(SeekFrom::Start(block_start))?; - file.write_all(&plan.block)?; + file.write_all(&plan.payloads)?; + file.write_all(&plan.final_directory_bytes)?; file.sync_data()?; patch_header(file, header, plan.directory_offset, plan.directory_len)?; file.set_len(plan.file_len)?; diff --git a/ben/src/io/bundle/tests/compact.rs b/ben/src/io/bundle/tests/compact.rs index 5431631..42815fc 100644 --- a/ben/src/io/bundle/tests/compact.rs +++ b/ben/src/io/bundle/tests/compact.rs @@ -1,8 +1,11 @@ -use std::fs; +use std::fs::{self, File, OpenOptions}; +use std::io::BufReader; use std::path::PathBuf; use super::writer::build_base_bundle; -use crate::io::bundle::compact::compact_bundle_in_place; +use crate::io::bundle::compact::{compact_bundle_in_place, plan_tail, stage_tail, Compaction}; +use crate::io::bundle::reader::BendlReader; +use crate::io::bundle::writer::{AddAssetOptions, BendlAppender}; /// A temp file that removes itself on drop, so failed assertions don't leak files. struct TempBundle(PathBuf); @@ -45,3 +48,82 @@ fn in_place_compaction_rejects_out_of_bounds_payload_len_without_panicking() { "failed compaction must leave the file untouched" ); } + +#[test] +fn crash_after_stage_leaves_consistent_bundle_and_recompaction_recovers() { + // The tail rewrite stages the survivors plus a directory addressing those staged copies at + // EOF, adopts it, and only then rewrites the final tail. Simulate a crash immediately after + // the staged adoption (the widest window) and check both halves of the crash-safety + // invariant: the staged state is a fully intact bundle, and re-running compaction from it + // converges losslessly. The staged directory used to carry the survivors' *final* offsets — + // bytes not yet written — so in the crash state the survivor failed its checksum, and the + // re-run rewrote it from those dead bytes and truncated away the only good copy. + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "crash-after-stage"); + let path = tmp.0.clone(); + let open_rw = || { + OpenOptions::new() + .read(true) + .write(true) + .open(&path) + .unwrap() + }; + + // Two appends (each leaves a superseded directory behind), then a directory-only removal of + // the first appended asset: the second must move left when the tail is compacted. + let survivor_payload: Vec = (0..600u32).map(|i| (i * 7 % 251) as u8).collect(); + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender + .add_custom_asset("doomed.bin", &[0xAB; 700], AddAssetOptions::defaults().raw()) + .unwrap(); + appender.commit().unwrap(); + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender + .add_custom_asset( + "survivor.bin", + &survivor_payload, + AddAssetOptions::defaults().raw(), + ) + .unwrap(); + appender.commit().unwrap(); + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender.remove_asset("doomed.bin").unwrap(); + appender.commit().unwrap(); + + // Run phase 1 only — up to the moment the staged directory becomes authoritative — then + // "crash" by dropping the handle before phase 2. + { + let reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + let mut header = *reader.header(); + let entries = reader.assets().to_vec(); + drop(reader); + let mut file = open_rw(); + let plan = plan_tail(&mut file, &header, &entries) + .unwrap() + .expect("post-stream dead space must be tail-compactable"); + stage_tail(&mut file, &mut header, &plan).unwrap(); + } + + // The crash state must be a fully consistent bundle: the directory loads, every checksum + // holds, and the survivor reads back intact. + { + let mut reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + reader.verify_all_asset_checksums().unwrap(); + let entry = reader.find_asset_by_name("survivor.bin").cloned().unwrap(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), survivor_payload); + assert!(reader.find_asset_by_name("doomed.bin").is_none()); + } + + // Recovery is the natural next step: re-running compaction must converge to the compact + // bundle with the survivor intact, and a further run must find nothing to do. + assert_eq!( + compact_bundle_in_place(&path).unwrap(), + Compaction::TailRewrite + ); + let mut reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + reader.verify_all_asset_checksums().unwrap(); + let entry = reader.find_asset_by_name("survivor.bin").cloned().unwrap(); + assert_eq!(reader.asset_bytes(&entry).unwrap(), survivor_payload); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + assert_eq!(compact_bundle_in_place(&path).unwrap(), Compaction::None); +} From 3e2fac41f679ab8d09732994f5f9b548d97880fc Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:01:10 -0600 Subject: [PATCH 173/221] unify remove and compact --- ben-py/binary_ensemble/_core.pyi | 3 + ben-py/binary_ensemble/bundle.py | 13 ++- ben-py/src/encode/bundle_encoder.rs | 29 ++++++ ben-py/tests/test_compact.py | 38 +++++++ ben/src/cli/bendl/remove.rs | 35 ++----- ben/src/io/bundle/compact.rs | 68 ++++++++++++- ben/src/io/bundle/mod.rs | 2 +- ben/src/io/bundle/tests/compact.rs | 148 +++++++++++++++++++++++++++- 8 files changed, 296 insertions(+), 40 deletions(-) diff --git a/ben-py/binary_ensemble/_core.pyi b/ben-py/binary_ensemble/_core.pyi index c22be45..ddbf02d 100644 --- a/ben-py/binary_ensemble/_core.pyi +++ b/ben-py/binary_ensemble/_core.pyi @@ -144,6 +144,9 @@ class BendlEncoder: # Drops the directory entry only (payload bytes become dead space until the next # whole-bundle rewrite compacts them); frees the name for re-add. KeyError if absent. def remove_asset(self, name: str) -> None: ... + # Drops the entry and reclaims its bytes as one operation; on error the bundle is left + # untouched with the asset still present. The facade's remove_asset calls this. + def remove_asset_compacting(self, name: str) -> None: ... def add_metadata(self, metadata: MetadataInput) -> None: ... # Returns the (possibly reordered) graph as a NetworkX graph, matching # BendlDecoder.read_graph. sort defaults to "mlc"; sort="key" sorts by `key`; sort=None diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index ceb3248..651d68e 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -31,7 +31,6 @@ from binary_ensemble._core import BendlDecoder, BendlStreamSession from binary_ensemble._core import BendlEncoder as _CoreBendlEncoder -from binary_ensemble._core import compact_bundle_in_place as _compact_bundle_in_place from binary_ensemble._core import recompress_bundle as _recompress_bundle from binary_ensemble._core import relabel_bundle as _relabel_bundle from binary_ensemble.types import ( @@ -328,10 +327,11 @@ def remove_asset(self, name: str) -> None: """Remove a named asset from a finalized bundle, reclaiming its bytes. Available wherever :meth:`add_asset` commits immediately: append mode, or create mode - after the stream has closed. The directory entry is dropped and the bundle is then - compacted in place, so the asset's payload bytes are actually gone from the file — not - just unreferenced. The name (and any singleton-type claim, e.g. ``metadata.json``) - becomes free again, so remove-then-add is the way to replace an asset's payload. + after the stream has closed. The directory drop and the compaction commit as one + operation, so the asset's payload bytes are actually gone from the file — not just + unreferenced — and on any error the bundle is left untouched, the asset still present + for a retry. The name (and any singleton-type claim, e.g. ``metadata.json``) becomes + free again, so remove-then-add is the way to replace an asset's payload. Removing appended (post-stream) assets is cheap at any scale: the compaction rebuilds only the small post-stream tail and never touches the assignment stream, even when the @@ -353,8 +353,7 @@ def remove_asset(self, name: str) -> None: Exception: If the encoder is in create mode before the stream (just don't add the asset), is currently streaming, or is closed. """ - self._enc.remove_asset(name) - _compact_bundle_in_place(self._path) + self._enc.remove_asset_compacting(name) def stream(self, *, variant: Variant = "twodelta") -> BendlStreamSession: """Open the single-use assignment stream context manager. diff --git a/ben-py/src/encode/bundle_encoder.rs b/ben-py/src/encode/bundle_encoder.rs index 425d1c3..5b022ba 100644 --- a/ben-py/src/encode/bundle_encoder.rs +++ b/ben-py/src/encode/bundle_encoder.rs @@ -11,6 +11,7 @@ use crate::common::{ parse_metadata_input, parse_variant, }; use crate::graph::helpers::{reorder_graph_to_bytes, resolve_reorder}; +use binary_ensemble::io::bundle::compact::remove_assets_in_place; use binary_ensemble::io::bundle::format::{AssignmentFormat, KnownAssetKind}; use binary_ensemble::io::bundle::writer::BendlAppender; use binary_ensemble::io::bundle::{ @@ -233,6 +234,34 @@ impl PyBendlEncoder { Err(state_error(&self.state, "remove_asset")) } + /// Remove a named asset and reclaim its bytes, as one operation. + /// + /// Available wherever ``add_asset`` commits immediately: append mode, or create mode after + /// the stream has closed. The directory drop and the compaction commit together, so there is + /// never a published state in which the asset is unreferenced but its bytes remain — and on + /// any error the bundle is left untouched, the asset still present. This is what + /// :meth:`binary_ensemble.bundle.BendlEncoder.remove_asset` calls; ``remove_asset`` here is + /// the raw directory-only form. + /// + /// Args: + /// name (str): The asset's name, as listed by + /// :meth:`binary_ensemble.bundle.BendlDecoder.asset_names`. + /// + /// Raises: + /// KeyError: If no asset with that name exists in the bundle. + /// Exception: If the encoder is in create mode before the stream (just don't add the + /// asset), is currently streaming, or is closed. + #[pyo3(signature = (name))] + #[pyo3(text_signature = "(self, name)")] + fn remove_asset_compacting(&mut self, name: &str) -> PyResult<()> { + if matches!(self.state, BundleState::Appendable) { + return remove_assets_in_place(&self.path, &[name]) + .map(|_| ()) + .map_err(map_bundle_err); + } + Err(state_error(&self.state, "remove_asset_compacting")) + } + /// Add the canonical ``metadata.json`` known asset. /// /// ``metadata`` accepts a Python ``dict``/``list``, UTF-8 JSON bytes, a file-like object with diff --git a/ben-py/tests/test_compact.py b/ben-py/tests/test_compact.py index 27c9473..5e335fd 100644 --- a/ben-py/tests/test_compact.py +++ b/ben-py/tests/test_compact.py @@ -284,3 +284,41 @@ def test_facade_remove_asset_leaves_bundle_fully_compact(tmp_path: Path) -> None dec = BendlDecoder(path) assert dec.asset_names() == ["b.txt"] dec.verify() + + +def test_facade_remove_asset_failure_leaves_bundle_untouched(tmp_path: Path) -> None: + """Removal and compaction commit together: when the rewrite fails mid-way (a corrupt + surviving asset caught by verify-on-touch), the bundle is left byte-identical — the asset + is still present and a retry still sees it. The removal used to commit its directory drop + first, so a failed compaction left the asset already unreachable and a retry raised + KeyError.""" + path = tmp_path / "atomic.bendl" + enc = BendlEncoder(path, overwrite=True) + enc.add_graph(_graph(), sort=None) + with enc.stream() as s: + s.write([1] * _n()) + enc.add_asset("notes.txt", "keep me", content_type="text") + _flip_byte_at(path, b"keep me") # corrupt the surviving post-stream asset + + before = path.read_bytes() + # Removing the pre-stream graph forces the full rewrite, which reads every survivor. + with pytest.raises(Exception, match="checksum"): + enc.remove_asset("graph.json") + assert path.read_bytes() == before + assert "graph.json" in BendlDecoder(path).asset_names() + + +def test_facade_remove_asset_can_remove_a_corrupt_asset(tmp_path: Path) -> None: + """The asset being removed is never read, so removal is the way out of a corrupt-asset + situation, not blocked by it.""" + path = tmp_path / "corrupt-removal.bendl" + enc = BendlEncoder(path, overwrite=True) + with enc.stream() as s: + s.write([1, 2, 3]) + enc.add_asset("bad.txt", "doomed bytes", content_type="text") + _flip_byte_at(path, b"doomed bytes") + + enc.remove_asset("bad.txt") + dec = BendlDecoder(path) + assert dec.asset_names() == [] + dec.verify() diff --git a/ben/src/cli/bendl/remove.rs b/ben/src/cli/bendl/remove.rs index b08121c..55ba49f 100644 --- a/ben/src/cli/bendl/remove.rs +++ b/ben/src/cli/bendl/remove.rs @@ -1,14 +1,13 @@ //! `bendl remove` and `bendl compact`: drop assets from a bundle and reclaim dead space. //! -//! Removal at the appender level only rewrites the directory; the payload bytes stay behind as -//! unreferenced dead space. The `remove` subcommand therefore compacts the bundle afterwards, so -//! "removed" means the bytes are actually gone from the file. `compact` is the standalone form, -//! useful after many appends (each of which leaves a superseded directory behind). +//! Removal goes through [`remove_assets_in_place`], which drops the directory entries and +//! reclaims their bytes as one operation, so "removed" means the bytes are actually gone from +//! the file — and a failure partway leaves the bundle untouched, assets still present. `compact` +//! is the standalone reclamation form, useful after many appends (each of which leaves a +//! superseded directory behind). use super::args::{CompactArgs, RemoveArgs}; -use crate::io::bundle::compact::{compact_bundle_in_place, Compaction}; -use crate::io::bundle::writer::BendlAppender; -use std::fs::OpenOptions; +use crate::io::bundle::compact::{compact_bundle_in_place, remove_assets_in_place, Compaction}; fn describe(kind: Compaction) -> &'static str { match kind { @@ -19,25 +18,9 @@ fn describe(kind: Compaction) -> &'static str { } pub(super) fn run_remove(args: RemoveArgs) -> Result<(), String> { - let file = OpenOptions::new() - .read(true) - .write(true) - .open(&args.input) - .map_err(|e| format!("failed to open {:?} for read+write: {e}", args.input))?; - let mut appender = - BendlAppender::open(file).map_err(|e| format!("failed to open appender: {e}"))?; - for name in &args.assets { - appender - .remove_asset(name) - .map_err(|e| format!("failed to remove asset: {e}"))?; - } - appender - .commit() - .map_err(|e| format!("failed to commit removal: {e}"))?; - - // Removal only rewrites the directory; compact so the payload bytes are actually gone. - let kind = compact_bundle_in_place(&args.input) - .map_err(|e| format!("failed to compact bundle after removal: {e}"))?; + let names: Vec<&str> = args.assets.iter().map(String::as_str).collect(); + let kind = remove_assets_in_place(&args.input, &names) + .map_err(|e| format!("failed to remove asset(s): {e}"))?; eprintln!( "Removed {} asset(s) from {:?} and compacted it ({})", args.assets.len(), diff --git a/ben/src/io/bundle/compact.rs b/ben/src/io/bundle/compact.rs index 6df3a40..77bc800 100644 --- a/ben/src/io/bundle/compact.rs +++ b/ben/src/io/bundle/compact.rs @@ -21,6 +21,7 @@ //! //! Both strategies preserve the stream's wire format (BEN or XBEN) as-is. +use std::collections::HashSet; use std::fs::{self, File, OpenOptions}; use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; use std::path::Path; @@ -84,6 +85,22 @@ fn add_preserved( /// Asset storage compression is normalized to the writer's default policy. Returns the /// destination writer on success. pub fn compact_bundle(reader: &mut BendlReader, out: W) -> Result +where + R: Read + Seek, + W: Write + Seek, +{ + compact_bundle_excluding(reader, out, &HashSet::new()) +} + +/// [`compact_bundle`], skipping every asset whose name is in `exclude`. +/// +/// Excluded assets are never read, so a removal also succeeds when the asset being removed is +/// itself corrupt. +fn compact_bundle_excluding( + reader: &mut BendlReader, + out: W, + exclude: &HashSet<&str>, +) -> Result where R: Read + Seek, W: Write + Seek, @@ -98,10 +115,14 @@ where .assignment_format_typed() .unwrap_or(AssignmentFormat::Ben); - // Read every asset's decoded payload up front (each read borrows the reader exclusively). + // Read every surviving asset's decoded payload up front (each read borrows the reader + // exclusively). let entries: Vec<_> = reader.assets().to_vec(); let mut assets = Vec::with_capacity(entries.len()); for entry in &entries { + if exclude.contains(entry.name.as_str()) { + continue; + } let payload = reader.asset_bytes(entry).map_err(io::Error::other)?; assets.push(PreservedAsset { asset_type: entry.asset_type, @@ -330,6 +351,31 @@ fn finalize_tail( /// streams the whole bundle through verified readers into a temp file and atomically swaps it /// over `path`. On any error the original file is left untouched. pub fn compact_bundle_in_place(path: &Path) -> Result { + compact_in_place_excluding(path, &[]) +} + +/// Remove the named assets from the bundle at `path` and compact it, as one operation. +/// +/// The removal and the compaction commit together: the directory that drops the names is the +/// same one the rewrite publishes, so no intermediate state ever exists in which an asset is +/// unreferenced but its bytes remain — and on any error the original file is left untouched, +/// with every asset still present for a retry. Unknown names are rejected up front, before any +/// byte of the file changes. The assets being removed are never read, so removal also succeeds +/// when the asset being removed is itself corrupt. +/// +/// `names` may repeat (duplicates collapse). An empty `names` is plain +/// [`compact_bundle_in_place`]. +pub fn remove_assets_in_place( + path: &Path, + names: &[&str], +) -> Result { + compact_in_place_excluding(path, names) +} + +fn compact_in_place_excluding( + path: &Path, + remove: &[&str], +) -> Result { // Parse and validate through the reader so malformed bundles are rejected up front. let file = File::open(path)?; let file_len = file.metadata()?.len(); @@ -345,14 +391,26 @@ pub fn compact_bundle_in_place(path: &Path) -> Result = remove.iter().copied().collect(); + for name in &remove { + if reader.find_asset_by_name(name).is_none() { + return Err(BendlWriteError::UnknownAssetName((*name).to_string())); + } + } let mut header = *reader.header(); - let entries: Vec = reader.assets().to_vec(); + let entries: Vec = reader + .assets() + .iter() + .filter(|e| !remove.contains(e.name.as_str())) + .cloned() + .collect(); drop(reader); let mut file = OpenOptions::new().read(true).write(true).open(path)?; if let Some(plan) = plan_tail(&mut file, &header, &entries)? { // Already compact? Then the directory sits right at its planned offset and the file ends - // right after it — nothing to do. + // right after it — nothing to do. (Unreachable with removals: dropping an entry always + // shrinks the directory, so the planned layout cannot match the current one.) let eof = file.seek(SeekFrom::End(0))?; if header.directory_offset == plan.directory_offset && eof == plan.file_len { return Ok(Compaction::None); @@ -362,7 +420,7 @@ pub fn compact_bundle_in_place(path: &Path) -> Result Result = (|| { let out = BufWriter::new(File::create(&tmp)?); - let out = compact_bundle(&mut reader, out)?; + let out = compact_bundle_excluding(&mut reader, out, &remove)?; out.into_inner() .map_err(|e| io::Error::other(e.to_string()))? .sync_all()?; diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index 475c7a7..b726b59 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -23,7 +23,7 @@ pub mod writer; #[cfg(test)] mod tests; -pub use compact::{compact_bundle, compact_bundle_in_place, Compaction}; +pub use compact::{compact_bundle, compact_bundle_in_place, remove_assets_in_place, Compaction}; pub use error::{BendlReadError, ChecksumError, ChecksumTarget}; pub use reader::{BendlReader, BendlVerifiedStreamReader, BundleValidationError}; pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; diff --git a/ben/src/io/bundle/tests/compact.rs b/ben/src/io/bundle/tests/compact.rs index 42815fc..dd2b128 100644 --- a/ben/src/io/bundle/tests/compact.rs +++ b/ben/src/io/bundle/tests/compact.rs @@ -3,7 +3,9 @@ use std::io::BufReader; use std::path::PathBuf; use super::writer::build_base_bundle; -use crate::io::bundle::compact::{compact_bundle_in_place, plan_tail, stage_tail, Compaction}; +use crate::io::bundle::compact::{ + compact_bundle_in_place, plan_tail, remove_assets_in_place, stage_tail, Compaction, +}; use crate::io::bundle::reader::BendlReader; use crate::io::bundle::writer::{AddAssetOptions, BendlAppender}; @@ -127,3 +129,147 @@ fn crash_after_stage_leaves_consistent_bundle_and_recompaction_recovers() { assert!(reader.find_asset_by_name("metadata.json").is_some()); assert_eq!(compact_bundle_in_place(&path).unwrap(), Compaction::None); } + +#[test] +fn remove_assets_in_place_drops_post_stream_assets_via_tail_path() { + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "remove-tail"); + let path = tmp.0.clone(); + + let open_rw = || { + OpenOptions::new() + .read(true) + .write(true) + .open(&path) + .unwrap() + }; + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender + .add_custom_asset("a.bin", &[1u8; 300], AddAssetOptions::defaults().raw()) + .unwrap(); + appender + .add_custom_asset("b.bin", &[2u8; 300], AddAssetOptions::defaults().raw()) + .unwrap(); + appender.commit().unwrap(); + + // One call removes both: drop + reclaim commit together, tail-only. + assert_eq!( + remove_assets_in_place(&path, &["a.bin", "b.bin"]).unwrap(), + Compaction::TailRewrite + ); + + let mut reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("a.bin").is_none()); + assert!(reader.find_asset_by_name("b.bin").is_none()); + assert!(reader.find_asset_by_name("metadata.json").is_some()); + reader.verify_all_asset_checksums().unwrap(); + drop(reader); + assert_eq!(compact_bundle_in_place(&path).unwrap(), Compaction::None); +} + +#[test] +fn remove_assets_in_place_rejects_unknown_name_without_touching_the_file() { + // Unknown names fail the whole batch up front — including any valid names beside them — so + // a caller never has to guess which removals landed. + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "remove-unknown"); + let before = fs::read(&tmp.0).unwrap(); + + let err = remove_assets_in_place(&tmp.0, &["metadata.json", "missing.bin"]).unwrap_err(); + assert!( + err.to_string().contains("no asset named"), + "unexpected error: {err}" + ); + assert_eq!(fs::read(&tmp.0).unwrap(), before); + let reader = BendlReader::open(BufReader::new(File::open(&tmp.0).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("metadata.json").is_some()); +} + +#[test] +fn remove_assets_in_place_failure_mid_rewrite_leaves_asset_present() { + // The non-atomicity regression: removal used to commit its directory drop *before* the + // compaction ran, so a failed compaction left the asset unreachable (a retry raised + // "no asset named") with its dead bytes still in the file. Fused, a mid-rewrite failure — + // here a corrupt surviving asset detected by verify-on-touch — leaves the file + // byte-identical and the asset still present for a retry. + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "remove-atomic"); + let path = tmp.0.clone(); + + let open_rw = || { + OpenOptions::new() + .read(true) + .write(true) + .open(&path) + .unwrap() + }; + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender + .add_custom_asset("extra.bin", &[7u8; 300], AddAssetOptions::defaults().raw()) + .unwrap(); + appender.commit().unwrap(); + + // Corrupt the survivor's payload on disk (its stored checksum now mismatches). + { + use std::io::{Seek, SeekFrom, Write}; + let reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + let offset = reader + .find_asset_by_name("extra.bin") + .unwrap() + .payload_offset; + drop(reader); + let mut f = open_rw(); + f.seek(SeekFrom::Start(offset)).unwrap(); + f.write_all(&[0xFF]).unwrap(); + } + + let before = fs::read(&path).unwrap(); + // Removing the pre-stream metadata forces the full rewrite, which reads every survivor. + let err = remove_assets_in_place(&path, &["metadata.json"]).unwrap_err(); + assert!( + err.to_string().to_lowercase().contains("checksum"), + "unexpected error: {err}" + ); + assert_eq!(fs::read(&path).unwrap(), before, "file must be untouched"); + let reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + assert!( + reader.find_asset_by_name("metadata.json").is_some(), + "a failed removal must leave the asset present" + ); +} + +#[test] +fn remove_assets_in_place_can_remove_a_corrupt_asset() { + // The asset being removed is never read, so removal is the way *out* of a corrupt-asset + // situation, not blocked by it. + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "remove-corrupt"); + let path = tmp.0.clone(); + + let open_rw = || { + OpenOptions::new() + .read(true) + .write(true) + .open(&path) + .unwrap() + }; + let mut appender = BendlAppender::open(open_rw()).unwrap(); + appender + .add_custom_asset("bad.bin", &[9u8; 300], AddAssetOptions::defaults().raw()) + .unwrap(); + appender.commit().unwrap(); + { + use std::io::{Seek, SeekFrom, Write}; + let reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + let offset = reader.find_asset_by_name("bad.bin").unwrap().payload_offset; + drop(reader); + let mut f = open_rw(); + f.seek(SeekFrom::Start(offset)).unwrap(); + f.write_all(&[0xFF]).unwrap(); + } + + remove_assets_in_place(&path, &["bad.bin"]).unwrap(); + let mut reader = BendlReader::open(BufReader::new(File::open(&path).unwrap())).unwrap(); + assert!(reader.find_asset_by_name("bad.bin").is_none()); + reader.verify_all_asset_checksums().unwrap(); +} From 3ff79c2477721056c5fd37fd645d212a99961c5d Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:07:45 -0600 Subject: [PATCH 174/221] remove large memory buffering of tail payloads --- ben/src/io/bundle/compact.rs | 110 +++++++++++++++++++---------- ben/src/io/bundle/tests/compact.rs | 24 ++++++- 2 files changed, 96 insertions(+), 38 deletions(-) diff --git a/ben/src/io/bundle/compact.rs b/ben/src/io/bundle/compact.rs index 77bc800..beab9fa 100644 --- a/ben/src/io/bundle/compact.rs +++ b/ben/src/io/bundle/compact.rs @@ -150,11 +150,25 @@ where } } -/// The post-stream tail to rebuild: surviving appended assets (raw on-disk bytes, so their -/// storage form and checksums carry over unchanged) followed by the new directory. +/// One survivor payload's source range, to be copied (raw on-disk bytes, so storage form and +/// checksums carry over unchanged) into the rebuilt tail. +struct PayloadMove { + /// Byte offset of the payload in the current file. + src: u64, + /// Payload length in bytes. + len: u64, +} + +/// The post-stream tail to rebuild: surviving appended assets followed by the new directory. +/// +/// Planning is pure arithmetic over the directory — no payload byte is read until a rewrite +/// actually executes, and the rewrite itself copies file-to-file through a fixed-size buffer, +/// so tail compaction needs no payload-sized memory. pub(super) struct PlannedTail { - /// Concatenated raw survivor payload bytes, in final layout order. - payloads: Vec, + /// Survivor payload source ranges, in final layout order. + moves: Vec, + /// Total survivor payload bytes (the moves' lengths summed). + payloads_len: u64, /// All surviving entries (pre-stream entries unchanged, survivors at their final offsets). final_entries: Vec, /// The encoded form of `final_entries`. @@ -173,7 +187,6 @@ pub(super) struct PlannedTail { /// `[HEADER_SIZE, stream_offset)` exactly and every other live payload sits at or beyond the /// stream end. Returns `None` when dead bytes exist before the stream end (full rewrite needed). pub(super) fn plan_tail( - file: &mut File, header: &BendlHeader, entries: &[BendlDirectoryEntry], ) -> Result, BendlWriteError> { @@ -211,18 +224,17 @@ pub(super) fn plan_tail( return Ok(None); } - // Read survivors' raw on-disk bytes and lay them out from the stream end. The allocation is - // bounded: extent validation before planning guarantees every payload range lies within the - // file. + // Lay the survivors out from the stream end — arithmetic only, no payload reads. Extent + // validation before planning guarantees every source range lies within the file. post.sort_by_key(|e| e.payload_offset); - let mut payloads = Vec::new(); + let mut moves = Vec::with_capacity(post.len()); let mut final_entries: Vec = pre.iter().map(|e| (*e).clone()).collect(); let mut offset = stream_end; for entry in &post { - let mut payload = vec![0u8; entry.payload_len as usize]; - file.seek(SeekFrom::Start(entry.payload_offset))?; - file.read_exact(&mut payload)?; - payloads.extend_from_slice(&payload); + moves.push(PayloadMove { + src: entry.payload_offset, + len: entry.payload_len, + }); let mut moved = (*entry).clone(); moved.payload_offset = offset; final_entries.push(moved); @@ -230,12 +242,14 @@ pub(super) fn plan_tail( .checked_add(entry.payload_len) .ok_or_else(|| io::Error::other("payload range overflowed"))?; } + let payloads_len = offset - stream_end; let directory_offset = offset; let final_directory_bytes = encode_directory(&final_entries)?; let directory_len = final_directory_bytes.len() as u64; Ok(Some(PlannedTail { - payloads, + moves, + payloads_len, final_entries, final_directory_bytes, directory_offset, @@ -246,6 +260,24 @@ pub(super) fn plan_tail( })) } +/// Copy `len` bytes within `file` from `src` to `dst` through a fixed-size buffer. +/// +/// The caller is responsible for ensuring the ranges don't overlap in a way that would read +/// already-overwritten bytes; both call sites here copy between disjoint regions. +fn copy_within(file: &mut File, src: u64, dst: u64, len: u64) -> io::Result<()> { + let mut buf = vec![0u8; 64 * 1024]; + let mut done = 0u64; + while done < len { + let chunk = buf.len().min((len - done) as usize); + file.seek(SeekFrom::Start(src + done))?; + file.read_exact(&mut buf[..chunk])?; + file.seek(SeekFrom::Start(dst + done))?; + file.write_all(&buf[..chunk])?; + done += chunk as u64; + } + Ok(()) +} + /// Write `header` (patched with the given directory location) at offset 0 and sync. fn patch_header( file: &mut File, @@ -271,12 +303,13 @@ fn execute_tail( header: &mut BendlHeader, plan: &PlannedTail, ) -> Result<(), BendlWriteError> { - stage_tail(file, header, plan)?; - finalize_tail(file, header, plan) + let staged_base = stage_tail(file, header, plan)?; + finalize_tail(file, header, plan, staged_base) } -/// Phase 1: append the survivor payloads and a *staged* directory — one whose entries point at -/// those appended copies — at the current EOF, then patch the header to adopt it. +/// Phase 1: copy the survivor payloads and write a *staged* directory — one whose entries point +/// at those appended copies — at the current EOF, then patch the header to adopt it. Returns the +/// EOF the staging started at (the staged payload base). /// /// Every write is append-only and the staged directory references only bytes that already exist /// (the live prefix plus the appended copies), so a crash anywhere up to and including the header @@ -286,9 +319,8 @@ pub(super) fn stage_tail( file: &mut File, header: &mut BendlHeader, plan: &PlannedTail, -) -> Result<(), BendlWriteError> { - let payloads_len = plan.payloads.len() as u64; - let block_start = plan.directory_offset - payloads_len; +) -> Result { + let block_start = plan.directory_offset - plan.payloads_len; let eof = file.seek(SeekFrom::End(0))?; debug_assert!( @@ -313,29 +345,35 @@ pub(super) fn stage_tail( "staged and final directories must encode to the same length" ); - file.write_all(&plan.payloads)?; + let mut dst = eof; + for mv in &plan.moves { + copy_within(file, mv.src, dst, mv.len)?; + dst += mv.len; + } + file.seek(SeekFrom::Start(eof + plan.payloads_len))?; file.write_all(&staged_directory_bytes)?; file.sync_data()?; - patch_header(file, header, eof + payloads_len, plan.directory_len) - .map_err(BendlWriteError::Io) + patch_header(file, header, eof + plan.payloads_len, plan.directory_len)?; + Ok(eof) } -/// Phase 2: write the payloads and the final directory at the stream end, patch the header to -/// the final directory, and truncate the staged tail away. +/// Phase 2: copy the payloads down from the staged region to the stream end, write the final +/// directory, patch the header to it, and truncate the staged tail away. /// /// Every byte this touches is dead under the staged state: the staged directory references only /// the live prefix (which ends at the stream end) and the staged copies at or beyond the old EOF, -/// and the final tail never extends past the old EOF. The truncate runs only after the final -/// header patch is synced. +/// and the final tail never extends past the old EOF — so the source and destination of the copy +/// are disjoint. The truncate runs only after the final header patch is synced. fn finalize_tail( file: &mut File, header: &mut BendlHeader, plan: &PlannedTail, + staged_base: u64, ) -> Result<(), BendlWriteError> { - let block_start = plan.directory_offset - plan.payloads.len() as u64; + let block_start = plan.directory_offset - plan.payloads_len; - file.seek(SeekFrom::Start(block_start))?; - file.write_all(&plan.payloads)?; + copy_within(file, staged_base, block_start, plan.payloads_len)?; + file.seek(SeekFrom::Start(plan.directory_offset))?; file.write_all(&plan.final_directory_bytes)?; file.sync_data()?; patch_header(file, header, plan.directory_offset, plan.directory_len)?; @@ -406,19 +444,19 @@ fn compact_in_place_excluding( .collect(); drop(reader); - let mut file = OpenOptions::new().read(true).write(true).open(path)?; - if let Some(plan) = plan_tail(&mut file, &header, &entries)? { + // Planning is pure directory arithmetic, so the already-compact case is decided here — + // before the file is even opened for writing, and without reading a single payload byte. + if let Some(plan) = plan_tail(&header, &entries)? { // Already compact? Then the directory sits right at its planned offset and the file ends // right after it — nothing to do. (Unreachable with removals: dropping an entry always // shrinks the directory, so the planned layout cannot match the current one.) - let eof = file.seek(SeekFrom::End(0))?; - if header.directory_offset == plan.directory_offset && eof == plan.file_len { + if header.directory_offset == plan.directory_offset && file_len == plan.file_len { return Ok(Compaction::None); } + let mut file = OpenOptions::new().read(true).write(true).open(path)?; execute_tail(&mut file, &mut header, &plan)?; return Ok(Compaction::TailRewrite); } - drop(file); // Dead space before the stream end: full rewrite through a temp file. let file = File::open(path)?; diff --git a/ben/src/io/bundle/tests/compact.rs b/ben/src/io/bundle/tests/compact.rs index dd2b128..0221b73 100644 --- a/ben/src/io/bundle/tests/compact.rs +++ b/ben/src/io/bundle/tests/compact.rs @@ -99,10 +99,10 @@ fn crash_after_stage_leaves_consistent_bundle_and_recompaction_recovers() { let mut header = *reader.header(); let entries = reader.assets().to_vec(); drop(reader); - let mut file = open_rw(); - let plan = plan_tail(&mut file, &header, &entries) + let plan = plan_tail(&header, &entries) .unwrap() .expect("post-stream dead space must be tail-compactable"); + let mut file = open_rw(); stage_tail(&mut file, &mut header, &plan).unwrap(); } @@ -273,3 +273,23 @@ fn remove_assets_in_place_can_remove_a_corrupt_asset() { assert!(reader.find_asset_by_name("bad.bin").is_none()); reader.verify_all_asset_checksums().unwrap(); } + +#[test] +fn already_compact_bundle_is_recognized_without_write_access() { + // The already-compact decision is pure directory arithmetic: no payload byte is read and + // the file is never opened for writing, so even a read-only bundle reports + // Compaction::None instead of failing at a read-write open. + let (bytes, _) = build_base_bundle(); + let tmp = temp_bundle(&bytes, "readonly-none"); + let mut perms = fs::metadata(&tmp.0).unwrap().permissions(); + perms.set_readonly(true); + fs::set_permissions(&tmp.0, perms).unwrap(); + + assert_eq!(compact_bundle_in_place(&tmp.0).unwrap(), Compaction::None); + + // Restore writability so the drop cleanup can remove the file on every platform. + let mut perms = fs::metadata(&tmp.0).unwrap().permissions(); + #[allow(clippy::permissions_set_readonly_false)] + perms.set_readonly(false); + fs::set_permissions(&tmp.0, perms).unwrap(); +} From 33ce37c7a8686db788728211ad9156a5b9ed8f10 Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:18:43 -0600 Subject: [PATCH 175/221] power loss safety because I'm paranoid --- ben/src/io/bundle/mod.rs | 2 +- ben/src/io/bundle/tests/writer.rs | 88 +++++++++++++++++++++++++++++++ ben/src/io/bundle/writer.rs | 52 +++++++++++++++--- ben/tests/test_stress_edges.rs | 7 +++ 4 files changed, 142 insertions(+), 7 deletions(-) diff --git a/ben/src/io/bundle/mod.rs b/ben/src/io/bundle/mod.rs index b726b59..aeb6692 100644 --- a/ben/src/io/bundle/mod.rs +++ b/ben/src/io/bundle/mod.rs @@ -26,4 +26,4 @@ mod tests; pub use compact::{compact_bundle, compact_bundle_in_place, remove_assets_in_place, Compaction}; pub use error::{BendlReadError, ChecksumError, ChecksumTarget}; pub use reader::{BendlReader, BendlVerifiedStreamReader, BundleValidationError}; -pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter}; +pub use writer::{AddAssetOptions, BendlStreamSession, BendlWriteError, BendlWriter, SyncData}; diff --git a/ben/src/io/bundle/tests/writer.rs b/ben/src/io/bundle/tests/writer.rs index db63ea9..1c60499 100644 --- a/ben/src/io/bundle/tests/writer.rs +++ b/ben/src/io/bundle/tests/writer.rs @@ -2113,3 +2113,91 @@ fn appender_duplicate_name_after_singleton_check_leaves_appender_usable() { ) .unwrap(); } + +/// Records the order of writes (by starting offset) and sync barriers, for asserting the +/// durability ordering of in-place mutations. +#[derive(Debug)] +enum LoggedOp { + Write { pos: u64 }, + Sync, +} + +struct SyncLog { + inner: Cursor>, + ops: Vec, +} + +impl Read for SyncLog { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.inner.read(buf) + } +} + +impl Seek for SyncLog { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + self.inner.seek(pos) + } +} + +impl Write for SyncLog { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.ops.push(LoggedOp::Write { + pos: self.inner.position(), + }); + self.inner.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +impl crate::io::bundle::writer::SyncData for SyncLog { + fn sync_data(&mut self) -> std::io::Result<()> { + self.ops.push(LoggedOp::Sync); + Ok(()) + } +} + +#[test] +fn append_commit_syncs_directory_before_the_header_patch_and_header_after() { + // The appender mutates an existing good bundle in place. OS writeback may persist writes in + // any order, so the new tail (payloads + directory) must hit a sync barrier before the + // header is patched to reference it, and the patched header must be synced before commit + // reports success. Without the first barrier, power loss could leave the header pointing at + // unwritten directory bytes — the previous bundle unrecoverable. + let (bundle, _) = build_base_bundle(); + let log = SyncLog { + inner: Cursor::new(bundle), + ops: Vec::new(), + }; + let mut appender = BendlAppender::open(log).unwrap(); + appender + .add_custom_asset("x.bin", &[1u8; 64], AddAssetOptions::defaults().raw()) + .unwrap(); + let log = appender.commit().unwrap(); + + let header_write = log + .ops + .iter() + .position(|op| matches!(op, LoggedOp::Write { pos: 0 })) + .expect("commit must patch the header at offset 0"); + let last_tail_write = log.ops[..header_write] + .iter() + .rposition(|op| matches!(op, LoggedOp::Write { .. })) + .expect("commit must write the tail before the header"); + assert!( + log.ops[last_tail_write..header_write] + .iter() + .any(|op| matches!(op, LoggedOp::Sync)), + "the new tail must be synced before the header patch: {:?}", + log.ops + ); + assert!( + log.ops[header_write..] + .iter() + .any(|op| matches!(op, LoggedOp::Sync)), + "the patched header must be synced before commit returns: {:?}", + log.ops + ); +} diff --git a/ben/src/io/bundle/writer.rs b/ben/src/io/bundle/writer.rs index 79e60a6..09154b7 100644 --- a/ben/src/io/bundle/writer.rs +++ b/ben/src/io/bundle/writer.rs @@ -24,11 +24,38 @@ //! offset, directory length, and `finalized` flag. use std::collections::HashSet; -use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::fs::File; +use std::io::{self, Cursor, Read, Seek, SeekFrom, Write}; use thiserror::Error; use xz2::write::XzEncoder; +/// A write sink that can order its writes durably: everything written before a +/// [`sync_data`](SyncData::sync_data) call must reach stable storage before anything written +/// after it takes effect. Backing files map this to [`File::sync_data`]; in-memory sinks are +/// trivially ordered and no-op. +/// +/// [`BendlAppender::commit`] requires this because it mutates an existing good bundle in place: +/// without a barrier between writing the new directory and patching the header, OS writeback may +/// persist the patched header first, and a power loss would leave it pointing at unwritten bytes +/// with the old directory pointer already gone. +pub trait SyncData { + /// Flush buffers and force every earlier write to stable storage. + fn sync_data(&mut self) -> io::Result<()>; +} + +impl SyncData for File { + fn sync_data(&mut self) -> io::Result<()> { + File::sync_data(self) + } +} + +impl SyncData for Cursor { + fn sync_data(&mut self) -> io::Result<()> { + Ok(()) + } +} + use super::format::{ default_compresses, encode_directory, read_directory, standardized_name_for, AssignmentFormat, BendlDirectoryEntry, BendlFormatError, BendlHeader, KnownAssetKind, ASSET_FLAG_CHECKSUM, @@ -775,8 +802,15 @@ impl BendlAppender { /// file mutation in one append-only burst: seek to old EOF, write new payloads, write a new /// directory, and patch the header. /// - /// If compression fails, the file is left unchanged. - pub fn commit(mut self) -> Result { + /// If compression fails, the file is left unchanged. The mutation is ordered durably: the new + /// payloads and directory are synced to stable storage before the header is patched to point + /// at them, and the patched header is synced before `commit` returns — so a crash or power + /// loss at any point leaves either the previous bundle (with at most trailing orphaned bytes) + /// or the fully appended one. + pub fn commit(mut self) -> Result + where + W: SyncData, + { // If nothing was enqueued or removed, commit is a no-op — return the file untouched. if self.pending.is_empty() && !self.removed_any { return Ok(self.inner); @@ -787,8 +821,7 @@ impl BendlAppender { let encoded = self.prepare_pending_assets()?; // Phase 2: append-only file mutation. Until the final header patch, the old header still - // points at the old directory, which remains intact. A crash before the patch leaves the - // previous bundle readable with trailing orphaned bytes. + // points at the old directory, which remains intact. let old_directory_end = self .header .directory_offset @@ -827,12 +860,19 @@ impl BendlAppender { self.inner.write_all(&directory_bytes)?; let new_directory_len = directory_bytes.len() as u64; - // Patch the header. + // The new tail must be durable before the header can reference it: OS writeback is free + // to reorder, and a power loss that persisted the patched header before the directory + // bytes would leave it pointing at garbage with the old directory pointer already gone. + self.inner.flush()?; + self.inner.sync_data()?; + + // Patch the header, and sync so a returned commit is durable. self.header.directory_offset = new_directory_offset; self.header.directory_len = new_directory_len; self.inner.seek(SeekFrom::Start(0))?; self.header.write_to(&mut self.inner)?; self.inner.flush()?; + self.inner.sync_data()?; Ok(self.inner) } diff --git a/ben/tests/test_stress_edges.rs b/ben/tests/test_stress_edges.rs index e907afb..30dc4b3 100644 --- a/ben/tests/test_stress_edges.rs +++ b/ben/tests/test_stress_edges.rs @@ -117,6 +117,13 @@ impl Read for HeaderPatchCrashCursor { } } +impl binary_ensemble::io::bundle::SyncData for HeaderPatchCrashCursor { + fn sync_data(&mut self) -> std::io::Result<()> { + // In-memory test double: writes are trivially ordered. + Ok(()) + } +} + impl Write for HeaderPatchCrashCursor { fn write(&mut self, buf: &[u8]) -> std::io::Result { let mut state = self.state.borrow_mut(); From 39c91d8eecb888a94a079a8ba78512a63b3a508f Mon Sep 17 00:00:00 2001 From: peterrrock2 <27579114+peterrrock2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:28:45 -0600 Subject: [PATCH 176/221] better file swap dicipline --- ben-py/binary_ensemble/bundle.py | 24 ++---- ben-py/src/common.rs | 120 ++++++++++++++++++++++++++++- ben-py/src/compact.rs | 7 +- ben-py/src/decode/py_funcs.rs | 11 ++- ben-py/src/encode/py_funcs.rs | 13 +++- ben-py/src/recompress.rs | 13 ++-- ben-py/src/relabel.rs | 7 +- ben-py/tests/test_compact.py | 41 ++++++++++ ben/src/io/bundle/compact.rs | 34 ++++++-- ben/src/io/bundle/tests/compact.rs | 20 +++++ 10 files changed, 246 insertions(+), 44 deletions(-) diff --git a/ben-py/binary_ensemble/bundle.py b/ben-py/binary_ensemble/bundle.py index 651d68e..5be953a 100644 --- a/ben-py/binary_ensemble/bundle.py +++ b/ben-py/binary_ensemble/bundle.py @@ -25,7 +25,6 @@ import json import os -import tempfile from collections.abc import Callable from typing import TYPE_CHECKING, Literal, cast, overload @@ -63,28 +62,19 @@ def _atomic_or_out( path: StrPath, out_file: StrPath | None, overwrite: bool, - suffix: str = ".bendl", ) -> None: - """Shared in-place-swap / out_file dispatch for whole-bundle transforms. + """Shared in-place / out_file dispatch for whole-bundle transforms. - ``transform(src, dst, overwrite)`` writes the result. ``out_file=None`` means in place: the - result is written to a temp file and atomically swapped over ``path``. ``overwrite`` governs - an existing ``out_file`` (the in-place swap always replaces ``path``). + ``transform(src, dst, overwrite)`` writes the result. The ``_core`` bindings own the swap + discipline: the destination is written via a uniquely named temp file that inherits an + existing destination's permissions, fsynced, and atomically renamed into place — so a + destination is never visible half-written and an error leaves it exactly as it was. + ``out_file=None`` means in place: the transform's destination is ``path`` itself. """ if out_file is not None: transform(path, out_file, overwrite) return - - directory = os.path.dirname(os.path.abspath(os.fspath(path))) - fd, tmp = tempfile.mkstemp(suffix=suffix, dir=directory) - os.close(fd) - try: - transform(path, tmp, True) - os.replace(tmp, path) - except BaseException: - if os.path.exists(tmp): - os.remove(tmp) - raise + transform(path, path, True) def _coerce_asset_payload(payload: object, content_type: str) -> bytes: diff --git a/ben-py/src/common.rs b/ben-py/src/common.rs index 841f597..5678c0a 100644 --- a/ben-py/src/common.rs +++ b/ben-py/src/common.rs @@ -2,9 +2,10 @@ use binary_ensemble::BenVariant; use pyo3::exceptions::{PyException, PyIOError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyByteArray, PyBytes, PyDict, PyList}; -use std::fs::File; +use std::fs::{self, File}; use std::io::{BufReader, BufWriter}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; pub fn parse_variant(variant: Option<&str>) -> PyResult { match variant { @@ -58,6 +59,121 @@ pub fn open_output(out_file: &PathBuf, overwrite: bool) -> PyResult