Skip to content

Commit a7be89a

Browse files
committed
rework schemes AGAIN
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 19403db commit a7be89a

27 files changed

Lines changed: 785 additions & 659 deletions

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-btrblocks/Cargo.toml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,5 @@ name = "compress_listview"
6464
harness = false
6565
test = false
6666

67-
[[bench]]
68-
name = "dict_encode"
69-
harness = false
70-
test = false
71-
72-
[[bench]]
73-
name = "stats_calc"
74-
harness = false
75-
test = false
76-
7767
[package.metadata.cargo-machete]
7868
ignored = ["getrandom_v03"]

vortex-btrblocks/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,11 @@ pub use builder::default_excluded;
6868
pub use canonical_compressor::BtrBlocksCompressor;
6969
pub use schemes::patches::compress_patches;
7070
pub use vortex_compressor::CascadingCompressor;
71-
pub use vortex_compressor::builtins::integer_dictionary_encode;
7271
pub use vortex_compressor::ctx::CompressorContext;
7372
pub use vortex_compressor::ctx::MAX_CASCADE;
7473
pub use vortex_compressor::scheme::Scheme;
7574
pub use vortex_compressor::scheme::SchemeExt;
7675
pub use vortex_compressor::scheme::SchemeId;
77-
pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling;
7876
pub use vortex_compressor::stats::ArrayAndStats;
7977
pub use vortex_compressor::stats::BoolStats;
8078
pub use vortex_compressor::stats::FloatStats;

vortex-btrblocks/src/schemes/decimal.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use vortex_array::ToCanonical;
1010
use vortex_array::arrays::PrimitiveArray;
1111
use vortex_array::arrays::decimal::narrowed_decimal;
1212
use vortex_array::dtype::DecimalType;
13+
use vortex_compressor::estimate::CompressionEstimate;
1314
use vortex_decimal_byte_parts::DecimalBytePartsArray;
1415
use vortex_error::VortexResult;
1516

@@ -42,12 +43,11 @@ impl Scheme for DecimalScheme {
4243

4344
fn expected_compression_ratio(
4445
&self,
45-
_compressor: &CascadingCompressor,
4646
_data: &mut ArrayAndStats,
4747
_ctx: CompressorContext,
48-
) -> VortexResult<f64> {
48+
) -> CompressionEstimate {
4949
// Decimal compression is almost always beneficial (narrowing + primitive compression).
50-
Ok(f64::MAX)
50+
CompressionEstimate::AlwaysUse
5151
}
5252

5353
fn compress(

vortex-btrblocks/src/schemes/float.rs

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use vortex_array::Canonical;
1111
use vortex_array::IntoArray;
1212
use vortex_array::ToCanonical;
1313
use vortex_array::dtype::PType;
14+
use vortex_compressor::estimate::CompressionEstimate;
1415
use vortex_compressor::scheme::ChildSelection;
1516
use vortex_compressor::scheme::DescendantExclusion;
1617
use vortex_error::VortexResult;
@@ -25,7 +26,6 @@ use crate::CompressorContext;
2526
use crate::Scheme;
2627
use crate::SchemeExt;
2728
use crate::compress_patches;
28-
use crate::estimate_compression_ratio_with_sampling;
2929

3030
/// ALP (Adaptive Lossless floating-Point) encoding.
3131
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -70,22 +70,21 @@ impl Scheme for ALPScheme {
7070

7171
fn expected_compression_ratio(
7272
&self,
73-
compressor: &CascadingCompressor,
7473
data: &mut ArrayAndStats,
7574
ctx: CompressorContext,
76-
) -> VortexResult<f64> {
75+
) -> CompressionEstimate {
7776
// ALP encodes floats as integers. Without integer compression afterward, the encoded ints
7877
// are the same size.
7978
if ctx.finished_cascading() {
80-
return Ok(0.0);
79+
return CompressionEstimate::Skip;
8180
}
8281

8382
// We don't support ALP for f16.
84-
if data.float_stats().source().ptype() == PType::F16 {
85-
return Ok(0.0);
83+
if data.array_as_primitive().ptype() == PType::F16 {
84+
return CompressionEstimate::Skip;
8685
}
8786

88-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
87+
CompressionEstimate::Sample
8988
}
9089

9190
fn compress(
@@ -94,9 +93,7 @@ impl Scheme for ALPScheme {
9493
data: &mut ArrayAndStats,
9594
ctx: CompressorContext,
9695
) -> VortexResult<ArrayRef> {
97-
let stats = data.float_stats();
98-
99-
let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?;
96+
let alp_encoded = alp_encode(data.array_as_primitive(), None)?;
10097

10198
// Compress the ALP ints.
10299
let compressed_alp_ints =
@@ -121,15 +118,15 @@ impl Scheme for ALPRDScheme {
121118

122119
fn expected_compression_ratio(
123120
&self,
124-
compressor: &CascadingCompressor,
125121
data: &mut ArrayAndStats,
126-
ctx: CompressorContext,
127-
) -> VortexResult<f64> {
128-
if data.float_stats().source().ptype() == PType::F16 {
129-
return Ok(0.0);
122+
_ctx: CompressorContext,
123+
) -> CompressionEstimate {
124+
// We don't support ALPRD for f16.
125+
if data.array_as_primitive().ptype() == PType::F16 {
126+
return CompressionEstimate::Skip;
130127
}
131128

132-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
129+
CompressionEstimate::Sample
133130
}
134131

135132
fn compress(
@@ -138,15 +135,15 @@ impl Scheme for ALPRDScheme {
138135
data: &mut ArrayAndStats,
139136
_ctx: CompressorContext,
140137
) -> VortexResult<ArrayRef> {
141-
let stats = data.float_stats();
138+
let primitive_array = data.array_as_primitive();
142139

143-
let encoder = match stats.source().ptype() {
144-
PType::F32 => RDEncoder::new(stats.source().as_slice::<f32>()),
145-
PType::F64 => RDEncoder::new(stats.source().as_slice::<f64>()),
140+
let encoder = match primitive_array.ptype() {
141+
PType::F32 => RDEncoder::new(primitive_array.as_slice::<f32>()),
142+
PType::F64 => RDEncoder::new(primitive_array.as_slice::<f64>()),
146143
ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"),
147144
};
148145

149-
let mut alp_rd = encoder.encode(stats.source());
146+
let mut alp_rd = encoder.encode(primitive_array);
150147

151148
let patches = alp_rd
152149
.left_parts_patches()
@@ -182,24 +179,19 @@ impl Scheme for NullDominatedSparseScheme {
182179

183180
fn expected_compression_ratio(
184181
&self,
185-
_compressor: &CascadingCompressor,
186182
data: &mut ArrayAndStats,
187183
_ctx: CompressorContext,
188-
) -> VortexResult<f64> {
184+
) -> CompressionEstimate {
185+
let len = data.array_len() as f64;
189186
let stats = data.float_stats();
190187

191-
if stats.value_count() == 0 {
192-
// All nulls should use ConstantScheme instead of this.
193-
return Ok(0.0);
194-
}
195-
196188
// If the majority (90%) of values is null, this will compress well.
197-
if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 {
198-
return Ok(stats.source().len() as f64 / stats.value_count() as f64);
189+
if stats.null_count() as f64 / len > 0.9 {
190+
return CompressionEstimate::Ratio(len / stats.value_count() as f64);
199191
}
200192

201193
// Otherwise we don't go this route.
202-
Ok(0.0)
194+
CompressionEstimate::Skip
203195
}
204196

205197
fn compress(
@@ -208,10 +200,8 @@ impl Scheme for NullDominatedSparseScheme {
208200
data: &mut ArrayAndStats,
209201
ctx: CompressorContext,
210202
) -> VortexResult<ArrayRef> {
211-
let stats = data.float_stats();
212-
213203
// We pass None as we only run this pathway for NULL-dominated float arrays.
214-
let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?;
204+
let sparse_encoded = SparseArray::encode(data.array(), None)?;
215205

216206
if let Some(sparse) = sparse_encoded.as_opt::<Sparse>() {
217207
let indices = sparse.patches().indices().to_primitive().narrow()?;
@@ -241,15 +231,22 @@ impl Scheme for PcoScheme {
241231
is_float_primitive(canonical)
242232
}
243233

234+
fn expected_compression_ratio(
235+
&self,
236+
_data: &mut ArrayAndStats,
237+
_ctx: CompressorContext,
238+
) -> CompressionEstimate {
239+
CompressionEstimate::Sample
240+
}
241+
244242
fn compress(
245243
&self,
246244
_compressor: &CascadingCompressor,
247245
data: &mut ArrayAndStats,
248246
_ctx: CompressorContext,
249247
) -> VortexResult<ArrayRef> {
250-
let stats = data.float_stats();
251248
Ok(vortex_pco::PcoArray::from_primitive(
252-
stats.source(),
249+
data.array_as_primitive(),
253250
pco::DEFAULT_COMPRESSION_LEVEL,
254251
8192,
255252
)?

0 commit comments

Comments
 (0)