Skip to content

Commit 699092a

Browse files
Merge branch 'apache:main' into fix_limit_column_ordering_bug
2 parents 330e9c5 + 20434b0 commit 699092a

34 files changed

Lines changed: 2570 additions & 3080 deletions

File tree

Cargo.lock

Lines changed: 85 additions & 140 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ repository = "https://github.com/apache/datafusion"
7979
# Define Minimum Supported Rust Version (MSRV)
8080
rust-version = "1.88.0"
8181
# Define DataFusion version
82-
version = "52.3.0"
82+
version = "53.0.0"
8383

8484
[workspace.dependencies]
8585
# We turn off default-features for some dependencies here so the workspaces which inherit them can
@@ -92,6 +92,13 @@ arrow = { version = "58.1.0", features = [
9292
"prettyprint",
9393
"chrono-tz",
9494
] }
95+
arrow-avro = { version = "58.1.0", default-features = false, features = [
96+
"deflate",
97+
"snappy",
98+
"zstd",
99+
"bzip2",
100+
"xz",
101+
] }
95102
arrow-buffer = { version = "58.1.0", default-features = false }
96103
arrow-flight = { version = "58.1.0", features = [
97104
"flight-sql-experimental",
@@ -109,43 +116,43 @@ chrono = { version = "0.4.44", default-features = false }
109116
criterion = "0.8"
110117
ctor = "0.6.3"
111118
dashmap = "6.0.1"
112-
datafusion = { path = "datafusion/core", version = "52.3.0", default-features = false }
113-
datafusion-catalog = { path = "datafusion/catalog", version = "52.3.0" }
114-
datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "52.3.0" }
115-
datafusion-common = { path = "datafusion/common", version = "52.3.0", default-features = false }
116-
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "52.3.0" }
117-
datafusion-datasource = { path = "datafusion/datasource", version = "52.3.0", default-features = false }
118-
datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "52.3.0", default-features = false }
119-
datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "52.3.0", default-features = false }
120-
datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "52.3.0", default-features = false }
121-
datafusion-datasource-json = { path = "datafusion/datasource-json", version = "52.3.0", default-features = false }
122-
datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "52.3.0", default-features = false }
123-
datafusion-doc = { path = "datafusion/doc", version = "52.3.0" }
124-
datafusion-execution = { path = "datafusion/execution", version = "52.3.0", default-features = false }
125-
datafusion-expr = { path = "datafusion/expr", version = "52.3.0", default-features = false }
126-
datafusion-expr-common = { path = "datafusion/expr-common", version = "52.3.0" }
127-
datafusion-ffi = { path = "datafusion/ffi", version = "52.3.0" }
128-
datafusion-functions = { path = "datafusion/functions", version = "52.3.0" }
129-
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "52.3.0" }
130-
datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "52.3.0" }
131-
datafusion-functions-nested = { path = "datafusion/functions-nested", version = "52.3.0", default-features = false }
132-
datafusion-functions-table = { path = "datafusion/functions-table", version = "52.3.0" }
133-
datafusion-functions-window = { path = "datafusion/functions-window", version = "52.3.0" }
134-
datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "52.3.0" }
135-
datafusion-macros = { path = "datafusion/macros", version = "52.3.0" }
136-
datafusion-optimizer = { path = "datafusion/optimizer", version = "52.3.0", default-features = false }
137-
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "52.3.0", default-features = false }
138-
datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "52.3.0", default-features = false }
139-
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "52.3.0", default-features = false }
140-
datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "52.3.0" }
141-
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "52.3.0" }
142-
datafusion-proto = { path = "datafusion/proto", version = "52.3.0" }
143-
datafusion-proto-common = { path = "datafusion/proto-common", version = "52.3.0" }
144-
datafusion-pruning = { path = "datafusion/pruning", version = "52.3.0" }
145-
datafusion-session = { path = "datafusion/session", version = "52.3.0" }
146-
datafusion-spark = { path = "datafusion/spark", version = "52.3.0" }
147-
datafusion-sql = { path = "datafusion/sql", version = "52.3.0" }
148-
datafusion-substrait = { path = "datafusion/substrait", version = "52.3.0" }
119+
datafusion = { path = "datafusion/core", version = "53.0.0", default-features = false }
120+
datafusion-catalog = { path = "datafusion/catalog", version = "53.0.0" }
121+
datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "53.0.0" }
122+
datafusion-common = { path = "datafusion/common", version = "53.0.0", default-features = false }
123+
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "53.0.0" }
124+
datafusion-datasource = { path = "datafusion/datasource", version = "53.0.0", default-features = false }
125+
datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "53.0.0", default-features = false }
126+
datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "53.0.0", default-features = false }
127+
datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "53.0.0", default-features = false }
128+
datafusion-datasource-json = { path = "datafusion/datasource-json", version = "53.0.0", default-features = false }
129+
datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "53.0.0", default-features = false }
130+
datafusion-doc = { path = "datafusion/doc", version = "53.0.0" }
131+
datafusion-execution = { path = "datafusion/execution", version = "53.0.0", default-features = false }
132+
datafusion-expr = { path = "datafusion/expr", version = "53.0.0", default-features = false }
133+
datafusion-expr-common = { path = "datafusion/expr-common", version = "53.0.0" }
134+
datafusion-ffi = { path = "datafusion/ffi", version = "53.0.0" }
135+
datafusion-functions = { path = "datafusion/functions", version = "53.0.0" }
136+
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "53.0.0" }
137+
datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "53.0.0" }
138+
datafusion-functions-nested = { path = "datafusion/functions-nested", version = "53.0.0", default-features = false }
139+
datafusion-functions-table = { path = "datafusion/functions-table", version = "53.0.0" }
140+
datafusion-functions-window = { path = "datafusion/functions-window", version = "53.0.0" }
141+
datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "53.0.0" }
142+
datafusion-macros = { path = "datafusion/macros", version = "53.0.0" }
143+
datafusion-optimizer = { path = "datafusion/optimizer", version = "53.0.0", default-features = false }
144+
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "53.0.0", default-features = false }
145+
datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "53.0.0", default-features = false }
146+
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "53.0.0", default-features = false }
147+
datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "53.0.0" }
148+
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "53.0.0" }
149+
datafusion-proto = { path = "datafusion/proto", version = "53.0.0" }
150+
datafusion-proto-common = { path = "datafusion/proto-common", version = "53.0.0" }
151+
datafusion-pruning = { path = "datafusion/pruning", version = "53.0.0" }
152+
datafusion-session = { path = "datafusion/session", version = "53.0.0" }
153+
datafusion-spark = { path = "datafusion/spark", version = "53.0.0" }
154+
datafusion-sql = { path = "datafusion/sql", version = "53.0.0" }
155+
datafusion-substrait = { path = "datafusion/substrait", version = "53.0.0" }
149156

150157
doc-comment = "0.3"
151158
env_logger = "0.11"

benchmarks/src/smj.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use futures::StreamExt;
3939
#[derive(Debug, Args, Clone)]
4040
#[command(verbatim_doc_comment)]
4141
pub struct RunOpt {
42-
/// Query number (between 1 and 20). If not specified, runs all queries
42+
/// Query number (between 1 and 23). If not specified, runs all queries
4343
#[arg(short, long)]
4444
query: Option<usize>,
4545

@@ -410,6 +410,52 @@ const SMJ_QUERIES: &[&str] = &[
410410
FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
411411
GROUP BY t1_sorted.key
412412
"#,
413+
// Q21: INNER 10M x 10M | unique keys (1:1) | 50% join filter
414+
r#"
415+
WITH t1_sorted AS (
416+
SELECT value as key, value as data
417+
FROM range(10000000) ORDER BY value
418+
),
419+
t2_sorted AS (
420+
SELECT value as key, value as data
421+
FROM range(10000000) ORDER BY value
422+
)
423+
SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
424+
FROM t1_sorted JOIN t2_sorted
425+
ON t1_sorted.key = t2_sorted.key
426+
AND t1_sorted.data + t2_sorted.data < 10000000
427+
"#,
428+
// Q22: LEFT 10M x 10M | unique keys (1:1) | 50% join filter
429+
r#"
430+
WITH t1_sorted AS (
431+
SELECT value as key, value as data
432+
FROM range(10000000) ORDER BY value
433+
),
434+
t2_sorted AS (
435+
SELECT value as key, value as data
436+
FROM range(10000000) ORDER BY value
437+
)
438+
SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
439+
FROM t1_sorted LEFT JOIN t2_sorted
440+
ON t1_sorted.key = t2_sorted.key
441+
AND t1_sorted.data + t2_sorted.data < 10000000
442+
"#,
443+
// Q23: FULL 10M x 10M | unique keys (1:1) | 50% join filter
444+
r#"
445+
WITH t1_sorted AS (
446+
SELECT value as key, value as data
447+
FROM range(10000000) ORDER BY value
448+
),
449+
t2_sorted AS (
450+
SELECT value as key, value as data
451+
FROM range(10000000) ORDER BY value
452+
)
453+
SELECT t1_sorted.key as k1, t1_sorted.data as d1,
454+
t2_sorted.key as k2, t2_sorted.data as d2
455+
FROM t1_sorted FULL JOIN t2_sorted
456+
ON t1_sorted.key = t2_sorted.key
457+
AND t1_sorted.data + t2_sorted.data < 10000000
458+
"#,
413459
];
414460

415461
impl RunOpt {

datafusion/common/Cargo.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ workspace = true
4141
name = "datafusion_common"
4242

4343
[features]
44-
avro = ["apache-avro"]
4544
backtrace = []
4645
parquet_encryption = [
4746
"parquet",
@@ -66,12 +65,6 @@ harness = false
6665
name = "stats_merge"
6766

6867
[dependencies]
69-
apache-avro = { workspace = true, features = [
70-
"bzip",
71-
"snappy",
72-
"xz",
73-
"zstandard",
74-
], optional = true }
7568
arrow = { workspace = true }
7669
arrow-ipc = { workspace = true }
7770
chrono = { workspace = true }

datafusion/common/src/error.rs

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ use std::sync::Arc;
4848
use crate::utils::datafusion_strsim::normalized_levenshtein;
4949
use crate::utils::quote_identifier;
5050
use crate::{Column, DFSchema, Diagnostic, TableReference};
51-
#[cfg(feature = "avro")]
52-
use apache_avro::Error as AvroError;
5351
use arrow::error::ArrowError;
5452
#[cfg(feature = "parquet")]
5553
use parquet::errors::ParquetError;
@@ -76,9 +74,6 @@ pub enum DataFusionError {
7674
/// Error when reading / writing Parquet data.
7775
#[cfg(feature = "parquet")]
7876
ParquetError(Box<ParquetError>),
79-
/// Error when reading Avro data.
80-
#[cfg(feature = "avro")]
81-
AvroError(Box<AvroError>),
8277
/// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile)
8378
#[cfg(feature = "object_store")]
8479
ObjectStore(Box<object_store::Error>),
@@ -332,13 +327,6 @@ impl From<ParquetError> for DataFusionError {
332327
}
333328
}
334329

335-
#[cfg(feature = "avro")]
336-
impl From<AvroError> for DataFusionError {
337-
fn from(e: AvroError) -> Self {
338-
DataFusionError::AvroError(Box::new(e))
339-
}
340-
}
341-
342330
#[cfg(feature = "object_store")]
343331
impl From<object_store::Error> for DataFusionError {
344332
fn from(e: object_store::Error) -> Self {
@@ -389,8 +377,6 @@ impl Error for DataFusionError {
389377
DataFusionError::ArrowError(e, _) => Some(e.as_ref()),
390378
#[cfg(feature = "parquet")]
391379
DataFusionError::ParquetError(e) => Some(e.as_ref()),
392-
#[cfg(feature = "avro")]
393-
DataFusionError::AvroError(e) => Some(e.as_ref()),
394380
#[cfg(feature = "object_store")]
395381
DataFusionError::ObjectStore(e) => Some(e.as_ref()),
396382
DataFusionError::IoError(e) => Some(e),
@@ -520,8 +506,6 @@ impl DataFusionError {
520506
DataFusionError::ArrowError(_, _) => "Arrow error: ",
521507
#[cfg(feature = "parquet")]
522508
DataFusionError::ParquetError(_) => "Parquet error: ",
523-
#[cfg(feature = "avro")]
524-
DataFusionError::AvroError(_) => "Avro error: ",
525509
#[cfg(feature = "object_store")]
526510
DataFusionError::ObjectStore(_) => "Object Store error: ",
527511
DataFusionError::IoError(_) => "IO error: ",
@@ -561,8 +545,6 @@ impl DataFusionError {
561545
}
562546
#[cfg(feature = "parquet")]
563547
DataFusionError::ParquetError(ref desc) => Cow::Owned(desc.to_string()),
564-
#[cfg(feature = "avro")]
565-
DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()),
566548
DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()),
567549
#[cfg(feature = "sql")]
568550
DataFusionError::SQL(ref desc, ref backtrace) => {

datafusion/core/Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ nested_expressions = ["datafusion-functions-nested"]
4343
# This feature is deprecated. Use the `nested_expressions` feature instead.
4444
array_expressions = ["nested_expressions"]
4545
# Used to enable the avro format
46-
avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
46+
avro = ["datafusion-datasource-avro"]
4747
backtrace = ["datafusion-common/backtrace"]
4848
compression = [
4949
"liblzma",
@@ -247,6 +247,11 @@ harness = false
247247
name = "parquet_struct_query"
248248
required-features = ["parquet"]
249249

250+
[[bench]]
251+
harness = false
252+
name = "parquet_struct_projection"
253+
required-features = ["parquet"]
254+
250255
[[bench]]
251256
harness = false
252257
name = "range_and_generate_series"

datafusion/core/benches/aggregate_query_sql.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,39 @@ fn criterion_benchmark(c: &mut Criterion) {
295295
)
296296
})
297297
});
298+
299+
c.bench_function("string_agg_query_group_by_few_groups", |b| {
300+
b.iter(|| {
301+
query(
302+
ctx.clone(),
303+
&rt,
304+
"SELECT u64_narrow, string_agg(utf8, ',') \
305+
FROM t GROUP BY u64_narrow",
306+
)
307+
})
308+
});
309+
310+
c.bench_function("string_agg_query_group_by_mid_groups", |b| {
311+
b.iter(|| {
312+
query(
313+
ctx.clone(),
314+
&rt,
315+
"SELECT u64_mid, string_agg(utf8, ',') \
316+
FROM t GROUP BY u64_mid",
317+
)
318+
})
319+
});
320+
321+
c.bench_function("string_agg_query_group_by_many_groups", |b| {
322+
b.iter(|| {
323+
query(
324+
ctx.clone(),
325+
&rt,
326+
"SELECT u64_wide, string_agg(utf8, ',') \
327+
FROM t GROUP BY u64_wide",
328+
)
329+
})
330+
});
298331
}
299332

300333
criterion_group!(benches, criterion_benchmark);

0 commit comments

Comments
 (0)