diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/agg_mode.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/agg_mode.rs index d42545bd4c2ce..d7f0df7e62195 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/agg_mode.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/agg_mode.rs @@ -16,7 +16,7 @@ use datafusion::physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptim use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode}; use datafusion::physical_plan::expressions::Column; use datafusion::physical_plan::projection::ProjectionExec; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use datafusion_common::Result; #[derive(Clone, Copy, Debug, PartialEq)] @@ -38,14 +38,17 @@ pub(crate) fn physical_optimizer_rules_without_combine( } /// Applies aggregate mode stripping to a physical plan. +/// `has_topk`: when true and stripping to Partial, replaces Final/FinalPartitioned with +/// PartialReduce so CSS partitions are merged by group key before the TopK sort truncates. pub(crate) fn apply_aggregate_mode( plan: Arc, mode: Mode, + has_topk: bool, ) -> Result> { match mode { Mode::Default => Ok(plan), - Mode::Partial => force_aggregate_mode(plan, AggregateMode::Partial), - Mode::Final => force_aggregate_mode(plan, AggregateMode::Final), + Mode::Partial => force_aggregate_mode(plan, AggregateMode::Partial, has_topk), + Mode::Final => force_aggregate_mode(plan, AggregateMode::Final, false), } } @@ -59,6 +62,7 @@ pub(crate) fn partial_aggregate_schema(plan: &Arc) -> Option< fn force_aggregate_mode( plan: Arc, target: AggregateMode, + has_topk: bool, ) -> Result> { if let Some(agg) = plan.downcast_ref::() { // Treat `FinalPartitioned` as `Final`: DataFusion picks `FinalPartitioned` for @@ -71,32 +75,47 @@ fn force_aggregate_mode( let new_children: Vec> = agg .children() .into_iter() - .map(|c| force_aggregate_mode(Arc::clone(c), target)) + .map(|c| force_aggregate_mode(Arc::clone(c), target, has_topk)) .collect::>()?; return plan.with_new_children(new_children); } // Mode mismatch — strip this node match target { AggregateMode::Partial => { - // Current node is Final; find the Partial subtree below + // Current node is Final/FinalPartitioned. + // When TopK is active and the input has multiple partitions (CSS), replace + // with PartialReduce instead of stripping. PartialReduce keeps agg.input() + // (RepartitionExec(Hash) → Partial(×N)) so CSS partitions are merged by + // group key before TopK truncation. Skip when input_partitions=1 — PartialReduce + // over a single partition is redundant and adds unnecessary overhead. + if has_topk && agg.input().output_partitioning().partition_count() > 1 { + return Ok(Arc::new(AggregateExec::try_new( + AggregateMode::PartialReduce, + agg.group_expr().clone(), + agg.aggr_expr().to_vec(), + agg.filter_expr().to_vec(), + Arc::clone(agg.input()), + agg.input_schema(), + )?)); + } + // Normal path: strip Final, return Partial subtree if let Some(partial_subtree) = find_partial_input(Arc::clone(agg.input())) { return Ok(partial_subtree); } - // If no Partial found below, the input itself is the Partial Ok(Arc::clone(agg.input())) } AggregateMode::Final => { // Current node is Partial; skip it, return its child // (the Final above will keep itself) let child = agg.children()[0]; - force_aggregate_mode(Arc::clone(child), target) + force_aggregate_mode(Arc::clone(child), target, false) } _ => Ok(plan), } } else if plan.children().len() == 1 { // Single-input wrapper — recurse transparently. let old_child = Arc::clone(plan.children()[0]); - let new_child = force_aggregate_mode(old_child.clone(), target)?; + let new_child = force_aggregate_mode(old_child.clone(), target, has_topk)?; // DataFusion's ProjectionMapping::try_new asserts col.name() == input_schema.field(i).name(); // with_new_children triggers it. Remap columns to the post-strip schema so it passes. @@ -235,7 +254,7 @@ mod tests { plan_string(&plan) ); - let result = apply_aggregate_mode(plan, Mode::Partial).unwrap(); + let result = apply_aggregate_mode(plan, Mode::Partial, false).unwrap(); let result_modes = find_agg_modes(&result); assert!( result_modes.contains(&AggregateMode::Partial), @@ -253,7 +272,7 @@ mod tests { async fn test_strip_final_over_scan() { // Final(Partial(memtable)) → strip to Final only (Partial removed) let plan = make_agg_plan().await; - let result = apply_aggregate_mode(plan, Mode::Final).unwrap(); + let result = apply_aggregate_mode(plan, Mode::Final, false).unwrap(); let result_modes = find_agg_modes(&result); assert!( result_modes.contains(&AggregateMode::Final), @@ -276,13 +295,13 @@ mod tests { let modes = find_agg_modes(&plan); if modes.len() < 2 { // If optimizer collapsed it, just verify Mode::Partial works - let result = apply_aggregate_mode(plan, Mode::Partial).unwrap(); + let result = apply_aggregate_mode(plan, Mode::Partial, false).unwrap(); let result_modes = find_agg_modes(&result); assert!(!result_modes.contains(&AggregateMode::Final)); return; } - let result = apply_aggregate_mode(plan, Mode::Partial).unwrap(); + let result = apply_aggregate_mode(plan, Mode::Partial, false).unwrap(); let result_modes = find_agg_modes(&result); assert!( !result_modes.contains(&AggregateMode::Final), @@ -297,7 +316,7 @@ mod tests { // Final → CoalescePartitions → Partial → scan; strip to Final let plan = make_agg_plan().await; // The simple plan has CoalescePartitions between Final and Partial - let result = apply_aggregate_mode(plan, Mode::Final).unwrap(); + let result = apply_aggregate_mode(plan, Mode::Final, false).unwrap(); let result_modes = find_agg_modes(&result); assert!( !result_modes.contains(&AggregateMode::Partial), @@ -332,10 +351,35 @@ mod tests { assert!(display_before.contains("AggregateExec: mode=Final"), "expected Final in plan"); assert!(display_before.contains("AggregateExec: mode=Partial"), "expected Partial in plan"); - let stripped = apply_aggregate_mode(plan, Mode::Partial).unwrap(); + let stripped = apply_aggregate_mode(plan, Mode::Partial, false).unwrap(); let display_after = plan_string(&stripped); assert!(!display_after.contains("mode=Final"), "Final should be stripped"); assert!(display_after.contains("mode=Partial"), "Partial should remain"); } + /// When has_topk=true and the input has multiple partitions (CSS), Final/FinalPartitioned + /// must be replaced with PartialReduce rather than stripped, so the coordinator receives + /// correctly merged partial state instead of per-partition-truncated results. + #[tokio::test] + async fn test_apply_partial_with_topk_produces_partial_reduce() { + let plan = make_agg_plan_with_repartition().await; + let display_before = plan_string(&plan); + // With target_partitions=4 and GROUP BY, DF produces FinalPartitioned. + assert!( + display_before.contains("mode=FinalPartitioned") || display_before.contains("mode=Final"), + "expected Final/FinalPartitioned in multi-partition plan, got:\n{display_before}" + ); + + let result = apply_aggregate_mode(plan, Mode::Partial, true).unwrap(); + let modes = find_agg_modes(&result); + assert!( + modes.contains(&AggregateMode::PartialReduce), + "has_topk=true with multi-partition input must produce PartialReduce, got modes: {modes:?}" + ); + assert!( + !modes.contains(&AggregateMode::Final) && !modes.contains(&AggregateMode::FinalPartitioned), + "Final/FinalPartitioned must not remain after stripping" + ); + } + } diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs index 5ec148c0e8ff8..9d21b6d5f40ca 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs @@ -132,6 +132,7 @@ pub async fn execute_indexed_query( query_config: Arc::unwrap_or_clone(query_config), io_handle: tokio::runtime::Handle::current(), aggregate_mode: crate::agg_mode::Mode::Default, + has_topk: false, prepared_plan: None, phantom_reservation: None, }; @@ -1331,7 +1332,7 @@ async unsafe fn execute_indexed_with_context_inner( // Apply aggregate mode stripping when prepare_partial_plan was called (engine-native-merge). // This makes the indexed executor produce Binary HLL state (Partial) instead of Int64 (Final). let physical_plan = if aggregate_mode != crate::agg_mode::Mode::Default { - crate::agg_mode::apply_aggregate_mode(physical_plan, aggregate_mode)? + crate::agg_mode::apply_aggregate_mode(physical_plan, aggregate_mode, handle.has_topk)? } else { physical_plan }; diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/local_executor.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/local_executor.rs index a59e2ec56d28f..89756519380ed 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/local_executor.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/local_executor.rs @@ -230,6 +230,7 @@ impl LocalSession { let stripped = crate::agg_mode::apply_aggregate_mode( physical_plan, crate::agg_mode::Mode::Final, + false, )?; let target_schema = crate::schema_coerce::coerce_inferred_schema(stripped.schema()); diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs index 5f99b8ccbd06b..30f637c759bba 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs @@ -24,6 +24,7 @@ use datafusion::{ execution::memory_pool::MemoryPool, execution::runtime_env::RuntimeEnvBuilder, execution::SessionStateBuilder, + physical_plan::ExecutionPlan, prelude::*, }; use log::error; @@ -62,6 +63,10 @@ pub struct SessionContextHandle { pub io_handle: tokio::runtime::Handle, /// Aggregate execution mode for distributed partial/final stripping. pub(crate) aggregate_mode: crate::agg_mode::Mode, + /// True when the shard Substrait fragment contains a FetchRel (Sort+Limit = TopK). + /// Detected once in `create_session_context` from plan_bytes and reused in + /// `prepare_partial_plan` to apply PartialReduce for CSS correctness. + pub(crate) has_topk: bool, /// Pre-prepared physical plan (set by prepare_partial_plan / prepare_final_plan). pub(crate) prepared_plan: Option>, /// Phantom reservation holding pool capacity for untracked memory. @@ -199,8 +204,15 @@ pub async unsafe fn create_session_context( let phantom = phantom_reservation.map(|b| b.phantom_reservation); let mut config = SessionConfig::new(); + // Detect TopK once from the Substrait bytes: a FetchRel (Sort+Limit) in a partial-agg + // fragment means OpenSearchTopKRewriter fired. Stored on the handle so prepare_partial_plan + // can apply PartialReduce without re-scanning the physical plan. + let has_topk = has_partial_aggregate && substrait_has_fetch_rel(plan_bytes); config.options_mut().execution.parquet.pushdown_filters = query_config.listing_table_pushdown_filters; - if has_partial_aggregate { + // Disable DataFusion's adaptive skip-partial-aggregation when TopK is active. + // If DF abandons partial agg midstream, the partial state sent to the coordinator is + // incomplete — TopK sees wrong group counts and produces incorrect results. + if has_topk { config.options_mut().execution.skip_partial_aggregation_probe_ratio_threshold = 1.0; } config.options_mut().execution.target_partitions = effective_partitions; @@ -378,6 +390,7 @@ pub async unsafe fn create_session_context( query_config, io_handle: tokio::runtime::Handle::current(), aggregate_mode: crate::agg_mode::Mode::Default, + has_topk, prepared_plan: None, phantom_reservation: phantom, }; @@ -448,13 +461,14 @@ pub async fn prepare_partial_plan( let logical_plan = from_substrait_plan(&handle.ctx.state(), &plan).await?; let dataframe = handle.ctx.execute_logical_plan(logical_plan).await?; let physical_plan = dataframe.create_physical_plan().await?; + // Strip first on the raw physical plan so `force_aggregate_mode(Partial)` can find the // Final/Partial pair without a RelabelExec wrapper at the root pre-empting the walk. // Then derive `target_schema` and wrap with RelabelExec from the stripped plan's actual // output (state-suffixed Binary for HLL Partial vs. Int64 cardinality for Final.evaluate) // — otherwise RelabelExec would carry the pre-strip type tag (e.g. Int64) and fail with // "non-bit-compatible types: Binary → Int64" when wrapping the stripped Partial. - let stripped = crate::agg_mode::apply_aggregate_mode(physical_plan, crate::agg_mode::Mode::Partial)?; + let stripped = crate::agg_mode::apply_aggregate_mode(physical_plan, crate::agg_mode::Mode::Partial, handle.has_topk)?; let target_schema = crate::schema_coerce::coerce_inferred_schema(stripped.schema()); let stripped = crate::relabel_exec::wrap_if_relabel_needed(stripped, target_schema)?; @@ -462,6 +476,62 @@ pub async fn prepare_partial_plan( Ok(()) } + +/// Returns true if the Substrait plan bytes contain a FetchRel (Sort+Limit node). +/// A FetchRel in a shard fragment means `OpenSearchTopKRewriter` inserted a per-shard +/// Sort+Limit — TopK is active. Used in `create_session_context` to detect TopK before +/// the DataFusion physical plan is built, so the result can be stored on the handle and +/// reused in `prepare_partial_plan` without re-scanning the physical plan. +/// +/// Single-shard (SINGLE aggregate mode) never has `has_partial_aggregate=true` so this +/// function is only called for multi-shard partial-aggregate fragments. +/// +/// # Upgrade path note +/// This detection avoids adding a new boolean field to the Java→Rust FFI surface +/// (which would break wire compatibility with older nodes during rolling upgrades — +/// old coordinators serialising `PartialAggregateInstructionNode` without the field +/// would be misread by new data nodes). The Substrait plan bytes are already part of +/// the existing wire contract and do not change format. +/// +/// TODO: Once AnalyticsCore supports a versioned flag/hint mechanism, replace this +/// Substrait scan with an explicit flag passed through the instruction pipeline. +/// That would be cleaner and avoid re-parsing the plan bytes, but requires a +/// backward-compatible flag delivery path that does not exist today. +fn substrait_has_fetch_rel(plan_bytes: &[u8]) -> bool { + use prost::Message; + use substrait::proto::rel::RelType; + + fn rel_has_fetch(rel: &substrait::proto::Rel) -> bool { + match rel.rel_type.as_ref() { + Some(RelType::Fetch(f)) => f.count_mode.is_some(), + Some(RelType::Sort(s)) => s.input.as_ref().map_or(false, |r| rel_has_fetch(r)), + Some(RelType::Project(p)) => p.input.as_ref().map_or(false, |r| rel_has_fetch(r)), + Some(RelType::Filter(f)) => f.input.as_ref().map_or(false, |r| rel_has_fetch(r)), + Some(RelType::Aggregate(a)) => a.input.as_ref().map_or(false, |r| rel_has_fetch(r)), + // TODO: enumerate remaining rel types explicitly and panic on unknown ones. + Some(other) => { + native_bridge_common::log_debug!( + "substrait_has_fetch_rel: {:?} — no TopK fetch", + std::mem::discriminant(other) + ); + false + } + None => false, + } + } + + let Ok(plan) = substrait::proto::Plan::decode(plan_bytes) else { return false; }; + plan.relations.iter().any(|pr| { + match pr.rel_type.as_ref() { + Some(substrait::proto::plan_rel::RelType::Root(rr)) => { + rr.input.as_ref().map_or(false, |r| rel_has_fetch(r)) + } + Some(substrait::proto::plan_rel::RelType::Rel(r)) => rel_has_fetch(r), + None => false, + } + }) +} + /// Attempt to acquire a memory budget using cached parquet metadata. /// Returns None on cache miss or if the budget system is not configured. fn try_acquire_budget( @@ -679,6 +749,7 @@ mod tests { query_config: crate::datafusion_query_config::DatafusionQueryConfig::test_default(), io_handle: tokio::runtime::Handle::current(), aggregate_mode: Mode::Default, + has_topk: false, prepared_plan: None, phantom_reservation: None, }; @@ -832,28 +903,158 @@ mod tests { } #[test] - fn test_skip_partial_agg_disabled_when_has_partial_aggregate() { - // When has_partial_aggregate=true, skip_partial must be disabled (threshold=1.0) + fn test_skip_partial_agg_disabled_when_has_topk() { + // skip_partial must be disabled (1.0) when TopK is active — if DF abandons partial + // agg midstream the partial state is incomplete and TopK sees wrong group counts. let mut config = SessionConfig::new(); - let has_partial = true; - if has_partial { + let has_topk = true; + if has_topk { config.options_mut().execution.skip_partial_aggregation_probe_ratio_threshold = 1.0; } assert_eq!( config.options().execution.skip_partial_aggregation_probe_ratio_threshold, 1.0, - "skip_partial must be disabled (1.0) for multi-shard" + "skip_partial must be disabled (1.0) when TopK is active" ); } #[test] - fn test_skip_partial_agg_default_when_single_shard() { - // When has_partial_aggregate=false, skip_partial retains DF default (0.8) + fn test_skip_partial_agg_default_when_no_topk() { + // When has_topk=false, skip_partial retains DF default (0.8) — no perf regression + // for non-TopK multi-shard queries. let config = SessionConfig::new(); assert_eq!( config.options().execution.skip_partial_aggregation_probe_ratio_threshold, 0.8, - "single-shard must retain DF default threshold" + "non-TopK queries must retain DF default threshold" ); } + + #[test] + fn test_substrait_has_fetch_rel_empty() { + assert!(!substrait_has_fetch_rel(&[]), "empty bytes → false"); + } + + #[test] + fn test_substrait_has_fetch_rel_with_fetch() { + use prost::Message; + use substrait::proto::expression::literal::LiteralType; + use substrait::proto::expression::{Literal, RexType}; + use substrait::proto::rel::RelType; + use substrait::proto::{Expression, FetchRel, Plan, PlanRel, Rel, SortRel, fetch_rel, plan_rel}; + + // Build: FetchRel(count=10) wrapping SortRel — same as what DataFusion Substrait + // producer emits for Sort(fetch=10, ...) from OpenSearchTopKRewriter. + let sort_rel = Box::new(Rel { + rel_type: Some(RelType::Sort(Box::new(SortRel { + common: None, + input: None, + sorts: vec![], + advanced_extension: None, + }))), + }); + let fetch_rel = Box::new(Rel { + rel_type: Some(RelType::Fetch(Box::new(FetchRel { + common: None, + input: Some(sort_rel), + offset_mode: None, + count_mode: Some(fetch_rel::CountMode::CountExpr(Box::new(Expression { + rex_type: Some(RexType::Literal(Literal { + nullable: false, + type_variation_reference: 0, + literal_type: Some(LiteralType::I64(10)), + })), + }))), + advanced_extension: None, + }))), + }); + let plan = Plan { + relations: vec![PlanRel { + rel_type: Some(plan_rel::RelType::Rel(*fetch_rel)), + }], + ..Default::default() + }; + let bytes = plan.encode_to_vec(); + assert!(substrait_has_fetch_rel(&bytes), "FetchRel(count=10) → true"); + } + + #[test] + fn test_substrait_has_fetch_rel_with_fetch_no_count_mode() { + use prost::Message; + use substrait::proto::rel::RelType; + use substrait::proto::{FetchRel, Plan, PlanRel, Rel, plan_rel}; + + // FetchRel exists but count_mode is None — not a real limit, should not trigger TopK. + let fetch_rel = Box::new(Rel { + rel_type: Some(RelType::Fetch(Box::new(FetchRel { + common: None, + input: None, + offset_mode: None, + count_mode: None, + advanced_extension: None, + }))), + }); + let plan = Plan { + relations: vec![PlanRel { + rel_type: Some(plan_rel::RelType::Rel(*fetch_rel)), + }], + ..Default::default() + }; + let bytes = plan.encode_to_vec(); + assert!(!substrait_has_fetch_rel(&bytes), "FetchRel without count_mode → false"); + } + + #[test] + fn test_substrait_has_fetch_rel_without_fetch() { + use prost::Message; + use substrait::proto::rel::RelType; + use substrait::proto::{Plan, PlanRel, Rel, SortRel, plan_rel}; + + // Sort without fetch → no FetchRel → false + let sort_rel = Box::new(Rel { + rel_type: Some(RelType::Sort(Box::new(SortRel { + common: None, + input: None, + sorts: vec![], + advanced_extension: None, + }))), + }); + let plan = Plan { + relations: vec![PlanRel { + rel_type: Some(plan_rel::RelType::Rel(*sort_rel)), + }], + ..Default::default() + }; + let bytes = plan.encode_to_vec(); + assert!(!substrait_has_fetch_rel(&bytes), "SortRel without FetchRel → false"); + } + + /// A Join rel at the root — exercises the `Some(other)` arm that logs and returns false. + /// Shard fragments never have Join above a TopK FetchRel, so this correctly returns false. + #[test] + fn test_substrait_has_fetch_rel_join_returns_false() { + use prost::Message; + use substrait::proto::rel::RelType; + use substrait::proto::{JoinRel, Plan, PlanRel, Rel, plan_rel}; + + let join_rel = Box::new(Rel { + rel_type: Some(RelType::Join(Box::new(JoinRel { + common: None, + left: None, + right: None, + r#type: 0, + expression: None, + post_join_filter: None, + advanced_extension: None, + }))), + }); + let plan = Plan { + relations: vec![PlanRel { + rel_type: Some(plan_rel::RelType::Rel(*join_rel)), + }], + ..Default::default() + }; + let bytes = plan.encode_to_vec(); + assert!(!substrait_has_fetch_rel(&bytes), "Join rel → false (no TopK in shard fragment with Join)"); + } } diff --git a/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchTopKRewriter.java b/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchTopKRewriter.java index 9c0c1d16d3d8a..be18e68c83e02 100644 --- a/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchTopKRewriter.java +++ b/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchTopKRewriter.java @@ -17,6 +17,7 @@ import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; import org.apache.calcite.sql.type.SqlTypeName; import org.opensearch.analytics.planner.PlannerContext; import org.opensearch.analytics.planner.rel.AggregateMode; @@ -229,8 +230,19 @@ private static PathToFinal findFinalAgg(RelNode node, OpenSearchProject seenProj if (node instanceof OpenSearchAggregate agg && agg.getMode() == AggregateMode.FINAL) { return new PathToFinal(seenProject, agg); } - if (node instanceof OpenSearchProject proj && seenProject == null) { - return findFinalAgg(proj.getInput(), proj); + // Anything between the Sort and the FINAL that consumes its full grouped output makes + // the pushdown unsafe — refuse to match at all. + // TODO: nested stats — re-enable once TopK oversampling factor is an execution hint + // so the inner agg can over-fetch enough groups for outer-agg correctness. + if (node instanceof OpenSearchAggregate) return null; // nested stats + if (node instanceof OpenSearchProject proj) { + if (proj.getProjects().stream().anyMatch(RexOver::containsOver)) return null; // window fn + // Capture the first Project for sort-key remapping; pass through subsequent Projects. + // Only the first Project (seenProject) is used for collation remapping in rewrite() — + // subsequent plain-column Projects are transparent. rewrite() then validates each sort + // field maps through seenProject as a RexInputRef; computed expressions (AVG division, + // etc.) cause rewrite() to bail, so they are safely rejected even if passed through here. + return findFinalAgg(proj.getInput(), seenProject == null ? proj : seenProject); } if (node.getInputs().size() == 1) return findFinalAgg(node.getInputs().get(0), seenProject); return null; diff --git a/sandbox/plugins/analytics-engine/src/test/java/org/opensearch/analytics/planner/TopKRewriterPlanShapeTests.java b/sandbox/plugins/analytics-engine/src/test/java/org/opensearch/analytics/planner/TopKRewriterPlanShapeTests.java index 184ddbd5b1456..73796cbb24a39 100644 --- a/sandbox/plugins/analytics-engine/src/test/java/org/opensearch/analytics/planner/TopKRewriterPlanShapeTests.java +++ b/sandbox/plugins/analytics-engine/src/test/java/org/opensearch/analytics/planner/TopKRewriterPlanShapeTests.java @@ -485,6 +485,57 @@ public void testRewrite_pplShape_sortByGroupKey_remapsCorrectly() { ); } + // ── Detection: chained stats (nested aggregation) must NOT get TopK ───────── + + /** + * PPL: {@code stats count() as c by X, Y | stats sum(c) as total by X | sort - total | head 5} + * The outer aggregate's PARTIAL input subtree contains another aggregate, so TopK must bail. + * TopK on the inner agg would truncate (X, Y) groups before the outer sum sees all of them, + * producing catastrophically wrong totals. + */ + public void testDetection_chainedStats_topKBails() { + RelOptTable table = mockTable("test_index", "status", "size"); + RelNode scan = stubScan(table); + + // Inner agg: count() by (status, size) + LogicalAggregate innerAgg = LogicalAggregate.create(scan, List.of(), ImmutableBitSet.of(0, 1), null, List.of(countStarCall())); + + // Outer agg: sum(count) by status — groups over the inner agg result + LogicalAggregate outerAgg = LogicalAggregate.create( + innerAgg, + List.of(), + ImmutableBitSet.of(0), + null, + List.of( + AggregateCall.create( + SqlStdOperatorTable.SUM, + false, + false, + false, + List.of(), + List.of(2), + -1, + null, + RelCollations.EMPTY, + typeFactory.createSqlType(SqlTypeName.BIGINT), + "total" + ) + ) + ); + + // Sort on total DESC, head 5 + RelNode sort = LogicalSort.create( + outerAgg, + RelCollations.of(new RelFieldCollation(1, RelFieldCollation.Direction.DESCENDING)), + null, + rexBuilder.makeLiteral(5, typeFactory.createSqlType(SqlTypeName.INTEGER), true) + ); + + RelNode result = runPlanner(sort, contextWithOversampling(2.0)); + String plan = RelOptUtil.toString(result); + assertEquals("chained stats — TopK must not insert a shard Sort", 0, countShardSortsBelowER(plan)); + } + // ── Detection: AVG does NOT get TopK (reduce decomposition inserts computed Project) ── /** AVG is decomposed into SUM/COUNT with a divide Project — rewriter bails. */ @@ -504,10 +555,11 @@ public void testDetection_avgByGroup_noTopK() { } /** - * Multiple adjacent Projects between Sort and Aggregate: if PROJECT_MERGE is ever removed, - * the rewriter should still work (captures only the first Project, skips remapping for the - * second). This test verifies TopK still fires — sort key passes through un-remapped since - * the second Project is not captured. + * Multiple adjacent Projects between Sort and Aggregate: PROJECT_MERGE collapses them during + * RBO so TopK normally fires. If for any reason two projects survive (PROJECT_MERGE removed or + * blocked), the rewriter now safely bails — accepting the second project is unsafe since it + * could carry window functions or other expressions that make TopK incorrect. + * This test verifies the safe-bail behavior when two projects reach the rewriter. */ public void testDetection_multipleProjects_topKStillFires() { RelOptTable table = mockTable("test_index", "status", "size"); @@ -538,7 +590,10 @@ public void testDetection_multipleProjects_topKStillFires() { RelNode result = runPlanner(sort, contextWithOversampling(2.0)); String plan = RelOptUtil.toString(result); long sortCount = plan.lines().filter(l -> l.contains("OpenSearchSort")).count(); - assertTrue("TopK should still fire with multiple projects (PROJECT_MERGE collapses them)", sortCount >= 2); + // PROJECT_MERGE collapses the two adjacent identity projects, so TopK fires. + // Even without PROJECT_MERGE, the rewriter passes through multiple plain-column projects + // and validates the sort key at the first seenProject — TopK still fires correctly. + assertTrue("TopK should fire with multiple plain-column projects", sortCount >= 2); } /** Computed expression (literal) in Project between Sort and Aggregate — rewriter bails. */ diff --git a/sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/TopKCssCorrectnessIT.java b/sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/TopKCssCorrectnessIT.java new file mode 100644 index 0000000000000..5f3936a684eac --- /dev/null +++ b/sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/TopKCssCorrectnessIT.java @@ -0,0 +1,299 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source source license. + */ + +package org.opensearch.analytics.qa; + +import org.opensearch.client.Request; +import org.opensearch.client.Response; + +import java.util.List; +import java.util.Map; + +/** + * Regression tests for TopK correctness when concurrent segment search (CSS) is active. + * + *

Before the PartialReduce fix, CSS caused each intra-shard partition to independently + * truncate to the TopK fetch limit before the coordinator merge, producing wrong counts. + * Each test runs the same query with CSS off (reference) and CSS on (subject) and asserts + * the results are identical. + * + *

Covers 13 aggregate shapes identified by Aniketh Jain across count, sum, avg, min/max, + * distinct_count, stddev/variance, percentile, offset, scalar agg, and permutation variants. + */ +@SuppressWarnings("unchecked") +public class TopKCssCorrectnessIT extends AnalyticsRestTestCase { + + private static volatile boolean provisioned = false; + private static final String INDEX = "parquet_hits"; + + private void ensureProvisioned() throws Exception { + if (!provisioned) { + // MULTI_SEGMENT (2 segments/shard) + low oversampling makes the CSS truncation + // bug reproducible on the local test cluster — each CSS partition independently + // truncates to a very small fetch limit, producing wrong results without the fix. + DatasetProvisioner.provision(client(), ClickBenchTestHelper.DATASET, 2, DatasetProvisioner.SegmentLayout.MULTI_SEGMENT); + Request req = new Request("PUT", "/_cluster/settings"); + req.setJsonEntity( + "{\"persistent\":{\"analytics.shard_bucket_oversampling_factor\": 0.1}}" + ); + client().performRequest(req); + provisioned = true; + } + } + + // ── case-01: multi-key, count/sum/avg, != filter ────────────────────────── + + public void testCase01_multiKeyCountSumAvg_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | where SearchPhrase != ''" + + " | stats count() as c, sum(IsRefresh), avg(ResolutionWidth)" + + " by SearchEngineID, ClientIP" + + " | sort - c, SearchEngineID, ClientIP | head 10" + ); + } + + // ── case-02: single-key count ──────────────────────────────────────────── + + public void testCase02_singleKeyCount_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats count() as c by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── case-03: distinct_count (HLL) ──────────────────────────────────────── + + public void testCase03_distinctCount_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats distinct_count(ClientIP) as dc by SearchEngineID" + + " | sort - dc, SearchEngineID | head 2" + ); + } + + // ── case-04: stddev / variance ─────────────────────────────────────────── + + public void testCase04_stddevVariance_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats stddev_samp(ResolutionWidth) as sd," + + " var_samp(ResolutionWidth) as vs," + + " var_pop(ResolutionWidth) as vp" + + " by SearchEngineID | sort SearchEngineID | head 10" + ); + } + + // ── case-05: scalar aggregate (no group-by, no TopK) ───────────────────── + + public void testCase05_scalarSums_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats sum(ResolutionWidth)," + + " sum(ResolutionWidth+1)," + + " sum(ResolutionWidth+2)," + + " count()" + ); + } + + // ── case-06: offset + limit ─────────────────────────────────────────────── + + public void testCase06_offsetLimit_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats count() as c by SearchEngineID" + + " | sort - c, SearchEngineID | head 2 from 1" + ); + } + + // ── case-07: min / max ──────────────────────────────────────────────────── + + public void testCase07_minMax_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats min(ResolutionWidth) as mn," + + " max(ResolutionWidth) as mx," + + " count() as c by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── case-08: avg + sum ──────────────────────────────────────────────────── + + public void testCase08_avgSum_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + // Sort by SearchEngineID (deterministic key, not count) to avoid tie-breaking flakiness. + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats avg(ResolutionWidth) as a," + + " sum(ResolutionWidth) as s," + + " count() as c by SearchEngineID" + + " | sort SearchEngineID | head 5" + ); + } + + // ── case-09a: agg permutation (count, sum, avg, min, max) ──────────────── + + public void testCase09a_permutation1_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats count() as c," + + " sum(IsRefresh) as si," + + " avg(ResolutionWidth) as a," + + " min(ResolutionWidth) as mn," + + " max(ResolutionWidth) as mx by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── case-09b: agg permutation (max, avg, count, min, sum) ──────────────── + + public void testCase09b_permutation2_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats max(ResolutionWidth) as mx," + + " avg(ResolutionWidth) as a," + + " count() as c," + + " min(ResolutionWidth) as mn," + + " sum(IsRefresh) as si by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── case-09c: agg permutation (avg, min, sum, max, count) ──────────────── + + public void testCase09c_permutation3_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats avg(ResolutionWidth) as a," + + " min(ResolutionWidth) as mn," + + " sum(IsRefresh) as si," + + " max(ResolutionWidth) as mx," + + " count() as c by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── case-10: no aliases ─────────────────────────────────────────────────── + + public void testCase10_noAliases_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats count(), sum(ResolutionWidth)," + + " avg(ResolutionWidth)," + + " min(ResolutionWidth)," + + " max(ResolutionWidth) by SearchEngineID" + + " | sort SearchEngineID | head 5" + ); + } + + // ── case-11: many aggs on same column ──────────────────────────────────── + + public void testCase11_manyAggsOnSameColumn_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats sum(ResolutionWidth)," + + " avg(ResolutionWidth)," + + " min(ResolutionWidth)," + + " max(ResolutionWidth)," + + " count(ResolutionWidth) by SearchEngineID" + + " | sort SearchEngineID | head 5" + ); + } + + // ── case-12: percentile ─────────────────────────────────────────────────── + + public void testCase12_percentile_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats percentile(ResolutionWidth, 50) as p50," + + " percentile(ResolutionWidth, 95) as p95 by SearchEngineID" + + " | sort SearchEngineID | head 5" + ); + } + + // ── case-13: mixed split + non-split (count/sum + percentile) ──────────── + + public void testCase13_mixedSplitAndNonSplit_cssMatchesNoCss() throws Exception { + ensureProvisioned(); + assertCssMatchesNoCss( + "source = " + INDEX + + " | stats count() as c," + + " sum(ResolutionWidth) as s," + + " percentile(ResolutionWidth, 50) as p50 by SearchEngineID" + + " | sort - c, SearchEngineID | head 2" + ); + } + + // ── Helpers ─────────────────────────────────────────────────────────────── + + /** + * Runs {@code ppl} with CSS off, then with CSS on (4 slices), and asserts the + * result rows are identical. Restores CSS-off after the check. + */ + private void assertCssMatchesNoCss(String ppl) throws Exception { + setCss("none", 0); + List> reference = rowsOf(executePPL(ppl)); + + setCss("all", 4); + List> withCss = rowsOf(executePPL(ppl)); + + assertEquals( + "CSS result differs from no-CSS reference for query: " + ppl, + reference, + withCss + ); + + setCss("none", 0); + } + + private void setCss(String mode, int sliceCount) throws Exception { + Request req = new Request("PUT", "/_cluster/settings"); + if (sliceCount > 0) { + req.setJsonEntity( + "{\"transient\":{\"search.concurrent_segment_search.mode\":\"" + + mode + + "\",\"search.concurrent.max_slice_count\":" + + sliceCount + + "}}" + ); + } else { + req.setJsonEntity( + "{\"transient\":{\"search.concurrent_segment_search.mode\":\"" + mode + "\"}}" + ); + } + client().performRequest(req); + } + + private Map executePPL(String ppl) throws Exception { + Request request = new Request("POST", "/_analytics/ppl"); + request.setJsonEntity("{\"query\": \"" + ppl + "\"}"); + Response response = client().performRequest(request); + return entityAsMap(response); + } + + private List> rowsOf(Map result) { + List rows = (List) result.get("rows"); + assertNotNull("response must have rows, got: " + result.keySet(), rows); + return (List>) rows; + } +} diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q10.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q10.plan.yaml index 6170ced6eb4fd..6a429ac754da8 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q10.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q10.plan.yaml @@ -30,12 +30,22 @@ plans: OpenSearchAggregate(group=[{0}], sum(AdvEngineID)=[SUM($1)], c=[SUM($2)], $f3=[SUM($3)], $f4=[SUM($4)], dc(UserID)=[APPROX_COUNT_DISTINCT($5)], mode=[FINAL], viableBackends=[[datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[RegionID@0 as RegionID, sum(.AdvEngineID)[sum]@1 as sum(AdvEngineID), count(Int64(1))[count]@2 as c, sum(.ResolutionWidth)[sum]@3 as $f3, count(.ResolutionWidth)[count]@4 as $f4, approx_distinct(.UserID)[hll_registers]@5 as dc(UserID)] + SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, RegionID@0 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, RegionID@0 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[RegionID@0 as RegionID], aggr=[sum(.AdvEngineID), count(Int64(1)), sum(.ResolutionWidth), count(.ResolutionWidth), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[sum(.AdvEngineID), count(Int64(1)), sum(.ResolutionWidth), count(.ResolutionWidth), approx_distinct(.UserID)] + DataSourceExec: file_groups={}, projection=[RegionID, AdvEngineID, ResolutionWidth, UserID], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[RegionID@0 as RegionID, sum(.AdvEngineID)[sum]@1 as sum(AdvEngineID), count(Int64(1))[count]@2 as c, sum(.ResolutionWidth)[sum]@3 as $f3, count(.ResolutionWidth)[count]@4 as $f4, approx_distinct(.UserID)[hll_registers]@5 as dc(UserID)] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, RegionID@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, RegionID@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[sum(.AdvEngineID), count(Int64(1)), sum(.ResolutionWidth), count(.ResolutionWidth), approx_distinct(.UserID)] - DataSourceExec: file_groups={}, projection=[RegionID, AdvEngineID, ResolutionWidth, UserID], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[RegionID@0 as RegionID], aggr=[sum(.AdvEngineID), count(Int64(1)), sum(.ResolutionWidth), count(.ResolutionWidth), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[sum(.AdvEngineID), count(Int64(1)), sum(.ResolutionWidth), count(.ResolutionWidth), approx_distinct(.UserID)] + DataSourceExec: file_groups={}, projection=[RegionID, AdvEngineID, ResolutionWidth, UserID], file_type=parquet prod1s: post_cbo: | OpenSearchSort(sort0=[$1], sort1=[$4], dir0=[DESC-nulls-last], dir1=[ASC-nulls-first], fetch=[10000], viableBackends=[[datafusion]]) diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q11.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q11.plan.yaml index 541366637d238..f9e3a34107fd0 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q11.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q11.plan.yaml @@ -39,19 +39,23 @@ plans: SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], preserve_partitioning=[true] ProjectionExec: expr=[MobilePhoneModel@0 as MobilePhoneModel, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] - FilterExec: MobilePhoneModel@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] + AggregateExec: mode=PartialReduce, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([MobilePhoneModel@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + FilterExec: MobilePhoneModel@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] shard_physical_nseg: | ProjectionExec: expr=[MobilePhoneModel@0 as MobilePhoneModel, approx_distinct(.UserID)@1 as u] SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], preserve_partitioning=[true] ProjectionExec: expr=[MobilePhoneModel@0 as MobilePhoneModel, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] - FilterExec: MobilePhoneModel@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] + AggregateExec: mode=PartialReduce, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([MobilePhoneModel@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + FilterExec: MobilePhoneModel@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] prod1s: post_cbo: | OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10000], viableBackends=[[datafusion]]) diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q12.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q12.plan.yaml index 936e2ca60afa4..be472def9d44e 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q12.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q12.plan.yaml @@ -39,19 +39,23 @@ plans: SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@3 DESC NULLS LAST, MobilePhone@0 ASC, MobilePhoneModel@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@3 DESC NULLS LAST, MobilePhone@0 ASC, MobilePhoneModel@1 ASC], preserve_partitioning=[true] ProjectionExec: expr=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, approx_distinct(.UserID)[hll_registers]@2 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@2) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] - FilterExec: MobilePhoneModel@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[MobilePhone, MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] + AggregateExec: mode=PartialReduce, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + FilterExec: MobilePhoneModel@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[MobilePhone, MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] shard_physical_nseg: | ProjectionExec: expr=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, approx_distinct(.UserID)@2 as u] SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@3 DESC NULLS LAST, MobilePhone@0 ASC, MobilePhoneModel@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@3 DESC NULLS LAST, MobilePhone@0 ASC, MobilePhoneModel@1 ASC], preserve_partitioning=[true] ProjectionExec: expr=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, approx_distinct(.UserID)[hll_registers]@2 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@2) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] - FilterExec: MobilePhoneModel@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[MobilePhone, MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] + AggregateExec: mode=PartialReduce, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[approx_distinct(.UserID)] + FilterExec: MobilePhoneModel@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[MobilePhone, MobilePhoneModel, UserID], file_type=parquet, predicate=MobilePhoneModel@33 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 != OR != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()] prod1s: post_cbo: | OpenSearchSort(sort0=[$0], sort1=[$1], sort2=[$2], dir0=[DESC-nulls-last], dir1=[ASC-nulls-first], dir2=[ASC-nulls-first], fetch=[10000], viableBackends=[[datafusion]]) diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q13.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q13.plan.yaml index d6c5e1f3183fd..55c166f8b6f69 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q13.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q13.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: SearchPhrase@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: SearchPhrase@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: SearchPhrase@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: SearchPhrase@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, SearchPhrase@1 as SearchPhrase] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q14.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q14.plan.yaml index 7c51d6d91369e..d6a98b957524e 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q14.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q14.plan.yaml @@ -39,19 +39,23 @@ plans: SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], preserve_partitioning=[true] ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] - FilterExec: SearchPhrase@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[SearchPhrase, UserID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] + FilterExec: SearchPhrase@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchPhrase, UserID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, approx_distinct(.UserID)@1 as u] SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST], preserve_partitioning=[true] ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] - FilterExec: SearchPhrase@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[SearchPhrase, UserID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[approx_distinct(.UserID)] + FilterExec: SearchPhrase@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchPhrase, UserID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] prod1s: post_cbo: | OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10000], viableBackends=[[datafusion]]) diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q15.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q15.plan.yaml index a98419f77dc43..c49bb90836312 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q15.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q15.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as c] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, SearchPhrase@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: SearchPhrase@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchEngineID@0, SearchPhrase@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: SearchPhrase@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as c] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, SearchPhrase@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: SearchPhrase@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchEngineID@0, SearchPhrase@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: SearchPhrase@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, SearchEngineID@1 as SearchEngineID, SearchPhrase@2 as SearchPhrase] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST, SearchEngineID@1 ASC, SearchPhrase@2 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q16.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q16.plan.yaml index 821b0852f7ebf..b7e3bbf32f926 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q16.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q16.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0}], count()=[SUM($1)], mode=[FINAL], viableBackends=[[lucene, datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[UserID@0 as UserID, count(Int64(1))[count]@1 as count()] + SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, UserID@0 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, UserID@0 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[UserID@0 as UserID, count(Int64(1))[count]@1 as count()] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, UserID@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, UserID@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[UserID], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.count())@0 as count(), UserID@1 as UserID] SortPreservingMergeExec: [sum(input-0.count())@0 DESC NULLS LAST, UserID@1 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q17.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q17.plan.yaml index da84469453510..3130f1842d8d0 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q17.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q17.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0, 1}], count()=[SUM($2)], mode=[FINAL], viableBackends=[[lucene, datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as count()] + SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, UserID@0 ASC, SearchPhrase@1 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, UserID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as count()] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, UserID@0 ASC, SearchPhrase@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, UserID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.count())@0 as count(), UserID@1 as UserID, SearchPhrase@2 as SearchPhrase] SortPreservingMergeExec: [sum(input-0.count())@0 DESC NULLS LAST, UserID@1 ASC, SearchPhrase@2 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q18.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q18.plan.yaml index 2ed82535c2792..6f107ca7318d3 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q18.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q18.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0, 1}], count()=[SUM($2)], mode=[FINAL], viableBackends=[[lucene, datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as count()] + SortPreservingMergeExec: [UserID@0 ASC, SearchPhrase@1 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[UserID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet, predicate=DynamicFilter [ ] + shard_physical_nseg: | ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))[count]@2 as count()] SortPreservingMergeExec: [UserID@0 ASC, SearchPhrase@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[UserID@0 ASC, SearchPhrase@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet, predicate=DynamicFilter [ ], pruning_predicate=, required_guarantees=[] + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, SearchPhrase], file_type=parquet, predicate=DynamicFilter [ ] coord_physical: | ProjectionExec: expr=[sum(input-0.count())@0 as count(), UserID@1 as UserID, SearchPhrase@2 as SearchPhrase] SortPreservingMergeExec: [UserID@1 ASC, SearchPhrase@2 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q19.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q19.plan.yaml index 10bdd10241338..8c458adde5771 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q19.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q19.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0, 1, 2}], count()=[SUM($3)], mode=[FINAL], viableBackends=[[datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as m, SearchPhrase@2 as SearchPhrase, count(Int64(1))[count]@3 as count()] + SortPreservingMergeExec: [count(Int64(1))@3 DESC NULLS LAST, UserID@0 ASC, opensearch_extract(Utf8("minute"),.EventTime)@1 ASC, SearchPhrase@2 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@3 DESC NULLS LAST, UserID@0 ASC, opensearch_extract(Utf8("minute"),.EventTime)@1 ASC, SearchPhrase@2 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, opensearch_extract(Utf8("minute"),.EventTime)@1, SearchPhrase@2], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, opensearch_extract(minute, CAST(EventTime@18 AS Timestamp(µs))) as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as m, SearchPhrase@2 as SearchPhrase, count(Int64(1))[count]@3 as count()] SortPreservingMergeExec: [count(Int64(1))@3 DESC NULLS LAST, UserID@0 ASC, opensearch_extract(Utf8("minute"),.EventTime)@1 ASC, SearchPhrase@2 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@3 DESC NULLS LAST, UserID@0 ASC, opensearch_extract(Utf8("minute"),.EventTime)@1 ASC, SearchPhrase@2 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[UserID, opensearch_extract(minute, CAST(EventTime@18 AS Timestamp(µs))) as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([UserID@0, opensearch_extract(Utf8("minute"),.EventTime)@1, SearchPhrase@2], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[UserID@0 as UserID, opensearch_extract(Utf8("minute"),.EventTime)@1 as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[UserID, opensearch_extract(minute, CAST(EventTime@18 AS Timestamp(µs))) as opensearch_extract(Utf8("minute"),.EventTime), SearchPhrase], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.count())@0 as count(), UserID@1 as UserID, m@2 as m, SearchPhrase@3 as SearchPhrase] SortPreservingMergeExec: [sum(input-0.count())@0 DESC NULLS LAST, UserID@1 ASC, m@2 ASC, SearchPhrase@3 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q22.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q22.plan.yaml index fb073fdd2f80a..5f6df8d5e5e84 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q22.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q22.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: URL@1 ILIKE %google% AND SearchPhrase@0 != , projection=[SearchPhrase@0] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[SearchPhrase, URL], file_type=parquet, predicate=URL@27 ILIKE %google% AND SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: URL@1 ILIKE %google% AND SearchPhrase@0 != , projection=[SearchPhrase@0] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchPhrase, URL], file_type=parquet, predicate=URL@27 ILIKE %google% AND SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] - FilterExec: URL@1 ILIKE %google% AND SearchPhrase@0 != , projection=[SearchPhrase@0] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[SearchPhrase, URL], file_type=parquet, predicate=URL@27 ILIKE %google% AND SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))] + FilterExec: URL@1 ILIKE %google% AND SearchPhrase@0 != , projection=[SearchPhrase@0] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchPhrase, URL], file_type=parquet, predicate=URL@27 ILIKE %google% AND SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, SearchPhrase@1 as SearchPhrase] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q23.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q23.plan.yaml index 365b4fd20fcc8..a7a168c652a60 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q23.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q23.plan.yaml @@ -2,7 +2,7 @@ # Compound predicate on parquet DataSourceExec with grouped count+dc(HLL) and TopK. query: q23 ppl_file: q23.ppl -applies: [prod2s] +applies: [prod2s, prod1s] plans: prod2s: post_cbo: | @@ -34,15 +34,61 @@ plans: ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c, approx_distinct(.UserID)[hll_registers]@2 as dc(UserID)] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] - FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))[count]@1 as c, approx_distinct(.UserID)[hll_registers]@2 as dc(UserID)] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] - FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + prod1s: + post_cbo: | + OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10000], viableBackends=[[datafusion]]) + OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10], viableBackends=[[datafusion]]) + OpenSearchProject(c=[$1], dc(UserID)=[$2], SearchPhrase=[$0], viableBackends=[[datafusion]]) + OpenSearchAggregate(group=[{0}], c=[COUNT()], dc(UserID)=[APPROX_COUNT_DISTINCT($1)], mode=[SINGLE], viableBackends=[[datafusion]]) + OpenSearchProject(SearchPhrase=[$74], UserID=[$97], viableBackends=[[datafusion]]) + OpenSearchFilter(condition=[AND(ANNOTATED_PREDICATE(id=0, backends=[datafusion], ILIKE($83, '%Google%', '\')), ANNOTATED_PREDICATE(id=1, backends=[datafusion], <>($74, '')), NOT(ANNOTATED_PREDICATE(id=2, backends=[datafusion], ILIKE($85, '%.google.%', '\'))))], viableBackends=[[datafusion]]) + OpenSearchTableScan(table=[[]], viableBackends=[[lucene, datafusion]]) + fragment: | + [SHARD_FRAGMENT chosen_backend=datafusion tree_shape=NONE] + OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10000], viableBackends=[[datafusion]]) + OpenSearchSort(sort0=[$0], dir0=[DESC-nulls-last], fetch=[10], viableBackends=[[datafusion]]) + OpenSearchProject(c=[$1], dc(UserID)=[$2], SearchPhrase=[$0], viableBackends=[[datafusion]]) + OpenSearchAggregate(group=[{0}], c=[COUNT()], dc(UserID)=[APPROX_COUNT_DISTINCT($1)], mode=[SINGLE], viableBackends=[[datafusion]]) + OpenSearchProject(SearchPhrase=[$74], UserID=[$97], viableBackends=[[datafusion]]) + OpenSearchFilter(condition=[AND(ANNOTATED_PREDICATE(id=0, backends=[datafusion], ILIKE($83, '%Google%', '\')), ANNOTATED_PREDICATE(id=1, backends=[datafusion], <>($74, '')), NOT(ANNOTATED_PREDICATE(id=2, backends=[datafusion], ILIKE($85, '%.google.%', '\'))))], viableBackends=[[datafusion]]) + OpenSearchTableScan(table=[[]], viableBackends=[[lucene, datafusion]]) + shard_physical_1seg: | + RelabelExec: schema=Schema { fields: [Field { name: "c", data_type: Int64 }, Field { name: "dc(UserID)", data_type: Int64, nullable: true }, Field { name: "SearchPhrase", data_type: Utf8View, nullable: true }], metadata: {} } + ProjectionExec: expr=[count(Int64(1))@0 as c, approx_distinct(.UserID)@1 as dc(UserID), SearchPhrase@2 as SearchPhrase] + SortPreservingMergeExec: [count(Int64(1))@0 DESC NULLS LAST], fetch=10 + SortExec: TopK(fetch=10), expr=[count(Int64(1))@0 DESC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[count(Int64(1))@1 as count(Int64(1)), approx_distinct(.UserID)@2 as approx_distinct(.UserID), SearchPhrase@0 as SearchPhrase] + AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + shard_physical_nseg: | + RelabelExec: schema=Schema { fields: [Field { name: "c", data_type: Int64 }, Field { name: "dc(UserID)", data_type: Int64, nullable: true }, Field { name: "SearchPhrase", data_type: Utf8View, nullable: true }], metadata: {} } + ProjectionExec: expr=[count(Int64(1))@0 as c, approx_distinct(.UserID)@1 as dc(UserID), SearchPhrase@2 as SearchPhrase] + SortPreservingMergeExec: [count(Int64(1))@0 DESC NULLS LAST], fetch=10 + SortExec: TopK(fetch=10), expr=[count(Int64(1))@0 DESC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[count(Int64(1))@1 as count(Int64(1)), approx_distinct(.UserID)@2 as approx_distinct(.UserID), SearchPhrase@0 as SearchPhrase] + AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1)), approx_distinct(.UserID)] + FilterExec: Title@1 ILIKE %Google% AND SearchPhrase@0 != AND URL@2 NOT ILIKE %.google.%, projection=[SearchPhrase@0, UserID@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[SearchPhrase, Title, URL, UserID], file_type=parquet, predicate=Title@101 ILIKE %Google% AND SearchPhrase@63 != AND URL@27 NOT ILIKE %.google.%, pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q28.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q28.plan.yaml index 3bb10ef913a8e..6a0325faf4c97 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q28.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q28.plan.yaml @@ -38,20 +38,24 @@ plans: ProjectionExec: expr=[CounterID@0 as CounterID, sum(character_length(.URL))[sum]@1 as $f1, count(character_length(.URL))[count]@2 as $f2, count(Int64(1))[count]@3 as c] SortPreservingMergeExec: [sum(character_length(.URL))@1 DESC NULLS LAST], fetch=75 SortExec: TopK(fetch=75), expr=[sum(character_length(.URL))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] - ProjectionExec: expr=[CounterID@0 as CounterID, character_length(URL@1) as character_length(.URL)] - FilterExec: URL@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, URL], file_type=parquet, predicate=URL@27 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 != OR != URL_max@1), required_guarantees=[URL not in ()] + AggregateExec: mode=PartialReduce, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] + RepartitionExec: partitioning=Hash([CounterID@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] + ProjectionExec: expr=[CounterID@0 as CounterID, character_length(URL@1) as character_length(.URL)] + FilterExec: URL@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, URL], file_type=parquet, predicate=URL@27 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 != OR != URL_max@1), required_guarantees=[URL not in ()] shard_physical_nseg: | ProjectionExec: expr=[CounterID@0 as CounterID, sum(character_length(.URL))[sum]@1 as $f1, count(character_length(.URL))[count]@2 as $f2, count(Int64(1))[count]@3 as c] SortPreservingMergeExec: [sum(character_length(.URL))@1 DESC NULLS LAST], fetch=75 SortExec: TopK(fetch=75), expr=[sum(character_length(.URL))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] - ProjectionExec: expr=[CounterID@0 as CounterID, character_length(URL@1) as character_length(.URL)] - FilterExec: URL@1 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, URL], file_type=parquet, predicate=URL@27 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 != OR != URL_max@1), required_guarantees=[URL not in ()] + AggregateExec: mode=PartialReduce, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] + RepartitionExec: partitioning=Hash([CounterID@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[sum(character_length(.URL)), count(character_length(.URL)), count(Int64(1))] + ProjectionExec: expr=[CounterID@0 as CounterID, character_length(URL@1) as character_length(.URL)] + FilterExec: URL@1 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, URL], file_type=parquet, predicate=URL@27 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 != OR != URL_max@1), required_guarantees=[URL not in ()] coord_physical: | ProjectionExec: expr=[CASE WHEN sum(input-0.$f2) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f1) / sum(input-0.$f2) END@0 as l, sum(input-0.c)@1 as c, CounterID@2 as CounterID] SortPreservingMergeExec: [CASE WHEN sum(input-0.$f2) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f1) / sum(input-0.$f2) END@0 DESC NULLS LAST], fetch=25 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q29.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q29.plan.yaml index 090a6fb1dbd12..1a6d7c0b81c89 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q29.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q29.plan.yaml @@ -38,20 +38,24 @@ plans: ProjectionExec: expr=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as k, sum(character_length(.Referer))[sum]@1 as $f1, count(character_length(.Referer))[count]@2 as $f2, count(Int64(1))[count]@3 as c, min(.Referer)[value]@4 as min(Referer)] SortPreservingMergeExec: [sum(character_length(.Referer))@1 DESC NULLS LAST], fetch=75 SortExec: TopK(fetch=75), expr=[sum(character_length(.Referer))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] - ProjectionExec: expr=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, ${1}, g) as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g")), Referer@0 as Referer, character_length(Referer@0) as character_length(.Referer)] - FilterExec: Referer@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[Referer], file_type=parquet, predicate=Referer@100 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 != OR != Referer_max@1), required_guarantees=[Referer not in ()] + AggregateExec: mode=PartialReduce, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] + RepartitionExec: partitioning=Hash([regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] + ProjectionExec: expr=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, ${1}, g) as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g")), Referer@0 as Referer, character_length(Referer@0) as character_length(.Referer)] + FilterExec: Referer@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[Referer], file_type=parquet, predicate=Referer@100 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 != OR != Referer_max@1), required_guarantees=[Referer not in ()] shard_physical_nseg: | ProjectionExec: expr=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as k, sum(character_length(.Referer))[sum]@1 as $f1, count(character_length(.Referer))[count]@2 as $f2, count(Int64(1))[count]@3 as c, min(.Referer)[value]@4 as min(Referer)] SortPreservingMergeExec: [sum(character_length(.Referer))@1 DESC NULLS LAST], fetch=75 SortExec: TopK(fetch=75), expr=[sum(character_length(.Referer))@1 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] - ProjectionExec: expr=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, ${1}, g) as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g")), Referer@0 as Referer, character_length(Referer@0) as character_length(.Referer)] - FilterExec: Referer@0 != - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[Referer], file_type=parquet, predicate=Referer@100 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 != OR != Referer_max@1), required_guarantees=[Referer not in ()] + AggregateExec: mode=PartialReduce, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] + RepartitionExec: partitioning=Hash([regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))@0 as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g"))], aggr=[sum(character_length(.Referer)), count(character_length(.Referer)), count(Int64(1)), min(.Referer)] + ProjectionExec: expr=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, ${1}, g) as regexp_replace(.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("${1}"),Utf8("g")), Referer@0 as Referer, character_length(Referer@0) as character_length(.Referer)] + FilterExec: Referer@0 != + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[Referer], file_type=parquet, predicate=Referer@100 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 != OR != Referer_max@1), required_guarantees=[Referer not in ()] coord_physical: | ProjectionExec: expr=[CASE WHEN sum(input-0.$f2) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f1) / sum(input-0.$f2) END@0 as l, sum(input-0.c)@1 as c, min(input-0.min(Referer))@2 as min(Referer), k@3 as k] SortPreservingMergeExec: [CASE WHEN sum(input-0.$f2) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f1) / sum(input-0.$f2) END@0 DESC NULLS LAST], fetch=25 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q31.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q31.plan.yaml index a0030b3e6d5f8..bf513cd933359 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q31.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q31.plan.yaml @@ -36,18 +36,22 @@ plans: ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, ClientIP@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, ClientIP@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] - FilterExec: SearchPhrase@4 != , projection=[SearchEngineID@3, ClientIP@0, IsRefresh@1, ResolutionWidth@2] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([SearchEngineID@0, ClientIP@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + FilterExec: SearchPhrase@4 != , projection=[SearchEngineID@3, ClientIP@0, IsRefresh@1, ResolutionWidth@2] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, ClientIP@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, SearchEngineID@0 ASC, ClientIP@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] - FilterExec: SearchPhrase@4 != , projection=[SearchEngineID@3, ClientIP@0, IsRefresh@1, ResolutionWidth@2] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([SearchEngineID@0, ClientIP@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + FilterExec: SearchPhrase@4 != , projection=[SearchEngineID@3, ClientIP@0, IsRefresh@1, ResolutionWidth@2] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, sum(input-0.sum(IsRefresh))@1 as sum(IsRefresh), CASE WHEN sum(input-0.$f5) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f4) / sum(input-0.$f5) END@2 as avg(ResolutionWidth), SearchEngineID@3 as SearchEngineID, ClientIP@4 as ClientIP] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST, SearchEngineID@3 ASC, ClientIP@4 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q32.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q32.plan.yaml index 6195dc4984ff1..c22ecb2044843 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q32.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q32.plan.yaml @@ -36,18 +36,22 @@ plans: ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] - FilterExec: SearchPhrase@3 != , projection=[WatchID@4, ClientIP@0, IsRefresh@1, ResolutionWidth@2] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchPhrase, WatchID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + FilterExec: SearchPhrase@3 != , projection=[WatchID@4, ClientIP@0, IsRefresh@1, ResolutionWidth@2] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchPhrase, WatchID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] shard_physical_nseg: | ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] - FilterExec: SearchPhrase@3 != , projection=[WatchID@4, ClientIP@0, IsRefresh@1, ResolutionWidth@2] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchPhrase, WatchID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] + AggregateExec: mode=PartialReduce, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + FilterExec: SearchPhrase@3 != , projection=[WatchID@4, ClientIP@0, IsRefresh@1, ResolutionWidth@2] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchPhrase, WatchID], file_type=parquet, predicate=SearchPhrase@63 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 != OR != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, sum(input-0.sum(IsRefresh))@1 as sum(IsRefresh), CASE WHEN sum(input-0.$f5) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f4) / sum(input-0.$f5) END@2 as avg(ResolutionWidth), WatchID@3 as WatchID, ClientIP@4 as ClientIP] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q33.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q33.plan.yaml index 4c173f915aacb..39d406f24edec 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q33.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q33.plan.yaml @@ -30,12 +30,22 @@ plans: OpenSearchAggregate(group=[{0, 1}], c=[SUM($2)], sum(IsRefresh)=[SUM($3)], $f4=[SUM($4)], $f5=[SUM($5)], mode=[FINAL], viableBackends=[[datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] + SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, WatchID@0 ASC, ClientIP@1 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, WatchID@0 ASC, ClientIP@1 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + DataSourceExec: file_groups={}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))[count]@2 as c, sum(.IsRefresh)[sum]@3 as sum(IsRefresh), sum(.ResolutionWidth)[sum]@4 as $f4, count(.ResolutionWidth)[count]@5 as $f5] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, WatchID@0 ASC, ClientIP@1 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST, WatchID@0 ASC, ClientIP@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] - DataSourceExec: file_groups={}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(.IsRefresh), sum(.ResolutionWidth), count(.ResolutionWidth)] + DataSourceExec: file_groups={}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, sum(input-0.sum(IsRefresh))@1 as sum(IsRefresh), CASE WHEN sum(input-0.$f5) = Int64(0) THEN Float64(NULL) ELSE sum(input-0.$f4) / sum(input-0.$f5) END@2 as avg(ResolutionWidth), WatchID@3 as WatchID, ClientIP@4 as ClientIP] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST, WatchID@3 ASC, ClientIP@4 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q34.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q34.plan.yaml index 6c4266fdd2dd6..f5a3106abd076 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q34.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q34.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0}], c=[SUM($1)], mode=[FINAL], viableBackends=[[lucene, datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as c] + SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[URL], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as c] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[URL], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[URL], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, URL@1 as URL] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST, URL@1 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q35.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q35.plan.yaml index 77cc0b79c710d..a19b87863992f 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q35.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q35.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0, 1}], c=[SUM($2)], mode=[FINAL], viableBackends=[[lucene, datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[Int32(1)@0 as const, URL@1 as URL, count(Int64(1))[count]@2 as c] + SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[Int32(1)@0 as Int32(1), URL@1 as URL], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0]) + RepartitionExec: partitioning=Hash([Int32(1)@0, URL@1], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[Int32(1)@0 as Int32(1), URL@1 as URL], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0]) + DataSourceExec: file_groups={}, projection=[1 as Int32(1), URL], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[Int32(1)@0 as const, URL@1 as URL, count(Int64(1))[count]@2 as c] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@2 DESC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[Int32(1)@0 as Int32(1), URL@1 as URL], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0]) - DataSourceExec: file_groups={}, projection=[1 as Int32(1), URL], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[Int32(1)@0 as Int32(1), URL@1 as URL], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0]) + RepartitionExec: partitioning=Hash([Int32(1)@0, URL@1], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[Int32(1)@0 as Int32(1), URL@1 as URL], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0]) + DataSourceExec: file_groups={}, projection=[1 as Int32(1), URL], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, Int32(1)@1 as const, URL@2 as URL] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q36.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q36.plan.yaml index ec6db780ecd6f..12b831f4b5d4f 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q36.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q36.plan.yaml @@ -28,12 +28,22 @@ plans: OpenSearchAggregate(group=[{0, 1, 2, 3}], c=[SUM($4)], mode=[FINAL], viableBackends=[[datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as ClientIP - 1, .ClientIP - Int32(2)@2 as ClientIP - 2, .ClientIP - Int32(3)@3 as ClientIP - 3, count(Int64(1))[count]@4 as c] + SortPreservingMergeExec: [count(Int64(1))@4 DESC NULLS LAST, ClientIP@0 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[count(Int64(1))@4 DESC NULLS LAST, ClientIP@0 ASC], preserve_partitioning=[true] + AggregateExec: mode=PartialReduce, gby=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as .ClientIP - Int32(1), .ClientIP - Int32(2)@2 as .ClientIP - Int32(2), .ClientIP - Int32(3)@3 as .ClientIP - Int32(3)], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([ClientIP@0, .ClientIP - Int32(1)@1, .ClientIP - Int32(2)@2, .ClientIP - Int32(3)@3], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as .ClientIP - Int32(1), .ClientIP - Int32(2)@2 as .ClientIP - Int32(2), .ClientIP - Int32(3)@3 as .ClientIP - Int32(3)], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[ClientIP, ClientIP@79 - 1 as .ClientIP - Int32(1), ClientIP@79 - 2 as .ClientIP - Int32(2), ClientIP@79 - 3 as .ClientIP - Int32(3)], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as ClientIP - 1, .ClientIP - Int32(2)@2 as ClientIP - 2, .ClientIP - Int32(3)@3 as ClientIP - 3, count(Int64(1))[count]@4 as c] SortPreservingMergeExec: [count(Int64(1))@4 DESC NULLS LAST, ClientIP@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@4 DESC NULLS LAST, ClientIP@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as .ClientIP - Int32(1), .ClientIP - Int32(2)@2 as .ClientIP - Int32(2), .ClientIP - Int32(3)@3 as .ClientIP - Int32(3)], aggr=[count(Int64(1))] - DataSourceExec: file_groups={}, projection=[ClientIP, ClientIP@79 - 1 as .ClientIP - Int32(1), ClientIP@79 - 2 as .ClientIP - Int32(2), ClientIP@79 - 3 as .ClientIP - Int32(3)], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as .ClientIP - Int32(1), .ClientIP - Int32(2)@2 as .ClientIP - Int32(2), .ClientIP - Int32(3)@3 as .ClientIP - Int32(3)], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([ClientIP@0, .ClientIP - Int32(1)@1, .ClientIP - Int32(2)@2, .ClientIP - Int32(3)@3], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[ClientIP@0 as ClientIP, .ClientIP - Int32(1)@1 as .ClientIP - Int32(1), .ClientIP - Int32(2)@2 as .ClientIP - Int32(2), .ClientIP - Int32(3)@3 as .ClientIP - Int32(3)], aggr=[count(Int64(1))] + DataSourceExec: file_groups={}, projection=[ClientIP, ClientIP@79 - 1 as .ClientIP - Int32(1), ClientIP@79 - 2 as .ClientIP - Int32(2), ClientIP@79 - 3 as .ClientIP - Int32(3)], file_type=parquet coord_physical: | ProjectionExec: expr=[sum(input-0.c)@0 as c, ClientIP@1 as ClientIP, ClientIP - 1@2 as ClientIP - 1, ClientIP - 2@3 as ClientIP - 2, ClientIP - 3@4 as ClientIP - 3] SortPreservingMergeExec: [sum(input-0.c)@0 DESC NULLS LAST, ClientIP@1 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q37.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q37.plan.yaml index dcfa7ed65d4ba..c1426e00eb1c9 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q37.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q37.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND URL@4 != , projection=[URL@4] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND URL@27 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 != OR != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()] + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND URL@4 != , projection=[URL@4] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND URL@27 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 != OR != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()] shard_physical_nseg: | ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND URL@4 != , projection=[URL@4] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND URL@27 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 != OR != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()] + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND URL@4 != , projection=[URL@4] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND URL@27 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 != OR != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, URL@1 as URL] SortPreservingMergeExec: [sum(input-0.PageViews)@0 DESC NULLS LAST, URL@1 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q38.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q38.plan.yaml index 4f3def2cc61f6..19844344bf357 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q38.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q38.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[Title@0 as Title, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, Title@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, Title@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND Title@4 != , projection=[Title@4] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, Title], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND Title@101 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 != OR != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()] + AggregateExec: mode=PartialReduce, gby=[Title@0 as Title], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([Title@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND Title@4 != , projection=[Title@4] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, Title], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND Title@101 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 != OR != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()] shard_physical_nseg: | ProjectionExec: expr=[Title@0 as Title, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, Title@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[count(Int64(1))@1 DESC NULLS LAST, Title@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND Title@4 != , projection=[Title@4] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, Title], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND Title@101 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 != OR != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()] + AggregateExec: mode=PartialReduce, gby=[Title@0 as Title], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([Title@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND DontCountHits@1 = 0 AND IsRefresh@3 = 0 AND Title@4 != , projection=[Title@4] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, Title], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND DontCountHits@43 = 0 AND IsRefresh@74 = 0 AND Title@101 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 != OR != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, Title@1 as Title] SortPreservingMergeExec: [sum(input-0.PageViews)@0 DESC NULLS LAST, Title@1 ASC], fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q39.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q39.plan.yaml index c05744ac30d98..34756cc0ac24b 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q39.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q39.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@4 = 0 AND IsLink@3 != 0 AND IsDownload@2 = 0, projection=[URL@5] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsDownload, IsLink, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND IsLink@49 != 0 AND IsDownload@36 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@4 = 0 AND IsLink@3 != 0 AND IsDownload@2 = 0, projection=[URL@5] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsDownload, IsLink, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND IsLink@49 != 0 AND IsDownload@36 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)] shard_physical_nseg: | ProjectionExec: expr=[URL@0 as URL, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@1 DESC NULLS LAST, URL@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@4 = 0 AND IsLink@3 != 0 AND IsDownload@2 = 0, projection=[URL@5] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsDownload, IsLink, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND IsLink@49 != 0 AND IsDownload@36 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[URL@0 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@4 = 0 AND IsLink@3 != 0 AND IsDownload@2 = 0, projection=[URL@5] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsDownload, IsLink, IsRefresh, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND IsLink@49 != 0 AND IsDownload@36 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, URL@1 as URL] GlobalLimitExec: skip=5, fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q40.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q40.plan.yaml index 504fc7ef167b9..52dbe24503e3f 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q40.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q40.plan.yaml @@ -34,20 +34,24 @@ plans: ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as Src, URL@4 as Dst, count(Int64(1))[count]@5 as PageViews] SortPreservingMergeExec: [count(Int64(1))@5 DESC NULLS LAST, TraficSourceID@0 ASC, SearchEngineID@1 ASC, AdvEngineID@2 ASC, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 ASC, URL@4 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@5 DESC NULLS LAST, TraficSourceID@0 ASC, SearchEngineID@1 ASC, AdvEngineID@2 ASC, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 ASC, URL@4 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] - ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN SearchEngineID@1 = 0 AND AdvEngineID@2 = 0 THEN Referer@3 ELSE END as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL] - FilterExec: CounterID@1 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0, projection=[TraficSourceID@6, SearchEngineID@5, AdvEngineID@0, Referer@4, URL@7] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[AdvEngineID, CounterID, EventDate, IsRefresh, Referer, SearchEngineID, TraficSourceID, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([TraficSourceID@0, SearchEngineID@1, AdvEngineID@2, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3, URL@4], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] + ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN SearchEngineID@1 = 0 AND AdvEngineID@2 = 0 THEN Referer@3 ELSE END as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL] + FilterExec: CounterID@1 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0, projection=[TraficSourceID@6, SearchEngineID@5, AdvEngineID@0, Referer@4, URL@7] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[AdvEngineID, CounterID, EventDate, IsRefresh, Referer, SearchEngineID, TraficSourceID, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)] shard_physical_nseg: | ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as Src, URL@4 as Dst, count(Int64(1))[count]@5 as PageViews] SortPreservingMergeExec: [count(Int64(1))@5 DESC NULLS LAST, TraficSourceID@0 ASC, SearchEngineID@1 ASC, AdvEngineID@2 ASC, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 ASC, URL@4 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@5 DESC NULLS LAST, TraficSourceID@0 ASC, SearchEngineID@1 ASC, AdvEngineID@2 ASC, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 ASC, URL@4 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] - ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN SearchEngineID@1 = 0 AND AdvEngineID@2 = 0 THEN Referer@3 ELSE END as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL] - FilterExec: CounterID@1 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0, projection=[TraficSourceID@6, SearchEngineID@5, AdvEngineID@0, Referer@4, URL@7] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[AdvEngineID, CounterID, EventDate, IsRefresh, Referer, SearchEngineID, TraficSourceID, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([TraficSourceID@0, SearchEngineID@1, AdvEngineID@2, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3, URL@4], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END@3 as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))] + ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN SearchEngineID@1 = 0 AND AdvEngineID@2 = 0 THEN Referer@3 ELSE END as CASE WHEN .SearchEngineID = Int32(0) AND .AdvEngineID = Int32(0) THEN .Referer ELSE Utf8("") END, URL@4 as URL] + FilterExec: CounterID@1 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0, projection=[TraficSourceID@6, SearchEngineID@5, AdvEngineID@0, Referer@4, URL@7] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[AdvEngineID, CounterID, EventDate, IsRefresh, Referer, SearchEngineID, TraficSourceID, URL], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, TraficSourceID@1 as TraficSourceID, SearchEngineID@2 as SearchEngineID, AdvEngineID@3 as AdvEngineID, Src@4 as Src, Dst@5 as Dst] GlobalLimitExec: skip=5, fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q41.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q41.plan.yaml index 05583d0830b46..6de64fa9aabc0 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q41.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q41.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[URLHash@0 as URLHash, EventDate@1 as EventDate, count(Int64(1))[count]@2 as PageViews] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, URLHash@0 ASC, EventDate@1 ASC], fetch=36 SortExec: TopK(fetch=36), expr=[count(Int64(1))@2 DESC NULLS LAST, URLHash@0 ASC, EventDate@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@2 = 0 AND (TraficSourceID@4 = -1 OR TraficSourceID@4 = 6) AND RefererHash@3 = 3594120000172545465, projection=[URLHash@5, EventDate@1] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsRefresh, RefererHash, TraficSourceID, URLHash], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND (TraficSourceID@13 = -1 OR TraficSourceID@13 = 6) AND RefererHash@12 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)] + AggregateExec: mode=PartialReduce, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URLHash@0, EventDate@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@2 = 0 AND (TraficSourceID@4 = -1 OR TraficSourceID@4 = 6) AND RefererHash@3 = 3594120000172545465, projection=[URLHash@5, EventDate@1] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsRefresh, RefererHash, TraficSourceID, URLHash], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND (TraficSourceID@13 = -1 OR TraficSourceID@13 = 6) AND RefererHash@12 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)] shard_physical_nseg: | ProjectionExec: expr=[URLHash@0 as URLHash, EventDate@1 as EventDate, count(Int64(1))[count]@2 as PageViews] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, URLHash@0 ASC, EventDate@1 ASC], fetch=36 SortExec: TopK(fetch=36), expr=[count(Int64(1))@2 DESC NULLS LAST, URLHash@0 ASC, EventDate@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@2 = 0 AND (TraficSourceID@4 = -1 OR TraficSourceID@4 = 6) AND RefererHash@3 = 3594120000172545465, projection=[URLHash@5, EventDate@1] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsRefresh, RefererHash, TraficSourceID, URLHash], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND (TraficSourceID@13 = -1 OR TraficSourceID@13 = 6) AND RefererHash@12 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)] + AggregateExec: mode=PartialReduce, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([URLHash@0, EventDate@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@1 >= 1372636800000 AND EventDate@1 <= 1375228800000 AND IsRefresh@2 = 0 AND (TraficSourceID@4 = -1 OR TraficSourceID@4 = 6) AND RefererHash@3 = 3594120000172545465, projection=[URLHash@5, EventDate@1] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, EventDate, IsRefresh, RefererHash, TraficSourceID, URLHash], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND (TraficSourceID@13 = -1 OR TraficSourceID@13 = 6) AND RefererHash@12 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, URLHash@1 as URLHash, EventDate@2 as EventDate] GlobalLimitExec: skip=2, fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q42.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q42.plan.yaml index f0d7442406edd..2083105e1ede4 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q42.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q42.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight, count(Int64(1))[count]@2 as PageViews] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, WindowClientWidth@0 ASC, WindowClientHeight@1 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@2 DESC NULLS LAST, WindowClientWidth@0 ASC, WindowClientHeight@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0 AND DontCountHits@1 = 0 AND URLHash@4 = 2868770270353813622, projection=[WindowClientWidth@6, WindowClientHeight@5] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URLHash, WindowClientHeight, WindowClientWidth], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND URLHash@26 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)] + AggregateExec: mode=PartialReduce, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([WindowClientWidth@0, WindowClientHeight@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0 AND DontCountHits@1 = 0 AND URLHash@4 = 2868770270353813622, projection=[WindowClientWidth@6, WindowClientHeight@5] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URLHash, WindowClientHeight, WindowClientWidth], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND URLHash@26 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)] shard_physical_nseg: | ProjectionExec: expr=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight, count(Int64(1))[count]@2 as PageViews] SortPreservingMergeExec: [count(Int64(1))@2 DESC NULLS LAST, WindowClientWidth@0 ASC, WindowClientHeight@1 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[count(Int64(1))@2 DESC NULLS LAST, WindowClientWidth@0 ASC, WindowClientHeight@1 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0 AND DontCountHits@1 = 0 AND URLHash@4 = 2868770270353813622, projection=[WindowClientWidth@6, WindowClientHeight@5] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URLHash, WindowClientHeight, WindowClientWidth], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND URLHash@26 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)] + AggregateExec: mode=PartialReduce, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([WindowClientWidth@0, WindowClientHeight@1], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1372636800000 AND EventDate@2 <= 1375228800000 AND IsRefresh@3 = 0 AND DontCountHits@1 = 0 AND URLHash@4 = 2868770270353813622, projection=[WindowClientWidth@6, WindowClientHeight@5] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, IsRefresh, URLHash, WindowClientHeight, WindowClientWidth], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1372636800000 AND EventDate@0 <= 1375228800000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND URLHash@26 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1372636800000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1375228800000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, WindowClientWidth@1 as WindowClientWidth, WindowClientHeight@2 as WindowClientHeight] GlobalLimitExec: skip=5, fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q43.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q43.plan.yaml index ff47d0a295934..fa82fdefd7984 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q43.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q43.plan.yaml @@ -34,20 +34,24 @@ plans: ProjectionExec: expr=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as M, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] - ProjectionExec: expr=[date_format(CAST(EventTime@0 AS Timestamp(µs)), %Y-%m-%d %H:%i:00) as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1373760000000 AND EventDate@2 <= 1373846400000 AND IsRefresh@4 = 0 AND DontCountHits@1 = 0, projection=[EventTime@3] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, EventTime, IsRefresh], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1373760000000 AND EventDate@0 <= 1373846400000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND DynamicFilter [ ], pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1373760000000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1373846400000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] + ProjectionExec: expr=[date_format(CAST(EventTime@0 AS Timestamp(µs)), %Y-%m-%d %H:%i:00) as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1373760000000 AND EventDate@2 <= 1373846400000 AND IsRefresh@4 = 0 AND DontCountHits@1 = 0, projection=[EventTime@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, EventTime, IsRefresh], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1373760000000 AND EventDate@0 <= 1373846400000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND DynamicFilter [ ], pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1373760000000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1373846400000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)] shard_physical_nseg: | ProjectionExec: expr=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as M, count(Int64(1))[count]@1 as PageViews] SortPreservingMergeExec: [date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 ASC], fetch=45 SortExec: TopK(fetch=45), expr=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] - ProjectionExec: expr=[date_format(CAST(EventTime@0 AS Timestamp(µs)), %Y-%m-%d %H:%i:00) as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))] - FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1373760000000 AND EventDate@2 <= 1373846400000 AND IsRefresh@4 = 0 AND DontCountHits@1 = 0, projection=[EventTime@3] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, EventTime, IsRefresh], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1373760000000 AND EventDate@0 <= 1373846400000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND DynamicFilter [ ], pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1373760000000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1373846400000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)] + AggregateExec: mode=PartialReduce, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))@0 as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))], aggr=[count(Int64(1))] + ProjectionExec: expr=[date_format(CAST(EventTime@0 AS Timestamp(µs)), %Y-%m-%d %H:%i:00) as date_format(.EventTime,Utf8("%Y-%m-%d %H:%i:00"))] + FilterExec: CounterID@0 = 62 AND EventDate@2 >= 1373760000000 AND EventDate@2 <= 1373846400000 AND IsRefresh@4 = 0 AND DontCountHits@1 = 0, projection=[EventTime@3] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[CounterID, DontCountHits, EventDate, EventTime, IsRefresh], file_type=parquet, predicate=CounterID@107 = 62 AND EventDate@0 >= 1373760000000 AND EventDate@0 <= 1373846400000 AND IsRefresh@74 = 0 AND DontCountHits@43 = 0 AND DynamicFilter [ ], pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 1373760000000 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 1373846400000 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)] coord_physical: | ProjectionExec: expr=[sum(input-0.PageViews)@0 as PageViews, M@1 as M] GlobalLimitExec: skip=5, fetch=10 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q8.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q8.plan.yaml index a4e1ed1ae7ec5..b411ccfe5f8c3 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q8.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q8.plan.yaml @@ -34,18 +34,22 @@ plans: ProjectionExec: expr=[AdvEngineID@0 as AdvEngineID, count(Int64(1))[count]@1 as count()] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, AdvEngineID@0 ASC], fetch=30000 SortExec: TopK(fetch=30000), expr=[count(Int64(1))@1 DESC NULLS LAST, AdvEngineID@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] - FilterExec: AdvEngineID@0 != 0 - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: file_groups={}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@20 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)] + AggregateExec: mode=PartialReduce, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([AdvEngineID@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] + FilterExec: AdvEngineID@0 != 0 + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: file_groups={}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@20 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)] shard_physical_nseg: | ProjectionExec: expr=[AdvEngineID@0 as AdvEngineID, count(Int64(1))[count]@1 as count()] SortPreservingMergeExec: [count(Int64(1))@1 DESC NULLS LAST, AdvEngineID@0 ASC], fetch=30000 SortExec: TopK(fetch=30000), expr=[count(Int64(1))@1 DESC NULLS LAST, AdvEngineID@0 ASC], preserve_partitioning=[true] - AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] - FilterExec: AdvEngineID@0 != 0 - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 - DataSourceExec: file_groups={}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@20 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)] + AggregateExec: mode=PartialReduce, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] + RepartitionExec: partitioning=Hash([AdvEngineID@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))] + FilterExec: AdvEngineID@0 != 0 + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@20 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)] coord_physical: | ProjectionExec: expr=[sum(input-0.count())@0 as count(), AdvEngineID@1 as AdvEngineID] SortPreservingMergeExec: [sum(input-0.count())@0 DESC NULLS LAST, AdvEngineID@1 ASC], fetch=10000 diff --git a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q9.plan.yaml b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q9.plan.yaml index 87d1370c7f4f9..7e305e292799a 100644 --- a/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q9.plan.yaml +++ b/sandbox/qa/analytics-engine-rest/src/test/resources/planshape/clickbench/q9.plan.yaml @@ -32,13 +32,24 @@ plans: OpenSearchAggregate(group=[{0}], u=[APPROX_COUNT_DISTINCT($1)], mode=[FINAL], viableBackends=[[datafusion]]) OpenSearchExchangeReducer(viableBackends=[[datafusion]], exchange=[ExchangeInfo[distributionType=SINGLETON, partitionKeyIndices=[]]]) OpenSearchStageInputScan(childStageId=[0], viableBackends=[[datafusion]]) - shard_physical: | + shard_physical_1seg: | + ProjectionExec: expr=[RegionID@0 as RegionID, approx_distinct(.UserID)@1 as u] + SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST, RegionID@0 ASC], fetch=30 + SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST, RegionID@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[RegionID@0 as RegionID, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] + AggregateExec: mode=PartialReduce, gby=[RegionID@0 as RegionID], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=1 + AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[approx_distinct(.UserID)] + DataSourceExec: file_groups={}, projection=[RegionID, UserID], file_type=parquet + shard_physical_nseg: | ProjectionExec: expr=[RegionID@0 as RegionID, approx_distinct(.UserID)@1 as u] SortPreservingMergeExec: [reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST, RegionID@0 ASC], fetch=30 SortExec: TopK(fetch=30), expr=[reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))@2 DESC NULLS LAST, RegionID@0 ASC], preserve_partitioning=[true] ProjectionExec: expr=[RegionID@0 as RegionID, approx_distinct(.UserID)[hll_registers]@1 as approx_distinct(.UserID), reduce_eval(approx_distinct, approx_distinct(.UserID)[hll_registers]@1) as reduce_eval(Utf8("approx_distinct"),approx_distinct(.UserID))] - AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[approx_distinct(.UserID)] - DataSourceExec: file_groups={}, projection=[RegionID, UserID], file_type=parquet + AggregateExec: mode=PartialReduce, gby=[RegionID@0 as RegionID], aggr=[approx_distinct(.UserID)] + RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=2 + AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[approx_distinct(.UserID)] + DataSourceExec: file_groups={}, projection=[RegionID, UserID], file_type=parquet prod1s: post_cbo: | OpenSearchSort(sort0=[$0], sort1=[$1], dir0=[DESC-nulls-last], dir1=[ASC-nulls-first], fetch=[10000], viableBackends=[[datafusion]])