Closed Xuanwo closed 1 month ago
I found some similar issues and discussions that might be relevant to integrating Databend PushDown with Iceberg Rust's scan planning:
COPY INTO
statements [1].iceberg-rust
[3].To integrate Databend PushDown with Iceberg Rust's scan planning, you need to utilize the PushDownInfo
struct and the push_downs
method from the PhysicalPlanBuilder
implementation. Here is a simplified example of how you might achieve this:
Define the PushDownInfo: This struct contains the necessary information for push-down operations, such as projections, filters, and prewhere conditions.
Implement the push_downs method: This method constructs the PushDownInfo
based on the scan plan and table schema.
Here is a code snippet that demonstrates how to integrate Databend PushDown with Iceberg Rust's scan planning:
use databend_common_catalog::plan::PushDownInfo;
use databend_common_expression::TableSchema;
use crate::plans::Scan;
use crate::PhysicalPlanBuilder;
impl PhysicalPlanBuilder {
fn push_downs(
&self,
scan: &Scan,
table_schema: &TableSchema,
has_inner_column: bool,
has_virtual_column: bool,
) -> Result<PushDownInfo> {
let metadata = self.metadata.read().clone();
let projection = Self::build_projection(
&metadata,
table_schema,
scan.columns.iter(),
has_inner_column,
true,
true,
true,
);
let output_columns = if has_virtual_column {
Some(Self::build_projection(
&metadata,
table_schema,
scan.columns.iter(),
has_inner_column,
true,
false,
true,
))
} else {
None
};
let mut is_deterministic = true;
let push_down_filter = scan
.push_down_predicates
.as_ref()
.filter(|p| !p.is_empty())
.map(|predicates: &Vec<ScalarExpr>| -> Result<Filters> {
let predicates = predicates
.iter()
.map(|p| {
Ok(p.as_raw_expr()
.type_check(&metadata)?
.project_column_ref(|col| col.column_name.clone()))
})
.collect::<Result<Vec<_>>>()?;
let expr = predicates
.into_iter()
.try_reduce(|lhs, rhs| {
check_function(None, "and_filters", &[], &[lhs, rhs], &BUILTIN_FUNCTIONS)
})?
.unwrap();
let expr = cast_expr_to_non_null_boolean(expr)?;
let (expr, _) = ConstantFolder::fold(&expr, &self.func_ctx, &BUILTIN_FUNCTIONS);
is_deterministic = expr.is_deterministic(&BUILTIN_FUNCTIONS);
let inverted_filter =
check_function(None, "not", &[], &[expr.clone()], &BUILTIN_FUNCTIONS)?;
Ok(Filters {
filter: expr.as_remote_expr(),
inverted_filter: inverted_filter.as_remote_expr(),
})
})
.transpose()?;
let prewhere_info = scan
.prewhere
.as_ref()
.map(|prewhere| -> Result<PrewhereInfo> {
let remain_columns = scan
.columns
.difference(&prewhere.prewhere_columns)
.copied()
.collect::<HashSet<usize>>();
let output_columns = Self::build_projection(
&metadata,
table_schema,
prewhere.output_columns.iter(),
has_inner_column,
true,
false,
true,
);
let prewhere_columns = Self::build_projection(
&metadata,
table_schema,
prewhere.prewhere_columns.iter(),
has_inner_column,
true,
true,
true,
);
let remain_columns = Self::build_projection(
&metadata,
table_schema,
remain_columns.iter(),
has_inner_column,
true,
true,
true,
);
let predicate = prewhere
.predicates
.iter()
.cloned()
.reduce(|lhs, rhs| {
ScalarExpr::FunctionCall(FunctionCall {
span: None,
func_name: "and_filters".to_string(),
params: vec![],
arguments: vec![lhs, rhs],
})
})
.expect("there should be at least one predicate in prewhere");
let filter = cast_expr_to_non_null_boolean(
predicate
.as_raw_expr()
.type_check(&metadata)?
.project_column_ref(|col| col.column_name.clone()),
)?;
let filter = filter.as_remote_expr();
let virtual_columns = self.build_virtual_columns(&prewhere.prewhere_columns);
Ok::<PrewhereInfo, ErrorCode>(PrewhereInfo {
output_columns,
prewhere_columns,
remain_columns,
filter,
virtual_columns,
})
})
.transpose()?;
let order_by = scan
.order_by
.clone()
.map(|items| {
items
.into_iter()
.map(|item| {
let metadata = self.metadata.read();
let column = metadata.column(item.index);
let (name, data_type) = match column {
ColumnEntry::BaseTableColumn(BaseTableColumn {
column_name,
data_type,
..
}) => (column_name.clone(), DataType::from(data_type)),
ColumnEntry::DerivedColumn(DerivedColumn {
alias, data_type, ..
}) => (alias.clone(), data_type.clone()),
ColumnEntry::InternalColumn(TableInternalColumn {
internal_column,
..
}) => (
internal_column.column_name().to_owned(),
internal_column.data_type(),
),
ColumnEntry::VirtualColumn(VirtualColumn {
column_name,
data_type,
..
}) => (column_name.clone(), DataType::from(data_type)),
};
// sort item is already a column
let scalar = RemoteExpr::ColumnRef {
span: None,
id: name.clone(),
data_type,
display_name: name,
};
Ok((scalar, item.asc, item.nulls_first))
})
.collect::<Result<Vec<_>>>()
})
.transpose()?;
Ok(PushDownInfo {
projection,
output_columns,
push_down_filter,
prewhere_info,
order_by,
is_deterministic,
})
}
}
This code snippet shows how to build the PushDownInfo
struct and integrate it into the scan planning process. You can adapt this approach to fit the specific requirements of your Iceberg Rust integration.
Additionally, you can refer to the IcebergTable
implementation in src/query/storages/iceberg/src/table.rs
for more context on how Iceberg tables are managed and how push-downs are utilized in Databend [4][5].
To continue talking to Dosu, mention @dosu.
Summary
Iceberg Rust has implemented scan planning, let's integrate with it directly instead.
Tasks