use fnv::FnvHashSet;
use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor};
use crate::expr::{BoundPredicate, BoundReference};
use crate::spec::{DataFile, Datum, PrimitiveLiteral};
use crate::{Error, ErrorKind};
const IN_PREDICATE_LIMIT: usize = 200;
const ROWS_MIGHT_MATCH: crate::Result<bool> = Ok(true);
const ROWS_CANNOT_MATCH: crate::Result<bool> = Ok(false);
pub(crate) struct InclusiveMetricsEvaluator<'a> {
data_file: &'a DataFile,
}
impl<'a> InclusiveMetricsEvaluator<'a> {
fn new(data_file: &'a DataFile) -> Self {
InclusiveMetricsEvaluator { data_file }
}
pub(crate) fn eval(
filter: &'a BoundPredicate,
data_file: &'a DataFile,
include_empty_files: bool,
) -> crate::Result<bool> {
if !include_empty_files && data_file.record_count == 0 {
return ROWS_CANNOT_MATCH;
}
let mut evaluator = Self::new(data_file);
visit(&mut evaluator, filter)
}
fn nan_count(&self, field_id: i32) -> Option<&u64> {
self.data_file.nan_value_counts.get(&field_id)
}
fn null_count(&self, field_id: i32) -> Option<&u64> {
self.data_file.null_value_counts.get(&field_id)
}
fn value_count(&self, field_id: i32) -> Option<&u64> {
self.data_file.value_counts.get(&field_id)
}
fn lower_bound(&self, field_id: i32) -> Option<&Datum> {
self.data_file.lower_bounds.get(&field_id)
}
fn upper_bound(&self, field_id: i32) -> Option<&Datum> {
self.data_file.upper_bounds.get(&field_id)
}
fn contains_nans_only(&self, field_id: i32) -> bool {
let nan_count = self.nan_count(field_id);
let value_count = self.value_count(field_id);
nan_count.is_some() && nan_count == value_count
}
fn contains_nulls_only(&self, field_id: i32) -> bool {
let null_count = self.null_count(field_id);
let value_count = self.value_count(field_id);
null_count.is_some() && null_count == value_count
}
fn may_contain_null(&self, field_id: i32) -> bool {
if let Some(&null_count) = self.null_count(field_id) {
null_count > 0
} else {
true
}
}
fn visit_inequality(
&mut self,
reference: &BoundReference,
datum: &Datum,
cmp_fn: fn(&Datum, &Datum) -> bool,
use_lower_bound: bool,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
return ROWS_CANNOT_MATCH;
}
if datum.is_nan() {
return ROWS_MIGHT_MATCH;
}
let bound = if use_lower_bound {
self.lower_bound(field_id)
} else {
self.upper_bound(field_id)
};
if let Some(bound) = bound {
if cmp_fn(bound, datum) {
return ROWS_MIGHT_MATCH;
}
return ROWS_CANNOT_MATCH;
}
ROWS_MIGHT_MATCH
}
}
impl BoundPredicateVisitor for InclusiveMetricsEvaluator<'_> {
type T = bool;
fn always_true(&mut self) -> crate::Result<bool> {
ROWS_MIGHT_MATCH
}
fn always_false(&mut self) -> crate::Result<bool> {
ROWS_CANNOT_MATCH
}
fn and(&mut self, lhs: bool, rhs: bool) -> crate::Result<bool> {
Ok(lhs && rhs)
}
fn or(&mut self, lhs: bool, rhs: bool) -> crate::Result<bool> {
Ok(lhs || rhs)
}
fn not(&mut self, inner: bool) -> crate::Result<bool> {
Ok(!inner)
}
fn is_null(
&mut self,
reference: &BoundReference,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
match self.null_count(field_id) {
Some(&0) => ROWS_CANNOT_MATCH,
Some(_) => ROWS_MIGHT_MATCH,
None => ROWS_MIGHT_MATCH,
}
}
fn not_null(
&mut self,
reference: &BoundReference,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nulls_only(field_id) {
return ROWS_CANNOT_MATCH;
}
ROWS_MIGHT_MATCH
}
fn is_nan(
&mut self,
reference: &BoundReference,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
match self.nan_count(field_id) {
Some(&0) => ROWS_CANNOT_MATCH,
_ if self.contains_nulls_only(field_id) => ROWS_CANNOT_MATCH,
_ => ROWS_MIGHT_MATCH,
}
}
fn not_nan(
&mut self,
reference: &BoundReference,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nans_only(field_id) {
return ROWS_CANNOT_MATCH;
}
ROWS_MIGHT_MATCH
}
fn less_than(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
self.visit_inequality(reference, datum, PartialOrd::lt, true)
}
fn less_than_or_eq(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
self.visit_inequality(reference, datum, PartialOrd::le, true)
}
fn greater_than(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
self.visit_inequality(reference, datum, PartialOrd::gt, false)
}
fn greater_than_or_eq(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
self.visit_inequality(reference, datum, PartialOrd::ge, false)
}
fn eq(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
return ROWS_CANNOT_MATCH;
}
if let Some(lower_bound) = self.lower_bound(field_id) {
if lower_bound.is_nan() {
return ROWS_MIGHT_MATCH;
} else if lower_bound.gt(datum) {
return ROWS_CANNOT_MATCH;
}
}
if let Some(upper_bound) = self.upper_bound(field_id) {
if upper_bound.is_nan() {
return ROWS_MIGHT_MATCH;
} else if upper_bound.lt(datum) {
return ROWS_CANNOT_MATCH;
}
}
ROWS_MIGHT_MATCH
}
fn not_eq(
&mut self,
_reference: &BoundReference,
_datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
ROWS_MIGHT_MATCH
}
fn starts_with(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nulls_only(field_id) {
return ROWS_CANNOT_MATCH;
}
let PrimitiveLiteral::String(datum) = datum.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string values",
));
};
if let Some(lower_bound) = self.lower_bound(field_id) {
let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string lower_bound value",
));
};
let prefix_length = lower_bound.chars().count().min(datum.chars().count());
let truncated_lower_bound = lower_bound.chars().take(prefix_length).collect::<String>();
if datum < &truncated_lower_bound {
return ROWS_CANNOT_MATCH;
}
}
if let Some(upper_bound) = self.upper_bound(field_id) {
let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string upper_bound value",
));
};
let prefix_length = upper_bound.chars().count().min(datum.chars().count());
let truncated_upper_bound = upper_bound.chars().take(prefix_length).collect::<String>();
if datum > &truncated_upper_bound {
return ROWS_CANNOT_MATCH;
}
}
ROWS_MIGHT_MATCH
}
fn not_starts_with(
&mut self,
reference: &BoundReference,
datum: &Datum,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.may_contain_null(field_id) {
return ROWS_MIGHT_MATCH;
}
let PrimitiveLiteral::String(prefix) = datum.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string values",
));
};
let Some(lower_bound) = self.lower_bound(field_id) else {
return ROWS_MIGHT_MATCH;
};
let PrimitiveLiteral::String(lower_bound_str) = lower_bound.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use NotStartsWith operator on non-string lower_bound value",
));
};
if lower_bound_str < prefix {
return ROWS_MIGHT_MATCH;
}
let prefix_len = prefix.chars().count();
if lower_bound_str.chars().take(prefix_len).collect::<String>() == *prefix {
let Some(upper_bound) = self.upper_bound(field_id) else {
return ROWS_MIGHT_MATCH;
};
let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use NotStartsWith operator on non-string upper_bound value",
));
};
if upper_bound.chars().count() < prefix_len {
return ROWS_MIGHT_MATCH;
}
if upper_bound.chars().take(prefix_len).collect::<String>() == *prefix {
return ROWS_CANNOT_MATCH;
}
}
ROWS_MIGHT_MATCH
}
fn r#in(
&mut self,
reference: &BoundReference,
literals: &FnvHashSet<Datum>,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
let field_id = reference.field().id;
if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
return ROWS_CANNOT_MATCH;
}
if literals.len() > IN_PREDICATE_LIMIT {
return ROWS_MIGHT_MATCH;
}
if let Some(lower_bound) = self.lower_bound(field_id) {
if lower_bound.is_nan() {
return ROWS_MIGHT_MATCH;
}
if !literals.iter().any(|datum| datum.ge(lower_bound)) {
return ROWS_CANNOT_MATCH;
}
}
if let Some(upper_bound) = self.upper_bound(field_id) {
if upper_bound.is_nan() {
return ROWS_MIGHT_MATCH;
}
if !literals.iter().any(|datum| datum.le(upper_bound)) {
return ROWS_CANNOT_MATCH;
}
}
ROWS_MIGHT_MATCH
}
fn not_in(
&mut self,
_reference: &BoundReference,
_literals: &FnvHashSet<Datum>,
_predicate: &BoundPredicate,
) -> crate::Result<bool> {
ROWS_MIGHT_MATCH
}
}
#[cfg(test)]
mod test {
use std::collections::HashMap;
use std::ops::Not;
use std::sync::Arc;
use fnv::FnvHashSet;
use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator;
use crate::expr::PredicateOperator::{
Eq, GreaterThan, GreaterThanOrEq, In, IsNan, IsNull, LessThan, LessThanOrEq, NotEq, NotIn,
NotNan, NotNull, NotStartsWith, StartsWith,
};
use crate::expr::{
BinaryExpression, Bind, BoundPredicate, Predicate, Reference, SetExpression,
UnaryExpression,
};
use crate::spec::{
DataContentType, DataFile, DataFileFormat, Datum, NestedField, PartitionSpec,
PartitionSpecRef, PrimitiveType, Schema, SchemaRef, Struct, Transform, Type,
UnboundPartitionField,
};
const INT_MIN_VALUE: i32 = 30;
const INT_MAX_VALUE: i32 = 79;
#[test]
fn test_data_file_no_partitions() {
let (_partition_spec_ref, schema_ref) = create_test_partition_spec();
let partition_filter = Predicate::AlwaysTrue
.bind(schema_ref.clone(), false)
.unwrap();
let case_sensitive = false;
let data_file = create_test_data_file();
let result =
InclusiveMetricsEvaluator::eval(&partition_filter, &data_file, case_sensitive).unwrap();
assert!(result);
}
#[test]
fn test_all_nulls() {
let result =
InclusiveMetricsEvaluator::eval(¬_null("all_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(!result, "Should skip: no non-null value in all null column");
let result =
InclusiveMetricsEvaluator::eval(&less_than("all_nulls", "a"), &get_test_file_1(), true)
.unwrap();
assert!(!result, "Should skip: LessThan on an all null column");
let result = InclusiveMetricsEvaluator::eval(
&less_than_or_equal("all_nulls", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: LessThanOrEqual on an all null column"
);
let result = InclusiveMetricsEvaluator::eval(
&greater_than("all_nulls", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: GreaterThan on an all null column");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_or_equal("all_nulls", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: GreaterThanOrEqual on an all null column"
);
let result =
InclusiveMetricsEvaluator::eval(&equal("all_nulls", "a"), &get_test_file_1(), true)
.unwrap();
assert!(!result, "Should skip: Equal on an all null column");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("all_nulls", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: StartsWith on an all null column");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("all_nulls", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: NotStartsWith on an all null column");
let result =
InclusiveMetricsEvaluator::eval(¬_null("some_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with some nulls could contain a non-null value"
);
let result =
InclusiveMetricsEvaluator::eval(¬_null("no_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with all nulls contains a non-null value"
);
}
#[test]
fn test_no_nulls() {
let result =
InclusiveMetricsEvaluator::eval(&is_null("all_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with all nulls contains a non-null value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_null("some_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with some nulls could contain a non-null value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_null("no_nulls"), &get_test_file_1(), true)
.unwrap();
assert!(
!result,
"Should skip: col with no nulls can't contains a non-null value"
);
}
#[test]
fn test_is_nan() {
let result =
InclusiveMetricsEvaluator::eval(&is_nan("all_nans"), &get_test_file_1(), true).unwrap();
assert!(
result,
"Should read: col with all nans must contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("some_nans"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with some nans could contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("no_nans"), &get_test_file_1(), true).unwrap();
assert!(
!result,
"Should skip: col with no nans can't contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("all_nulls_double"), &get_test_file_1(), true)
.unwrap();
assert!(
!result,
"Should skip: col with no nans can't contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("no_nan_stats"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: no guarantee col is nan-free without nan stats"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("all_nans_v1_stats"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with all nans must contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(&is_nan("nan_and_null_only"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with nans and nulls must contain a nan value"
);
}
#[test]
fn test_not_nan() {
let result =
InclusiveMetricsEvaluator::eval(¬_nan("all_nans"), &get_test_file_1(), true)
.unwrap();
assert!(
!result,
"Should read: col with all nans must contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(¬_nan("some_nans"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with some nans could contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(¬_nan("no_nans"), &get_test_file_1(), true).unwrap();
assert!(
result,
"Should read: col with no nans might contains a non-nan value"
);
let result =
InclusiveMetricsEvaluator::eval(¬_nan("all_nulls_double"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: col with no nans can't contains a nan value"
);
let result =
InclusiveMetricsEvaluator::eval(¬_nan("no_nan_stats"), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: no guarantee col is nan-free without nan stats"
);
let result = InclusiveMetricsEvaluator::eval(
¬_nan("all_nans_v1_stats"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: col with all nans must contains a nan value"
);
let result = InclusiveMetricsEvaluator::eval(
¬_nan("nan_and_null_only"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: col with nans and nulls may contain a non-nan value"
);
}
#[test]
fn test_required_column() {
let result =
InclusiveMetricsEvaluator::eval(¬_null("required"), &get_test_file_1(), true)
.unwrap();
assert!(result, "Should read: required columns are always non-null");
let result =
InclusiveMetricsEvaluator::eval(&is_null("required"), &get_test_file_1(), true)
.unwrap();
assert!(!result, "Should skip: required columns are always non-null");
}
#[test]
#[should_panic]
fn test_missing_column() {
let _result =
InclusiveMetricsEvaluator::eval(&less_than("missing", "a"), &get_test_file_1(), true);
}
#[test]
fn test_missing_stats() {
let missing_stats_datafile = create_test_data_file();
let expressions = [
less_than_int("no_stats", 5),
less_than_or_equal_int("no_stats", 30),
equal_int("no_stats", 70),
greater_than_int("no_stats", 78),
greater_than_or_equal_int("no_stats", 90),
not_equal_int("no_stats", 101),
is_null("no_stats"),
not_null("no_stats"),
];
for expression in expressions {
let result =
InclusiveMetricsEvaluator::eval(&expression, &missing_stats_datafile, true)
.unwrap();
assert!(
result,
"Should read if stats are missing for {:?}",
&expression
);
}
}
#[test]
fn test_zero_record_file() {
let zero_records_datafile = create_zero_records_data_file();
let expressions = [
less_than_int("no_stats", 5),
less_than_or_equal_int("no_stats", 30),
equal_int("no_stats", 70),
greater_than_int("no_stats", 78),
greater_than_or_equal_int("no_stats", 90),
not_equal_int("no_stats", 101),
is_null("no_stats"),
not_null("no_stats"),
];
for expression in expressions {
let result =
InclusiveMetricsEvaluator::eval(&expression, &zero_records_datafile, true).unwrap();
assert!(
result,
"Should skip if data file has zero records (expression: {:?})",
&expression
);
}
}
#[test]
fn test_not() {
let result = InclusiveMetricsEvaluator::eval(
¬_less_than_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: not(false)");
let result = InclusiveMetricsEvaluator::eval(
¬_greater_than_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: not(true)");
}
#[test]
fn test_and() {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 25),
))
.and(Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 30),
)));
let bound_pred = filter.bind(schema.clone(), true).unwrap();
let result =
InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
assert!(!result, "Should skip: and(false, true)");
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 25),
))
.and(Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new("id"),
Datum::int(INT_MAX_VALUE + 1),
)));
let bound_pred = filter.bind(schema.clone(), true).unwrap();
let result =
InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
assert!(!result, "Should skip: and(false, false)");
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThan,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 25),
))
.and(Predicate::Binary(BinaryExpression::new(
LessThanOrEq,
Reference::new("id"),
Datum::int(INT_MIN_VALUE),
)));
let bound_pred = filter.bind(schema.clone(), true).unwrap();
let result =
InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
assert!(result, "Should read: and(true, true)");
}
#[test]
fn test_or() {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 25),
))
.or(Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 30),
)));
let bound_pred = filter.bind(schema.clone(), true).unwrap();
let result =
InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
assert!(result, "Should read: or(false, true)");
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new("id"),
Datum::int(INT_MIN_VALUE - 25),
))
.or(Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new("id"),
Datum::int(INT_MAX_VALUE + 1),
)));
let bound_pred = filter.bind(schema.clone(), true).unwrap();
let result =
InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
assert!(!result, "Should skip: or(false, false)");
}
#[test]
fn test_integer_lt() {
let result = InclusiveMetricsEvaluator::eval(
&less_than_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range below lower bound (5 < 30)");
let result = InclusiveMetricsEvaluator::eval(
&less_than_int("id", INT_MIN_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id range below lower bound (30 is not < 30)"
);
let result = InclusiveMetricsEvaluator::eval(
&less_than_int("id", INT_MIN_VALUE + 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: one possible id");
let result = InclusiveMetricsEvaluator::eval(
&less_than_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: many possible ids");
}
#[test]
fn test_integer_lt_eq() {
let result = InclusiveMetricsEvaluator::eval(
&less_than_or_equal_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range below lower bound (5 < 30)");
let result = InclusiveMetricsEvaluator::eval(
&less_than_or_equal_int("id", INT_MIN_VALUE - 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range below lower bound (29 < 30)");
let result = InclusiveMetricsEvaluator::eval(
&less_than_or_equal_int("id", INT_MIN_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: one possible id");
let result = InclusiveMetricsEvaluator::eval(
&less_than_or_equal_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: many possible ids");
}
#[test]
fn test_integer_gt() {
let result = InclusiveMetricsEvaluator::eval(
&greater_than_int("id", INT_MAX_VALUE + 6),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range above upper bound (85 > 79)");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id range above upper bound (79 is not > 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&greater_than_int("id", INT_MAX_VALUE - 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: one possible id");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_int("id", INT_MAX_VALUE - 4),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: many possible ids");
}
#[test]
fn test_integer_gt_eq() {
let result = InclusiveMetricsEvaluator::eval(
&greater_than_or_equal_int("id", INT_MAX_VALUE + 6),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range above upper bound (85 < 79)");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_or_equal_int("id", INT_MAX_VALUE + 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id range above upper bound (80 > 79)");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_or_equal_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: one possible id");
let result = InclusiveMetricsEvaluator::eval(
&greater_than_or_equal_int("id", INT_MAX_VALUE - 4),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: many possible ids");
}
#[test]
fn test_integer_eq() {
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id below lower bound");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MIN_VALUE - 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id below lower bound");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MIN_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to lower bound");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MAX_VALUE - 4),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id between lower and upper bounds");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to upper bound");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MAX_VALUE + 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id above upper bound");
let result = InclusiveMetricsEvaluator::eval(
&equal_int("id", INT_MAX_VALUE + 6),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: id above upper bound");
}
#[test]
fn test_integer_not_eq() {
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MIN_VALUE - 25),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id below lower bound");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MIN_VALUE - 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id below lower bound");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MIN_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to lower bound");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MAX_VALUE - 4),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id between lower and upper bounds");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MAX_VALUE),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to upper bound");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MAX_VALUE + 1),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id above upper bound");
let result = InclusiveMetricsEvaluator::eval(
¬_equal_int("id", INT_MAX_VALUE + 6),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id above upper bound");
}
#[test]
#[should_panic]
fn test_case_sensitive_integer_not_eq_rewritten() {
let _result =
InclusiveMetricsEvaluator::eval(&equal_int_not("ID", 5), &get_test_file_1(), true)
.unwrap();
}
#[test]
fn test_string_starts_with() {
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: no stats");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "a"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "aa"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "aaa"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "1s"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "1str1x"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "ff"),
&get_test_file_4(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "aB"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "dWX"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "5"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", "3str3x"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("some_empty", "房东整租霍"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: range does matches");
let result = InclusiveMetricsEvaluator::eval(
&starts_with("all_nulls", ""),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
let above_max = "イロハニボ";
let result = InclusiveMetricsEvaluator::eval(
&starts_with("required", above_max),
&get_test_file_4(),
true,
)
.unwrap();
assert!(!result, "Should skip: range does not match");
}
#[test]
fn test_string_not_starts_with() {
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "a"),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: no stats");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "a"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "aa"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "aaa"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "1s"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "1str1x"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "ff"),
&get_test_file_4(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "aB"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "dWX"),
&get_test_file_2(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "5"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", "3str3x"),
&get_test_file_3(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
let above_max = "イロハニホヘト";
let result = InclusiveMetricsEvaluator::eval(
¬_starts_with("required", above_max),
&get_test_file_4(),
true,
)
.unwrap();
assert!(result, "Should read: range matches");
}
#[test]
fn test_integer_in() {
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MIN_VALUE - 25, INT_MIN_VALUE - 24]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id below lower bound (5 < 30, 6 < 30)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MIN_VALUE - 2, INT_MIN_VALUE - 1]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id below lower bound (28 < 30, 29 < 30)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MIN_VALUE - 1, INT_MIN_VALUE]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to lower bound (30 == 30)");
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MAX_VALUE - 4, INT_MAX_VALUE - 3]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MAX_VALUE, INT_MAX_VALUE + 1]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to upper bound (79 == 79)");
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MAX_VALUE + 1, INT_MAX_VALUE + 2]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id above upper bound (80 > 79, 81 > 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#in_int("id", &[INT_MAX_VALUE + 6, INT_MAX_VALUE + 7]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
!result,
"Should skip: id above upper bound (85 > 79, 86 > 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#in_str("all_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(!result, "Should skip: in on all nulls column");
let result = InclusiveMetricsEvaluator::eval(
&r#in_str("some_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: in on some nulls column");
let result = InclusiveMetricsEvaluator::eval(
&r#in_str("no_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: in on no nulls column");
let ids = (-400..=0).collect::<Vec<_>>();
let result =
InclusiveMetricsEvaluator::eval(&r#in_int("id", &ids), &get_test_file_1(), true)
.unwrap();
assert!(
result,
"Should read: number of items in In expression greater than threshold"
);
}
#[test]
fn test_integer_not_in() {
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MIN_VALUE - 25, INT_MIN_VALUE - 24]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id below lower bound (5 < 30, 6 < 30)");
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MIN_VALUE - 2, INT_MIN_VALUE - 1]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: id below lower bound (28 < 30, 29 < 30)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MIN_VALUE - 1, INT_MIN_VALUE]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to lower bound (30 == 30)");
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MAX_VALUE - 4, INT_MAX_VALUE - 3]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MAX_VALUE, INT_MAX_VALUE + 1]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: id equal to upper bound (79 == 79)");
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MAX_VALUE + 1, INT_MAX_VALUE + 2]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: id above upper bound (80 > 79, 81 > 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_int("id", &[INT_MAX_VALUE + 6, INT_MAX_VALUE + 7]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(
result,
"Should read: id above upper bound (85 > 79, 86 > 79)"
);
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_str("all_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: NotIn on all nulls column");
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_str("some_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: NotIn on some nulls column");
let result = InclusiveMetricsEvaluator::eval(
&r#not_in_str("no_nulls", &["abc", "def"]),
&get_test_file_1(),
true,
)
.unwrap();
assert!(result, "Should read: NotIn on no nulls column");
}
fn create_test_partition_spec() -> (PartitionSpecRef, SchemaRef) {
let table_schema = Schema::builder()
.with_fields(vec![Arc::new(NestedField::optional(
1,
"a",
Type::Primitive(PrimitiveType::Float),
))])
.build()
.unwrap();
let table_schema_ref = Arc::new(table_schema);
let partition_spec = PartitionSpec::builder(table_schema_ref.clone())
.with_spec_id(1)
.add_unbound_fields(vec![UnboundPartitionField::builder()
.source_id(1)
.name("a".to_string())
.field_id(1)
.transform(Transform::Identity)
.build()])
.unwrap()
.build()
.unwrap();
(Arc::new(partition_spec), table_schema_ref)
}
fn not_null(reference: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Unary(UnaryExpression::new(NotNull, Reference::new(reference)));
filter.bind(schema.clone(), true).unwrap()
}
fn is_null(reference: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Unary(UnaryExpression::new(IsNull, Reference::new(reference)));
filter.bind(schema.clone(), true).unwrap()
}
fn not_nan(reference: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Unary(UnaryExpression::new(NotNan, Reference::new(reference)));
filter.bind(schema.clone(), true).unwrap()
}
fn is_nan(reference: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Unary(UnaryExpression::new(IsNan, Reference::new(reference)));
filter.bind(schema.clone(), true).unwrap()
}
fn less_than(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn less_than_or_equal(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThanOrEq,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn greater_than(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThan,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn greater_than_or_equal(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn equal(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
Eq,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn less_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn not_less_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThan,
Reference::new(reference),
Datum::int(int_literal),
))
.not();
filter.bind(schema.clone(), true).unwrap()
}
fn less_than_or_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
LessThanOrEq,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn greater_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThan,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn not_greater_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThan,
Reference::new(reference),
Datum::int(int_literal),
))
.not();
filter.bind(schema.clone(), true).unwrap()
}
fn greater_than_or_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
GreaterThanOrEq,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
Eq,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn equal_int_not(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
Eq,
Reference::new(reference),
Datum::int(int_literal),
))
.not();
filter.bind(schema.clone(), true).unwrap()
}
fn not_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
NotEq,
Reference::new(reference),
Datum::int(int_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn starts_with(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
StartsWith,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn not_starts_with(reference: &str, str_literal: &str) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Binary(BinaryExpression::new(
NotStartsWith,
Reference::new(reference),
Datum::string(str_literal),
));
filter.bind(schema.clone(), true).unwrap()
}
fn in_int(reference: &str, int_literals: &[i32]) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Set(SetExpression::new(
In,
Reference::new(reference),
FnvHashSet::from_iter(int_literals.iter().map(|&lit| Datum::int(lit))),
));
filter.bind(schema.clone(), true).unwrap()
}
fn in_str(reference: &str, str_literals: &[&str]) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Set(SetExpression::new(
In,
Reference::new(reference),
FnvHashSet::from_iter(str_literals.iter().map(Datum::string)),
));
filter.bind(schema.clone(), true).unwrap()
}
fn not_in_int(reference: &str, int_literals: &[i32]) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Set(SetExpression::new(
NotIn,
Reference::new(reference),
FnvHashSet::from_iter(int_literals.iter().map(|&lit| Datum::int(lit))),
));
filter.bind(schema.clone(), true).unwrap()
}
fn not_in_str(reference: &str, str_literals: &[&str]) -> BoundPredicate {
let schema = create_test_schema();
let filter = Predicate::Set(SetExpression::new(
NotIn,
Reference::new(reference),
FnvHashSet::from_iter(str_literals.iter().map(Datum::string)),
));
filter.bind(schema.clone(), true).unwrap()
}
fn create_test_schema() -> Arc<Schema> {
let table_schema = Schema::builder()
.with_fields(vec![
Arc::new(NestedField::required(
1,
"id",
Type::Primitive(PrimitiveType::Int),
)),
Arc::new(NestedField::optional(
2,
"no_stats",
Type::Primitive(PrimitiveType::Int),
)),
Arc::new(NestedField::required(
3,
"required",
Type::Primitive(PrimitiveType::String),
)),
Arc::new(NestedField::optional(
4,
"all_nulls",
Type::Primitive(PrimitiveType::String),
)),
Arc::new(NestedField::optional(
5,
"some_nulls",
Type::Primitive(PrimitiveType::String),
)),
Arc::new(NestedField::optional(
6,
"no_nulls",
Type::Primitive(PrimitiveType::String),
)),
Arc::new(NestedField::optional(
7,
"all_nans",
Type::Primitive(PrimitiveType::Double),
)),
Arc::new(NestedField::optional(
8,
"some_nans",
Type::Primitive(PrimitiveType::Float),
)),
Arc::new(NestedField::optional(
9,
"no_nans",
Type::Primitive(PrimitiveType::Float),
)),
Arc::new(NestedField::optional(
10,
"all_nulls_double",
Type::Primitive(PrimitiveType::Double),
)),
Arc::new(NestedField::optional(
11,
"all_nans_v1_stats",
Type::Primitive(PrimitiveType::Float),
)),
Arc::new(NestedField::optional(
12,
"nan_and_null_only",
Type::Primitive(PrimitiveType::Double),
)),
Arc::new(NestedField::optional(
13,
"no_nan_stats",
Type::Primitive(PrimitiveType::Double),
)),
Arc::new(NestedField::optional(
14,
"some_empty",
Type::Primitive(PrimitiveType::String),
)),
])
.build()
.unwrap();
Arc::new(table_schema)
}
fn create_test_data_file() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "/test/path".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 10,
file_size_in_bytes: 10,
column_sizes: Default::default(),
value_counts: Default::default(),
null_value_counts: Default::default(),
nan_value_counts: Default::default(),
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
fn create_zero_records_data_file() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "/test/path".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 0,
file_size_in_bytes: 10,
column_sizes: Default::default(),
value_counts: Default::default(),
null_value_counts: Default::default(),
nan_value_counts: Default::default(),
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
fn get_test_file_1() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "/test/path".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 50,
file_size_in_bytes: 10,
value_counts: HashMap::from([
(4, 50),
(5, 50),
(6, 50),
(7, 50),
(8, 50),
(9, 50),
(10, 50),
(11, 50),
(12, 50),
(13, 50),
(14, 50),
]),
null_value_counts: HashMap::from([
(4, 50),
(5, 10),
(6, 0),
(10, 50),
(11, 0),
(12, 1),
(14, 0),
]),
nan_value_counts: HashMap::from([(7, 50), (8, 10), (9, 0)]),
lower_bounds: HashMap::from([
(1, Datum::int(INT_MIN_VALUE)),
(11, Datum::float(f32::NAN)),
(12, Datum::double(f64::NAN)),
(14, Datum::string("")),
]),
upper_bounds: HashMap::from([
(1, Datum::int(INT_MAX_VALUE)),
(11, Datum::float(f32::NAN)),
(12, Datum::double(f64::NAN)),
(14, Datum::string("房东整租霍营小区二层两居室")),
]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
fn get_test_file_2() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "file_2.avro".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 50,
file_size_in_bytes: 10,
value_counts: HashMap::from([(3, 20)]),
null_value_counts: HashMap::from([(3, 2)]),
nan_value_counts: HashMap::default(),
lower_bounds: HashMap::from([(3, Datum::string("aa"))]),
upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
fn get_test_file_3() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "file_3.avro".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 50,
file_size_in_bytes: 10,
value_counts: HashMap::from([(3, 20)]),
null_value_counts: HashMap::from([(3, 2)]),
nan_value_counts: HashMap::default(),
lower_bounds: HashMap::from([(3, Datum::string("1str1"))]),
upper_bounds: HashMap::from([(3, Datum::string("3str3"))]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
fn get_test_file_4() -> DataFile {
DataFile {
content: DataContentType::Data,
file_path: "file_4.avro".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 50,
file_size_in_bytes: 10,
value_counts: HashMap::from([(3, 20)]),
null_value_counts: HashMap::from([(3, 2)]),
nan_value_counts: HashMap::default(),
lower_bounds: HashMap::from([(3, Datum::string("abc"))]),
upper_bounds: HashMap::from([(3, Datum::string("イロハニホヘト"))]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
equality_ids: vec![],
sort_order_id: None,
}
}
}