iceberg/expr/visitors/
inclusive_metrics_evaluator.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use fnv::FnvHashSet;
19
20use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit};
21use crate::expr::{BoundPredicate, BoundReference};
22use crate::spec::{DataFile, Datum, PrimitiveLiteral};
23use crate::{Error, ErrorKind};
24
25const IN_PREDICATE_LIMIT: usize = 200;
26const ROWS_MIGHT_MATCH: crate::Result<bool> = Ok(true);
27const ROWS_CANNOT_MATCH: crate::Result<bool> = Ok(false);
28
29pub(crate) struct InclusiveMetricsEvaluator<'a> {
30    data_file: &'a DataFile,
31}
32
33impl<'a> InclusiveMetricsEvaluator<'a> {
34    fn new(data_file: &'a DataFile) -> Self {
35        InclusiveMetricsEvaluator { data_file }
36    }
37
38    /// Evaluate this `InclusiveMetricsEvaluator`'s filter predicate against the
39    /// provided [`DataFile`]'s metrics. Used by [`TableScan`] to
40    /// see if this `DataFile` contains data that could match
41    /// the scan's filter.
42    pub(crate) fn eval(
43        filter: &'a BoundPredicate,
44        data_file: &'a DataFile,
45        include_empty_files: bool,
46    ) -> crate::Result<bool> {
47        if !include_empty_files && data_file.record_count == 0 {
48            return ROWS_CANNOT_MATCH;
49        }
50
51        let mut evaluator = Self::new(data_file);
52        visit(&mut evaluator, filter)
53    }
54
55    fn nan_count(&self, field_id: i32) -> Option<&u64> {
56        self.data_file.nan_value_counts.get(&field_id)
57    }
58
59    fn null_count(&self, field_id: i32) -> Option<&u64> {
60        self.data_file.null_value_counts.get(&field_id)
61    }
62
63    fn value_count(&self, field_id: i32) -> Option<&u64> {
64        self.data_file.value_counts.get(&field_id)
65    }
66
67    fn lower_bound(&self, field_id: i32) -> Option<&Datum> {
68        self.data_file.lower_bounds.get(&field_id)
69    }
70
71    fn upper_bound(&self, field_id: i32) -> Option<&Datum> {
72        self.data_file.upper_bounds.get(&field_id)
73    }
74
75    fn contains_nans_only(&self, field_id: i32) -> bool {
76        let nan_count = self.nan_count(field_id);
77        let value_count = self.value_count(field_id);
78
79        nan_count.is_some() && nan_count == value_count
80    }
81
82    fn contains_nulls_only(&self, field_id: i32) -> bool {
83        let null_count = self.null_count(field_id);
84        let value_count = self.value_count(field_id);
85
86        null_count.is_some() && null_count == value_count
87    }
88
89    fn may_contain_null(&self, field_id: i32) -> bool {
90        if let Some(&null_count) = self.null_count(field_id) {
91            null_count > 0
92        } else {
93            true
94        }
95    }
96
97    fn visit_inequality(
98        &mut self,
99        reference: &BoundReference,
100        datum: &Datum,
101        cmp_fn: fn(&Datum, &Datum) -> bool,
102        use_lower_bound: bool,
103    ) -> crate::Result<bool> {
104        let field_id = reference.field().id;
105
106        if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
107            return ROWS_CANNOT_MATCH;
108        }
109
110        if datum.is_nan() {
111            // NaN indicates unreliable bounds.
112            // See the InclusiveMetricsEvaluator docs for more.
113            return ROWS_MIGHT_MATCH;
114        }
115
116        let bound = if use_lower_bound {
117            self.lower_bound(field_id)
118        } else {
119            self.upper_bound(field_id)
120        };
121
122        if let Some(bound) = bound {
123            if cmp_fn(bound, datum) {
124                return ROWS_MIGHT_MATCH;
125            }
126
127            return ROWS_CANNOT_MATCH;
128        }
129
130        ROWS_MIGHT_MATCH
131    }
132}
133
134impl BoundPredicateVisitor for InclusiveMetricsEvaluator<'_> {
135    type T = bool;
136
137    fn always_true(&mut self) -> crate::Result<bool> {
138        ROWS_MIGHT_MATCH
139    }
140
141    fn always_false(&mut self) -> crate::Result<bool> {
142        ROWS_CANNOT_MATCH
143    }
144
145    fn and(&mut self, lhs: bool, rhs: bool) -> crate::Result<bool> {
146        Ok(lhs && rhs)
147    }
148
149    fn or(&mut self, lhs: bool, rhs: bool) -> crate::Result<bool> {
150        Ok(lhs || rhs)
151    }
152
153    fn not(&mut self, inner: bool) -> crate::Result<bool> {
154        Ok(!inner)
155    }
156
157    fn is_null(
158        &mut self,
159        reference: &BoundReference,
160        _predicate: &BoundPredicate,
161    ) -> crate::Result<bool> {
162        let field_id = reference.field().id;
163
164        match self.null_count(field_id) {
165            Some(&0) => ROWS_CANNOT_MATCH,
166            Some(_) => ROWS_MIGHT_MATCH,
167            None => ROWS_MIGHT_MATCH,
168        }
169    }
170
171    fn not_null(
172        &mut self,
173        reference: &BoundReference,
174        _predicate: &BoundPredicate,
175    ) -> crate::Result<bool> {
176        let field_id = reference.field().id;
177
178        if self.contains_nulls_only(field_id) {
179            return ROWS_CANNOT_MATCH;
180        }
181
182        ROWS_MIGHT_MATCH
183    }
184
185    fn is_nan(
186        &mut self,
187        reference: &BoundReference,
188        _predicate: &BoundPredicate,
189    ) -> crate::Result<bool> {
190        let field_id = reference.field().id;
191
192        match self.nan_count(field_id) {
193            Some(&0) => ROWS_CANNOT_MATCH,
194            _ if self.contains_nulls_only(field_id) => ROWS_CANNOT_MATCH,
195            _ => ROWS_MIGHT_MATCH,
196        }
197    }
198
199    fn not_nan(
200        &mut self,
201        reference: &BoundReference,
202        _predicate: &BoundPredicate,
203    ) -> crate::Result<bool> {
204        let field_id = reference.field().id;
205
206        if self.contains_nans_only(field_id) {
207            return ROWS_CANNOT_MATCH;
208        }
209
210        ROWS_MIGHT_MATCH
211    }
212
213    fn less_than(
214        &mut self,
215        reference: &BoundReference,
216        datum: &Datum,
217        _predicate: &BoundPredicate,
218    ) -> crate::Result<bool> {
219        self.visit_inequality(reference, datum, PartialOrd::lt, true)
220    }
221
222    fn less_than_or_eq(
223        &mut self,
224        reference: &BoundReference,
225        datum: &Datum,
226        _predicate: &BoundPredicate,
227    ) -> crate::Result<bool> {
228        self.visit_inequality(reference, datum, PartialOrd::le, true)
229    }
230
231    fn greater_than(
232        &mut self,
233        reference: &BoundReference,
234        datum: &Datum,
235        _predicate: &BoundPredicate,
236    ) -> crate::Result<bool> {
237        self.visit_inequality(reference, datum, PartialOrd::gt, false)
238    }
239
240    fn greater_than_or_eq(
241        &mut self,
242        reference: &BoundReference,
243        datum: &Datum,
244        _predicate: &BoundPredicate,
245    ) -> crate::Result<bool> {
246        self.visit_inequality(reference, datum, PartialOrd::ge, false)
247    }
248
249    fn eq(
250        &mut self,
251        reference: &BoundReference,
252        datum: &Datum,
253        _predicate: &BoundPredicate,
254    ) -> crate::Result<bool> {
255        let field_id = reference.field().id;
256
257        if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
258            return ROWS_CANNOT_MATCH;
259        }
260
261        if let Some(lower_bound) = self.lower_bound(field_id) {
262            if lower_bound.is_nan() {
263                // NaN indicates unreliable bounds.
264                // See the InclusiveMetricsEvaluator docs for more.
265                return ROWS_MIGHT_MATCH;
266            } else if lower_bound.gt(datum) {
267                return ROWS_CANNOT_MATCH;
268            }
269        }
270
271        if let Some(upper_bound) = self.upper_bound(field_id) {
272            if upper_bound.is_nan() {
273                // NaN indicates unreliable bounds.
274                // See the InclusiveMetricsEvaluator docs for more.
275                return ROWS_MIGHT_MATCH;
276            } else if upper_bound.lt(datum) {
277                return ROWS_CANNOT_MATCH;
278            }
279        }
280
281        ROWS_MIGHT_MATCH
282    }
283
284    fn not_eq(
285        &mut self,
286        _reference: &BoundReference,
287        _datum: &Datum,
288        _predicate: &BoundPredicate,
289    ) -> crate::Result<bool> {
290        // Because the bounds are not necessarily a min or max value,
291        // this cannot be answered using them. notEq(col, X) with (X, Y)
292        // doesn't guarantee that X is a value in col.
293        ROWS_MIGHT_MATCH
294    }
295
296    fn starts_with(
297        &mut self,
298        reference: &BoundReference,
299        datum: &Datum,
300        _predicate: &BoundPredicate,
301    ) -> crate::Result<bool> {
302        let field_id = reference.field().id;
303
304        if self.contains_nulls_only(field_id) {
305            return ROWS_CANNOT_MATCH;
306        }
307
308        let PrimitiveLiteral::String(datum) = datum.literal() else {
309            return Err(Error::new(
310                ErrorKind::Unexpected,
311                "Cannot use StartsWith operator on non-string values",
312            ));
313        };
314
315        if let Some(lower_bound) = self.lower_bound(field_id) {
316            let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else {
317                return Err(Error::new(
318                    ErrorKind::Unexpected,
319                    "Cannot use StartsWith operator on non-string lower_bound value",
320                ));
321            };
322
323            let prefix_length = lower_bound.chars().count().min(datum.chars().count());
324
325            // truncate lower bound so that its length
326            // is not greater than the length of prefix
327            let truncated_lower_bound = lower_bound.chars().take(prefix_length).collect::<String>();
328            if datum < &truncated_lower_bound {
329                return ROWS_CANNOT_MATCH;
330            }
331        }
332
333        if let Some(upper_bound) = self.upper_bound(field_id) {
334            let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else {
335                return Err(Error::new(
336                    ErrorKind::Unexpected,
337                    "Cannot use StartsWith operator on non-string upper_bound value",
338                ));
339            };
340
341            let prefix_length = upper_bound.chars().count().min(datum.chars().count());
342
343            // truncate upper bound so that its length
344            // is not greater than the length of prefix
345            let truncated_upper_bound = upper_bound.chars().take(prefix_length).collect::<String>();
346            if datum > &truncated_upper_bound {
347                return ROWS_CANNOT_MATCH;
348            }
349        }
350
351        ROWS_MIGHT_MATCH
352    }
353
354    fn not_starts_with(
355        &mut self,
356        reference: &BoundReference,
357        datum: &Datum,
358        _predicate: &BoundPredicate,
359    ) -> crate::Result<bool> {
360        let field_id = reference.field().id;
361
362        if self.may_contain_null(field_id) {
363            return ROWS_MIGHT_MATCH;
364        }
365
366        // notStartsWith will match unless all values must start with the prefix.
367        // This happens when the lower and upper bounds both start with the prefix.
368
369        let PrimitiveLiteral::String(prefix) = datum.literal() else {
370            return Err(Error::new(
371                ErrorKind::Unexpected,
372                "Cannot use StartsWith operator on non-string values",
373            ));
374        };
375
376        let Some(lower_bound) = self.lower_bound(field_id) else {
377            return ROWS_MIGHT_MATCH;
378        };
379
380        let PrimitiveLiteral::String(lower_bound_str) = lower_bound.literal() else {
381            return Err(Error::new(
382                ErrorKind::Unexpected,
383                "Cannot use NotStartsWith operator on non-string lower_bound value",
384            ));
385        };
386
387        if lower_bound_str < prefix {
388            // if lower is shorter than the prefix then lower doesn't start with the prefix
389            return ROWS_MIGHT_MATCH;
390        }
391
392        let prefix_len = prefix.chars().count();
393
394        if lower_bound_str.chars().take(prefix_len).collect::<String>() == *prefix {
395            // lower bound matches the prefix
396
397            let Some(upper_bound) = self.upper_bound(field_id) else {
398                return ROWS_MIGHT_MATCH;
399            };
400
401            let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else {
402                return Err(Error::new(
403                    ErrorKind::Unexpected,
404                    "Cannot use NotStartsWith operator on non-string upper_bound value",
405                ));
406            };
407
408            // if upper is shorter than the prefix then upper can't start with the prefix
409            if upper_bound.chars().count() < prefix_len {
410                return ROWS_MIGHT_MATCH;
411            }
412
413            if upper_bound.chars().take(prefix_len).collect::<String>() == *prefix {
414                // both bounds match the prefix, so all rows must match the
415                // prefix and therefore do not satisfy the predicate
416                return ROWS_CANNOT_MATCH;
417            }
418        }
419
420        ROWS_MIGHT_MATCH
421    }
422
423    fn r#in(
424        &mut self,
425        reference: &BoundReference,
426        literals: &FnvHashSet<Datum>,
427        _predicate: &BoundPredicate,
428    ) -> crate::Result<bool> {
429        let field_id = reference.field().id;
430
431        if self.contains_nulls_only(field_id) || self.contains_nans_only(field_id) {
432            return ROWS_CANNOT_MATCH;
433        }
434
435        if literals.len() > IN_PREDICATE_LIMIT {
436            // skip evaluating the predicate if the number of values is too big
437            return ROWS_MIGHT_MATCH;
438        }
439
440        if let Some(lower_bound) = self.lower_bound(field_id) {
441            if lower_bound.is_nan() {
442                // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
443                return ROWS_MIGHT_MATCH;
444            }
445
446            if !literals.iter().any(|datum| datum.ge(lower_bound)) {
447                // if all values are less than lower bound, rows cannot match.
448                return ROWS_CANNOT_MATCH;
449            }
450        }
451
452        if let Some(upper_bound) = self.upper_bound(field_id) {
453            if upper_bound.is_nan() {
454                // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
455                return ROWS_MIGHT_MATCH;
456            }
457
458            if !literals.iter().any(|datum| datum.le(upper_bound)) {
459                // if all values are greater than upper bound, rows cannot match.
460                return ROWS_CANNOT_MATCH;
461            }
462        }
463
464        ROWS_MIGHT_MATCH
465    }
466
467    fn not_in(
468        &mut self,
469        _reference: &BoundReference,
470        _literals: &FnvHashSet<Datum>,
471        _predicate: &BoundPredicate,
472    ) -> crate::Result<bool> {
473        // Because the bounds are not necessarily a min or max value,
474        // this cannot be answered using them. notIn(col, {X, ...})
475        // with (X, Y) doesn't guarantee that X is a value in col.
476        ROWS_MIGHT_MATCH
477    }
478}
479
480#[cfg(test)]
481mod test {
482    use std::collections::HashMap;
483    use std::ops::Not;
484    use std::sync::Arc;
485
486    use fnv::FnvHashSet;
487
488    use crate::expr::PredicateOperator::{
489        Eq, GreaterThan, GreaterThanOrEq, In, IsNan, IsNull, LessThan, LessThanOrEq, NotEq, NotIn,
490        NotNan, NotNull, NotStartsWith, StartsWith,
491    };
492    use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator;
493    use crate::expr::{
494        BinaryExpression, Bind, BoundPredicate, Predicate, Reference, SetExpression,
495        UnaryExpression,
496    };
497    use crate::spec::{
498        DataContentType, DataFile, DataFileFormat, Datum, NestedField, PartitionSpec,
499        PartitionSpecRef, PrimitiveType, Schema, SchemaRef, Struct, Transform, Type,
500        UnboundPartitionField,
501    };
502
503    const INT_MIN_VALUE: i32 = 30;
504    const INT_MAX_VALUE: i32 = 79;
505
506    #[test]
507    fn test_data_file_no_partitions() {
508        let (_partition_spec_ref, schema_ref) = create_test_partition_spec();
509
510        let partition_filter = Predicate::AlwaysTrue
511            .bind(schema_ref.clone(), false)
512            .unwrap();
513
514        let case_sensitive = false;
515
516        let data_file = create_test_data_file();
517
518        let result =
519            InclusiveMetricsEvaluator::eval(&partition_filter, &data_file, case_sensitive).unwrap();
520
521        assert!(result);
522    }
523
524    #[test]
525    fn test_all_nulls() {
526        let result =
527            InclusiveMetricsEvaluator::eval(&not_null("all_nulls"), &get_test_file_1(), true)
528                .unwrap();
529        assert!(!result, "Should skip: no non-null value in all null column");
530
531        let result =
532            InclusiveMetricsEvaluator::eval(&less_than("all_nulls", "a"), &get_test_file_1(), true)
533                .unwrap();
534        assert!(!result, "Should skip: LessThan on an all null column");
535
536        let result = InclusiveMetricsEvaluator::eval(
537            &less_than_or_equal("all_nulls", "a"),
538            &get_test_file_1(),
539            true,
540        )
541        .unwrap();
542        assert!(
543            !result,
544            "Should skip: LessThanOrEqual on an all null column"
545        );
546
547        let result = InclusiveMetricsEvaluator::eval(
548            &greater_than("all_nulls", "a"),
549            &get_test_file_1(),
550            true,
551        )
552        .unwrap();
553        assert!(!result, "Should skip: GreaterThan on an all null column");
554
555        let result = InclusiveMetricsEvaluator::eval(
556            &greater_than_or_equal("all_nulls", "a"),
557            &get_test_file_1(),
558            true,
559        )
560        .unwrap();
561        assert!(
562            !result,
563            "Should skip: GreaterThanOrEqual on an all null column"
564        );
565
566        let result =
567            InclusiveMetricsEvaluator::eval(&equal("all_nulls", "a"), &get_test_file_1(), true)
568                .unwrap();
569        assert!(!result, "Should skip: Equal on an all null column");
570
571        let result = InclusiveMetricsEvaluator::eval(
572            &starts_with("all_nulls", "a"),
573            &get_test_file_1(),
574            true,
575        )
576        .unwrap();
577        assert!(!result, "Should skip: StartsWith on an all null column");
578
579        let result = InclusiveMetricsEvaluator::eval(
580            &not_starts_with("all_nulls", "a"),
581            &get_test_file_1(),
582            true,
583        )
584        .unwrap();
585        assert!(result, "Should read: NotStartsWith on an all null column");
586
587        let result =
588            InclusiveMetricsEvaluator::eval(&not_null("some_nulls"), &get_test_file_1(), true)
589                .unwrap();
590        assert!(
591            result,
592            "Should read: col with some nulls could contain a non-null value"
593        );
594
595        let result =
596            InclusiveMetricsEvaluator::eval(&not_null("no_nulls"), &get_test_file_1(), true)
597                .unwrap();
598        assert!(
599            result,
600            "Should read: col with all nulls contains a non-null value"
601        );
602    }
603
604    #[test]
605    fn test_no_nulls() {
606        let result =
607            InclusiveMetricsEvaluator::eval(&is_null("all_nulls"), &get_test_file_1(), true)
608                .unwrap();
609        assert!(
610            result,
611            "Should read: col with all nulls contains a non-null value"
612        );
613
614        let result =
615            InclusiveMetricsEvaluator::eval(&is_null("some_nulls"), &get_test_file_1(), true)
616                .unwrap();
617        assert!(
618            result,
619            "Should read: col with some nulls could contain a non-null value"
620        );
621
622        let result =
623            InclusiveMetricsEvaluator::eval(&is_null("no_nulls"), &get_test_file_1(), true)
624                .unwrap();
625        assert!(
626            !result,
627            "Should skip: col with no nulls can't contains a non-null value"
628        );
629    }
630
631    #[test]
632    fn test_is_nan() {
633        let result =
634            InclusiveMetricsEvaluator::eval(&is_nan("all_nans"), &get_test_file_1(), true).unwrap();
635        assert!(
636            result,
637            "Should read: col with all nans must contains a nan value"
638        );
639
640        let result =
641            InclusiveMetricsEvaluator::eval(&is_nan("some_nans"), &get_test_file_1(), true)
642                .unwrap();
643        assert!(
644            result,
645            "Should read: col with some nans could contains a nan value"
646        );
647
648        let result =
649            InclusiveMetricsEvaluator::eval(&is_nan("no_nans"), &get_test_file_1(), true).unwrap();
650        assert!(
651            !result,
652            "Should skip: col with no nans can't contains a nan value"
653        );
654
655        let result =
656            InclusiveMetricsEvaluator::eval(&is_nan("all_nulls_double"), &get_test_file_1(), true)
657                .unwrap();
658        assert!(
659            !result,
660            "Should skip: col with no nans can't contains a nan value"
661        );
662
663        let result =
664            InclusiveMetricsEvaluator::eval(&is_nan("no_nan_stats"), &get_test_file_1(), true)
665                .unwrap();
666        assert!(
667            result,
668            "Should read: no guarantee col is nan-free without nan stats"
669        );
670
671        let result =
672            InclusiveMetricsEvaluator::eval(&is_nan("all_nans_v1_stats"), &get_test_file_1(), true)
673                .unwrap();
674        assert!(
675            result,
676            "Should read: col with all nans must contains a nan value"
677        );
678
679        let result =
680            InclusiveMetricsEvaluator::eval(&is_nan("nan_and_null_only"), &get_test_file_1(), true)
681                .unwrap();
682        assert!(
683            result,
684            "Should read: col with nans and nulls must contain a nan value"
685        );
686    }
687
688    #[test]
689    fn test_not_nan() {
690        let result =
691            InclusiveMetricsEvaluator::eval(&not_nan("all_nans"), &get_test_file_1(), true)
692                .unwrap();
693        assert!(
694            !result,
695            "Should read: col with all nans must contains a nan value"
696        );
697
698        let result =
699            InclusiveMetricsEvaluator::eval(&not_nan("some_nans"), &get_test_file_1(), true)
700                .unwrap();
701        assert!(
702            result,
703            "Should read: col with some nans could contains a nan value"
704        );
705
706        let result =
707            InclusiveMetricsEvaluator::eval(&not_nan("no_nans"), &get_test_file_1(), true).unwrap();
708        assert!(
709            result,
710            "Should read: col with no nans might contains a non-nan value"
711        );
712
713        let result =
714            InclusiveMetricsEvaluator::eval(&not_nan("all_nulls_double"), &get_test_file_1(), true)
715                .unwrap();
716        assert!(
717            result,
718            "Should read: col with no nans can't contains a nan value"
719        );
720
721        let result =
722            InclusiveMetricsEvaluator::eval(&not_nan("no_nan_stats"), &get_test_file_1(), true)
723                .unwrap();
724        assert!(
725            result,
726            "Should read: no guarantee col is nan-free without nan stats"
727        );
728
729        let result = InclusiveMetricsEvaluator::eval(
730            &not_nan("all_nans_v1_stats"),
731            &get_test_file_1(),
732            true,
733        )
734        .unwrap();
735        assert!(
736            result,
737            "Should read: col with all nans must contains a nan value"
738        );
739
740        let result = InclusiveMetricsEvaluator::eval(
741            &not_nan("nan_and_null_only"),
742            &get_test_file_1(),
743            true,
744        )
745        .unwrap();
746        assert!(
747            result,
748            "Should read: col with nans and nulls may contain a non-nan value"
749        );
750    }
751
752    #[test]
753    fn test_required_column() {
754        let result =
755            InclusiveMetricsEvaluator::eval(&not_null("required"), &get_test_file_1(), true)
756                .unwrap();
757        assert!(result, "Should read: required columns are always non-null");
758
759        let result =
760            InclusiveMetricsEvaluator::eval(&is_null("required"), &get_test_file_1(), true)
761                .unwrap();
762        assert!(!result, "Should skip: required columns are always non-null");
763    }
764
765    #[test]
766    #[should_panic]
767    fn test_missing_column() {
768        let _result =
769            InclusiveMetricsEvaluator::eval(&less_than("missing", "a"), &get_test_file_1(), true);
770    }
771
772    #[test]
773    fn test_missing_stats() {
774        let missing_stats_datafile = create_test_data_file();
775
776        let expressions = [
777            less_than_int("no_stats", 5),
778            less_than_or_equal_int("no_stats", 30),
779            equal_int("no_stats", 70),
780            greater_than_int("no_stats", 78),
781            greater_than_or_equal_int("no_stats", 90),
782            not_equal_int("no_stats", 101),
783            is_null("no_stats"),
784            not_null("no_stats"),
785            // is_nan("no_stats"),
786            // not_nan("no_stats"),
787        ];
788
789        for expression in expressions {
790            let result =
791                InclusiveMetricsEvaluator::eval(&expression, &missing_stats_datafile, true)
792                    .unwrap();
793
794            assert!(
795                result,
796                "Should read if stats are missing for {:?}",
797                &expression
798            );
799        }
800    }
801
802    #[test]
803    fn test_zero_record_file() {
804        let zero_records_datafile = create_zero_records_data_file();
805
806        let expressions = [
807            less_than_int("no_stats", 5),
808            less_than_or_equal_int("no_stats", 30),
809            equal_int("no_stats", 70),
810            greater_than_int("no_stats", 78),
811            greater_than_or_equal_int("no_stats", 90),
812            not_equal_int("no_stats", 101),
813            is_null("no_stats"),
814            not_null("no_stats"),
815            // is_nan("no_stats"),
816            // not_nan("no_stats"),
817        ];
818
819        for expression in expressions {
820            let result =
821                InclusiveMetricsEvaluator::eval(&expression, &zero_records_datafile, true).unwrap();
822
823            assert!(
824                result,
825                "Should skip if data file has zero records (expression: {:?})",
826                &expression
827            );
828        }
829    }
830
831    #[test]
832    fn test_not() {
833        // Not sure if we need a test for this, as we'd expect,
834        // as a precondition, that rewrite-not has already been applied.
835
836        let result = InclusiveMetricsEvaluator::eval(
837            &not_less_than_int("id", INT_MIN_VALUE - 25),
838            &get_test_file_1(),
839            true,
840        )
841        .unwrap();
842        assert!(result, "Should read: not(false)");
843
844        let result = InclusiveMetricsEvaluator::eval(
845            &not_greater_than_int("id", INT_MIN_VALUE - 25),
846            &get_test_file_1(),
847            true,
848        )
849        .unwrap();
850        assert!(!result, "Should skip: not(true)");
851    }
852
853    #[test]
854    fn test_and() {
855        let schema = create_test_schema();
856        let filter = Predicate::Binary(BinaryExpression::new(
857            LessThan,
858            Reference::new("id"),
859            Datum::int(INT_MIN_VALUE - 25),
860        ))
861        .and(Predicate::Binary(BinaryExpression::new(
862            GreaterThanOrEq,
863            Reference::new("id"),
864            Datum::int(INT_MIN_VALUE - 30),
865        )));
866
867        let bound_pred = filter.bind(schema.clone(), true).unwrap();
868
869        let result =
870            InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
871        assert!(!result, "Should skip: and(false, true)");
872
873        let schema = create_test_schema();
874        let filter = Predicate::Binary(BinaryExpression::new(
875            LessThan,
876            Reference::new("id"),
877            Datum::int(INT_MIN_VALUE - 25),
878        ))
879        .and(Predicate::Binary(BinaryExpression::new(
880            GreaterThanOrEq,
881            Reference::new("id"),
882            Datum::int(INT_MAX_VALUE + 1),
883        )));
884
885        let bound_pred = filter.bind(schema.clone(), true).unwrap();
886
887        let result =
888            InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
889        assert!(!result, "Should skip: and(false, false)");
890
891        let schema = create_test_schema();
892        let filter = Predicate::Binary(BinaryExpression::new(
893            GreaterThan,
894            Reference::new("id"),
895            Datum::int(INT_MIN_VALUE - 25),
896        ))
897        .and(Predicate::Binary(BinaryExpression::new(
898            LessThanOrEq,
899            Reference::new("id"),
900            Datum::int(INT_MIN_VALUE),
901        )));
902
903        let bound_pred = filter.bind(schema.clone(), true).unwrap();
904
905        let result =
906            InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
907        assert!(result, "Should read: and(true, true)");
908    }
909
910    #[test]
911    fn test_or() {
912        let schema = create_test_schema();
913        let filter = Predicate::Binary(BinaryExpression::new(
914            LessThan,
915            Reference::new("id"),
916            Datum::int(INT_MIN_VALUE - 25),
917        ))
918        .or(Predicate::Binary(BinaryExpression::new(
919            GreaterThanOrEq,
920            Reference::new("id"),
921            Datum::int(INT_MIN_VALUE - 30),
922        )));
923
924        let bound_pred = filter.bind(schema.clone(), true).unwrap();
925
926        let result =
927            InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
928        assert!(result, "Should read: or(false, true)");
929
930        let schema = create_test_schema();
931        let filter = Predicate::Binary(BinaryExpression::new(
932            LessThan,
933            Reference::new("id"),
934            Datum::int(INT_MIN_VALUE - 25),
935        ))
936        .or(Predicate::Binary(BinaryExpression::new(
937            GreaterThanOrEq,
938            Reference::new("id"),
939            Datum::int(INT_MAX_VALUE + 1),
940        )));
941
942        let bound_pred = filter.bind(schema.clone(), true).unwrap();
943
944        let result =
945            InclusiveMetricsEvaluator::eval(&bound_pred, &get_test_file_1(), true).unwrap();
946        assert!(!result, "Should skip: or(false, false)");
947    }
948
949    #[test]
950    fn test_integer_lt() {
951        let result = InclusiveMetricsEvaluator::eval(
952            &less_than_int("id", INT_MIN_VALUE - 25),
953            &get_test_file_1(),
954            true,
955        )
956        .unwrap();
957        assert!(!result, "Should skip: id range below lower bound (5 < 30)");
958
959        let result = InclusiveMetricsEvaluator::eval(
960            &less_than_int("id", INT_MIN_VALUE),
961            &get_test_file_1(),
962            true,
963        )
964        .unwrap();
965        assert!(
966            !result,
967            "Should skip: id range below lower bound (30 is not < 30)"
968        );
969
970        let result = InclusiveMetricsEvaluator::eval(
971            &less_than_int("id", INT_MIN_VALUE + 1),
972            &get_test_file_1(),
973            true,
974        )
975        .unwrap();
976        assert!(result, "Should read: one possible id");
977
978        let result = InclusiveMetricsEvaluator::eval(
979            &less_than_int("id", INT_MAX_VALUE),
980            &get_test_file_1(),
981            true,
982        )
983        .unwrap();
984        assert!(result, "Should read: many possible ids");
985    }
986
987    #[test]
988    fn test_integer_lt_eq() {
989        let result = InclusiveMetricsEvaluator::eval(
990            &less_than_or_equal_int("id", INT_MIN_VALUE - 25),
991            &get_test_file_1(),
992            true,
993        )
994        .unwrap();
995        assert!(!result, "Should skip: id range below lower bound (5 < 30)");
996
997        let result = InclusiveMetricsEvaluator::eval(
998            &less_than_or_equal_int("id", INT_MIN_VALUE - 1),
999            &get_test_file_1(),
1000            true,
1001        )
1002        .unwrap();
1003        assert!(!result, "Should skip: id range below lower bound (29 < 30)");
1004
1005        let result = InclusiveMetricsEvaluator::eval(
1006            &less_than_or_equal_int("id", INT_MIN_VALUE),
1007            &get_test_file_1(),
1008            true,
1009        )
1010        .unwrap();
1011        assert!(result, "Should read: one possible id");
1012
1013        let result = InclusiveMetricsEvaluator::eval(
1014            &less_than_or_equal_int("id", INT_MAX_VALUE),
1015            &get_test_file_1(),
1016            true,
1017        )
1018        .unwrap();
1019        assert!(result, "Should read: many possible ids");
1020    }
1021
1022    #[test]
1023    fn test_integer_gt() {
1024        let result = InclusiveMetricsEvaluator::eval(
1025            &greater_than_int("id", INT_MAX_VALUE + 6),
1026            &get_test_file_1(),
1027            true,
1028        )
1029        .unwrap();
1030        assert!(!result, "Should skip: id range above upper bound (85 > 79)");
1031
1032        let result = InclusiveMetricsEvaluator::eval(
1033            &greater_than_int("id", INT_MAX_VALUE),
1034            &get_test_file_1(),
1035            true,
1036        )
1037        .unwrap();
1038        assert!(
1039            !result,
1040            "Should skip: id range above upper bound (79 is not > 79)"
1041        );
1042
1043        let result = InclusiveMetricsEvaluator::eval(
1044            &greater_than_int("id", INT_MAX_VALUE - 1),
1045            &get_test_file_1(),
1046            true,
1047        )
1048        .unwrap();
1049        assert!(result, "Should read: one possible id");
1050
1051        let result = InclusiveMetricsEvaluator::eval(
1052            &greater_than_int("id", INT_MAX_VALUE - 4),
1053            &get_test_file_1(),
1054            true,
1055        )
1056        .unwrap();
1057        assert!(result, "Should read: many possible ids");
1058    }
1059
1060    #[test]
1061    fn test_integer_gt_eq() {
1062        let result = InclusiveMetricsEvaluator::eval(
1063            &greater_than_or_equal_int("id", INT_MAX_VALUE + 6),
1064            &get_test_file_1(),
1065            true,
1066        )
1067        .unwrap();
1068        assert!(!result, "Should skip: id range above upper bound (85 < 79)");
1069
1070        let result = InclusiveMetricsEvaluator::eval(
1071            &greater_than_or_equal_int("id", INT_MAX_VALUE + 1),
1072            &get_test_file_1(),
1073            true,
1074        )
1075        .unwrap();
1076        assert!(!result, "Should skip: id range above upper bound (80 > 79)");
1077
1078        let result = InclusiveMetricsEvaluator::eval(
1079            &greater_than_or_equal_int("id", INT_MAX_VALUE),
1080            &get_test_file_1(),
1081            true,
1082        )
1083        .unwrap();
1084        assert!(result, "Should read: one possible id");
1085
1086        let result = InclusiveMetricsEvaluator::eval(
1087            &greater_than_or_equal_int("id", INT_MAX_VALUE - 4),
1088            &get_test_file_1(),
1089            true,
1090        )
1091        .unwrap();
1092        assert!(result, "Should read: many possible ids");
1093    }
1094
1095    #[test]
1096    fn test_integer_eq() {
1097        let result = InclusiveMetricsEvaluator::eval(
1098            &equal_int("id", INT_MIN_VALUE - 25),
1099            &get_test_file_1(),
1100            true,
1101        )
1102        .unwrap();
1103        assert!(!result, "Should skip: id below lower bound");
1104
1105        let result = InclusiveMetricsEvaluator::eval(
1106            &equal_int("id", INT_MIN_VALUE - 1),
1107            &get_test_file_1(),
1108            true,
1109        )
1110        .unwrap();
1111        assert!(!result, "Should skip: id below lower bound");
1112
1113        let result = InclusiveMetricsEvaluator::eval(
1114            &equal_int("id", INT_MIN_VALUE),
1115            &get_test_file_1(),
1116            true,
1117        )
1118        .unwrap();
1119        assert!(result, "Should read: id equal to lower bound");
1120
1121        let result = InclusiveMetricsEvaluator::eval(
1122            &equal_int("id", INT_MAX_VALUE - 4),
1123            &get_test_file_1(),
1124            true,
1125        )
1126        .unwrap();
1127        assert!(result, "Should read: id between lower and upper bounds");
1128
1129        let result = InclusiveMetricsEvaluator::eval(
1130            &equal_int("id", INT_MAX_VALUE),
1131            &get_test_file_1(),
1132            true,
1133        )
1134        .unwrap();
1135        assert!(result, "Should read: id equal to upper bound");
1136
1137        let result = InclusiveMetricsEvaluator::eval(
1138            &equal_int("id", INT_MAX_VALUE + 1),
1139            &get_test_file_1(),
1140            true,
1141        )
1142        .unwrap();
1143        assert!(!result, "Should skip: id above upper bound");
1144
1145        let result = InclusiveMetricsEvaluator::eval(
1146            &equal_int("id", INT_MAX_VALUE + 6),
1147            &get_test_file_1(),
1148            true,
1149        )
1150        .unwrap();
1151        assert!(!result, "Should skip: id above upper bound");
1152    }
1153
1154    #[test]
1155    fn test_integer_not_eq() {
1156        let result = InclusiveMetricsEvaluator::eval(
1157            &not_equal_int("id", INT_MIN_VALUE - 25),
1158            &get_test_file_1(),
1159            true,
1160        )
1161        .unwrap();
1162        assert!(result, "Should read: id below lower bound");
1163
1164        let result = InclusiveMetricsEvaluator::eval(
1165            &not_equal_int("id", INT_MIN_VALUE - 1),
1166            &get_test_file_1(),
1167            true,
1168        )
1169        .unwrap();
1170        assert!(result, "Should read: id below lower bound");
1171
1172        let result = InclusiveMetricsEvaluator::eval(
1173            &not_equal_int("id", INT_MIN_VALUE),
1174            &get_test_file_1(),
1175            true,
1176        )
1177        .unwrap();
1178        assert!(result, "Should read: id equal to lower bound");
1179
1180        let result = InclusiveMetricsEvaluator::eval(
1181            &not_equal_int("id", INT_MAX_VALUE - 4),
1182            &get_test_file_1(),
1183            true,
1184        )
1185        .unwrap();
1186        assert!(result, "Should read: id between lower and upper bounds");
1187
1188        let result = InclusiveMetricsEvaluator::eval(
1189            &not_equal_int("id", INT_MAX_VALUE),
1190            &get_test_file_1(),
1191            true,
1192        )
1193        .unwrap();
1194        assert!(result, "Should read: id equal to upper bound");
1195
1196        let result = InclusiveMetricsEvaluator::eval(
1197            &not_equal_int("id", INT_MAX_VALUE + 1),
1198            &get_test_file_1(),
1199            true,
1200        )
1201        .unwrap();
1202        assert!(result, "Should read: id above upper bound");
1203
1204        let result = InclusiveMetricsEvaluator::eval(
1205            &not_equal_int("id", INT_MAX_VALUE + 6),
1206            &get_test_file_1(),
1207            true,
1208        )
1209        .unwrap();
1210        assert!(result, "Should read: id above upper bound");
1211    }
1212
1213    #[test]
1214    #[should_panic]
1215    fn test_case_sensitive_integer_not_eq_rewritten() {
1216        let _result =
1217            InclusiveMetricsEvaluator::eval(&equal_int_not("ID", 5), &get_test_file_1(), true)
1218                .unwrap();
1219    }
1220
1221    #[test]
1222    fn test_string_starts_with() {
1223        let result = InclusiveMetricsEvaluator::eval(
1224            &starts_with("required", "a"),
1225            &get_test_file_1(),
1226            true,
1227        )
1228        .unwrap();
1229        assert!(result, "Should read: no stats");
1230
1231        let result = InclusiveMetricsEvaluator::eval(
1232            &starts_with("required", "a"),
1233            &get_test_file_2(),
1234            true,
1235        )
1236        .unwrap();
1237        assert!(result, "Should read: range matches");
1238
1239        let result = InclusiveMetricsEvaluator::eval(
1240            &starts_with("required", "aa"),
1241            &get_test_file_2(),
1242            true,
1243        )
1244        .unwrap();
1245        assert!(result, "Should read: range matches");
1246
1247        let result = InclusiveMetricsEvaluator::eval(
1248            &starts_with("required", "aaa"),
1249            &get_test_file_2(),
1250            true,
1251        )
1252        .unwrap();
1253        assert!(result, "Should read: range matches");
1254
1255        let result = InclusiveMetricsEvaluator::eval(
1256            &starts_with("required", "1s"),
1257            &get_test_file_3(),
1258            true,
1259        )
1260        .unwrap();
1261        assert!(result, "Should read: range matches");
1262
1263        let result = InclusiveMetricsEvaluator::eval(
1264            &starts_with("required", "1str1x"),
1265            &get_test_file_3(),
1266            true,
1267        )
1268        .unwrap();
1269        assert!(result, "Should read: range matches");
1270
1271        let result = InclusiveMetricsEvaluator::eval(
1272            &starts_with("required", "ff"),
1273            &get_test_file_4(),
1274            true,
1275        )
1276        .unwrap();
1277        assert!(result, "Should read: range matches");
1278
1279        let result = InclusiveMetricsEvaluator::eval(
1280            &starts_with("required", "aB"),
1281            &get_test_file_2(),
1282            true,
1283        )
1284        .unwrap();
1285        assert!(!result, "Should skip: range does not match");
1286
1287        let result = InclusiveMetricsEvaluator::eval(
1288            &starts_with("required", "dWX"),
1289            &get_test_file_2(),
1290            true,
1291        )
1292        .unwrap();
1293        assert!(!result, "Should skip: range does not match");
1294
1295        let result = InclusiveMetricsEvaluator::eval(
1296            &starts_with("required", "5"),
1297            &get_test_file_3(),
1298            true,
1299        )
1300        .unwrap();
1301        assert!(!result, "Should skip: range does not match");
1302
1303        let result = InclusiveMetricsEvaluator::eval(
1304            &starts_with("required", "3str3x"),
1305            &get_test_file_3(),
1306            true,
1307        )
1308        .unwrap();
1309        assert!(!result, "Should skip: range does not match");
1310
1311        let result = InclusiveMetricsEvaluator::eval(
1312            &starts_with("some_empty", "房东整租霍"),
1313            &get_test_file_1(),
1314            true,
1315        )
1316        .unwrap();
1317        assert!(result, "Should read: range does matches");
1318
1319        let result = InclusiveMetricsEvaluator::eval(
1320            &starts_with("all_nulls", ""),
1321            &get_test_file_1(),
1322            true,
1323        )
1324        .unwrap();
1325        assert!(!result, "Should skip: range does not match");
1326
1327        // Note: This string has been created manually by taking
1328        // the string "イロハニホヘト", which is an upper bound in
1329        // the datafile returned by get_test_file_4(), truncating it
1330        // to four character, and then appending the "ボ" character,
1331        // which occupies the next code point after the 5th
1332        // character in the string above, "ホ".
1333        // In the Java implementation of Iceberg, this is done by
1334        // the `truncateStringMax` function, but we don't yet have
1335        // this implemented in iceberg-rust.
1336        let above_max = "イロハニボ";
1337
1338        let result = InclusiveMetricsEvaluator::eval(
1339            &starts_with("required", above_max),
1340            &get_test_file_4(),
1341            true,
1342        )
1343        .unwrap();
1344        assert!(!result, "Should skip: range does not match");
1345    }
1346
1347    #[test]
1348    fn test_string_not_starts_with() {
1349        let result = InclusiveMetricsEvaluator::eval(
1350            &not_starts_with("required", "a"),
1351            &get_test_file_1(),
1352            true,
1353        )
1354        .unwrap();
1355        assert!(result, "Should read: no stats");
1356
1357        let result = InclusiveMetricsEvaluator::eval(
1358            &not_starts_with("required", "a"),
1359            &get_test_file_2(),
1360            true,
1361        )
1362        .unwrap();
1363        assert!(result, "Should read: range matches");
1364
1365        let result = InclusiveMetricsEvaluator::eval(
1366            &not_starts_with("required", "aa"),
1367            &get_test_file_2(),
1368            true,
1369        )
1370        .unwrap();
1371        assert!(result, "Should read: range matches");
1372
1373        let result = InclusiveMetricsEvaluator::eval(
1374            &not_starts_with("required", "aaa"),
1375            &get_test_file_2(),
1376            true,
1377        )
1378        .unwrap();
1379        assert!(result, "Should read: range matches");
1380
1381        let result = InclusiveMetricsEvaluator::eval(
1382            &not_starts_with("required", "1s"),
1383            &get_test_file_3(),
1384            true,
1385        )
1386        .unwrap();
1387        assert!(result, "Should read: range matches");
1388
1389        let result = InclusiveMetricsEvaluator::eval(
1390            &not_starts_with("required", "1str1x"),
1391            &get_test_file_3(),
1392            true,
1393        )
1394        .unwrap();
1395        assert!(result, "Should read: range matches");
1396
1397        let result = InclusiveMetricsEvaluator::eval(
1398            &not_starts_with("required", "ff"),
1399            &get_test_file_4(),
1400            true,
1401        )
1402        .unwrap();
1403        assert!(result, "Should read: range matches");
1404
1405        let result = InclusiveMetricsEvaluator::eval(
1406            &not_starts_with("required", "aB"),
1407            &get_test_file_2(),
1408            true,
1409        )
1410        .unwrap();
1411        assert!(result, "Should read: range matches");
1412
1413        let result = InclusiveMetricsEvaluator::eval(
1414            &not_starts_with("required", "dWX"),
1415            &get_test_file_2(),
1416            true,
1417        )
1418        .unwrap();
1419        assert!(result, "Should read: range matches");
1420
1421        let result = InclusiveMetricsEvaluator::eval(
1422            &not_starts_with("required", "5"),
1423            &get_test_file_3(),
1424            true,
1425        )
1426        .unwrap();
1427        assert!(result, "Should read: range matches");
1428
1429        let result = InclusiveMetricsEvaluator::eval(
1430            &not_starts_with("required", "3str3x"),
1431            &get_test_file_3(),
1432            true,
1433        )
1434        .unwrap();
1435        assert!(result, "Should read: range matches");
1436
1437        let above_max = "イロハニホヘト";
1438        let result = InclusiveMetricsEvaluator::eval(
1439            &not_starts_with("required", above_max),
1440            &get_test_file_4(),
1441            true,
1442        )
1443        .unwrap();
1444        assert!(result, "Should read: range matches");
1445    }
1446
1447    #[test]
1448    fn test_integer_in() {
1449        let result = InclusiveMetricsEvaluator::eval(
1450            &r#in_int("id", &[INT_MIN_VALUE - 25, INT_MIN_VALUE - 24]),
1451            &get_test_file_1(),
1452            true,
1453        )
1454        .unwrap();
1455        assert!(
1456            !result,
1457            "Should skip: id below lower bound (5 < 30, 6 < 30)"
1458        );
1459
1460        let result = InclusiveMetricsEvaluator::eval(
1461            &r#in_int("id", &[INT_MIN_VALUE - 2, INT_MIN_VALUE - 1]),
1462            &get_test_file_1(),
1463            true,
1464        )
1465        .unwrap();
1466        assert!(
1467            !result,
1468            "Should skip: id below lower bound (28 < 30, 29 < 30)"
1469        );
1470
1471        let result = InclusiveMetricsEvaluator::eval(
1472            &r#in_int("id", &[INT_MIN_VALUE - 1, INT_MIN_VALUE]),
1473            &get_test_file_1(),
1474            true,
1475        )
1476        .unwrap();
1477        assert!(result, "Should read: id equal to lower bound (30 == 30)");
1478
1479        let result = InclusiveMetricsEvaluator::eval(
1480            &r#in_int("id", &[INT_MAX_VALUE - 4, INT_MAX_VALUE - 3]),
1481            &get_test_file_1(),
1482            true,
1483        )
1484        .unwrap();
1485        assert!(
1486            result,
1487            "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
1488        );
1489
1490        let result = InclusiveMetricsEvaluator::eval(
1491            &r#in_int("id", &[INT_MAX_VALUE, INT_MAX_VALUE + 1]),
1492            &get_test_file_1(),
1493            true,
1494        )
1495        .unwrap();
1496        assert!(result, "Should read: id equal to upper bound (79 == 79)");
1497
1498        let result = InclusiveMetricsEvaluator::eval(
1499            &r#in_int("id", &[INT_MAX_VALUE + 1, INT_MAX_VALUE + 2]),
1500            &get_test_file_1(),
1501            true,
1502        )
1503        .unwrap();
1504        assert!(
1505            !result,
1506            "Should skip: id above upper bound (80 > 79, 81 > 79)"
1507        );
1508
1509        let result = InclusiveMetricsEvaluator::eval(
1510            &r#in_int("id", &[INT_MAX_VALUE + 6, INT_MAX_VALUE + 7]),
1511            &get_test_file_1(),
1512            true,
1513        )
1514        .unwrap();
1515        assert!(
1516            !result,
1517            "Should skip: id above upper bound (85 > 79, 86 > 79)"
1518        );
1519
1520        let result = InclusiveMetricsEvaluator::eval(
1521            &r#in_str("all_nulls", &["abc", "def"]),
1522            &get_test_file_1(),
1523            true,
1524        )
1525        .unwrap();
1526        assert!(!result, "Should skip: in on all nulls column");
1527
1528        let result = InclusiveMetricsEvaluator::eval(
1529            &r#in_str("some_nulls", &["abc", "def"]),
1530            &get_test_file_1(),
1531            true,
1532        )
1533        .unwrap();
1534        assert!(result, "Should read: in on some nulls column");
1535
1536        let result = InclusiveMetricsEvaluator::eval(
1537            &r#in_str("no_nulls", &["abc", "def"]),
1538            &get_test_file_1(),
1539            true,
1540        )
1541        .unwrap();
1542        assert!(result, "Should read: in on no nulls column");
1543
1544        let ids = (-400..=0).collect::<Vec<_>>();
1545        let result =
1546            InclusiveMetricsEvaluator::eval(&r#in_int("id", &ids), &get_test_file_1(), true)
1547                .unwrap();
1548        assert!(
1549            result,
1550            "Should read: number of items in In expression greater than threshold"
1551        );
1552    }
1553
1554    #[test]
1555    fn test_integer_not_in() {
1556        let result = InclusiveMetricsEvaluator::eval(
1557            &r#not_in_int("id", &[INT_MIN_VALUE - 25, INT_MIN_VALUE - 24]),
1558            &get_test_file_1(),
1559            true,
1560        )
1561        .unwrap();
1562        assert!(result, "Should read: id below lower bound (5 < 30, 6 < 30)");
1563
1564        let result = InclusiveMetricsEvaluator::eval(
1565            &r#not_in_int("id", &[INT_MIN_VALUE - 2, INT_MIN_VALUE - 1]),
1566            &get_test_file_1(),
1567            true,
1568        )
1569        .unwrap();
1570        assert!(
1571            result,
1572            "Should read: id below lower bound (28 < 30, 29 < 30)"
1573        );
1574
1575        let result = InclusiveMetricsEvaluator::eval(
1576            &r#not_in_int("id", &[INT_MIN_VALUE - 1, INT_MIN_VALUE]),
1577            &get_test_file_1(),
1578            true,
1579        )
1580        .unwrap();
1581        assert!(result, "Should read: id equal to lower bound (30 == 30)");
1582
1583        let result = InclusiveMetricsEvaluator::eval(
1584            &r#not_in_int("id", &[INT_MAX_VALUE - 4, INT_MAX_VALUE - 3]),
1585            &get_test_file_1(),
1586            true,
1587        )
1588        .unwrap();
1589        assert!(
1590            result,
1591            "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
1592        );
1593
1594        let result = InclusiveMetricsEvaluator::eval(
1595            &r#not_in_int("id", &[INT_MAX_VALUE, INT_MAX_VALUE + 1]),
1596            &get_test_file_1(),
1597            true,
1598        )
1599        .unwrap();
1600        assert!(result, "Should read: id equal to upper bound (79 == 79)");
1601
1602        let result = InclusiveMetricsEvaluator::eval(
1603            &r#not_in_int("id", &[INT_MAX_VALUE + 1, INT_MAX_VALUE + 2]),
1604            &get_test_file_1(),
1605            true,
1606        )
1607        .unwrap();
1608        assert!(
1609            result,
1610            "Should read: id above upper bound (80 > 79, 81 > 79)"
1611        );
1612
1613        let result = InclusiveMetricsEvaluator::eval(
1614            &r#not_in_int("id", &[INT_MAX_VALUE + 6, INT_MAX_VALUE + 7]),
1615            &get_test_file_1(),
1616            true,
1617        )
1618        .unwrap();
1619        assert!(
1620            result,
1621            "Should read: id above upper bound (85 > 79, 86 > 79)"
1622        );
1623
1624        let result = InclusiveMetricsEvaluator::eval(
1625            &r#not_in_str("all_nulls", &["abc", "def"]),
1626            &get_test_file_1(),
1627            true,
1628        )
1629        .unwrap();
1630        assert!(result, "Should read: NotIn on all nulls column");
1631
1632        let result = InclusiveMetricsEvaluator::eval(
1633            &r#not_in_str("some_nulls", &["abc", "def"]),
1634            &get_test_file_1(),
1635            true,
1636        )
1637        .unwrap();
1638        assert!(result, "Should read: NotIn on some nulls column");
1639
1640        let result = InclusiveMetricsEvaluator::eval(
1641            &r#not_in_str("no_nulls", &["abc", "def"]),
1642            &get_test_file_1(),
1643            true,
1644        )
1645        .unwrap();
1646        assert!(result, "Should read: NotIn on no nulls column");
1647    }
1648
1649    fn create_test_partition_spec() -> (PartitionSpecRef, SchemaRef) {
1650        let table_schema = Schema::builder()
1651            .with_fields(vec![Arc::new(NestedField::optional(
1652                1,
1653                "a",
1654                Type::Primitive(PrimitiveType::Float),
1655            ))])
1656            .build()
1657            .unwrap();
1658        let table_schema_ref = Arc::new(table_schema);
1659
1660        let partition_spec = PartitionSpec::builder(table_schema_ref.clone())
1661            .with_spec_id(1)
1662            .add_unbound_fields(vec![
1663                UnboundPartitionField::builder()
1664                    .source_id(1)
1665                    .name("a".to_string())
1666                    .field_id(1)
1667                    .transform(Transform::Identity)
1668                    .build(),
1669            ])
1670            .unwrap()
1671            .build()
1672            .unwrap();
1673        (Arc::new(partition_spec), table_schema_ref)
1674    }
1675
1676    fn not_null(reference: &str) -> BoundPredicate {
1677        let schema = create_test_schema();
1678        let filter = Predicate::Unary(UnaryExpression::new(NotNull, Reference::new(reference)));
1679        filter.bind(schema.clone(), true).unwrap()
1680    }
1681
1682    fn is_null(reference: &str) -> BoundPredicate {
1683        let schema = create_test_schema();
1684        let filter = Predicate::Unary(UnaryExpression::new(IsNull, Reference::new(reference)));
1685        filter.bind(schema.clone(), true).unwrap()
1686    }
1687
1688    fn not_nan(reference: &str) -> BoundPredicate {
1689        let schema = create_test_schema();
1690        let filter = Predicate::Unary(UnaryExpression::new(NotNan, Reference::new(reference)));
1691        filter.bind(schema.clone(), true).unwrap()
1692    }
1693
1694    fn is_nan(reference: &str) -> BoundPredicate {
1695        let schema = create_test_schema();
1696        let filter = Predicate::Unary(UnaryExpression::new(IsNan, Reference::new(reference)));
1697        filter.bind(schema.clone(), true).unwrap()
1698    }
1699
1700    fn less_than(reference: &str, str_literal: &str) -> BoundPredicate {
1701        let schema = create_test_schema();
1702        let filter = Predicate::Binary(BinaryExpression::new(
1703            LessThan,
1704            Reference::new(reference),
1705            Datum::string(str_literal),
1706        ));
1707        filter.bind(schema.clone(), true).unwrap()
1708    }
1709
1710    fn less_than_or_equal(reference: &str, str_literal: &str) -> BoundPredicate {
1711        let schema = create_test_schema();
1712        let filter = Predicate::Binary(BinaryExpression::new(
1713            LessThanOrEq,
1714            Reference::new(reference),
1715            Datum::string(str_literal),
1716        ));
1717        filter.bind(schema.clone(), true).unwrap()
1718    }
1719
1720    fn greater_than(reference: &str, str_literal: &str) -> BoundPredicate {
1721        let schema = create_test_schema();
1722        let filter = Predicate::Binary(BinaryExpression::new(
1723            GreaterThan,
1724            Reference::new(reference),
1725            Datum::string(str_literal),
1726        ));
1727        filter.bind(schema.clone(), true).unwrap()
1728    }
1729
1730    fn greater_than_or_equal(reference: &str, str_literal: &str) -> BoundPredicate {
1731        let schema = create_test_schema();
1732        let filter = Predicate::Binary(BinaryExpression::new(
1733            GreaterThanOrEq,
1734            Reference::new(reference),
1735            Datum::string(str_literal),
1736        ));
1737        filter.bind(schema.clone(), true).unwrap()
1738    }
1739
1740    fn equal(reference: &str, str_literal: &str) -> BoundPredicate {
1741        let schema = create_test_schema();
1742        let filter = Predicate::Binary(BinaryExpression::new(
1743            Eq,
1744            Reference::new(reference),
1745            Datum::string(str_literal),
1746        ));
1747        filter.bind(schema.clone(), true).unwrap()
1748    }
1749
1750    fn less_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
1751        let schema = create_test_schema();
1752        let filter = Predicate::Binary(BinaryExpression::new(
1753            LessThan,
1754            Reference::new(reference),
1755            Datum::int(int_literal),
1756        ));
1757        filter.bind(schema.clone(), true).unwrap()
1758    }
1759
1760    fn not_less_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
1761        let schema = create_test_schema();
1762        let filter = Predicate::Binary(BinaryExpression::new(
1763            LessThan,
1764            Reference::new(reference),
1765            Datum::int(int_literal),
1766        ))
1767        .not();
1768        filter.bind(schema.clone(), true).unwrap()
1769    }
1770
1771    fn less_than_or_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
1772        let schema = create_test_schema();
1773        let filter = Predicate::Binary(BinaryExpression::new(
1774            LessThanOrEq,
1775            Reference::new(reference),
1776            Datum::int(int_literal),
1777        ));
1778        filter.bind(schema.clone(), true).unwrap()
1779    }
1780
1781    fn greater_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
1782        let schema = create_test_schema();
1783        let filter = Predicate::Binary(BinaryExpression::new(
1784            GreaterThan,
1785            Reference::new(reference),
1786            Datum::int(int_literal),
1787        ));
1788        filter.bind(schema.clone(), true).unwrap()
1789    }
1790
1791    fn not_greater_than_int(reference: &str, int_literal: i32) -> BoundPredicate {
1792        let schema = create_test_schema();
1793        let filter = Predicate::Binary(BinaryExpression::new(
1794            GreaterThan,
1795            Reference::new(reference),
1796            Datum::int(int_literal),
1797        ))
1798        .not();
1799        filter.bind(schema.clone(), true).unwrap()
1800    }
1801
1802    fn greater_than_or_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
1803        let schema = create_test_schema();
1804        let filter = Predicate::Binary(BinaryExpression::new(
1805            GreaterThanOrEq,
1806            Reference::new(reference),
1807            Datum::int(int_literal),
1808        ));
1809        filter.bind(schema.clone(), true).unwrap()
1810    }
1811
1812    fn equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
1813        let schema = create_test_schema();
1814        let filter = Predicate::Binary(BinaryExpression::new(
1815            Eq,
1816            Reference::new(reference),
1817            Datum::int(int_literal),
1818        ));
1819        filter.bind(schema.clone(), true).unwrap()
1820    }
1821
1822    fn equal_int_not(reference: &str, int_literal: i32) -> BoundPredicate {
1823        let schema = create_test_schema();
1824        let filter = Predicate::Binary(BinaryExpression::new(
1825            Eq,
1826            Reference::new(reference),
1827            Datum::int(int_literal),
1828        ))
1829        .not();
1830        filter.bind(schema.clone(), true).unwrap()
1831    }
1832
1833    fn not_equal_int(reference: &str, int_literal: i32) -> BoundPredicate {
1834        let schema = create_test_schema();
1835        let filter = Predicate::Binary(BinaryExpression::new(
1836            NotEq,
1837            Reference::new(reference),
1838            Datum::int(int_literal),
1839        ));
1840        filter.bind(schema.clone(), true).unwrap()
1841    }
1842
1843    fn starts_with(reference: &str, str_literal: &str) -> BoundPredicate {
1844        let schema = create_test_schema();
1845        let filter = Predicate::Binary(BinaryExpression::new(
1846            StartsWith,
1847            Reference::new(reference),
1848            Datum::string(str_literal),
1849        ));
1850        filter.bind(schema.clone(), true).unwrap()
1851    }
1852
1853    fn not_starts_with(reference: &str, str_literal: &str) -> BoundPredicate {
1854        let schema = create_test_schema();
1855        let filter = Predicate::Binary(BinaryExpression::new(
1856            NotStartsWith,
1857            Reference::new(reference),
1858            Datum::string(str_literal),
1859        ));
1860        filter.bind(schema.clone(), true).unwrap()
1861    }
1862
1863    fn in_int(reference: &str, int_literals: &[i32]) -> BoundPredicate {
1864        let schema = create_test_schema();
1865        let filter = Predicate::Set(SetExpression::new(
1866            In,
1867            Reference::new(reference),
1868            FnvHashSet::from_iter(int_literals.iter().map(|&lit| Datum::int(lit))),
1869        ));
1870        filter.bind(schema.clone(), true).unwrap()
1871    }
1872
1873    fn in_str(reference: &str, str_literals: &[&str]) -> BoundPredicate {
1874        let schema = create_test_schema();
1875        let filter = Predicate::Set(SetExpression::new(
1876            In,
1877            Reference::new(reference),
1878            FnvHashSet::from_iter(str_literals.iter().map(Datum::string)),
1879        ));
1880        filter.bind(schema.clone(), true).unwrap()
1881    }
1882
1883    fn not_in_int(reference: &str, int_literals: &[i32]) -> BoundPredicate {
1884        let schema = create_test_schema();
1885        let filter = Predicate::Set(SetExpression::new(
1886            NotIn,
1887            Reference::new(reference),
1888            FnvHashSet::from_iter(int_literals.iter().map(|&lit| Datum::int(lit))),
1889        ));
1890        filter.bind(schema.clone(), true).unwrap()
1891    }
1892
1893    fn not_in_str(reference: &str, str_literals: &[&str]) -> BoundPredicate {
1894        let schema = create_test_schema();
1895        let filter = Predicate::Set(SetExpression::new(
1896            NotIn,
1897            Reference::new(reference),
1898            FnvHashSet::from_iter(str_literals.iter().map(Datum::string)),
1899        ));
1900        filter.bind(schema.clone(), true).unwrap()
1901    }
1902
1903    fn create_test_schema() -> Arc<Schema> {
1904        let table_schema = Schema::builder()
1905            .with_fields(vec![
1906                Arc::new(NestedField::required(
1907                    1,
1908                    "id",
1909                    Type::Primitive(PrimitiveType::Int),
1910                )),
1911                Arc::new(NestedField::optional(
1912                    2,
1913                    "no_stats",
1914                    Type::Primitive(PrimitiveType::Int),
1915                )),
1916                Arc::new(NestedField::required(
1917                    3,
1918                    "required",
1919                    Type::Primitive(PrimitiveType::String),
1920                )),
1921                Arc::new(NestedField::optional(
1922                    4,
1923                    "all_nulls",
1924                    Type::Primitive(PrimitiveType::String),
1925                )),
1926                Arc::new(NestedField::optional(
1927                    5,
1928                    "some_nulls",
1929                    Type::Primitive(PrimitiveType::String),
1930                )),
1931                Arc::new(NestedField::optional(
1932                    6,
1933                    "no_nulls",
1934                    Type::Primitive(PrimitiveType::String),
1935                )),
1936                Arc::new(NestedField::optional(
1937                    7,
1938                    "all_nans",
1939                    Type::Primitive(PrimitiveType::Double),
1940                )),
1941                Arc::new(NestedField::optional(
1942                    8,
1943                    "some_nans",
1944                    Type::Primitive(PrimitiveType::Float),
1945                )),
1946                Arc::new(NestedField::optional(
1947                    9,
1948                    "no_nans",
1949                    Type::Primitive(PrimitiveType::Float),
1950                )),
1951                Arc::new(NestedField::optional(
1952                    10,
1953                    "all_nulls_double",
1954                    Type::Primitive(PrimitiveType::Double),
1955                )),
1956                Arc::new(NestedField::optional(
1957                    11,
1958                    "all_nans_v1_stats",
1959                    Type::Primitive(PrimitiveType::Float),
1960                )),
1961                Arc::new(NestedField::optional(
1962                    12,
1963                    "nan_and_null_only",
1964                    Type::Primitive(PrimitiveType::Double),
1965                )),
1966                Arc::new(NestedField::optional(
1967                    13,
1968                    "no_nan_stats",
1969                    Type::Primitive(PrimitiveType::Double),
1970                )),
1971                Arc::new(NestedField::optional(
1972                    14,
1973                    "some_empty",
1974                    Type::Primitive(PrimitiveType::String),
1975                )),
1976            ])
1977            .build()
1978            .unwrap();
1979
1980        Arc::new(table_schema)
1981    }
1982
1983    fn create_test_data_file() -> DataFile {
1984        DataFile {
1985            content: DataContentType::Data,
1986            file_path: "/test/path".to_string(),
1987            file_format: DataFileFormat::Parquet,
1988            partition: Struct::empty(),
1989            record_count: 10,
1990            file_size_in_bytes: 10,
1991            column_sizes: Default::default(),
1992            value_counts: Default::default(),
1993            null_value_counts: Default::default(),
1994            nan_value_counts: Default::default(),
1995            lower_bounds: Default::default(),
1996            upper_bounds: Default::default(),
1997            key_metadata: None,
1998            split_offsets: None,
1999            equality_ids: None,
2000            sort_order_id: None,
2001            partition_spec_id: 0,
2002            first_row_id: None,
2003            referenced_data_file: None,
2004            content_offset: None,
2005            content_size_in_bytes: None,
2006        }
2007    }
2008
2009    fn create_zero_records_data_file() -> DataFile {
2010        DataFile {
2011            content: DataContentType::Data,
2012            file_path: "/test/path".to_string(),
2013            file_format: DataFileFormat::Parquet,
2014            partition: Struct::empty(),
2015            record_count: 0,
2016            file_size_in_bytes: 10,
2017            column_sizes: Default::default(),
2018            value_counts: Default::default(),
2019            null_value_counts: Default::default(),
2020            nan_value_counts: Default::default(),
2021            lower_bounds: Default::default(),
2022            upper_bounds: Default::default(),
2023            key_metadata: None,
2024            split_offsets: None,
2025            equality_ids: None,
2026            sort_order_id: None,
2027            partition_spec_id: 0,
2028            first_row_id: None,
2029            referenced_data_file: None,
2030            content_offset: None,
2031            content_size_in_bytes: None,
2032        }
2033    }
2034
2035    fn get_test_file_1() -> DataFile {
2036        DataFile {
2037            content: DataContentType::Data,
2038            file_path: "/test/path".to_string(),
2039            file_format: DataFileFormat::Parquet,
2040            partition: Struct::empty(),
2041            record_count: 50,
2042            file_size_in_bytes: 10,
2043
2044            value_counts: HashMap::from([
2045                (4, 50),
2046                (5, 50),
2047                (6, 50),
2048                (7, 50),
2049                (8, 50),
2050                (9, 50),
2051                (10, 50),
2052                (11, 50),
2053                (12, 50),
2054                (13, 50),
2055                (14, 50),
2056            ]),
2057
2058            null_value_counts: HashMap::from([
2059                (4, 50),
2060                (5, 10),
2061                (6, 0),
2062                (10, 50),
2063                (11, 0),
2064                (12, 1),
2065                (14, 0),
2066            ]),
2067
2068            nan_value_counts: HashMap::from([(7, 50), (8, 10), (9, 0)]),
2069
2070            lower_bounds: HashMap::from([
2071                (1, Datum::int(INT_MIN_VALUE)),
2072                (11, Datum::float(f32::NAN)),
2073                (12, Datum::double(f64::NAN)),
2074                (14, Datum::string("")),
2075            ]),
2076
2077            upper_bounds: HashMap::from([
2078                (1, Datum::int(INT_MAX_VALUE)),
2079                (11, Datum::float(f32::NAN)),
2080                (12, Datum::double(f64::NAN)),
2081                (14, Datum::string("房东整租霍营小区二层两居室")),
2082            ]),
2083
2084            column_sizes: Default::default(),
2085            key_metadata: None,
2086            split_offsets: None,
2087            equality_ids: None,
2088            sort_order_id: None,
2089            partition_spec_id: 0,
2090            first_row_id: None,
2091            referenced_data_file: None,
2092            content_offset: None,
2093            content_size_in_bytes: None,
2094        }
2095    }
2096    fn get_test_file_2() -> DataFile {
2097        DataFile {
2098            content: DataContentType::Data,
2099            file_path: "file_2.avro".to_string(),
2100            file_format: DataFileFormat::Parquet,
2101            partition: Struct::empty(),
2102            record_count: 50,
2103            file_size_in_bytes: 10,
2104
2105            value_counts: HashMap::from([(3, 20)]),
2106
2107            null_value_counts: HashMap::from([(3, 2)]),
2108
2109            nan_value_counts: HashMap::default(),
2110
2111            lower_bounds: HashMap::from([(3, Datum::string("aa"))]),
2112
2113            upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
2114
2115            column_sizes: Default::default(),
2116            key_metadata: None,
2117            split_offsets: None,
2118            equality_ids: None,
2119            sort_order_id: None,
2120            partition_spec_id: 0,
2121            first_row_id: None,
2122            referenced_data_file: None,
2123            content_offset: None,
2124            content_size_in_bytes: None,
2125        }
2126    }
2127
2128    fn get_test_file_3() -> DataFile {
2129        DataFile {
2130            content: DataContentType::Data,
2131            file_path: "file_3.avro".to_string(),
2132            file_format: DataFileFormat::Parquet,
2133            partition: Struct::empty(),
2134            record_count: 50,
2135            file_size_in_bytes: 10,
2136
2137            value_counts: HashMap::from([(3, 20)]),
2138
2139            null_value_counts: HashMap::from([(3, 2)]),
2140
2141            nan_value_counts: HashMap::default(),
2142
2143            lower_bounds: HashMap::from([(3, Datum::string("1str1"))]),
2144
2145            upper_bounds: HashMap::from([(3, Datum::string("3str3"))]),
2146
2147            column_sizes: Default::default(),
2148            key_metadata: None,
2149            split_offsets: None,
2150            equality_ids: None,
2151            sort_order_id: None,
2152            partition_spec_id: 0,
2153            first_row_id: None,
2154            referenced_data_file: None,
2155            content_offset: None,
2156            content_size_in_bytes: None,
2157        }
2158    }
2159
2160    fn get_test_file_4() -> DataFile {
2161        DataFile {
2162            content: DataContentType::Data,
2163            file_path: "file_4.avro".to_string(),
2164            file_format: DataFileFormat::Parquet,
2165            partition: Struct::empty(),
2166            record_count: 50,
2167            file_size_in_bytes: 10,
2168
2169            value_counts: HashMap::from([(3, 20)]),
2170
2171            null_value_counts: HashMap::from([(3, 2)]),
2172
2173            nan_value_counts: HashMap::default(),
2174
2175            lower_bounds: HashMap::from([(3, Datum::string("abc"))]),
2176
2177            upper_bounds: HashMap::from([(3, Datum::string("イロハニホヘト"))]),
2178
2179            column_sizes: Default::default(),
2180            key_metadata: None,
2181            split_offsets: None,
2182            equality_ids: None,
2183            sort_order_id: None,
2184            partition_spec_id: 0,
2185            first_row_id: None,
2186            referenced_data_file: None,
2187            content_offset: None,
2188            content_size_in_bytes: None,
2189        }
2190    }
2191}