iceberg/spec/schema/
index.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::utils::try_insert_field;
19use super::*;
20
21/// Creates a field id to field map.
22pub fn index_by_id(r#struct: &StructType) -> Result<HashMap<i32, NestedFieldRef>> {
23    struct IndexById(HashMap<i32, NestedFieldRef>);
24
25    impl SchemaVisitor for IndexById {
26        type T = ();
27
28        fn schema(&mut self, _schema: &Schema, _value: ()) -> Result<()> {
29            Ok(())
30        }
31
32        fn field(&mut self, field: &NestedFieldRef, _value: ()) -> Result<()> {
33            try_insert_field(&mut self.0, field.id, field.clone())
34        }
35
36        fn r#struct(&mut self, _struct: &StructType, _results: Vec<Self::T>) -> Result<Self::T> {
37            Ok(())
38        }
39
40        fn list(&mut self, list: &ListType, _value: Self::T) -> Result<Self::T> {
41            try_insert_field(
42                &mut self.0,
43                list.element_field.id,
44                list.element_field.clone(),
45            )
46        }
47
48        fn map(&mut self, map: &MapType, _key_value: Self::T, _value: Self::T) -> Result<Self::T> {
49            try_insert_field(&mut self.0, map.key_field.id, map.key_field.clone())?;
50            try_insert_field(&mut self.0, map.value_field.id, map.value_field.clone())
51        }
52
53        fn primitive(&mut self, _: &PrimitiveType) -> Result<Self::T> {
54            Ok(())
55        }
56    }
57
58    let mut index = IndexById(HashMap::new());
59    visit_struct(r#struct, &mut index)?;
60    Ok(index.0)
61}
62
63/// Creates a field id to parent field id map.
64pub fn index_parents(r#struct: &StructType) -> Result<HashMap<i32, i32>> {
65    struct IndexByParent {
66        parents: Vec<i32>,
67        result: HashMap<i32, i32>,
68    }
69
70    impl SchemaVisitor for IndexByParent {
71        type T = ();
72
73        fn before_struct_field(&mut self, field: &NestedFieldRef) -> Result<()> {
74            if let Some(parent) = self.parents.last().copied() {
75                self.result.insert(field.id, parent);
76            }
77            self.parents.push(field.id);
78            Ok(())
79        }
80
81        fn after_struct_field(&mut self, _field: &NestedFieldRef) -> Result<()> {
82            self.parents.pop();
83            Ok(())
84        }
85
86        fn before_list_element(&mut self, field: &NestedFieldRef) -> Result<()> {
87            if let Some(parent) = self.parents.last().copied() {
88                self.result.insert(field.id, parent);
89            }
90            self.parents.push(field.id);
91            Ok(())
92        }
93
94        fn after_list_element(&mut self, _field: &NestedFieldRef) -> Result<()> {
95            self.parents.pop();
96            Ok(())
97        }
98
99        fn before_map_key(&mut self, field: &NestedFieldRef) -> Result<()> {
100            if let Some(parent) = self.parents.last().copied() {
101                self.result.insert(field.id, parent);
102            }
103            self.parents.push(field.id);
104            Ok(())
105        }
106
107        fn after_map_key(&mut self, _field: &NestedFieldRef) -> Result<()> {
108            self.parents.pop();
109            Ok(())
110        }
111
112        fn before_map_value(&mut self, field: &NestedFieldRef) -> Result<()> {
113            if let Some(parent) = self.parents.last().copied() {
114                self.result.insert(field.id, parent);
115            }
116            self.parents.push(field.id);
117            Ok(())
118        }
119
120        fn after_map_value(&mut self, _field: &NestedFieldRef) -> Result<()> {
121            self.parents.pop();
122            Ok(())
123        }
124
125        fn schema(&mut self, _schema: &Schema, _value: Self::T) -> Result<Self::T> {
126            Ok(())
127        }
128
129        fn field(&mut self, _field: &NestedFieldRef, _value: Self::T) -> Result<Self::T> {
130            Ok(())
131        }
132
133        fn r#struct(&mut self, _struct: &StructType, _results: Vec<Self::T>) -> Result<Self::T> {
134            Ok(())
135        }
136
137        fn list(&mut self, _list: &ListType, _value: Self::T) -> Result<Self::T> {
138            Ok(())
139        }
140
141        fn map(&mut self, _map: &MapType, _key_value: Self::T, _value: Self::T) -> Result<Self::T> {
142            Ok(())
143        }
144
145        fn primitive(&mut self, _p: &PrimitiveType) -> Result<Self::T> {
146            Ok(())
147        }
148    }
149
150    let mut index = IndexByParent {
151        parents: vec![],
152        result: HashMap::new(),
153    };
154    visit_struct(r#struct, &mut index)?;
155    Ok(index.result)
156}
157
158#[derive(Default)]
159pub struct IndexByName {
160    // Maybe radix tree is better here?
161    name_to_id: HashMap<String, i32>,
162    short_name_to_id: HashMap<String, i32>,
163
164    field_names: Vec<String>,
165    short_field_names: Vec<String>,
166}
167
168impl IndexByName {
169    fn add_field(&mut self, name: &str, field_id: i32) -> Result<()> {
170        let full_name = self
171            .field_names
172            .iter()
173            .map(String::as_str)
174            .chain(vec![name])
175            .join(".");
176        if let Some(existing_field_id) = self.name_to_id.get(full_name.as_str()) {
177            return Err(Error::new(
178                ErrorKind::DataInvalid,
179                format!(
180                    "Invalid schema: multiple fields for name {full_name}: {field_id} and {existing_field_id}"
181                ),
182            ));
183        } else {
184            self.name_to_id.insert(full_name, field_id);
185        }
186
187        let full_short_name = self
188            .short_field_names
189            .iter()
190            .map(String::as_str)
191            .chain(vec![name])
192            .join(".");
193        self.short_name_to_id
194            .entry(full_short_name)
195            .or_insert_with(|| field_id);
196        Ok(())
197    }
198
199    /// Returns two indexes: full name to field id, and id to full name.
200    ///
201    /// In the first index, short names are returned.
202    /// In second index, short names are not returned.
203    pub fn indexes(mut self) -> (HashMap<String, i32>, HashMap<i32, String>) {
204        self.short_name_to_id.reserve(self.name_to_id.len());
205        for (name, id) in &self.name_to_id {
206            self.short_name_to_id.insert(name.clone(), *id);
207        }
208
209        let id_to_name = self.name_to_id.into_iter().map(|e| (e.1, e.0)).collect();
210        (self.short_name_to_id, id_to_name)
211    }
212}
213
214impl SchemaVisitor for IndexByName {
215    type T = ();
216
217    fn before_struct_field(&mut self, field: &NestedFieldRef) -> Result<()> {
218        self.field_names.push(field.name.to_string());
219        self.short_field_names.push(field.name.to_string());
220        Ok(())
221    }
222
223    fn after_struct_field(&mut self, _field: &NestedFieldRef) -> Result<()> {
224        self.field_names.pop();
225        self.short_field_names.pop();
226        Ok(())
227    }
228
229    fn before_list_element(&mut self, field: &NestedFieldRef) -> Result<()> {
230        self.field_names.push(field.name.clone());
231        if !field.field_type.is_struct() {
232            self.short_field_names.push(field.name.to_string());
233        }
234
235        Ok(())
236    }
237
238    fn after_list_element(&mut self, field: &NestedFieldRef) -> Result<()> {
239        self.field_names.pop();
240        if !field.field_type.is_struct() {
241            self.short_field_names.pop();
242        }
243
244        Ok(())
245    }
246
247    fn before_map_key(&mut self, field: &NestedFieldRef) -> Result<()> {
248        self.before_struct_field(field)
249    }
250
251    fn after_map_key(&mut self, field: &NestedFieldRef) -> Result<()> {
252        self.after_struct_field(field)
253    }
254
255    fn before_map_value(&mut self, field: &NestedFieldRef) -> Result<()> {
256        self.field_names.push(field.name.to_string());
257        if !field.field_type.is_struct() {
258            self.short_field_names.push(field.name.to_string());
259        }
260        Ok(())
261    }
262
263    fn after_map_value(&mut self, field: &NestedFieldRef) -> Result<()> {
264        self.field_names.pop();
265        if !field.field_type.is_struct() {
266            self.short_field_names.pop();
267        }
268
269        Ok(())
270    }
271
272    fn schema(&mut self, _schema: &Schema, _value: Self::T) -> Result<Self::T> {
273        Ok(())
274    }
275
276    fn field(&mut self, field: &NestedFieldRef, _value: Self::T) -> Result<Self::T> {
277        self.add_field(field.name.as_str(), field.id)
278    }
279
280    fn r#struct(&mut self, _struct: &StructType, _results: Vec<Self::T>) -> Result<Self::T> {
281        Ok(())
282    }
283
284    fn list(&mut self, list: &ListType, _value: Self::T) -> Result<Self::T> {
285        self.add_field(LIST_FIELD_NAME, list.element_field.id)
286    }
287
288    fn map(&mut self, map: &MapType, _key_value: Self::T, _value: Self::T) -> Result<Self::T> {
289        self.add_field(MAP_KEY_FIELD_NAME, map.key_field.id)?;
290        self.add_field(MAP_VALUE_FIELD_NAME, map.value_field.id)
291    }
292
293    fn primitive(&mut self, _p: &PrimitiveType) -> Result<Self::T> {
294        Ok(())
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use crate::spec::schema::tests::table_schema_nested;
302
303    #[test]
304    fn test_index_parent() {
305        let schema = table_schema_nested();
306        let result = index_parents(&schema.r#struct).unwrap();
307        assert_eq!(result.get(&5).unwrap(), &4);
308        assert_eq!(result.get(&7).unwrap(), &6);
309        assert_eq!(result.get(&8).unwrap(), &6);
310        assert_eq!(result.get(&9).unwrap(), &8);
311        assert_eq!(result.get(&10).unwrap(), &8);
312        assert_eq!(result.get(&12).unwrap(), &11);
313        assert_eq!(result.get(&13).unwrap(), &12);
314        assert_eq!(result.get(&14).unwrap(), &12);
315        assert_eq!(result.get(&16).unwrap(), &15);
316        assert_eq!(result.get(&17).unwrap(), &15);
317    }
318}