iceberg_catalog_hms/
schema.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use hive_metastore::FieldSchema;
19use iceberg::spec::{PrimitiveType, Schema, SchemaVisitor, visit_schema};
20use iceberg::{Error, ErrorKind, Result};
21
22type HiveSchema = Vec<FieldSchema>;
23
24#[derive(Debug, Default)]
25pub(crate) struct HiveSchemaBuilder {
26    schema: HiveSchema,
27    depth: usize,
28}
29
30impl HiveSchemaBuilder {
31    /// Creates a new `HiveSchemaBuilder` from iceberg `Schema`
32    pub fn from_iceberg(schema: &Schema) -> Result<HiveSchemaBuilder> {
33        let mut builder = Self::default();
34        visit_schema(schema, &mut builder)?;
35        Ok(builder)
36    }
37
38    /// Returns the newly converted `HiveSchema`
39    pub fn build(self) -> HiveSchema {
40        self.schema
41    }
42
43    /// Check if is in `StructType` while traversing schema
44    fn is_inside_struct(&self) -> bool {
45        self.depth > 0
46    }
47}
48
49impl SchemaVisitor for HiveSchemaBuilder {
50    type T = String;
51
52    fn schema(
53        &mut self,
54        _schema: &iceberg::spec::Schema,
55        value: String,
56    ) -> iceberg::Result<String> {
57        Ok(value)
58    }
59
60    fn before_struct_field(
61        &mut self,
62        _field: &iceberg::spec::NestedFieldRef,
63    ) -> iceberg::Result<()> {
64        self.depth += 1;
65        Ok(())
66    }
67
68    fn r#struct(
69        &mut self,
70        r#_struct: &iceberg::spec::StructType,
71        results: Vec<String>,
72    ) -> iceberg::Result<String> {
73        Ok(format!("struct<{}>", results.join(", ")))
74    }
75
76    fn after_struct_field(
77        &mut self,
78        _field: &iceberg::spec::NestedFieldRef,
79    ) -> iceberg::Result<()> {
80        self.depth -= 1;
81        Ok(())
82    }
83
84    fn field(
85        &mut self,
86        field: &iceberg::spec::NestedFieldRef,
87        value: String,
88    ) -> iceberg::Result<String> {
89        if self.is_inside_struct() {
90            return Ok(format!("{}:{}", field.name, value));
91        }
92
93        self.schema.push(FieldSchema {
94            name: Some(field.name.clone().into()),
95            r#type: Some(value.clone().into()),
96            comment: field.doc.clone().map(|doc| doc.into()),
97        });
98
99        Ok(value)
100    }
101
102    fn list(&mut self, _list: &iceberg::spec::ListType, value: String) -> iceberg::Result<String> {
103        Ok(format!("array<{value}>"))
104    }
105
106    fn map(
107        &mut self,
108        _map: &iceberg::spec::MapType,
109        key_value: String,
110        value: String,
111    ) -> iceberg::Result<String> {
112        Ok(format!("map<{key_value},{value}>"))
113    }
114
115    fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result<String> {
116        let hive_type = match p {
117            PrimitiveType::Boolean => "boolean".to_string(),
118            PrimitiveType::Int => "int".to_string(),
119            PrimitiveType::Long => "bigint".to_string(),
120            PrimitiveType::Float => "float".to_string(),
121            PrimitiveType::Double => "double".to_string(),
122            PrimitiveType::Date => "date".to_string(),
123            PrimitiveType::Timestamp => "timestamp".to_string(),
124            PrimitiveType::TimestampNs => "timestamp_ns".to_string(),
125            PrimitiveType::TimestamptzNs => "timestamptz_ns".to_string(),
126            PrimitiveType::Time | PrimitiveType::String | PrimitiveType::Uuid => {
127                "string".to_string()
128            }
129            PrimitiveType::Binary | PrimitiveType::Fixed(_) => "binary".to_string(),
130            PrimitiveType::Decimal { precision, scale } => {
131                format!("decimal({precision},{scale})")
132            }
133            _ => {
134                return Err(Error::new(
135                    ErrorKind::FeatureUnsupported,
136                    "Conversion from 'Timestamptz' is not supported",
137                ));
138            }
139        };
140
141        Ok(hive_type)
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use iceberg::Result;
148    use iceberg::spec::Schema;
149
150    use super::*;
151
152    #[test]
153    fn test_schema_with_nested_maps() -> Result<()> {
154        let record = r#"
155            {
156                "schema-id": 1,
157                "type": "struct",
158                "fields": [
159                    {
160                        "id": 1,
161                        "name": "quux",
162                        "required": true,
163                        "type": {
164                            "type": "map",
165                            "key-id": 2,
166                            "key": "string",
167                            "value-id": 3,
168                            "value-required": true,
169                            "value": {
170                                "type": "map",
171                                "key-id": 4,
172                                "key": "string",
173                                "value-id": 5,
174                                "value-required": true,
175                                "value": "int"
176                            }
177                        }
178                    }
179                ]
180            }
181        "#;
182
183        let schema = serde_json::from_str::<Schema>(record)?;
184
185        let result = HiveSchemaBuilder::from_iceberg(&schema)?.build();
186
187        let expected = vec![FieldSchema {
188            name: Some("quux".into()),
189            r#type: Some("map<string,map<string,int>>".into()),
190            comment: None,
191        }];
192
193        assert_eq!(result, expected);
194
195        Ok(())
196    }
197
198    #[test]
199    fn test_schema_with_struct_inside_list() -> Result<()> {
200        let record = r#"
201        {
202            "schema-id": 1,
203            "type": "struct",
204            "fields": [
205                {
206                    "id": 1,
207                    "name": "location",
208                    "required": true,
209                    "type": {
210                        "type": "list",
211                        "element-id": 2,
212                        "element-required": true,
213                        "element": {
214                            "type": "struct",
215                            "fields": [
216                                {
217                                    "id": 3,
218                                    "name": "latitude",
219                                    "required": false,
220                                    "type": "float"
221                                },
222                                {
223                                    "id": 4,
224                                    "name": "longitude",
225                                    "required": false,
226                                    "type": "float"
227                                }
228                            ]
229                        }
230                    }
231                }
232            ]
233        }
234        "#;
235
236        let schema = serde_json::from_str::<Schema>(record)?;
237
238        let result = HiveSchemaBuilder::from_iceberg(&schema)?.build();
239
240        let expected = vec![FieldSchema {
241            name: Some("location".into()),
242            r#type: Some("array<struct<latitude:float, longitude:float>>".into()),
243            comment: None,
244        }];
245
246        assert_eq!(result, expected);
247
248        Ok(())
249    }
250
251    #[test]
252    fn test_schema_with_structs() -> Result<()> {
253        let record = r#"{
254            "type": "struct",
255            "schema-id": 1,
256            "fields": [
257                {
258                    "id": 1,
259                    "name": "person",
260                    "required": true,
261                    "type": {
262                        "type": "struct",
263                        "fields": [
264                            {
265                                "id": 2,
266                                "name": "name",
267                                "required": true,
268                                "type": "string"
269                            },
270                            {
271                                "id": 3,
272                                "name": "age",
273                                "required": false,
274                                "type": "int"
275                            }
276                        ]
277                    }
278                }
279            ]
280        }"#;
281
282        let schema = serde_json::from_str::<Schema>(record)?;
283
284        let result = HiveSchemaBuilder::from_iceberg(&schema)?.build();
285
286        let expected = vec![FieldSchema {
287            name: Some("person".into()),
288            r#type: Some("struct<name:string, age:int>".into()),
289            comment: None,
290        }];
291
292        assert_eq!(result, expected);
293
294        Ok(())
295    }
296
297    #[test]
298    fn test_schema_with_simple_fields() -> Result<()> {
299        let record = r#"{
300            "type": "struct",
301            "schema-id": 1,
302            "fields": [
303                {
304                    "id": 1,
305                    "name": "c1",
306                    "required": true,
307                    "type": "boolean"
308                },
309                {
310                    "id": 2,
311                    "name": "c2",
312                    "required": true,
313                    "type": "int"
314                },
315                {
316                    "id": 3,
317                    "name": "c3",
318                    "required": true,
319                    "type": "long"
320                },
321                {
322                    "id": 4,
323                    "name": "c4",
324                    "required": true,
325                    "type": "float"
326                },
327                {
328                    "id": 5,
329                    "name": "c5",
330                    "required": true,
331                    "type": "double"
332                },
333                {
334                    "id": 6,
335                    "name": "c6",
336                    "required": true,
337                    "type": "decimal(2,2)"
338                },
339                {
340                    "id": 7,
341                    "name": "c7",
342                    "required": true,
343                    "type": "date"
344                },
345                {
346                    "id": 8,
347                    "name": "c8",
348                    "required": true,
349                    "type": "time"
350                },
351                {
352                    "id": 9,
353                    "name": "c9",
354                    "required": true,
355                    "type": "timestamp"
356                },
357                {
358                    "id": 10,
359                    "name": "c10",
360                    "required": true,
361                    "type": "string"
362                },
363                {
364                    "id": 11,
365                    "name": "c11",
366                    "required": true,
367                    "type": "uuid"
368                },
369                {
370                    "id": 12,
371                    "name": "c12",
372                    "required": true,
373                    "type": "fixed[4]"
374                },
375                {
376                    "id": 13,
377                    "name": "c13",
378                    "required": true,
379                    "type": "binary"
380                }
381            ]
382        }"#;
383
384        let schema = serde_json::from_str::<Schema>(record)?;
385
386        let result = HiveSchemaBuilder::from_iceberg(&schema)?.build();
387
388        let expected = vec![
389            FieldSchema {
390                name: Some("c1".into()),
391                r#type: Some("boolean".into()),
392                comment: None,
393            },
394            FieldSchema {
395                name: Some("c2".into()),
396                r#type: Some("int".into()),
397                comment: None,
398            },
399            FieldSchema {
400                name: Some("c3".into()),
401                r#type: Some("bigint".into()),
402                comment: None,
403            },
404            FieldSchema {
405                name: Some("c4".into()),
406                r#type: Some("float".into()),
407                comment: None,
408            },
409            FieldSchema {
410                name: Some("c5".into()),
411                r#type: Some("double".into()),
412                comment: None,
413            },
414            FieldSchema {
415                name: Some("c6".into()),
416                r#type: Some("decimal(2,2)".into()),
417                comment: None,
418            },
419            FieldSchema {
420                name: Some("c7".into()),
421                r#type: Some("date".into()),
422                comment: None,
423            },
424            FieldSchema {
425                name: Some("c8".into()),
426                r#type: Some("string".into()),
427                comment: None,
428            },
429            FieldSchema {
430                name: Some("c9".into()),
431                r#type: Some("timestamp".into()),
432                comment: None,
433            },
434            FieldSchema {
435                name: Some("c10".into()),
436                r#type: Some("string".into()),
437                comment: None,
438            },
439            FieldSchema {
440                name: Some("c11".into()),
441                r#type: Some("string".into()),
442                comment: None,
443            },
444            FieldSchema {
445                name: Some("c12".into()),
446                r#type: Some("binary".into()),
447                comment: None,
448            },
449            FieldSchema {
450                name: Some("c13".into()),
451                r#type: Some("binary".into()),
452                comment: None,
453            },
454        ];
455
456        assert_eq!(result, expected);
457
458        Ok(())
459    }
460}