iceberg/spec/
statistic_file.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Statistic Files for TableMetadata
19
20use std::collections::HashMap;
21
22use serde::{Deserialize, Serialize};
23
24#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "kebab-case")]
26/// Represents a statistics file
27pub struct StatisticsFile {
28    /// The snapshot id of the statistics file.
29    pub snapshot_id: i64,
30    /// Path of the statistics file
31    pub statistics_path: String,
32    /// File size in bytes
33    pub file_size_in_bytes: i64,
34    /// File footer size in bytes
35    pub file_footer_size_in_bytes: i64,
36    /// Base64-encoded implementation-specific key metadata for encryption.
37    #[serde(default, skip_serializing_if = "Option::is_none")]
38    pub key_metadata: Option<String>,
39    /// Blob metadata
40    pub blob_metadata: Vec<BlobMetadata>,
41}
42
43#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(rename_all = "kebab-case")]
45/// Represents a blob of metadata, which is a part of a statistics file
46pub struct BlobMetadata {
47    /// Type of the blob.
48    pub r#type: String,
49    /// Snapshot id of the blob.
50    pub snapshot_id: i64,
51    /// Sequence number of the blob.
52    pub sequence_number: i64,
53    /// Fields of the blob.
54    pub fields: Vec<i32>,
55    /// Properties of the blob.
56    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
57    pub properties: HashMap<String, String>,
58}
59
60#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
61#[serde(rename_all = "kebab-case")]
62/// Statistics file for a partition
63pub struct PartitionStatisticsFile {
64    /// The snapshot id of the statistics file.
65    pub snapshot_id: i64,
66    /// Path of the statistics file
67    pub statistics_path: String,
68    /// File size in bytes
69    pub file_size_in_bytes: i64,
70}
71
72#[cfg(test)]
73mod test {
74    use std::fmt::Debug;
75
76    use serde::de::DeserializeOwned;
77    use serde_json::json;
78
79    use super::*;
80
81    fn test_serde_json<T: Serialize + DeserializeOwned + PartialEq + Debug>(
82        json: serde_json::Value,
83        expected: T,
84    ) {
85        let json_str = json.to_string();
86        let actual: T = serde_json::from_str(&json_str).expect("Failed to parse from json");
87        assert_eq!(actual, expected, "Parsed value is not equal to expected");
88
89        let restored: T = serde_json::from_str(
90            &serde_json::to_string(&actual).expect("Failed to serialize to json"),
91        )
92        .expect("Failed to parse from serialized json");
93
94        assert_eq!(
95            restored, expected,
96            "Parsed restored value is not equal to expected"
97        );
98    }
99
100    #[test]
101    fn test_blob_metadata_serde() {
102        test_serde_json(
103            json!({
104                "type": "boring-type",
105                "snapshot-id": 1940541653261589030i64,
106                "sequence-number": 2,
107                "fields": [
108                        1
109                ],
110                "properties": {
111                        "prop-key": "prop-value"
112                }
113            }),
114            BlobMetadata {
115                r#type: "boring-type".to_string(),
116                snapshot_id: 1940541653261589030,
117                sequence_number: 2,
118                fields: vec![1],
119                properties: vec![("prop-key".to_string(), "prop-value".to_string())]
120                    .into_iter()
121                    .collect(),
122            },
123        );
124    }
125
126    #[test]
127    fn test_blob_metadata_serde_no_properties() {
128        test_serde_json(
129            json!({
130                "type": "boring-type",
131                "snapshot-id": 1940541653261589030i64,
132                "sequence-number": 2,
133                "fields": [
134                        1
135                ]
136            }),
137            BlobMetadata {
138                r#type: "boring-type".to_string(),
139                snapshot_id: 1940541653261589030,
140                sequence_number: 2,
141                fields: vec![1],
142                properties: HashMap::new(),
143            },
144        );
145    }
146
147    #[test]
148    fn test_statistics_file_serde() {
149        test_serde_json(
150            json!({
151              "snapshot-id": 3055729675574597004i64,
152              "statistics-path": "s3://a/b/stats.puffin",
153              "file-size-in-bytes": 413,
154              "file-footer-size-in-bytes": 42,
155              "blob-metadata": [
156                {
157                  "type": "ndv",
158                  "snapshot-id": 3055729675574597004i64,
159                  "sequence-number": 1,
160                  "fields": [1]
161                }
162              ]
163            }),
164            StatisticsFile {
165                snapshot_id: 3055729675574597004i64,
166                statistics_path: "s3://a/b/stats.puffin".to_string(),
167                file_size_in_bytes: 413,
168                file_footer_size_in_bytes: 42,
169                key_metadata: None,
170                blob_metadata: vec![BlobMetadata {
171                    r#type: "ndv".to_string(),
172                    snapshot_id: 3055729675574597004i64,
173                    sequence_number: 1,
174                    fields: vec![1],
175                    properties: HashMap::new(),
176                }],
177            },
178        );
179    }
180
181    #[test]
182    fn test_partition_statistics_serde() {
183        test_serde_json(
184            json!({
185              "snapshot-id": 3055729675574597004i64,
186              "statistics-path": "s3://a/b/partition-stats.parquet",
187              "file-size-in-bytes": 43
188            }),
189            PartitionStatisticsFile {
190                snapshot_id: 3055729675574597004,
191                statistics_path: "s3://a/b/partition-stats.parquet".to_string(),
192                file_size_in_bytes: 43,
193            },
194        );
195    }
196}