Skip to content

Commit 05fa433

Browse files
author
Jan Kaul
committed
add function to obtain partition bounds of snapshot
1 parent d7849b0 commit 05fa433

File tree

3 files changed

+71
-2
lines changed

3 files changed

+71
-2
lines changed

datafusion_iceberg/src/table.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ fn fake_object_store_url(table_location_url: &str) -> Option<ObjectStoreUrl> {
318318
u.path()
319319
.as_bytes()
320320
.iter()
321-
.map(|b| format!("{:02x}", b))
321+
.map(|b| format!("{b:02x}"))
322322
.collect::<Vec<_>>()
323323
.join("")
324324
)))

iceberg-rust/src/table/manifest_list.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,75 @@ pub(crate) async fn read_snapshot<'metadata>(
144144
ManifestListReader::new(bytes, table_metadata)
145145
}
146146

147+
/// Computes the overall partition bounds for all data files in a snapshot.
148+
///
149+
/// This function reads the manifest list for a given snapshot and computes the
150+
/// bounding rectangle that encompasses all partition values across all manifests
151+
/// in the snapshot. It's useful for understanding the overall data distribution
152+
/// and for query optimization by determining which partitions contain data.
153+
///
154+
/// The function:
155+
/// 1. Fetches the manifest list file from object storage
156+
/// 2. Iterates through all manifest entries in the manifest list
157+
/// 3. For each manifest that has partition information, converts the partition
158+
/// summary to a rectangle and expands the overall bounds
159+
/// 4. Returns the combined bounding rectangle or None if no partitions are found
160+
///
161+
/// # Arguments
162+
/// * `snapshot` - The snapshot containing the manifest list location
163+
/// * `table_metadata` - Reference to the table metadata for format version info
164+
/// * `object_store` - The object store to read the manifest list file from
165+
///
166+
/// # Returns
167+
/// * `Result<Option<Rectangle>, Error>` - The bounding rectangle encompassing all
168+
/// partition values, or None if no partitions are found, or an error if the
169+
/// operation fails
170+
///
171+
/// # Errors
172+
/// Returns an error if:
173+
/// * The manifest list file cannot be read from storage
174+
/// * The manifest list format is invalid
175+
/// * The Avro reader cannot be created
176+
/// * Partition summary conversion fails
177+
///
178+
/// # Example Usage
179+
/// ```ignore
180+
/// let bounds = snapshot_partition_bounds(&snapshot, &table_metadata, object_store).await?;
181+
/// if let Some(rectangle) = bounds {
182+
/// println!("Partition bounds: {:?}", rectangle);
183+
/// } else {
184+
/// println!("No partition bounds found");
185+
/// }
186+
/// ```
187+
pub async fn snapshot_partition_bounds(
188+
snapshot: &Snapshot,
189+
table_metadata: &TableMetadata,
190+
object_store: Arc<dyn ObjectStore>,
191+
) -> Result<Option<Rectangle>, Error> {
192+
let bytes: Cursor<Vec<u8>> = Cursor::new(
193+
object_store
194+
.get(&strip_prefix(snapshot.manifest_list()).into())
195+
.await?
196+
.bytes()
197+
.await?
198+
.into(),
199+
);
200+
201+
ManifestListReader::new(bytes, table_metadata)?.try_fold(None::<Rectangle>, |acc, x| {
202+
if let Some(partitions) = x?.partitions {
203+
let rect = summary_to_rectangle(&partitions)?;
204+
if let Some(mut acc) = acc {
205+
acc.expand(&rect);
206+
Ok(Some(acc))
207+
} else {
208+
Ok(Some(rect))
209+
}
210+
} else {
211+
Ok(acc)
212+
}
213+
})
214+
}
215+
147216
/// A writer for Iceberg manifest list files that manages the creation and updating of manifest lists.
148217
///
149218
/// The ManifestListWriter is responsible for:

iceberg-rust/src/util/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use crate::error::Error;
1111
type Vec4<T> = SmallVec<[T; 4]>;
1212

1313
#[derive(Debug, Clone, PartialEq, Eq)]
14-
pub(crate) struct Rectangle {
14+
pub struct Rectangle {
1515
pub min: Vec4<Value>,
1616
pub max: Vec4<Value>,
1717
}

0 commit comments

Comments
 (0)