Skip to content

Commit 2275b73

Browse files
committed
check if column is only nulls
1 parent 93bdf79 commit 2275b73

File tree

1 file changed

+116
-110
lines changed

1 file changed

+116
-110
lines changed

iceberg-rust/src/file_format/parquet.rs

Lines changed: 116 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,15 @@ pub fn parquet_to_datafile(
7979
.or_insert(row_group.num_rows());
8080

8181
if let Some(statistics) = column.statistics() {
82+
let mut only_nulls = false;
8283
null_value_counts
8384
.entry(id)
8485
.and_modify(|x| *x += statistics.null_count() as i64)
8586
.or_insert(statistics.null_count() as i64);
8687
if let Some(distinct_count) = statistics.distinct_count() {
88+
if statistics.null_count() == distinct_count {
89+
only_nulls = true
90+
}
8791
distinct_counts
8892
.entry(id)
8993
.and_modify(|x| *x += distinct_count as i64)
@@ -95,123 +99,125 @@ pub fn parquet_to_datafile(
9599
.ok_or_else(|| Error::Schema(column_name.to_string(), "".to_string()))?
96100
.field_type;
97101

98-
if let Type::Primitive(_) = &data_type {
99-
let new = Value::try_from_bytes(statistics.min_bytes(), data_type)?;
100-
match lower_bounds.entry(id) {
101-
Entry::Occupied(mut entry) => {
102-
let entry = entry.get_mut();
103-
match (&entry, &new) {
104-
(Value::Int(current), Value::Int(new_val)) => {
105-
if *current > *new_val {
106-
*entry = new
107-
}
108-
}
109-
(Value::LongInt(current), Value::LongInt(new_val)) => {
110-
if *current > *new_val {
111-
*entry = new
112-
}
113-
}
114-
(Value::Float(current), Value::Float(new_val)) => {
115-
if *current > *new_val {
116-
*entry = new
117-
}
118-
}
119-
(Value::Double(current), Value::Double(new_val)) => {
120-
if *current > *new_val {
121-
*entry = new
122-
}
123-
}
124-
(Value::Date(current), Value::Date(new_val)) => {
125-
if *current > *new_val {
126-
*entry = new
127-
}
102+
if !only_nulls {
103+
if let Type::Primitive(_) = &data_type {
104+
let new = Value::try_from_bytes(statistics.min_bytes(), data_type)?;
105+
match lower_bounds.entry(id) {
106+
Entry::Occupied(mut entry) => {
107+
let entry = entry.get_mut();
108+
match (&entry, &new) {
109+
(Value::Int(current), Value::Int(new_val)) => {
110+
if *current > *new_val {
111+
*entry = new
112+
}
113+
}
114+
(Value::LongInt(current), Value::LongInt(new_val)) => {
115+
if *current > *new_val {
116+
*entry = new
117+
}
118+
}
119+
(Value::Float(current), Value::Float(new_val)) => {
120+
if *current > *new_val {
121+
*entry = new
122+
}
123+
}
124+
(Value::Double(current), Value::Double(new_val)) => {
125+
if *current > *new_val {
126+
*entry = new
127+
}
128+
}
129+
(Value::Date(current), Value::Date(new_val)) => {
130+
if *current > *new_val {
131+
*entry = new
132+
}
133+
}
134+
(Value::Time(current), Value::Time(new_val)) => {
135+
if *current > *new_val {
136+
*entry = new
137+
}
138+
}
139+
(Value::Timestamp(current), Value::Timestamp(new_val)) => {
140+
if *current > *new_val {
141+
*entry = new
142+
}
143+
}
144+
(Value::TimestampTZ(current), Value::TimestampTZ(new_val)) => {
145+
if *current > *new_val {
146+
*entry = new
147+
}
148+
}
149+
_ => (),
128150
}
129-
(Value::Time(current), Value::Time(new_val)) => {
130-
if *current > *new_val {
131-
*entry = new
132-
}
133-
}
134-
(Value::Timestamp(current), Value::Timestamp(new_val)) => {
135-
if *current > *new_val {
136-
*entry = new
137-
}
138-
}
139-
(Value::TimestampTZ(current), Value::TimestampTZ(new_val)) => {
140-
if *current > *new_val {
141-
*entry = new
142-
}
143-
}
144-
_ => (),
151+
}
152+
Entry::Vacant(entry) => {
153+
entry.insert(new);
145154
}
146155
}
147-
Entry::Vacant(entry) => {
148-
entry.insert(new);
149-
}
150-
}
151-
let new = Value::try_from_bytes(statistics.max_bytes(), data_type)?;
152-
match upper_bounds.entry(id) {
153-
Entry::Occupied(mut entry) => {
154-
let entry = entry.get_mut();
155-
match (&entry, &new) {
156-
(Value::Int(current), Value::Int(new_val)) => {
157-
if *current < *new_val {
158-
*entry = new
159-
}
160-
}
161-
(Value::LongInt(current), Value::LongInt(new_val)) => {
162-
if *current < *new_val {
163-
*entry = new
164-
}
165-
}
166-
(Value::Float(current), Value::Float(new_val)) => {
167-
if *current < *new_val {
168-
*entry = new
169-
}
156+
let new = Value::try_from_bytes(statistics.max_bytes(), data_type)?;
157+
match upper_bounds.entry(id) {
158+
Entry::Occupied(mut entry) => {
159+
let entry = entry.get_mut();
160+
match (&entry, &new) {
161+
(Value::Int(current), Value::Int(new_val)) => {
162+
if *current < *new_val {
163+
*entry = new
164+
}
165+
}
166+
(Value::LongInt(current), Value::LongInt(new_val)) => {
167+
if *current < *new_val {
168+
*entry = new
169+
}
170+
}
171+
(Value::Float(current), Value::Float(new_val)) => {
172+
if *current < *new_val {
173+
*entry = new
174+
}
175+
}
176+
(Value::Double(current), Value::Double(new_val)) => {
177+
if *current < *new_val {
178+
*entry = new
179+
}
180+
}
181+
(Value::Date(current), Value::Date(new_val)) => {
182+
if *current < *new_val {
183+
*entry = new
184+
}
185+
}
186+
(Value::Time(current), Value::Time(new_val)) => {
187+
if *current < *new_val {
188+
*entry = new
189+
}
190+
}
191+
(Value::Timestamp(current), Value::Timestamp(new_val)) => {
192+
if *current < *new_val {
193+
*entry = new
194+
}
195+
}
196+
(Value::TimestampTZ(current), Value::TimestampTZ(new_val)) => {
197+
if *current < *new_val {
198+
*entry = new
199+
}
200+
}
201+
_ => (),
170202
}
171-
(Value::Double(current), Value::Double(new_val)) => {
172-
if *current < *new_val {
173-
*entry = new
174-
}
175-
}
176-
(Value::Date(current), Value::Date(new_val)) => {
177-
if *current < *new_val {
178-
*entry = new
179-
}
180-
}
181-
(Value::Time(current), Value::Time(new_val)) => {
182-
if *current < *new_val {
183-
*entry = new
184-
}
185-
}
186-
(Value::Timestamp(current), Value::Timestamp(new_val)) => {
187-
if *current < *new_val {
188-
*entry = new
189-
}
190-
}
191-
(Value::TimestampTZ(current), Value::TimestampTZ(new_val)) => {
192-
if *current < *new_val {
193-
*entry = new
194-
}
195-
}
196-
_ => (),
203+
}
204+
Entry::Vacant(entry) => {
205+
entry.insert(new);
197206
}
198207
}
199-
Entry::Vacant(entry) => {
200-
entry.insert(new);
201-
}
202-
}
203208

204-
if let Some(partition_value) = partition.get_mut(column_name) {
205-
if partition_value.is_none() {
206-
let transform = transforms
207-
.get(column_name)
208-
.ok_or_else(|| Error::InvalidFormat("transform".to_string()))?;
209-
let min = Value::try_from_bytes(statistics.min_bytes(), data_type)?
210-
.tranform(transform)?;
211-
let max = Value::try_from_bytes(statistics.max_bytes(), data_type)?
212-
.tranform(transform)?;
213-
if min == max {
214-
*partition_value = Some(min)
209+
if let Some(partition_value) = partition.get_mut(column_name) {
210+
if partition_value.is_none() {
211+
let transform = transforms
212+
.get(column_name)
213+
.ok_or_else(|| Error::InvalidFormat("transform".to_string()))?;
214+
let min = Value::try_from_bytes(statistics.min_bytes(), data_type)?
215+
.tranform(transform)?;
216+
let max = Value::try_from_bytes(statistics.max_bytes(), data_type)?
217+
.tranform(transform)?;
218+
if min == max {
219+
*partition_value = Some(min)
220+
}
215221
}
216222
}
217223
}

0 commit comments

Comments
 (0)