Skip to content

Commit 133fb4b

Browse files
committed
PARQUET-2249: Add IEEE-754 total order and nan count for floating types
1 parent 6e2f7bb commit 133fb4b

File tree

24 files changed

+2694
-131
lines changed

24 files changed

+2694
-131
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.io.api.Binary;
22+
import org.apache.parquet.schema.ColumnOrder;
23+
import org.apache.parquet.schema.Float16;
24+
import org.apache.parquet.schema.LogicalTypeAnnotation;
2225
import org.apache.parquet.schema.PrimitiveType;
2326
import org.apache.parquet.schema.Types;
2427

@@ -28,6 +31,7 @@ public class BinaryStatistics extends Statistics<Binary> {
2831
private static final PrimitiveType DEFAULT_FAKE_TYPE =
2932
Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("fake_binary_type");
3033

34+
private final boolean isFloat16;
3135
private Binary max;
3236
private Binary min;
3337

@@ -41,26 +45,51 @@ public BinaryStatistics() {
4145

4246
BinaryStatistics(PrimitiveType type) {
4347
super(type);
48+
this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
49+
if (isFloat16) {
50+
incrementNanCount(0);
51+
}
4452
}
4553

4654
private BinaryStatistics(BinaryStatistics other) {
4755
super(other.type());
56+
this.isFloat16 = other.isFloat16;
4857
if (other.hasNonNullValue()) {
4958
initializeStats(other.min, other.max);
5059
}
5160
setNumNulls(other.getNumNulls());
61+
incrementNanCount(other.getNanCount());
5262
}
5363

5464
@Override
5565
public void updateStats(Binary value) {
66+
if (isFloat16 && Float16.isNaN(value.get2BytesLittleEndian())) {
67+
incrementNanCount();
68+
}
5669
if (!this.hasNonNullValue()) {
5770
min = value.copy();
5871
max = value.copy();
5972
this.markAsNotEmpty();
60-
} else if (comparator().compare(min, value) > 0) {
61-
min = value.copy();
62-
} else if (comparator().compare(max, value) < 0) {
63-
max = value.copy();
73+
} else {
74+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
75+
if (!Float16.isNaN(value.get2BytesLittleEndian())) {
76+
if (Float16.isNaN(min.get2BytesLittleEndian())
77+
|| comparator().compare(min, value) > 0) {
78+
min = value.copy();
79+
}
80+
if (Float16.isNaN(max.get2BytesLittleEndian())
81+
|| comparator().compare(max, value) < 0) {
82+
max = value.copy();
83+
}
84+
}
85+
return;
86+
}
87+
88+
if (comparator().compare(min, value) > 0) {
89+
min = value.copy();
90+
} else if (comparator().compare(max, value) < 0) {
91+
max = value.copy();
92+
}
6493
}
6594
}
6695

@@ -126,6 +155,20 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) {
126155
*/
127156
@Deprecated
128157
public void updateStats(Binary min_value, Binary max_value) {
158+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
159+
if (!Float16.isNaN(min_value.get2BytesLittleEndian())) {
160+
if (Float16.isNaN(min.get2BytesLittleEndian()) || comparator().compare(min, min_value) > 0) {
161+
min = min_value.copy();
162+
}
163+
}
164+
if (!Float16.isNaN(max_value.get2BytesLittleEndian())) {
165+
if (Float16.isNaN(max.get2BytesLittleEndian()) || comparator().compare(max, max_value) < 0) {
166+
max = max_value.copy();
167+
}
168+
}
169+
return;
170+
}
171+
129172
if (comparator().compare(min, min_value) > 0) {
130173
min = min_value.copy();
131174
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -41,6 +42,7 @@ public DoubleStatistics() {
4142

4243
DoubleStatistics(PrimitiveType type) {
4344
super(type);
45+
incrementNanCount(0);
4446
}
4547

4648
private DoubleStatistics(DoubleStatistics other) {
@@ -49,10 +51,14 @@ private DoubleStatistics(DoubleStatistics other) {
4951
initializeStats(other.min, other.max);
5052
}
5153
setNumNulls(other.getNumNulls());
54+
incrementNanCount(other.getNanCount());
5255
}
5356

5457
@Override
5558
public void updateStats(double value) {
59+
if (Double.isNaN(value)) {
60+
incrementNanCount();
61+
}
5662
if (!this.hasNonNullValue()) {
5763
initializeStats(value, value);
5864
} else {
@@ -98,6 +104,20 @@ public boolean isSmallerThan(long size) {
98104
}
99105

100106
public void updateStats(double min_value, double max_value) {
107+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
108+
if (!Double.isNaN(min_value)) {
109+
if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
110+
min = min_value;
111+
}
112+
}
113+
if (!Double.isNaN(max_value)) {
114+
if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
115+
max = max_value;
116+
}
117+
}
118+
return;
119+
}
120+
101121
if (comparator().compare(min, min_value) > 0) {
102122
min = min_value;
103123
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -42,6 +43,7 @@ public FloatStatistics() {
4243

4344
FloatStatistics(PrimitiveType type) {
4445
super(type);
46+
incrementNanCount(0);
4547
}
4648

4749
private FloatStatistics(FloatStatistics other) {
@@ -50,10 +52,14 @@ private FloatStatistics(FloatStatistics other) {
5052
initializeStats(other.min, other.max);
5153
}
5254
setNumNulls(other.getNumNulls());
55+
incrementNanCount(other.getNanCount());
5356
}
5457

5558
@Override
5659
public void updateStats(float value) {
60+
if (Float.isNaN(value)) {
61+
incrementNanCount();
62+
}
5763
if (!this.hasNonNullValue()) {
5864
initializeStats(value, value);
5965
} else {
@@ -99,6 +105,20 @@ public boolean isSmallerThan(long size) {
99105
}
100106

101107
public void updateStats(float min_value, float max_value) {
108+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
109+
if (!Float.isNaN(min_value)) {
110+
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
111+
min = min_value;
112+
}
113+
}
114+
if (!Float.isNaN(max_value)) {
115+
if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
116+
max = max_value;
117+
}
118+
}
119+
return;
120+
}
121+
102122
if (comparator().compare(min, min_value) > 0) {
103123
min = min_value;
104124
}

0 commit comments

Comments
 (0)