Skip to content

Commit e8bbfbb

Browse files
authored
GH-2956: Use avro SchemaBuilder API to convert record (#2957)
The avro schema builder API is cleaned and more stable. It decreases chance of using newly introduced avro API in case user run with legacy avro version As OPTIONAL converted fields sets null as default, increase consistency by using [] as default for REPEATED converted fields.
1 parent c928c4b commit e8bbfbb

File tree

2 files changed

+51
-47
lines changed

2 files changed

+51
-47
lines changed

parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import static java.util.Optional.empty;
2222
import static java.util.Optional.of;
23-
import static org.apache.avro.JsonProperties.NULL_VALUE;
2423
import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED;
2524
import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED_DEFAULT;
2625
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_FIXED_AS_INT96;
@@ -58,6 +57,7 @@
5857
import org.apache.avro.LogicalType;
5958
import org.apache.avro.LogicalTypes;
6059
import org.apache.avro.Schema;
60+
import org.apache.avro.SchemaBuilder;
6161
import org.apache.hadoop.conf.Configuration;
6262
import org.apache.parquet.conf.HadoopParquetConfiguration;
6363
import org.apache.parquet.conf.ParquetConfiguration;
@@ -296,21 +296,24 @@ Schema convert(GroupType parquetSchema) {
296296
}
297297

298298
private Schema convertFields(String name, List<Type> parquetFields, Map<String, Integer> names) {
299-
String ns = namespace(name, names);
300-
List<Schema.Field> fields = new ArrayList<Schema.Field>();
299+
SchemaBuilder.FieldAssembler<Schema> builder =
300+
SchemaBuilder.builder(namespace(name, names)).record(name).fields();
301301
for (Type parquetType : parquetFields) {
302302
Schema fieldSchema = convertField(parquetType, names);
303303
if (parquetType.isRepetition(REPEATED)) { // If a repeated field is ungrouped, treat as REQUIRED per spec
304-
fields.add(new Schema.Field(parquetType.getName(), Schema.createArray(fieldSchema)));
304+
builder.name(parquetType.getName())
305+
.type()
306+
.array()
307+
.items()
308+
.type(fieldSchema)
309+
.arrayDefault(new ArrayList<>());
305310
} else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
306-
fields.add(new Schema.Field(parquetType.getName(), optional(fieldSchema), null, NULL_VALUE));
311+
builder.name(parquetType.getName()).type().optional().type(fieldSchema);
307312
} else { // REQUIRED
308-
fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null, (Object) null));
313+
builder.name(parquetType.getName()).type(fieldSchema).noDefault();
309314
}
310315
}
311-
Schema schema = Schema.createRecord(name, null, ns, false);
312-
schema.setFields(fields);
313-
return schema;
316+
return builder.endRecord();
314317
}
315318

316319
private Schema convertField(final Type parquetType, Map<String, Integer> names) {

parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -72,43 +72,43 @@ public static void setupConf() {
7272
NEW_BEHAVIOR.setBoolean("parquet.avro.write-old-list-structure", false);
7373
}
7474

75-
public static final String ALL_PARQUET_SCHEMA =
76-
"message org.apache.parquet.avro.myrecord {\n" + " required boolean myboolean;\n"
77-
+ " required int32 myint;\n"
78-
+ " required int64 mylong;\n"
79-
+ " required float myfloat;\n"
80-
+ " required double mydouble;\n"
81-
+ " required binary mybytes;\n"
82-
+ " required binary mystring (UTF8);\n"
83-
+ " required group mynestedrecord {\n"
84-
+ " required int32 mynestedint;\n"
85-
+ " }\n"
86-
+ " required binary myenum (ENUM);\n"
87-
+ " required group myarray (LIST) {\n"
88-
+ " repeated int32 array;\n"
89-
+ " }\n"
90-
+ " optional group myoptionalarray (LIST) {\n"
91-
+ " repeated int32 array;\n"
92-
+ " }\n"
93-
+ " required group myarrayofoptional (LIST) {\n"
94-
+ " repeated group list {\n"
95-
+ " optional int32 element;\n"
96-
+ " }\n"
97-
+ " }\n"
98-
+ " required group myrecordarray (LIST) {\n"
99-
+ " repeated group array {\n"
100-
+ " required int32 a;\n"
101-
+ " required int32 b;\n"
102-
+ " }\n"
103-
+ " }\n"
104-
+ " required group mymap (MAP) {\n"
105-
+ " repeated group map (MAP_KEY_VALUE) {\n"
106-
+ " required binary key (UTF8);\n"
107-
+ " required int32 value;\n"
108-
+ " }\n"
109-
+ " }\n"
110-
+ " required fixed_len_byte_array(1) myfixed;\n"
111-
+ "}\n";
75+
public static final String ALL_PARQUET_SCHEMA = "message org.apache.parquet.avro.myrecord {\n"
76+
+ " required boolean myboolean;\n"
77+
+ " required int32 myint;\n"
78+
+ " required int64 mylong;\n"
79+
+ " required float myfloat;\n"
80+
+ " required double mydouble;\n"
81+
+ " required binary mybytes;\n"
82+
+ " required binary mystring (UTF8);\n"
83+
+ " required group mynestedrecord {\n"
84+
+ " required int32 mynestedint;\n"
85+
+ " }\n"
86+
+ " required binary myenum (ENUM);\n"
87+
+ " required group myarray (LIST) {\n"
88+
+ " repeated int32 array;\n"
89+
+ " }\n"
90+
+ " optional group myoptionalarray (LIST) {\n"
91+
+ " repeated int32 array;\n"
92+
+ " }\n"
93+
+ " required group myarrayofoptional (LIST) {\n"
94+
+ " repeated group list {\n"
95+
+ " optional int32 element;\n"
96+
+ " }\n"
97+
+ " }\n"
98+
+ " required group myrecordarray (LIST) {\n"
99+
+ " repeated group array {\n"
100+
+ " required int32 a;\n"
101+
+ " required int32 b;\n"
102+
+ " }\n"
103+
+ " }\n"
104+
+ " required group mymap (MAP) {\n"
105+
+ " repeated group map (MAP_KEY_VALUE) {\n"
106+
+ " required binary key (UTF8);\n"
107+
+ " required int32 value;\n"
108+
+ " }\n"
109+
+ " }\n"
110+
+ " required fixed_len_byte_array(1) myfixed;\n"
111+
+ "}\n";
112112

113113
private void testAvroToParquetConversion(Schema avroSchema, String schemaString) throws Exception {
114114
testAvroToParquetConversion(new Configuration(false), avroSchema, schemaString);
@@ -432,7 +432,8 @@ public void testConvertUngroupedRepeatedField() throws Exception {
432432
+ " \"name\": \"SchemaWithRepeatedField\","
433433
+ " \"fields\": [{"
434434
+ " \"name\": \"repeatedField\","
435-
+ " \"type\": {\"type\": \"array\",\"items\": \"int\"}"
435+
+ " \"type\": {\"type\": \"array\",\"items\": \"int\"},"
436+
+ " \"default\": []"
436437
+ " }]"
437438
+ "}"),
438439
"message SchemaWithRepeatedField { repeated int32 repeatedField; }");

0 commit comments

Comments
 (0)