Skip to content

Commit aa79155

Browse files
author
yuezhang
committed
Merge branch 'master' into rollbackfirstwhenautoclustering
2 parents a215aa2 + 48a3906 commit aa79155

30 files changed

Lines changed: 561 additions & 208 deletions

File tree

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
hoodie.insert.shuffle.parallelism=100
19+
hoodie.upsert.shuffle.parallelism=100
20+
hoodie.bulkinsert.shuffle.parallelism=100
21+
22+
hoodie.deltastreamer.source.test.num_partitions=100
23+
hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false
24+
hoodie.deltastreamer.source.test.max_unique_records=100000000
25+
hoodie.embed.timeline.server=false
26+
hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector
27+
28+
hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector
29+
hoodie.datasource.hive_sync.skip_ro_suffix=true
30+
31+
hoodie.datasource.write.recordkey.field=_row_key
32+
hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator
33+
hoodie.datasource.write.partitionpath.field=timestamp
34+
35+
hoodie.compact.inline.max.delta.commits=2
36+
37+
hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input
38+
hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc
39+
hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc
40+
hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP
41+
hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd
42+
43+
hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/
44+
hoodie.datasource.hive_sync.mode=jdbc
45+
hoodie.datasource.hive_sync.database=testdb
46+
hoodie.datasource.hive_sync.table=table1
47+
hoodie.datasource.hive_sync.assume_date_partitioning=false
48+
hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path
49+
hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
50+
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Use compact-test.properties for this yaml file.
18+
dag_name: mor-async-compact.yaml
19+
dag_rounds: 4
20+
dag_intermittent_delay_mins: 0
21+
dag_content:
22+
first_insert:
23+
config:
24+
record_size: 1000
25+
num_partitions_insert: 1
26+
repeat_count: 1
27+
num_records_insert: 10000
28+
type: InsertNode
29+
deps: none
30+
first_upsert:
31+
config:
32+
record_size: 1000
33+
num_partitions_insert: 1
34+
num_records_insert: 300
35+
repeat_count: 1
36+
num_records_upsert: 2000
37+
num_partitions_upsert: 1
38+
type: UpsertNode
39+
deps: first_insert
40+
second_upsert:
41+
config:
42+
record_size: 1000
43+
num_partitions_insert: 1
44+
num_records_insert: 300
45+
repeat_count: 1
46+
num_records_upsert: 2000
47+
num_partitions_upsert: 1
48+
type: UpsertNode
49+
deps: first_upsert
50+
third_upsert:
51+
config:
52+
record_size: 1000
53+
num_partitions_insert: 1
54+
num_records_insert: 300
55+
repeat_count: 1
56+
num_records_upsert: 2000
57+
num_partitions_upsert: 1
58+
type: UpsertNode
59+
deps: second_upsert
60+
first_validate:
61+
config:
62+
delete_input_data: false
63+
type: ValidateDatasetNode
64+
deps: third_upsert
65+
first_schedule_compact:
66+
config:
67+
type: ScheduleCompactNode
68+
deps: first_validate
69+
fourth_upsert:
70+
config:
71+
record_size: 1000
72+
num_partitions_insert: 1
73+
num_records_insert: 300
74+
repeat_count: 1
75+
num_records_upsert: 2000
76+
num_partitions_upsert: 1
77+
type: UpsertNode
78+
deps: first_schedule_compact
79+
fifth_upsert:
80+
config:
81+
record_size: 1000
82+
num_partitions_insert: 1
83+
num_records_insert: 300
84+
repeat_count: 1
85+
num_records_upsert: 2000
86+
num_partitions_upsert: 1
87+
type: UpsertNode
88+
deps: fourth_upsert
89+
second_insert:
90+
config:
91+
record_size: 1000
92+
num_partitions_insert: 1
93+
repeat_count: 1
94+
num_records_insert: 10000
95+
type: InsertNode
96+
deps: fifth_upsert
97+
sixth_upsert:
98+
config:
99+
record_size: 1000
100+
num_partitions_insert: 1
101+
num_records_insert: 300
102+
repeat_count: 1
103+
num_records_upsert: 2000
104+
num_partitions_upsert: 1
105+
type: UpsertNode
106+
deps: second_insert
107+
third_validate:
108+
config:
109+
delete_input_data: false
110+
type: ValidateDatasetNode
111+
deps: sixth_upsert
112+
first_compact:
113+
config:
114+
type: CompactNode
115+
deps: third_validate
116+
first_delete:
117+
config:
118+
num_partitions_delete: 1
119+
num_records_delete: 500
120+
type: DeleteNode
121+
deps: first_compact
122+
fifth_validate:
123+
config:
124+
delete_input_data: false
125+
type: ValidateDatasetNode
126+
deps: first_delete

docker/demo/config/test-suite/test.properties

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ hoodie.deltastreamer.source.test.max_unique_records=100000000
2525
hoodie.embed.timeline.server=false
2626
hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector
2727

28-
hoodie.insert.shuffle.parallelism=100
29-
hoodie.upsert.shuffle.parallelism=100
30-
hoodie.bulkinsert.shuffle.parallelism=100
31-
3228
hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector
3329
hoodie.datasource.hive_sync.skip_ro_suffix=true
3430

hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public String showAllFileSlices(
119119

120120
@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
121121
public String showLatestFileSlices(
122-
@CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition,
122+
@CliOption(key = {"partitionPath"}, help = "A valid partition path", mandatory = true) String partition,
123123
@CliOption(key = {"baseFileOnly"}, help = "Only display base file view",
124124
unspecifiedDefaultValue = "false") boolean baseFileOnly,
125125
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
148148
.defaultValue(false)
149149
.withDocumentation("Only applies if index type is HBASE. "
150150
+ "When an already existing record is upserted to a new partition compared to whats in storage, "
151-
+ "this config when set, will delete old record in old paritition "
151+
+ "this config when set, will delete old record in old partition "
152152
+ "and will insert it as new record in new partition.");
153153

154154
public static final ConfigProperty<Boolean> ROLLBACK_SYNC_ENABLE = ConfigProperty

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieConcatHandle.java renamed to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,16 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.hudi.io.storage;
19+
package org.apache.hudi.io;
2020

2121
import org.apache.hudi.common.engine.TaskContextSupplier;
2222
import org.apache.hudi.common.model.HoodieBaseFile;
23+
import org.apache.hudi.common.model.HoodieRecord;
24+
import org.apache.hudi.common.model.HoodieRecordLocation;
2325
import org.apache.hudi.common.model.HoodieRecordPayload;
2426
import org.apache.hudi.common.util.Option;
2527
import org.apache.hudi.config.HoodieWriteConfig;
2628
import org.apache.hudi.exception.HoodieUpsertException;
27-
import org.apache.hudi.io.HoodieMergeHandle;
2829
import org.apache.hudi.keygen.BaseKeyGenerator;
2930
import org.apache.hudi.keygen.KeyGenUtils;
3031
import org.apache.hudi.table.HoodieTable;
@@ -34,6 +35,7 @@
3435
import org.apache.log4j.Logger;
3536

3637
import java.io.IOException;
38+
import java.util.Collections;
3739
import java.util.Iterator;
3840
import java.util.Map;
3941

@@ -44,38 +46,44 @@
4446
* Simplified Logic:
4547
* For every existing record
4648
* Write the record as is
47-
* For all incoming records, write to file as is.
49+
* For all incoming records, write to file as is, without de-duplicating based on the record key.
4850
*
4951
* Illustration with simple data.
5052
* Incoming data:
51-
* rec1_2, rec4_2, rec5_1, rec6_1
53+
* rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
5254
* Existing data:
5355
* rec1_1, rec2_1, rec3_1, rec4_1
5456
*
5557
* For every existing record, write to storage as is.
5658
* => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage
5759
* Write all records from incoming set to storage
58-
* => rec1_2, rec4_2, rec5_1 and rec6_1
60+
* => rec1_2, rec1_3, rec4_2, rec5_1 and rec6_1
5961
*
6062
* Final snapshot in storage
61-
* rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec4_2, rec5_1, rec6_1
63+
* rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
6264
*
6365
* Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not
6466
* happen and every batch should have new records to be inserted. Above example is for illustration purposes only.
6567
*/
6668
public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieMergeHandle<T, I, K, O> {
6769

6870
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
71+
// a representation of incoming records that tolerates duplicate keys
72+
private final Iterator<HoodieRecord<T>> recordItr;
6973

70-
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr,
71-
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
72-
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
74+
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
75+
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
76+
TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
77+
super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
78+
this.recordItr = recordItr;
7379
}
7480

75-
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId,
76-
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
77-
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
81+
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
82+
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
83+
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
84+
super(config, instantTime, hoodieTable, Collections.emptyMap(), partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
7885
Option.empty());
86+
this.recordItr = keyToNewRecords.values().iterator();
7987
}
8088

8189
/**
@@ -94,4 +102,17 @@ public void write(GenericRecord oldRecord) {
94102
}
95103
recordsWritten++;
96104
}
105+
106+
@Override
107+
protected void writeIncomingRecords() throws IOException {
108+
while (recordItr.hasNext()) {
109+
HoodieRecord<T> record = recordItr.next();
110+
if (needsUpdateLocation()) {
111+
record.unseal();
112+
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
113+
record.seal();
114+
}
115+
writeInsertRecord(record);
116+
}
117+
}
97118
}

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,18 @@ private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRe
257257
return writeRecord(hoodieRecord, indexedRecord);
258258
}
259259

260+
protected void writeInsertRecord(HoodieRecord<T> hoodieRecord) throws IOException {
261+
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
262+
Option<IndexedRecord> insertRecord = hoodieRecord.getData().getInsertValue(schema, config.getProps());
263+
// just skip the ignored record
264+
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
265+
return;
266+
}
267+
if (writeRecord(hoodieRecord, insertRecord)) {
268+
insertRecordsWritten++;
269+
}
270+
}
271+
260272
protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
261273
Option recordMetadata = hoodieRecord.getData().getMetadata();
262274
if (!partitionPath.equals(hoodieRecord.getPartitionPath())) {
@@ -340,28 +352,28 @@ public void write(GenericRecord oldRecord) {
340352
}
341353
}
342354

355+
protected void writeIncomingRecords() throws IOException {
356+
// write out any pending records (this can happen when inserts are turned into updates)
357+
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
358+
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
359+
while (newRecordsItr.hasNext()) {
360+
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
361+
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
362+
writeInsertRecord(hoodieRecord);
363+
}
364+
}
365+
}
366+
343367
@Override
344368
public List<WriteStatus> close() {
345369
try {
346-
// write out any pending records (this can happen when inserts are turned into updates)
347-
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
348-
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
349-
while (newRecordsItr.hasNext()) {
350-
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
351-
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
352-
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
353-
Option<IndexedRecord> insertRecord =
354-
hoodieRecord.getData().getInsertValue(schema, config.getProps());
355-
// just skip the ignore record
356-
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
357-
continue;
358-
}
359-
writeRecord(hoodieRecord, insertRecord);
360-
insertRecordsWritten++;
361-
}
362-
}
370+
writeIncomingRecords();
363371

364-
((ExternalSpillableMap) keyToNewRecords).close();
372+
if (keyToNewRecords instanceof ExternalSpillableMap) {
373+
((ExternalSpillableMap) keyToNewRecords).close();
374+
} else {
375+
keyToNewRecords.clear();
376+
}
365377
writtenRecordKeys.clear();
366378

367379
if (fileWriter != null) {

0 commit comments

Comments
 (0)