Skip to content

Commit feb7bcf

Browse files
wongxingjunbrightwon
authored andcommitted
[HUDI-4433] rebase last upstream master
[HUDI-4433] fix after rebase
1 parent 097e5a8 commit feb7bcf

3 files changed

Lines changed: 54 additions & 1 deletion

File tree

hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public class RepairsCommand {
6969
@ShellMethod(key = "repair deduplicate",
7070
value = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
7171
public String deduplicate(
72-
@ShellOption(value = {"--duplicatedPartitionPath"}, help = "Partition Path containing the duplicates")
72+
@ShellOption(value = {"--duplicatedPartitionPath"}, defaultValue = "", help = "Partition Path containing the duplicates")
7373
final String duplicatedPartitionPath,
7474
@ShellOption(value = {"--repairedOutputPath"}, help = "Location to place the repaired files")
7575
final String repairedOutputPath,

hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ public class ITTestRepairsCommand extends HoodieCLIIntegrationTestBase {
6969
private String duplicatedPartitionPath;
7070
private String duplicatedPartitionPathWithUpdates;
7171
private String duplicatedPartitionPathWithUpserts;
72+
private String duplicatedNoPartitionPath;
7273
private String repairedOutputPath;
7374

7475
private HoodieFileFormat fileFormat;
@@ -78,6 +79,7 @@ public void init() throws Exception {
7879
duplicatedPartitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH;
7980
duplicatedPartitionPathWithUpdates = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH;
8081
duplicatedPartitionPathWithUpserts = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH;
82+
duplicatedNoPartitionPath = HoodieTestDataGenerator.NO_PARTITION_PATH;
8183
repairedOutputPath = Paths.get(basePath, "tmp").toString();
8284

8385
HoodieCLI.conf = jsc.hadoopConfiguration();
@@ -135,6 +137,23 @@ public void init() throws Exception {
135137
.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords)
136138
.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "8", dupRecords);
137139

140+
// init cow table for non-partitioned table tests
141+
String cowNonPartitionedTablePath = Paths.get(basePath, "cow_table_non_partitioned").toString();
142+
143+
// Create cow table and connect
144+
new TableCommand().createTable(
145+
cowNonPartitionedTablePath, "cow_table_non_partitioned", HoodieTableType.COPY_ON_WRITE.name(),
146+
"", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload");
147+
148+
HoodieSparkWriteableTestTable cowNonPartitionedTable = HoodieSparkWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema);
149+
150+
cowNonPartitionedTable.addCommit("20160401010101")
151+
.withInserts(HoodieTestDataGenerator.NO_PARTITION_PATH, "1", hoodieRecords1)
152+
.getFileIdWithLogFile(HoodieTestDataGenerator.NO_PARTITION_PATH);
153+
154+
cowNonPartitionedTable.addCommit("20160401010202")
155+
.withInserts(HoodieTestDataGenerator.NO_PARTITION_PATH, "2", dupRecords);
156+
138157
fileFormat = metaClient.getTableConfig().getBaseFileFormat();
139158
}
140159

@@ -232,6 +251,39 @@ public void testDeduplicateWithUpserts(HoodieTableType tableType) throws IOExcep
232251
assertEquals(100, result.count());
233252
}
234253

254+
/**
255+
* Test case dry run deduplicate for non-partitioned dataset.
256+
*/
257+
@ParameterizedTest
258+
@EnumSource(value = HoodieTableType.class)
259+
public void testDeduplicateNoPartitionWithInserts(HoodieTableType tableType) throws IOException {
260+
String tablePath = Paths.get(basePath, "cow_table_non_partitioned").toString();
261+
connectTableAndReloadMetaClient(tablePath);
262+
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
263+
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
264+
fs.listStatus(new Path(Paths.get(tablePath, duplicatedNoPartitionPath).toString())));
265+
List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
266+
assertEquals(2, filteredStatuses.size(), "There should be 2 files.");
267+
268+
// Before deduplicate, all files contain 110 records
269+
String[] files = filteredStatuses.toArray(new String[0]);
270+
Dataset df = readFiles(files);
271+
assertEquals(110, df.count());
272+
273+
// use default value without specifying duplicatedPartitionPath
274+
String cmdStr = String.format("repair deduplicate --repairedOutputPath %s --sparkMaster %s",
275+
repairedOutputPath, "local");
276+
Object resultForCmd = shell.evaluate(() -> cmdStr);
277+
assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd));
278+
assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, resultForCmd.toString());
279+
280+
// After deduplicate, there are 100 records
281+
FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
282+
files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
283+
Dataset result = readFiles(files);
284+
assertEquals(100, result.count());
285+
}
286+
235287
/**
236288
* Test case for real run deduplicate.
237289
*/

hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
8989
// with default bloom filter with 60,000 entries and 0.000000001 FPRate
9090
public static final int BLOOM_FILTER_BYTES = 323495;
9191
private static Logger logger = LogManager.getLogger(HoodieTestDataGenerator.class);
92+
public static final String NO_PARTITION_PATH = "";
9293
public static final String DEFAULT_FIRST_PARTITION_PATH = "2016/03/15";
9394
public static final String DEFAULT_SECOND_PARTITION_PATH = "2015/03/16";
9495
public static final String DEFAULT_THIRD_PARTITION_PATH = "2015/03/17";

0 commit comments

Comments
 (0)