Skip to content

Commit cdf2657

Browse files
yihuacodope
authored andcommitted
[HUDI-4507] Improve file name extraction logic in metadata utils (#6250)
1 parent c647af5 commit cdf2657

3 files changed

Lines changed: 32 additions & 19 deletions

File tree

hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,24 @@ public static Path getPartitionPath(Path basePath, String partitionPath) {
615615
return StringUtils.isNullOrEmpty(partitionPath) ? basePath : new Path(basePath, partitionPath);
616616
}
617617

618+
/**
619+
* Extracts the file name from the relative path based on the table base path. For example:
620+
* "/2022/07/29/file1.parquet", "/2022/07/29" -> "file1.parquet"
621+
* "2022/07/29/file2.parquet", "2022/07/29" -> "file2.parquet"
622+
* "/file3.parquet", "" -> "file3.parquet"
623+
* "file4.parquet", "" -> "file4.parquet"
624+
*
625+
* @param filePathWithPartition the relative file path based on the table base path.
626+
* @param partition the relative partition path. For partitioned table, `partition` contains the relative partition path;
627+
* for non-partitioned table, `partition` is empty
628+
* @return Extracted file name in String.
629+
*/
630+
public static String getFileName(String filePathWithPartition, String partition) {
631+
int offset = StringUtils.isNullOrEmpty(partition)
632+
? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
633+
return filePathWithPartition.substring(offset);
634+
}
635+
618636
/**
619637
* Get DFS full partition path (e.g. hdfs://ip-address:8020:/<absolute path>)
620638
*/

hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,13 @@ public static List<HoodieRecord> convertMetadataToFilesPartitionRecords(HoodieCo
325325
return map;
326326
}
327327

328-
int offset = partition.equals(NON_PARTITIONED_NAME)
329-
? (pathWithPartition.startsWith("/") ? 1 : 0)
330-
: partition.length() + 1;
331-
String filename = pathWithPartition.substring(offset);
328+
String fileName = FSUtils.getFileName(pathWithPartition, partitionStatName);
332329

333330
// Since write-stats are coming in no particular order, if the same
334331
// file have previously been appended to w/in the txn, we simply pick max
335332
// of the sizes as reported after every write, since file-sizes are
336333
// monotonically increasing (ie file-size never goes down, unless deleted)
337-
map.merge(filename, stat.getFileSizeInBytes(), Math::max);
334+
map.merge(fileName, stat.getFileSizeInBytes(), Math::max);
338335

339336
return map;
340337
},
@@ -410,12 +407,7 @@ public static HoodieData<HoodieRecord> convertMetadataToBloomFilterRecords(
410407
return Collections.emptyListIterator();
411408
}
412409

413-
// For partitioned table, "partition" contains the relative partition path;
414-
// for non-partitioned table, "partition" is empty
415-
int offset = StringUtils.isNullOrEmpty(partition)
416-
? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
417-
418-
final String fileName = pathWithPartition.substring(offset);
410+
String fileName = FSUtils.getFileName(pathWithPartition, partition);
419411
if (!FSUtils.isBaseFile(new Path(fileName))) {
420412
return Collections.emptyListIterator();
421413
}
@@ -1162,13 +1154,8 @@ private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
11621154
HoodieTableMetaClient datasetMetaClient,
11631155
List<String> columnsToIndex,
11641156
boolean isDeleted) {
1165-
String partitionName = getPartitionIdentifier(partitionPath);
1166-
// NOTE: We have to chop leading "/" to make sure Hadoop does not treat it like
1167-
// absolute path
11681157
String filePartitionPath = filePath.startsWith("/") ? filePath.substring(1) : filePath;
1169-
String fileName = partitionName.equals(NON_PARTITIONED_NAME)
1170-
? filePartitionPath
1171-
: filePartitionPath.substring(partitionName.length() + 1);
1158+
String fileName = FSUtils.getFileName(filePath, partitionPath);
11721159

11731160
if (isDeleted) {
11741161
// TODO we should delete records instead of stubbing them

hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,17 @@ public void testFileNameRelatedFunctions() throws Exception {
326326
Files.createFile(partitionPath.resolve(log3));
327327

328328
assertEquals(3, (int) FSUtils.getLatestLogVersion(FSUtils.getFs(basePath, new Configuration()),
329-
new Path(partitionPath.toString()), fileId, LOG_EXTENTION, instantTime).get().getLeft());
329+
new Path(partitionPath.toString()), fileId, LOG_EXTENTION, instantTime).get().getLeft());
330330
assertEquals(4, FSUtils.computeNextLogVersion(FSUtils.getFs(basePath, new Configuration()),
331-
new Path(partitionPath.toString()), fileId, LOG_EXTENTION, instantTime));
331+
new Path(partitionPath.toString()), fileId, LOG_EXTENTION, instantTime));
332+
}
333+
334+
@Test
335+
public void testGetFilename() {
336+
assertEquals("file1.parquet", FSUtils.getFileName("/2022/07/29/file1.parquet", "/2022/07/29"));
337+
assertEquals("file2.parquet", FSUtils.getFileName("2022/07/29/file2.parquet", "2022/07/29"));
338+
assertEquals("file3.parquet", FSUtils.getFileName("/file3.parquet", ""));
339+
assertEquals("file4.parquet", FSUtils.getFileName("file4.parquet", ""));
332340
}
333341

334342
private void prepareTestDirectory(FileSystem fileSystem, String rootDir) throws IOException {

0 commit comments

Comments
 (0)