Skip to content

Commit 85b146d

Browse files
authored
[HUDI-3985] Refactor DLASyncTool to support read hoodie table as spark datasource table (#5532)
1 parent c7576f7 commit 85b146d

26 files changed

Lines changed: 1281 additions & 974 deletions

File tree

hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ import org.apache.hudi.common.table.HoodieTableConfig
2626
import org.apache.hudi.common.util.Option
2727
import org.apache.hudi.common.util.ValidationUtils.checkState
2828
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
29-
import org.apache.hudi.hive.util.ConfigUtils
3029
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
3130
import org.apache.hudi.keygen.constant.KeyGeneratorOptions
3231
import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
3332
import org.apache.hudi.sync.common.HoodieSyncConfig
33+
import org.apache.hudi.sync.common.util.ConfigUtils
3434
import org.apache.log4j.LogManager
3535
import org.apache.spark.sql.execution.datasources.{DataSourceUtils => SparkDataSourceUtils}
3636

hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/DropHoodieTableCommand.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path
2121
import org.apache.hudi.client.common.HoodieSparkEngineContext
2222
import org.apache.hudi.common.fs.FSUtils
2323
import org.apache.hudi.common.model.HoodieTableType
24-
import org.apache.hudi.hive.util.ConfigUtils
24+
import org.apache.hudi.sync.common.util.ConfigUtils
2525
import org.apache.spark.sql._
2626
import org.apache.spark.sql.catalyst.TableIdentifier
2727
import org.apache.spark.sql.catalyst.catalog._

hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ import org.apache.hadoop.conf.Configuration
2121
import org.apache.hadoop.fs.Path
2222
import org.apache.hudi.DataSourceWriteOptions
2323
import org.apache.hudi.hive.HiveSyncConfig
24-
import org.apache.hudi.hive.util.ConfigUtils
2524
import org.apache.hudi.sql.InsertMode
2625
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, HoodieCatalogTable}
2726
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable.needFilterProps
27+
import org.apache.hudi.sync.common.util.ConfigUtils
2828
import org.apache.spark.sql.catalyst.plans.QueryPlan
2929
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
3030
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils

hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ package org.apache.spark.sql.hudi.catalog
2020

2121
import org.apache.hadoop.fs.Path
2222
import org.apache.hudi.exception.HoodieException
23-
import org.apache.hudi.hive.util.ConfigUtils
2423
import org.apache.hudi.sql.InsertMode
24+
import org.apache.hudi.sync.common.util.ConfigUtils
2525
import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
2626
import org.apache.spark.sql.HoodieSpark3SqlUtils.convertTransforms
2727
import org.apache.spark.sql.catalyst.TableIdentifier
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
<modelVersion>4.0.0</modelVersion>
2727

28-
<artifactId>hudi-dla-sync</artifactId>
28+
<artifactId>hudi-adb-sync</artifactId>
2929
<packaging>jar</packaging>
3030

3131
<properties>
File renamed without changes.
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hudi.sync.adb;
20+
21+
import org.apache.hudi.common.fs.FSUtils;
22+
import org.apache.hudi.common.table.timeline.HoodieTimeline;
23+
import org.apache.hudi.common.util.StringUtils;
24+
import org.apache.hudi.exception.HoodieException;
25+
import org.apache.hudi.hive.PartitionValueExtractor;
26+
import org.apache.hudi.hive.SchemaDifference;
27+
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
28+
29+
import org.apache.hadoop.fs.FileSystem;
30+
import org.apache.hadoop.fs.Path;
31+
32+
import java.util.ArrayList;
33+
import java.util.HashMap;
34+
import java.util.List;
35+
import java.util.Map;
36+
37+
public abstract class AbstractAdbSyncHoodieClient extends AbstractSyncHoodieClient {
38+
protected AdbSyncConfig adbSyncConfig;
39+
protected PartitionValueExtractor partitionValueExtractor;
40+
protected HoodieTimeline activeTimeline;
41+
42+
public AbstractAdbSyncHoodieClient(AdbSyncConfig syncConfig, FileSystem fs) {
43+
super(syncConfig.basePath, syncConfig.assumeDatePartitioning,
44+
syncConfig.useFileListingFromMetadata, false, fs);
45+
this.adbSyncConfig = syncConfig;
46+
final String clazz = adbSyncConfig.partitionValueExtractorClass;
47+
try {
48+
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(clazz).newInstance();
49+
} catch (Exception e) {
50+
throw new HoodieException("Fail to init PartitionValueExtractor class " + clazz, e);
51+
}
52+
53+
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
54+
}
55+
56+
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions,
57+
List<String> partitionStoragePartitions) {
58+
Map<String, String> paths = new HashMap<>();
59+
60+
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
61+
List<String> partitionValues = entry.getKey();
62+
String fullTablePartitionPath = entry.getValue();
63+
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
64+
}
65+
List<PartitionEvent> events = new ArrayList<>();
66+
for (String storagePartition : partitionStoragePartitions) {
67+
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, storagePartition);
68+
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
69+
// Check if the partition values or if hdfs path is the same
70+
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
71+
if (adbSyncConfig.useHiveStylePartitioning) {
72+
String partition = String.join("/", storagePartitionValues);
73+
storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
74+
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
75+
}
76+
if (!storagePartitionValues.isEmpty()) {
77+
String storageValue = String.join(", ", storagePartitionValues);
78+
if (!paths.containsKey(storageValue)) {
79+
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
80+
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
81+
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
82+
}
83+
}
84+
}
85+
return events;
86+
}
87+
88+
public void close() {
89+
90+
}
91+
92+
public abstract Map<List<String>, String> scanTablePartitions(String tableName) throws Exception;
93+
94+
public abstract void updateTableDefinition(String tableName, SchemaDifference schemaDiff) throws Exception;
95+
96+
public abstract boolean databaseExists(String databaseName) throws Exception;
97+
98+
public abstract void createDatabase(String databaseName) throws Exception;
99+
100+
public abstract void dropTable(String tableName);
101+
102+
protected String getDatabasePath() {
103+
String dbLocation = adbSyncConfig.dbLocation;
104+
Path dbLocationPath;
105+
if (StringUtils.isNullOrEmpty(dbLocation)) {
106+
if (new Path(adbSyncConfig.basePath).isRoot()) {
107+
dbLocationPath = new Path(adbSyncConfig.basePath);
108+
} else {
109+
dbLocationPath = new Path(adbSyncConfig.basePath).getParent();
110+
}
111+
} else {
112+
dbLocationPath = new Path(dbLocation);
113+
}
114+
return generateAbsolutePathStr(dbLocationPath);
115+
}
116+
117+
protected String generateAbsolutePathStr(Path path) {
118+
String absolutePathStr = path.toString();
119+
if (path.toUri().getScheme() == null) {
120+
absolutePathStr = getDefaultFs() + absolutePathStr;
121+
}
122+
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
123+
}
124+
125+
protected String getDefaultFs() {
126+
return fs.getConf().get("fs.defaultFS");
127+
}
128+
}

0 commit comments

Comments
 (0)