Skip to content

Commit 0fb8556

Browse files
s-sanjayJagmeet Bali
andauthored
Add ability to provide multi-region (global) data consistency across HMS in different regions (#2542)
[global-hive-sync-tool] Add a global hive sync tool to sync hudi table across clusters. Add a way to rollback the replicated time stamp if we fail to sync or if we partly sync Co-authored-by: Jagmeet Bali <jsbali@uber.com>
1 parent e64fe55 commit 0fb8556

27 files changed

Lines changed: 1731 additions & 71 deletions

hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@
2121
import org.apache.hudi.common.model.BootstrapFileMapping;
2222
import org.apache.hudi.common.model.HoodieFileGroupId;
2323
import org.apache.hudi.common.table.HoodieTableMetaClient;
24+
import org.apache.hudi.common.table.timeline.HoodieTimeline;
25+
import org.apache.hudi.common.util.ReflectionUtils;
2426

2527
import java.io.Serializable;
2628
import java.util.List;
2729
import java.util.Map;
28-
import org.apache.hudi.common.table.timeline.HoodieTimeline;
29-
import org.apache.hudi.common.util.ReflectionUtils;
3030

3131
/**
3232
* Bootstrap Index Interface.
@@ -161,6 +161,6 @@ public abstract void appendNextPartition(String partitionPath,
161161

162162
public static BootstrapIndex getBootstrapIndex(HoodieTableMetaClient metaClient) {
163163
return ((BootstrapIndex)(ReflectionUtils.loadClass(
164-
metaClient.getTableConfig().getBootstrapIndexClass(), metaClient)));
164+
metaClient.getTableConfig().getBootstrapIndexClass(), new Class[]{HoodieTableMetaClient.class}, metaClient)));
165165
}
166166
}

hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ public class HoodieTableMetaClient implements Serializable {
9898
private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build();
9999

100100
private HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad,
101-
ConsistencyGuardConfig consistencyGuardConfig, Option<TimelineLayoutVersion> layoutVersion,
102-
String payloadClassName) {
101+
ConsistencyGuardConfig consistencyGuardConfig, Option<TimelineLayoutVersion> layoutVersion,
102+
String payloadClassName) {
103103
LOG.info("Loading HoodieTableMetaClient from " + basePath);
104104
this.consistencyGuardConfig = consistencyGuardConfig;
105105
this.hadoopConf = new SerializableConfiguration(conf);

hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public static Object loadClass(String clazz, Class<?>[] constructorArgTypes, Obj
8787
try {
8888
return getClass(clazz).getConstructor(constructorArgTypes).newInstance(constructorArgs);
8989
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
90-
throw new HoodieException("Unable to instantiate class ", e);
90+
throw new HoodieException("Unable to instantiate class " + clazz, e);
9191
}
9292
}
9393

hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@
3939
* InputPathHandler takes in a set of input paths and incremental tables list. Then, classifies the
4040
* input paths to incremental, snapshot paths and non-hoodie paths. This is then accessed later to
4141
* mutate the JobConf before processing incremental mode queries and snapshot queries.
42+
*
43+
* Note: We are adding jobConf of a mapreduce or spark job. The properties in the jobConf are two
44+
* type: session properties and table properties from metastore. While session property is common
45+
* for all the tables in a query the table properties are unique per table so there is no need to
46+
* check if it belongs to the table for which the path handler is now instantiated. The jobConf has
47+
* all table properties such as name, last modification time and so on which are unique to a table.
48+
* This class is written in such a way that it can handle multiple tables and properties unique to
49+
* a table but for table level property such check is not required.
4250
*/
4351
public class InputPathHandler {
4452

@@ -63,7 +71,6 @@ public InputPathHandler(Configuration conf, Path[] inputPaths, List<String> incr
6371
/**
6472
* Takes in the original InputPaths and classifies each of them into incremental, snapshot and
6573
* non-hoodie InputPaths. The logic is as follows:
66-
*
6774
* 1. Check if an inputPath starts with the same basepath as any of the metadata basepaths we know
6875
* 1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this
6976
* as incremental or snapshot - We can get the table name of this inputPath from the

hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@
1818

1919
package org.apache.hudi.hadoop.utils;
2020

21-
import org.apache.hadoop.fs.Path;
22-
import org.apache.hadoop.mapred.JobConf;
23-
import org.apache.hadoop.mapreduce.JobContext;
2421
import org.apache.hudi.common.table.HoodieTableMetaClient;
2522
import org.apache.hudi.common.table.timeline.HoodieTimeline;
2623
import org.apache.hudi.common.util.CollectionUtils;
2724
import org.apache.hudi.exception.HoodieIOException;
25+
26+
import org.apache.hadoop.fs.Path;
27+
import org.apache.hadoop.mapred.JobConf;
28+
import org.apache.hadoop.mapreduce.JobContext;
2829
import org.apache.log4j.LogManager;
2930
import org.apache.log4j.Logger;
3031

@@ -73,6 +74,7 @@ public class HoodieHiveUtils {
7374
public static final int MAX_COMMIT_ALL = -1;
7475
public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
7576
public static final Pattern HOODIE_CONSUME_MODE_PATTERN_STRING = Pattern.compile("hoodie\\.(.*)\\.consume\\.mode");
77+
public static final String GLOBALLY_CONSISTENT_READ_TIMESTAMP = "last_replication_timestamp";
7678

7779
public static boolean stopAtCompaction(JobContext job, String tableName) {
7880
String compactionPropName = String.format(HOODIE_STOP_AT_COMPACTION_PATTERN, tableName);

hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ public static List<FileStatus> filterFileStatusForSnapshotMode(JobConf job, Map<
442442
}
443443

444444
HoodieTimeline timeline = HoodieHiveUtils.getTableTimeline(metaClient.getTableConfig().getTableName(), job, metaClient);
445+
445446
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
446447
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
447448
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hudi.hadoop;
20+
21+
import org.apache.hudi.common.fs.FSUtils;
22+
import org.apache.hudi.exception.HoodieIOException;
23+
import org.apache.hudi.hadoop.testutils.InputFormatTestUtil;
24+
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
25+
26+
import org.apache.hadoop.fs.FileStatus;
27+
import org.junit.jupiter.api.Assertions;
28+
import org.junit.jupiter.api.BeforeEach;
29+
import org.junit.jupiter.api.Test;
30+
31+
import java.io.IOException;
32+
import java.util.Arrays;
33+
import java.util.List;
34+
35+
import static org.junit.jupiter.api.Assertions.assertEquals;
36+
import static org.junit.jupiter.api.Assertions.assertTrue;
37+
38+
public class TestGloballyConsistentTimeStampFilteringInputFormat
39+
extends TestHoodieParquetInputFormat {
40+
41+
@BeforeEach
42+
public void setUp() {
43+
super.setUp();
44+
}
45+
46+
@Test
47+
public void testInputFormatLoad() throws IOException {
48+
super.testInputFormatLoad();
49+
50+
// set filtering timestamp to 0 now the timeline wont have any commits.
51+
InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "0");
52+
53+
Assertions.assertThrows(HoodieIOException.class, () -> inputFormat.getSplits(jobConf, 10));
54+
Assertions.assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf));
55+
}
56+
57+
@Test
58+
public void testInputFormatUpdates() throws IOException {
59+
super.testInputFormatUpdates();
60+
61+
// set the globally replicated timestamp to 199 so only 100 is read and update is ignored.
62+
InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "100");
63+
64+
FileStatus[] files = inputFormat.listStatus(jobConf);
65+
assertEquals(10, files.length);
66+
67+
ensureFilesInCommit("5 files have been updated to commit 200. but should get filtered out ",
68+
files,"200", 0);
69+
ensureFilesInCommit("We should see 10 files from commit 100 ", files, "100", 10);
70+
}
71+
72+
@Override
73+
public void testIncrementalSimple() throws IOException {
74+
// setting filtering timestamp to zero should not in any way alter the result of the test which
75+
// pulls in zero files due to incremental ts being the actual commit time
76+
jobConf.set(HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP, "0");
77+
super.testIncrementalSimple();
78+
}
79+
80+
@Override
81+
public void testIncrementalWithMultipleCommits() throws IOException {
82+
super.testIncrementalWithMultipleCommits();
83+
84+
// set globally replicated timestamp to 400 so commits from 500, 600 does not show up
85+
InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "400");
86+
InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtils.MAX_COMMIT_ALL);
87+
88+
FileStatus[] files = inputFormat.listStatus(jobConf);
89+
90+
assertEquals(
91+
5, files.length,"Pulling ALL commits from 100, should get us the 3 files from 400 commit, 1 file from 300 "
92+
+ "commit and 1 file from 200 commit");
93+
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit",
94+
files, "400", 3);
95+
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit",
96+
files, "300", 1);
97+
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit",
98+
files, "200", 1);
99+
100+
List<String> commits = Arrays.asList("100", "200", "300", "400", "500", "600");
101+
for (int idx = 0; idx < commits.size(); ++idx) {
102+
for (int jdx = 0; jdx < commits.size(); ++jdx) {
103+
InputFormatTestUtil.setupIncremental(jobConf, commits.get(idx), HoodieHiveUtils.MAX_COMMIT_ALL);
104+
InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, commits.get(jdx));
105+
106+
files = inputFormat.listStatus(jobConf);
107+
108+
if (jdx <= idx) {
109+
assertEquals(0, files.length,"all commits should be filtered");
110+
} else {
111+
// only commits upto the timestamp is allowed
112+
for (FileStatus file : files) {
113+
String commitTs = FSUtils.getCommitTime(file.getPath().getName());
114+
assertTrue(commits.indexOf(commitTs) <= jdx);
115+
assertTrue(commits.indexOf(commitTs) > idx);
116+
}
117+
}
118+
}
119+
}
120+
}
121+
}

hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@
6565

6666
public class TestHoodieParquetInputFormat {
6767

68-
private HoodieParquetInputFormat inputFormat;
69-
private JobConf jobConf;
68+
protected HoodieParquetInputFormat inputFormat;
69+
protected JobConf jobConf;
7070
private final HoodieFileFormat baseFileFormat = HoodieFileFormat.PARQUET;
7171
private final String baseFileExtension = baseFileFormat.getFileExtension();
7272

hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323
import org.apache.hudi.common.table.HoodieTableConfig;
2424
import org.apache.hudi.common.table.HoodieTableMetaClient;
2525
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
26+
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
2627

2728
import org.apache.hadoop.conf.Configuration;
2829
import org.apache.hadoop.fs.FileSystem;
2930
import org.apache.hadoop.fs.Path;
31+
import org.apache.hadoop.mapred.JobConf;
3032
import org.apache.hadoop.hdfs.DistributedFileSystem;
3133
import org.apache.hadoop.hdfs.MiniDFSCluster;
3234
import org.junit.jupiter.api.AfterAll;
@@ -169,6 +171,21 @@ public void testInputPathHandler() throws IOException {
169171
assertTrue(actualComparesToExpected(actualPaths, nonHoodiePaths));
170172
}
171173

174+
@Test
175+
public void testInputPathHandlerWithGloballyReplicatedTimeStamp() throws IOException {
176+
JobConf jobConf = new JobConf();
177+
jobConf.set(HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP, "1");
178+
inputPathHandler = new InputPathHandler(dfs.getConf(), inputPaths.toArray(
179+
new Path[inputPaths.size()]), incrementalTables);
180+
List<Path> actualPaths = inputPathHandler.getGroupedIncrementalPaths().values().stream()
181+
.flatMap(List::stream).collect(Collectors.toList());
182+
assertTrue(actualComparesToExpected(actualPaths, incrementalPaths));
183+
actualPaths = inputPathHandler.getSnapshotPaths();
184+
assertTrue(actualComparesToExpected(actualPaths, snapshotPaths));
185+
actualPaths = inputPathHandler.getNonHoodieInputPaths();
186+
assertTrue(actualComparesToExpected(actualPaths, nonHoodiePaths));
187+
}
188+
172189
private boolean actualComparesToExpected(List<Path> actualPaths, List<Path> expectedPaths) {
173190
if (actualPaths.size() != expectedPaths.size()) {
174191
return false;

hudi-sync/hudi-hive-sync/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030

3131
<properties>
3232
<main.basedir>${project.parent.basedir}</main.basedir>
33+
34+
<jetty.version>7.6.0.v20120127</jetty.version>
3335
</properties>
3436

3537
<dependencies>
@@ -148,6 +150,14 @@
148150
<scope>test</scope>
149151
</dependency>
150152

153+
<!-- Needed for running HiveServer for Tests -->
154+
<dependency>
155+
<groupId>org.eclipse.jetty.aggregate</groupId>
156+
<artifactId>jetty-all</artifactId>
157+
<scope>test</scope>
158+
<version>${jetty.version}</version>
159+
</dependency>
160+
151161
<dependency>
152162
<groupId>org.junit.jupiter</groupId>
153163
<artifactId>junit-jupiter-api</artifactId>

0 commit comments

Comments
 (0)