Skip to content

Commit 6456bd3

Browse files
authored
[HUDI-4273] Support inline schedule clustering for Flink stream (#5890)
* [HUDI-4273] Support inline schedule clustering for Flink stream * delete deprecated clustering plan strategy and add clustering ITTest
1 parent af9f090 commit 6456bd3

29 files changed

Lines changed: 1104 additions & 373 deletions

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,6 @@ public static String checkAndGetClusteringPlanStrategy(HoodieWriteConfig config)
7070
String sparkSizeBasedClassName = HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
7171
String sparkSelectedPartitionsClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkSelectedPartitionsClusteringPlanStrategy";
7272
String sparkRecentDaysClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy";
73-
String flinkSizeBasedClassName = HoodieClusteringConfig.FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
74-
String flinkSelectedPartitionsClassName = "org.apache.hudi.client.clustering.plan.strategy.FlinkSelectedPartitionsClusteringPlanStrategy";
75-
String flinkRecentDaysClassName = "org.apache.hudi.client.clustering.plan.strategy.FlinkRecentDaysClusteringPlanStrategy";
7673
String javaSelectedPartitionClassName = "org.apache.hudi.client.clustering.plan.strategy.JavaRecentDaysClusteringPlanStrategy";
7774
String javaSizeBasedClassName = HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
7875

@@ -85,14 +82,6 @@ public static String checkAndGetClusteringPlanStrategy(HoodieWriteConfig config)
8582
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());
8683
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
8784
return sparkSizeBasedClassName;
88-
} else if (flinkRecentDaysClassName.equals(className)) {
89-
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name());
90-
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.RECENT_DAYS.name()));
91-
return flinkSizeBasedClassName;
92-
} else if (flinkSelectedPartitionsClassName.equals(className)) {
93-
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());
94-
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
95-
return flinkSizeBasedClassName;
9685
} else if (javaSelectedPartitionClassName.equals(className)) {
9786
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name());
9887
LOG.warn(String.format(logStr, className, javaSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
@@ -173,7 +162,7 @@ protected Map<String, Double> buildMetrics(List<FileSlice> fileSlices) {
173162
return metrics;
174163
}
175164

176-
protected HoodieTable<T,I,K, O> getHoodieTable() {
165+
protected HoodieTable<T, I, K, O> getHoodieTable() {
177166
return this.hoodieTable;
178167
}
179168

hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ public Option<HoodieCompactionPlan> execute() {
7171
if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()
7272
&& !config.getFailedWritesCleanPolicy().isLazy()) {
7373
// TODO(yihua): this validation is removed for Java client used by kafka-connect. Need to revisit this.
74-
if (config.getEngineType() != EngineType.JAVA) {
74+
if (config.getEngineType() == EngineType.SPARK) {
7575
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
7676
table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
7777
.ifPresent(earliestInflight -> ValidationUtils.checkArgument(

hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkRecentDaysClusteringPlanStrategy.java

Lines changed: 0 additions & 65 deletions
This file was deleted.

hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSelectedPartitionsClusteringPlanStrategy.java

Lines changed: 0 additions & 67 deletions
This file was deleted.

hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
package org.apache.hudi.configuration;
2020

21-
import org.apache.hudi.client.clustering.plan.strategy.FlinkRecentDaysClusteringPlanStrategy;
21+
import org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy;
2222
import org.apache.hudi.common.config.ConfigClassProperty;
2323
import org.apache.hudi.common.config.ConfigGroups;
2424
import org.apache.hudi.common.config.HoodieConfig;
@@ -45,6 +45,11 @@
4545
import java.util.Map;
4646
import java.util.Set;
4747

48+
import static org.apache.hudi.config.HoodieClusteringConfig.DAYBASED_LOOKBACK_PARTITIONS;
49+
import static org.apache.hudi.config.HoodieClusteringConfig.PARTITION_FILTER_BEGIN_PARTITION;
50+
import static org.apache.hudi.config.HoodieClusteringConfig.PARTITION_FILTER_END_PARTITION;
51+
import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST;
52+
4853
/**
4954
* Hoodie Flink config options.
5055
*
@@ -594,6 +599,12 @@ private FlinkOptions() {
594599
.defaultValue(false) // default false for pipeline
595600
.withDescription("Schedule the cluster plan, default false");
596601

602+
public static final ConfigOption<Boolean> CLUSTERING_ASYNC_ENABLED = ConfigOptions
603+
.key("clustering.async.enabled")
604+
.booleanType()
605+
.defaultValue(false) // default false for pipeline
606+
.withDescription("Async Clustering, default false");
607+
597608
public static final ConfigOption<Integer> CLUSTERING_DELTA_COMMITS = ConfigOptions
598609
.key("clustering.delta_commits")
599610
.intType()
@@ -615,11 +626,22 @@ private FlinkOptions() {
615626
public static final ConfigOption<String> CLUSTERING_PLAN_STRATEGY_CLASS = ConfigOptions
616627
.key("clustering.plan.strategy.class")
617628
.stringType()
618-
.defaultValue(FlinkRecentDaysClusteringPlanStrategy.class.getName())
629+
.defaultValue(FlinkSizeBasedClusteringPlanStrategy.class.getName())
619630
.withDescription("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
620631
+ "i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by "
621632
+ CLUSTERING_TARGET_PARTITIONS.key() + ") day based partitions picks the small file slices within those partitions.");
622633

634+
public static final ConfigOption<String> CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME = ConfigOptions
635+
.key("clustering.plan.partition.filter.mode")
636+
.stringType()
637+
.defaultValue("NONE")
638+
.withDescription("Partition filter mode used in the creation of clustering plan. Available values are - "
639+
+ "NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate."
640+
+ "RECENT_DAYS: keep a continuous range of partitions, worked together with configs '" + DAYBASED_LOOKBACK_PARTITIONS.key() + "' and '"
641+
+ PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST.key() + "."
642+
+ "SELECTED_PARTITIONS: keep partitions that are in the specified range ['" + PARTITION_FILTER_BEGIN_PARTITION.key() + "', '"
643+
+ PARTITION_FILTER_END_PARTITION.key() + "'].");
644+
623645
public static final ConfigOption<Integer> CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES = ConfigOptions
624646
.key("clustering.plan.strategy.target.file.max.bytes")
625647
.intType()
@@ -641,7 +663,7 @@ private FlinkOptions() {
641663
public static final ConfigOption<String> CLUSTERING_SORT_COLUMNS = ConfigOptions
642664
.key("clustering.plan.strategy.sort.columns")
643665
.stringType()
644-
.noDefaultValue()
666+
.defaultValue("")
645667
.withDescription("Columns to sort the data by when clustering");
646668

647669
public static final ConfigOption<Integer> CLUSTERING_MAX_NUM_GROUPS = ConfigOptions

hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import org.apache.hudi.common.model.DefaultHoodieRecordPayload;
2222
import org.apache.hudi.common.model.WriteOperationType;
23+
import org.apache.hudi.common.util.StringUtils;
2324
import org.apache.hudi.index.HoodieIndex;
2425
import org.apache.hudi.table.format.FilePathUtils;
2526

@@ -42,7 +43,10 @@ public static boolean insertClustering(Configuration conf) {
4243
* Returns whether the insert is clustering disabled with given configuration {@code conf}.
4344
*/
4445
public static boolean isAppendMode(Configuration conf) {
45-
return isCowTable(conf) && isInsertOperation(conf) && !conf.getBoolean(FlinkOptions.INSERT_CLUSTER);
46+
// 1. inline clustering is supported for COW table;
47+
// 2. async clustering is supported for both COW and MOR table
48+
return isCowTable(conf) && isInsertOperation(conf) && !conf.getBoolean(FlinkOptions.INSERT_CLUSTER)
49+
|| needsScheduleClustering(conf);
4650
}
4751

4852
/**
@@ -115,4 +119,49 @@ public static boolean emitChangelog(Configuration conf) {
115119
return conf.getBoolean(FlinkOptions.READ_AS_STREAMING)
116120
&& conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED);
117121
}
122+
123+
/**
124+
* Returns whether there is need to schedule the async compaction.
125+
*
126+
* @param conf The flink configuration.
127+
*/
128+
public static boolean needsAsyncCompaction(Configuration conf) {
129+
return OptionsResolver.isMorTable(conf)
130+
&& conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED);
131+
}
132+
133+
/**
134+
* Returns whether there is need to schedule the compaction plan.
135+
*
136+
* @param conf The flink configuration.
137+
*/
138+
public static boolean needsScheduleCompaction(Configuration conf) {
139+
return OptionsResolver.isMorTable(conf)
140+
&& conf.getBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED);
141+
}
142+
143+
/**
144+
* Returns whether there is need to schedule the async clustering.
145+
*
146+
* @param conf The flink configuration.
147+
*/
148+
public static boolean needsAsyncClustering(Configuration conf) {
149+
return isInsertOperation(conf) && conf.getBoolean(FlinkOptions.CLUSTERING_ASYNC_ENABLED);
150+
}
151+
152+
/**
153+
* Returns whether there is need to schedule the clustering plan.
154+
*
155+
* @param conf The flink configuration.
156+
*/
157+
public static boolean needsScheduleClustering(Configuration conf) {
158+
return isInsertOperation(conf) && conf.getBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED);
159+
}
160+
161+
/**
162+
* Returns whether the clustering sort is enabled.
163+
*/
164+
public static boolean sortClusteringEnabled(Configuration conf) {
165+
return !StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.CLUSTERING_SORT_COLUMNS));
166+
}
118167
}

hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.apache.hudi.sink.meta.CkpMetadata;
3838
import org.apache.hudi.sink.utils.HiveSyncContext;
3939
import org.apache.hudi.sink.utils.NonThrownExecutor;
40+
import org.apache.hudi.util.ClusteringUtil;
4041
import org.apache.hudi.util.CompactionUtil;
4142
import org.apache.hudi.util.StreamerUtil;
4243

@@ -253,6 +254,11 @@ public void notifyCheckpointComplete(long checkpointId) {
253254
CompactionUtil.scheduleCompaction(metaClient, writeClient, tableState.isDeltaTimeCompaction, committed);
254255
}
255256

257+
if (tableState.scheduleClustering) {
258+
// if async clustering is on, schedule the clustering
259+
ClusteringUtil.scheduleClustering(conf, writeClient, committed);
260+
}
261+
256262
if (committed) {
257263
// start new instant.
258264
startInstant();
@@ -607,6 +613,7 @@ private static class TableState implements Serializable {
607613
final String commitAction;
608614
final boolean isOverwrite;
609615
final boolean scheduleCompaction;
616+
final boolean scheduleClustering;
610617
final boolean syncHive;
611618
final boolean syncMetadata;
612619
final boolean isDeltaTimeCompaction;
@@ -616,7 +623,8 @@ private TableState(Configuration conf) {
616623
this.commitAction = CommitUtils.getCommitActionType(this.operationType,
617624
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE).toUpperCase(Locale.ROOT)));
618625
this.isOverwrite = WriteOperationType.isOverwrite(this.operationType);
619-
this.scheduleCompaction = StreamerUtil.needsScheduleCompaction(conf);
626+
this.scheduleCompaction = OptionsResolver.needsScheduleCompaction(conf);
627+
this.scheduleClustering = OptionsResolver.needsScheduleClustering(conf);
620628
this.syncHive = conf.getBoolean(FlinkOptions.HIVE_SYNC_ENABLED);
621629
this.syncMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED);
622630
this.isDeltaTimeCompaction = OptionsResolver.isDeltaTimeCompaction(conf);

hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitEvent.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import java.util.List;
2525

2626
/**
27-
* Represents a commit event from the clustering task {@link ClusteringFunction}.
27+
* Represents a commit event from the clustering task {@link ClusteringOperator}.
2828
*/
2929
public class ClusteringCommitEvent implements Serializable {
3030
private static final long serialVersionUID = 1L;
@@ -51,6 +51,10 @@ public ClusteringCommitEvent(String instant, List<WriteStatus> writeStatuses, in
5151
this.taskID = taskID;
5252
}
5353

54+
public ClusteringCommitEvent(String instant, int taskID) {
55+
this(instant, null, taskID);
56+
}
57+
5458
public void setInstant(String instant) {
5559
this.instant = instant;
5660
}
@@ -74,4 +78,8 @@ public List<WriteStatus> getWriteStatuses() {
7478
public int getTaskID() {
7579
return taskID;
7680
}
81+
82+
public boolean isFailed() {
83+
return this.writeStatuses == null;
84+
}
7785
}

0 commit comments

Comments
 (0)