Merge pull request #10621 from sundy-li/auto-cast-loading-parquet

BohuTANG · web-flow · commit 016f10187acd · 2023-03-18T08:48:24.000+08:00
feat(query): enable runtime cast transform in loading parquet files
diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_parquet.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_parquet.rs
@@ -38,7 +38,8 @@ use common_arrow::read_columns_async;
 use common_exception::ErrorCode;
 use common_exception::Result;
 use common_expression::DataBlock;
-use common_expression::TableField;
+use common_expression::DataField;
+use common_expression::DataSchema;
 use common_expression::TableSchema;
 use common_expression::TableSchemaRef;
 use common_meta_app::principal::StageInfo;
@@ -362,7 +363,15 @@ impl BlockBuilderTrait for ParquetBlockBuilder {
     fn deserialize(&mut self, mut batch: Option<RowGroupInMemory>) -> Result<Vec<DataBlock>> {
         if let Some(rg) = batch.as_mut() {
             let chunk = rg.get_arrow_chunk()?;
-            let block = DataBlock::from_arrow_chunk(&chunk, &self.ctx.data_schema())?;
+
+            let fields: Vec<DataField> = rg
+                .fields_to_read
+                .iter()
+                .map(DataField::from)
+                .collect::<Vec<_>>();
+
+            let input_schema = DataSchema::new(fields);
+            let block = DataBlock::from_arrow_chunk(&chunk, &input_schema)?;
 
             let block_total_rows = block.num_rows();
             let num_rows_per_block = self.ctx.block_compact_thresholds.max_rows_per_block;
@@ -446,20 +455,12 @@ impl AligningStateTrait for AligningState {
 
 fn get_used_fields(fields: &Vec<Field>, schema: &TableSchemaRef) -> Result<Vec<Field>> {
     let mut read_fields = Vec::with_capacity(fields.len());
-    for (idx, f) in schema.fields().iter().enumerate() {
+    for f in schema.fields().iter() {
         if let Some(m) = fields
             .iter()
             .filter(|c| c.name.eq_ignore_ascii_case(f.name()))
             .last()
         {
-            let tf = TableField::from(m);
-            if tf.data_type().remove_nullable() != f.data_type().remove_nullable() {
-                return Err(ErrorCode::TableSchemaMismatch(format!(
-                    "parquet schema mismatch for field {}(start from 0), expect: {:?}, got {:?}",
-                    idx, f, tf
-                )));
-            }
-
             read_fields.push(m.clone());
         } else {
             return Err(ErrorCode::TableSchemaMismatch(format!(
diff --git a/src/query/service/src/interpreters/interpreter_insert.rs b/src/query/service/src/interpreters/interpreter_insert.rs
@@ -16,6 +16,7 @@ use std::collections::VecDeque;
 use std::io::BufRead;
 use std::io::Cursor;
 use std::ops::Not;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Instant;
 
@@ -49,6 +50,7 @@ use common_formats::FastFieldDecoderValues;
 use common_io::cursor_ext::ReadBytesExt;
 use common_io::cursor_ext::ReadCheckPointExt;
 use common_meta_app::principal::FileFormatOptions;
+use common_meta_app::principal::StageFileFormatType;
 use common_meta_app::principal::StageInfo;
 use common_pipeline_core::Pipeline;
 use common_pipeline_sources::AsyncSource;
@@ -85,6 +87,7 @@ use crate::interpreters::common::append2table;
 use crate::interpreters::Interpreter;
 use crate::interpreters::InterpreterPtr;
 use crate::pipelines::processors::transforms::TransformAddConstColumns;
+use crate::pipelines::processors::transforms::TransformRuntimeCastSchema;
 use crate::pipelines::processors::TransformResortAddOn;
 use crate::pipelines::PipelineBuildResult;
 use crate::pipelines::SourcePipeBuilder;
@@ -370,17 +373,49 @@ impl Interpreter for InsertInterpreter {
                     1,
                 )?;
             }
-            InsertInputSource::StreamingWithFormat(_, _, input_context) => {
+            InsertInputSource::StreamingWithFormat(format, _, input_context) => {
                 let input_context = input_context.as_ref().expect("must success").clone();
                 input_context
                     .format
                     .exec_stream(input_context.clone(), &mut build_res.main_pipeline)?;
+
+                if Ok(StageFileFormatType::Parquet) == StageFileFormatType::from_str(format) {
+                    let dest_schema = plan.schema();
+                    let func_ctx = self.ctx.get_function_context()?;
+
+                    build_res.main_pipeline.add_transform(
+                        |transform_input_port, transform_output_port| {
+                            TransformRuntimeCastSchema::try_create(
+                                transform_input_port,
+                                transform_output_port,
+                                dest_schema.clone(),
+                                func_ctx,
+                            )
+                        },
+                    )?;
+                }
             }
-            InsertInputSource::StreamingWithFileFormat(_, _, input_context) => {
+            InsertInputSource::StreamingWithFileFormat(format_options, _, input_context) => {
                 let input_context = input_context.as_ref().expect("must success").clone();
                 input_context
                     .format
                     .exec_stream(input_context.clone(), &mut build_res.main_pipeline)?;
+
+                if StageFileFormatType::Parquet == format_options.format {
+                    let dest_schema = plan.schema();
+                    let func_ctx = self.ctx.get_function_context()?;
+
+                    build_res.main_pipeline.add_transform(
+                        |transform_input_port, transform_output_port| {
+                            TransformRuntimeCastSchema::try_create(
+                                transform_input_port,
+                                transform_output_port,
+                                dest_schema.clone(),
+                                func_ctx,
+                            )
+                        },
+                    )?;
+                }
             }
             InsertInputSource::Stage(opts) => {
                 tracing::info!("insert: from stage with options {:?}", opts);
diff --git a/src/query/service/src/pipelines/processors/transforms/mod.rs b/src/query/service/src/pipelines/processors/transforms/mod.rs
@@ -29,6 +29,7 @@ mod transform_merge_block;
 mod transform_resort_addon;
 mod transform_right_join;
 mod transform_right_semi_anti_join;
+mod transform_runtime_cast_schema;
 mod transform_runtime_filter;
 
 pub use aggregator::build_partition_bucket;
@@ -84,6 +85,7 @@ pub use transform_right_join::RightJoinCompactor;
 pub use transform_right_join::TransformRightJoin;
 pub use transform_right_semi_anti_join::RightSemiAntiJoinCompactor;
 pub use transform_right_semi_anti_join::TransformRightSemiAntiJoin;
+pub use transform_runtime_cast_schema::TransformRuntimeCastSchema;
 pub use transform_runtime_filter::SinkRuntimeFilterSource;
 pub use transform_runtime_filter::TransformRuntimeFilter;
 pub use transform_sort_merge::SortMergeCompactor;
diff --git a/src/query/service/src/pipelines/processors/transforms/transform_runtime_cast_schema.rs b/src/query/service/src/pipelines/processors/transforms/transform_runtime_cast_schema.rs
@@ -0,0 +1,100 @@
+// Copyright 2021 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_exception::Result;
+use common_expression::BlockEntry;
+use common_expression::DataBlock;
+use common_expression::DataSchemaRef;
+use common_expression::Evaluator;
+use common_expression::Expr;
+use common_expression::FunctionContext;
+use common_functions::scalars::BUILTIN_FUNCTIONS;
+
+use crate::pipelines::processors::port::InputPort;
+use crate::pipelines::processors::port::OutputPort;
+use crate::pipelines::processors::processor::ProcessorPtr;
+use crate::pipelines::processors::transforms::transform::Transform;
+use crate::pipelines::processors::transforms::transform::Transformer;
+
+/// TransformRuntimeCastSchema is used to cast block to the specified schema.
+/// Different from `TransformCastSchema`, it is used at the runtime
+pub struct TransformRuntimeCastSchema {
+    func_ctx: FunctionContext,
+    insert_schema: DataSchemaRef,
+}
+
+impl TransformRuntimeCastSchema
+where Self: Transform
+{
+    pub fn try_create(
+        input_port: Arc<InputPort>,
+        output_port: Arc<OutputPort>,
+        insert_schema: DataSchemaRef,
+        func_ctx: FunctionContext,
+    ) -> Result<ProcessorPtr> {
+        Ok(ProcessorPtr::create(Transformer::create(
+            input_port,
+            output_port,
+            Self {
+                func_ctx,
+                insert_schema,
+            },
+        )))
+    }
+}
+
+impl Transform for TransformRuntimeCastSchema {
+    const NAME: &'static str = "CastSchemaTransform";
+
+    fn transform(&mut self, data_block: DataBlock) -> Result<DataBlock> {
+        let exprs: Vec<Expr> = data_block
+            .columns()
+            .iter()
+            .zip(self.insert_schema.fields().iter().enumerate())
+            .map(|(from, (index, to))| {
+                let expr = Expr::ColumnRef {
+                    span: None,
+                    id: index,
+                    data_type: from.data_type.clone(),
+                    display_name: to.name().clone(),
+                };
+                if &from.data_type != to.data_type() {
+                    Expr::Cast {
+                        span: None,
+                        is_try: false,
+                        expr: Box::new(expr),
+                        dest_type: to.data_type().clone(),
+                    }
+                } else {
+                    expr
+                }
+            })
+            .collect();
+
+        let mut columns = Vec::with_capacity(exprs.len());
+        let evaluator = Evaluator::new(&data_block, self.func_ctx, &BUILTIN_FUNCTIONS);
+
+        for (field, expr) in self.insert_schema.fields().iter().zip(exprs.iter()) {
+            let value = evaluator.run(expr)?;
+            let column = BlockEntry {
+                data_type: field.data_type().clone(),
+                value,
+            };
+            columns.push(column);
+        }
+        Ok(DataBlock::new(columns, data_block.num_rows()))
+    }
+}
diff --git a/tests/suites/1_stateful/01_load/01_0000_streaming_load.result b/tests/suites/1_stateful/01_load/01_0000_streaming_load.result
@@ -12,5 +12,5 @@
 198	2020.0	767
 --parquet less
 199	2020.0	769
---parquet mismatch schema
-1
+--parquet runtime cast schema
+199	2020.0	769
diff --git a/tests/suites/1_stateful/01_load/01_0000_streaming_load.sh b/tests/suites/1_stateful/01_load/01_0000_streaming_load.sh
@@ -87,12 +87,13 @@ echo "--parquet less"
 curl -s -H "insert_sql:insert into ontime_less file_format = (type = Parquet)" -F "upload=@/tmp/ontime_200.parquet" -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load"  > /dev/null 2>&1
 echo "select count(1), avg(Year), sum(DayOfWeek)  from ontime_less;" | $MYSQL_CLIENT_CONNECT
 
-# load parquet with mismatch schema
-echo "--parquet mismatch schema"
-cat $CURDIR/../ddl/ontime.sql | sed 's/ontime/ontime_test_mismatch/g' | sed 's/DATE/VARCHAR/g' | $MYSQL_CLIENT_CONNECT
-curl -s -H "insert_sql:insert into ontime_test_mismatch file_format = (type = Parquet)" -F "upload=@/tmp/ontime_200.parquet" -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load" | grep -c 'parquet schema mismatch'
+# load parquet with mismatch schema, will auto cast
+echo "--parquet runtime cast schema"
+cat $CURDIR/../ddl/ontime.sql | sed 's/ontime/ontime_test_schmea_mismatch/g' | sed 's/DATE/TIMESTAMP/g' | $MYSQL_CLIENT_CONNECT
+curl -s -H "insert_sql:insert into ontime_test_schmea_mismatch file_format = (type = Parquet)" -F "upload=@/tmp/ontime_200.parquet" -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load"  > /dev/null 2>&1
+echo "select count(1), avg(Year), sum(DayOfWeek)  from ontime_test_schmea_mismatch;" | $MYSQL_CLIENT_CONNECT
 
 
 echo "drop table ontime_streaming_load;" | $MYSQL_CLIENT_CONNECT
-echo "drop table ontime_test_mismatch;" | $MYSQL_CLIENT_CONNECT
+echo "drop table ontime_test_schmea_mismatch;" | $MYSQL_CLIENT_CONNECT
 echo "drop table ontime_less;" | $MYSQL_CLIENT_CONNECT