[mineru op] fix typo and update all pdf pipelines to flash-mineru (#474)

SunnyHaze · web-flow · commit ac9f2487f63b · 2026-02-16T22:56:35.000+08:00
diff --git a/dataflow/cli_funcs/pdf2model_pipeline/pdf_to_qa_pipeline.py b/dataflow/cli_funcs/pdf2model_pipeline/pdf_to_qa_pipeline.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from dataflow.operators.knowledge_cleaning import (
     KBCChunkGeneratorBatch,
+    FileOrURLToMarkdownConverterLocal,
     FileOrURLToMarkdownConverterFlash,
     KBCTextCleanerBatch,
     KBCMultiHopQAGeneratorBatch,
@@ -29,11 +30,24 @@ def __init__(self, cache_base="./"):
             cache_type="json",
         )
 
-        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
+        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
             intermediate_dir=str(cache_path / ".cache"),
             mineru_backend="vlm-vllm-engine",  # 可选 pipeline, vlm-vllm-engine, vlm-vllm-transformer, vlm-http-client
         )
 
+        # Faster backend by Flash-MinerU
+        # See de 
+
+        # self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
+        #     intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+        #     mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+        #     # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+        #     batch_size=4, # batchsize per vllm worker
+        #     replicas=1,   # num of vllm workers
+        #     num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+        #     engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
+        # )
+
         self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
             split_method="token",
             chunk_size=512,
diff --git a/dataflow/operators/knowledge_cleaning/generate/mineru_operators.py b/dataflow/operators/knowledge_cleaning/generate/mineru_operators.py
@@ -334,7 +334,7 @@ def __init__(self,
                  intermediate_dir: str = "intermediate", 
                  mineru_backend: str = "vlm-auto-engine",
                  mineru_source: str = "local",
-                 mienru_model_path:str = None,
+                 mineru_model_path:str = None,
                  mineru_download_model_type:str = "vlm"
                  ):
         """
@@ -346,7 +346,7 @@ def __init__(self,
         """
         super().__init__(intermediate_dir, mineru_backend)
         self.mineru_source = mineru_source
-        self.mienru_model_path = mienru_model_path
+        self.mineru_model_path = mineru_model_path
         self.mineru_download_model_type = mineru_download_model_type
 
     @staticmethod
@@ -371,7 +371,7 @@ def get_desc(lang: str = "zh"):
                 "- intermediate_dir: 中间产物目录（默认 intermediate）\n"
                 "- mineru_backend: MinerU CLI 后端（默认 vlm-auto-engine；也可 pipeline / vlm-sglang-engine 等）\n"
                 "- mineru_source: 模型来源（默认 local；对应 MINERU_MODEL_SOURCE）\n"
-                "- mienru_model_path: 本地模型目录；提供则会调用 configure_model 配置模型\n"
+                "- mineru_model_path: 本地模型目录；提供则会调用 configure_model 配置模型\n"
                 "- mineru_download_model_type: 配置模型类型（默认 vlm）\n\n"
                 "运行参数（run）：\n"
                 "- storage: DataFlowStorage，需包含 dataframe\n"
@@ -448,8 +448,8 @@ def _batch_parse_pdf_with_mineru(self, pdf_files: list):
         os.environ.setdefault("MINERU_MODEL_SOURCE", self.mineru_source)
 
         # load local model and config corresponding files https://github.com/opendatalab/MinerU/blob/a12610fb3e9e24488fe3e76cd233ba88ec64bbaf/mineru/cli/models_download.py#L19
-        if self.mienru_model_path != None:
-            configure_model(self.mienru_model_path, self.mineru_download_model_type)
+        if self.mineru_model_path != None:
+            configure_model(self.mineru_model_path, self.mineru_download_model_type)
 
         parsed_results = {}
         for item in pdf_files:
diff --git a/dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py b/dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py
@@ -29,27 +29,30 @@ def __init__(self):
         # ------------case1: use MinerU official API (by default) ------------
         # by default we use API provided by MinerU official
         # https://mineru.net/apiManage/docs
-        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
-            intermediate_dir="../example_data/KBCleaningPipeline/API/",
-            mineru_backend="vlm",  # vlm or pipeline
-            api_key=None  # !!! place your api key here or set environment variable MINERU_API_KEY!!!
-        )
+        # self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
+        #     intermediate_dir="../example_data/KBCleaningPipeline/API/",
+        #     mineru_backend="vlm",  # vlm or pipeline
+        #     api_key=None  # !!! place your api key here or set environment variable MINERU_API_KEY!!!
+        # )
         # ------------case2: use Flash-MinerU inference locally with GPU ------------
         # https://github.com/OpenDCAI/Flash-MinerU
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
             intermediate_dir="../example_data/KBCleaningPipeline/flash/",
             mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
             # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
-            engine_gpu_util_rate_to_ray_cap=0.5
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on GPU
         )
 
         # ------------case3: use MinerU official inference locally (much slower than other two) ------------
-        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
-            intermediate_dir="../example_data/KBCleaningPipeline/local/",
-            mineru_backend="vlm-local-engine",
-            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
-            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
-        )
+        # self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
+        #     intermediate_dir="../example_data/KBCleaningPipeline/local/",
+        #     mineru_backend="vlm-local-engine",
+        #     # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+        #     mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
+        # )
 
         self.knowledge_cleaning_step2 = KBCChunkGenerator(
             split_method="token",
diff --git a/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py b/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py
@@ -27,7 +27,15 @@ def __init__(self):
         
         self.vqa_extract_prompt = QAExtractPrompt()
         
-        self.mineru_executor = FileOrURLToMarkdownConverterFlash(intermediate_dir = "intermediate", mineru_backend="vlm-vllm-engine")
+        self.mineru_executor = FileOrURLToMarkdownConverterFlash(
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
+        )
         self.input_formatter = MinerU2LLMInputOperator()
         self.vqa_extractor = ChunkedPromptedGenerator(
             llm_serving=self.llm_serving,
diff --git a/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_sglang.py b/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_sglang.py
@@ -19,10 +19,14 @@ def __init__(self):
         )
 
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
-            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
-            mineru_backend="vlm-vllm-engine",
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
         )
-
         self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
             split_method="token",
             chunk_size=512,
diff --git a/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_vllm.py b/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_vllm.py
@@ -19,8 +19,13 @@ def __init__(self):
         )
 
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
-            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
-            mineru_backend="vlm-vllm-engine",
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
         )
 
         self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
diff --git a/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_sglang.py b/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_sglang.py
@@ -19,10 +19,14 @@ def __init__(self):
         )
 
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
-            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
-            mineru_backend="vlm-vllm-engine",
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
         )
-
         self.knowledge_cleaning_step2 = KBCChunkGenerator(
             split_method="token",
             chunk_size=512,
diff --git a/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_vllm.py b/dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_vllm.py
@@ -19,8 +19,13 @@ def __init__(self):
         )
 
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
-            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
-            mineru_backend="vlm-vllm-engine",
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
         )
 
         self.knowledge_cleaning_step2 = KBCChunkGenerator(
diff --git a/dataflow/statics/playground/playground/kbcleaning_pipeline_vllm.py b/dataflow/statics/playground/playground/kbcleaning_pipeline_vllm.py
@@ -19,9 +19,13 @@ def __init__(self):
         )
 
         self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
-            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
-            lang="en",
-            mineru_backend="vlm-vllm-engine",
+            intermediate_dir="../example_data/KBCleaningPipeline/flash/",
+            mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",  # !!! place your local model path here !!!
+            # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
+            batch_size=4, # batchsize per vllm worker
+            replicas=1,   # num of vllm workers
+            num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
+            engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
         )
 
         self.knowledge_cleaning_step2 = KBCChunkGenerator(

Original file line number	Diff line number	Diff line change
`@@ -19,8 +19,13 @@ def __init__(self):`
`19`	`19`	`)`
`20`	`20`
`21`	`21`	`self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(`
`22`		`- intermediate_dir="../../example_data/KBCleaningPipeline/raw/",`
`23`		`- mineru_backend="vlm-vllm-engine",`
	`22`	`+ intermediate_dir="../example_data/KBCleaningPipeline/flash/",`
	`23`	`+ mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!`
	`24`	`+ # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.`
	`25`	`+ batch_size=4, # batchsize per vllm worker`
	`26`	`+ replicas=1, # num of vllm workers`
	`27`	`+ num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU`
	`28`	`+ engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on`
`24`	`29`	`)`
`25`	`30`
`26`	`31`	`self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(`