Skip to content

Commit ac9f248

Browse files
authored
[mineru op] fix typo and update all pdf pipelines to flash-mineru (#474)
1 parent 0946ffc commit ac9f248

File tree

9 files changed

+79
-32
lines changed

9 files changed

+79
-32
lines changed

dataflow/cli_funcs/pdf2model_pipeline/pdf_to_qa_pipeline.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66
from dataflow.operators.knowledge_cleaning import (
77
KBCChunkGeneratorBatch,
8+
FileOrURLToMarkdownConverterLocal,
89
FileOrURLToMarkdownConverterFlash,
910
KBCTextCleanerBatch,
1011
KBCMultiHopQAGeneratorBatch,
@@ -29,11 +30,24 @@ def __init__(self, cache_base="./"):
2930
cache_type="json",
3031
)
3132

32-
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
33+
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
3334
intermediate_dir=str(cache_path / ".cache"),
3435
mineru_backend="vlm-vllm-engine", # 可选 pipeline, vlm-vllm-engine, vlm-vllm-transformer, vlm-http-client
3536
)
3637

38+
# Faster backend by Flash-MinerU
39+
# See de
40+
41+
# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
42+
# intermediate_dir="../example_data/KBCleaningPipeline/flash/",
43+
# mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
44+
# # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
45+
# batch_size=4, # batchsize per vllm worker
46+
# replicas=1, # num of vllm workers
47+
# num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
48+
# engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
49+
# )
50+
3751
self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
3852
split_method="token",
3953
chunk_size=512,

dataflow/operators/knowledge_cleaning/generate/mineru_operators.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ def __init__(self,
334334
intermediate_dir: str = "intermediate",
335335
mineru_backend: str = "vlm-auto-engine",
336336
mineru_source: str = "local",
337-
mienru_model_path:str = None,
337+
mineru_model_path:str = None,
338338
mineru_download_model_type:str = "vlm"
339339
):
340340
"""
@@ -346,7 +346,7 @@ def __init__(self,
346346
"""
347347
super().__init__(intermediate_dir, mineru_backend)
348348
self.mineru_source = mineru_source
349-
self.mienru_model_path = mienru_model_path
349+
self.mineru_model_path = mineru_model_path
350350
self.mineru_download_model_type = mineru_download_model_type
351351

352352
@staticmethod
@@ -371,7 +371,7 @@ def get_desc(lang: str = "zh"):
371371
"- intermediate_dir: 中间产物目录(默认 intermediate)\n"
372372
"- mineru_backend: MinerU CLI 后端(默认 vlm-auto-engine;也可 pipeline / vlm-sglang-engine 等)\n"
373373
"- mineru_source: 模型来源(默认 local;对应 MINERU_MODEL_SOURCE)\n"
374-
"- mienru_model_path: 本地模型目录;提供则会调用 configure_model 配置模型\n"
374+
"- mineru_model_path: 本地模型目录;提供则会调用 configure_model 配置模型\n"
375375
"- mineru_download_model_type: 配置模型类型(默认 vlm)\n\n"
376376
"运行参数(run):\n"
377377
"- storage: DataFlowStorage,需包含 dataframe\n"
@@ -448,8 +448,8 @@ def _batch_parse_pdf_with_mineru(self, pdf_files: list):
448448
os.environ.setdefault("MINERU_MODEL_SOURCE", self.mineru_source)
449449

450450
# load local model and config corresponding files https://github.com/opendatalab/MinerU/blob/a12610fb3e9e24488fe3e76cd233ba88ec64bbaf/mineru/cli/models_download.py#L19
451-
if self.mienru_model_path != None:
452-
configure_model(self.mienru_model_path, self.mineru_download_model_type)
451+
if self.mineru_model_path != None:
452+
configure_model(self.mineru_model_path, self.mineru_download_model_type)
453453

454454
parsed_results = {}
455455
for item in pdf_files:

dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,30 @@ def __init__(self):
2929
# ------------case1: use MinerU official API (by default) ------------
3030
# by default we use API provided by MinerU official
3131
# https://mineru.net/apiManage/docs
32-
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
33-
intermediate_dir="../example_data/KBCleaningPipeline/API/",
34-
mineru_backend="vlm", # vlm or pipeline
35-
api_key=None # !!! place your api key here or set environment variable MINERU_API_KEY!!!
36-
)
32+
# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
33+
# intermediate_dir="../example_data/KBCleaningPipeline/API/",
34+
# mineru_backend="vlm", # vlm or pipeline
35+
# api_key=None # !!! place your api key here or set environment variable MINERU_API_KEY!!!
36+
# )
3737
# ------------case2: use Flash-MinerU inference locally with GPU ------------
3838
# https://github.com/OpenDCAI/Flash-MinerU
3939
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
4040
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
4141
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
4242
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
43-
engine_gpu_util_rate_to_ray_cap=0.5
43+
batch_size=4, # batchsize per vllm worker
44+
replicas=1, # num of vllm workers
45+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
46+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on GPU
4447
)
4548

4649
# ------------case3: use MinerU official inference locally (much slower than other two) ------------
47-
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
48-
intermediate_dir="../example_data/KBCleaningPipeline/local/",
49-
mineru_backend="vlm-local-engine",
50-
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
51-
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
52-
)
50+
# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
51+
# intermediate_dir="../example_data/KBCleaningPipeline/local/",
52+
# mineru_backend="vlm-local-engine",
53+
# # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
54+
# mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
55+
# )
5356

5457
self.knowledge_cleaning_step2 = KBCChunkGenerator(
5558
split_method="token",

dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,15 @@ def __init__(self):
2727

2828
self.vqa_extract_prompt = QAExtractPrompt()
2929

30-
self.mineru_executor = FileOrURLToMarkdownConverterFlash(intermediate_dir = "intermediate", mineru_backend="vlm-vllm-engine")
30+
self.mineru_executor = FileOrURLToMarkdownConverterFlash(
31+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
32+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
33+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
34+
batch_size=4, # batchsize per vllm worker
35+
replicas=1, # num of vllm workers
36+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
37+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
38+
)
3139
self.input_formatter = MinerU2LLMInputOperator()
3240
self.vqa_extractor = ChunkedPromptedGenerator(
3341
llm_serving=self.llm_serving,

dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_sglang.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ def __init__(self):
1919
)
2020

2121
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
22-
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
23-
mineru_backend="vlm-vllm-engine",
22+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
23+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
24+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
25+
batch_size=4, # batchsize per vllm worker
26+
replicas=1, # num of vllm workers
27+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
28+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
2429
)
25-
2630
self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
2731
split_method="token",
2832
chunk_size=512,

dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_batch_vllm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@ def __init__(self):
1919
)
2020

2121
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
22-
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
23-
mineru_backend="vlm-vllm-engine",
22+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
23+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
24+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
25+
batch_size=4, # batchsize per vllm worker
26+
replicas=1, # num of vllm workers
27+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
28+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
2429
)
2530

2631
self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(

dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_sglang.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ def __init__(self):
1919
)
2020

2121
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
22-
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
23-
mineru_backend="vlm-vllm-engine",
22+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
23+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
24+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
25+
batch_size=4, # batchsize per vllm worker
26+
replicas=1, # num of vllm workers
27+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
28+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
2429
)
25-
2630
self.knowledge_cleaning_step2 = KBCChunkGenerator(
2731
split_method="token",
2832
chunk_size=512,

dataflow/statics/pipelines/gpu_pipelines/kbcleaning/kbcleaning_pipeline_vllm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@ def __init__(self):
1919
)
2020

2121
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
22-
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
23-
mineru_backend="vlm-vllm-engine",
22+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
23+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
24+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
25+
batch_size=4, # batchsize per vllm worker
26+
replicas=1, # num of vllm workers
27+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
28+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
2429
)
2530

2631
self.knowledge_cleaning_step2 = KBCChunkGenerator(

dataflow/statics/playground/playground/kbcleaning_pipeline_vllm.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@ def __init__(self):
1919
)
2020

2121
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
22-
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
23-
lang="en",
24-
mineru_backend="vlm-vllm-engine",
22+
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
23+
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
24+
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
25+
batch_size=4, # batchsize per vllm worker
26+
replicas=1, # num of vllm workers
27+
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
28+
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
2529
)
2630

2731
self.knowledge_cleaning_step2 = KBCChunkGenerator(

0 commit comments

Comments
 (0)