diff --git a/backend.py b/backend.py
index 292dec41..78557e10 100644
--- a/backend.py
+++ b/backend.py
@@ -28,6 +28,14 @@
from utils.logger import logger
import ast
+# Import document parser
+try:
+ from utils.document_parser import get_parser
+ DOCUMENT_PARSER_AVAILABLE = True
+except ImportError as e:
+ DOCUMENT_PARSER_AVAILABLE = False
+ logger.warning(f"Document parser not available: {e}")
+
# Try to import GraphRAG components
try:
from models.constructor import kt_gen as constructor
@@ -262,15 +270,21 @@ async def websocket_endpoint(websocket: WebSocket, client_id: str):
async def upload_files(files: List[UploadFile] = File(...), client_id: str = "default"):
"""Upload files and prepare for graph construction"""
try:
- # Use original filename (without extension) as dataset name
- # If multiple files, use the first file's name
- main_file = files[0]
- original_name = os.path.splitext(main_file.filename)[0]
- # Clean filename to be filesystem-safe
- dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
- dataset_name = dataset_name.replace(' ', '_')
+ # Generate dataset name based on file count
+ if len(files) == 1:
+ # Single file: use its name
+ main_file = files[0]
+ original_name = os.path.splitext(main_file.filename)[0]
+ # Clean filename to be filesystem-safe
+ dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
+ dataset_name = dataset_name.replace(' ', '_')
+ else:
+ # Multiple files: create a descriptive name with date
+ from datetime import datetime
+ date_str = datetime.now().strftime("%Y%m%d")
+ dataset_name = f"{len(files)}files_{date_str}"
- # Add timestamp if dataset already exists
+ # Add counter if dataset already exists
base_name = dataset_name
counter = 1
while os.path.exists(f"data/uploaded/{dataset_name}"):
@@ -286,7 +300,13 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
corpus_data = []
skipped_files: List[str] = []
processed_count = 0
- allowed_extensions = {".txt", ".md", ".json"}
+ allowed_extensions = {".txt", ".md", ".json", ".pdf", ".docx", ".doc"}
+
+ # Initialize document parser if needed
+ doc_parser = None
+ if DOCUMENT_PARSER_AVAILABLE:
+ doc_parser = get_parser()
+
for i, file in enumerate(files):
file_path = os.path.join(upload_dir, file.filename)
with open(file_path, "wb") as buffer:
@@ -303,6 +323,35 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
progress = 10 + (i + 1) * 80 // len(files)
await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
continue
+
+ # Handle PDF and DOCX/DOC files with document parser
+ if ext in ['.pdf', '.docx', '.doc']:
+ if not doc_parser:
+ logger.warning(f"Document parser not available, skipping {file.filename}")
+ skipped_files.append(file.filename)
+ progress = 10 + (i + 1) * 80 // len(files)
+ await send_progress_update(client_id, "upload", progress, f"Skipped {file.filename} (parser unavailable)")
+ continue
+
+ try:
+ text = doc_parser.parse_file(file_path, ext)
+ if text and text.strip():
+ corpus_data.append({
+ "title": file.filename,
+ "text": text
+ })
+ processed_count += 1
+ await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Parsed {file.filename}")
+ else:
+ logger.warning(f"No text extracted from {file.filename}")
+ skipped_files.append(file.filename)
+ await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"No text in {file.filename}")
+ except Exception as e:
+ logger.error(f"Error parsing {file.filename}: {e}")
+ skipped_files.append(file.filename)
+ await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Failed to parse {file.filename}")
+ continue
+
# Treat plain text formats explicitly (.txt and .md)
if filename_lower.endswith(('.txt', '.md')):
text = decode_bytes_with_detection(content_bytes)
@@ -333,7 +382,7 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
# Ensure at least one valid file processed
if processed_count == 0:
- msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
+ msg = "No supported files were uploaded. Allowed: .txt, .md, .json, .pdf, .docx, .doc"
if skipped_files:
msg += f"; skipped: {', '.join(skipped_files)}"
await send_progress_update(client_id, "upload", 0, msg)
diff --git a/frontend/index.html b/frontend/index.html
index 14ec66c6..e271d371 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -1089,7 +1089,7 @@
Upload Documents
Drag & Drop Files Here
or click to browse files
- Currently supports: .txt, .md, .json
+ Currently supports: .txt, .md, .json, .pdf, .docx, .doc
@@ -1106,7 +1106,7 @@
Drag & Drop Files Here
]
-
+
@@ -1213,7 +1213,7 @@ Answer:
uploadDocuments: 'Upload Documents',
uploadDragTitle: 'Drag & Drop Files Here',
uploadDragDesc: 'or click to browse files',
- uploadSupports: 'Currently supports: .txt, .md, .json',
+ uploadSupports: 'Currently supports: .txt, .md, .json, .pdf, .docx, .doc',
uploadSampleHeader: '📄 Sample Format (.json):',
uploadBtn: 'Upload Files',
clearUploadBtn: 'Clear',
@@ -1233,7 +1233,15 @@ Answer:
retrievalProgress: (found,cand) => `Retrieval progress: ${found} relevant nodes from ${cand} candidates.`,
answerStart: 'Synthesizing the final answer...',
qaCompleted: 'QA process completed!',
- completedBadge: '✔️ Completed'
+ completedBadge: '✔️ Completed',
+ datasetDocCountLabel: (count) => count === 1 ? ' (1 document)' : ` (${count} documents)`,
+ uploadSuccessMessage: (name, count) => {
+ const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || 'dataset';
+ if (typeof count === 'number' && count > 0) {
+ return `Dataset "${friendlyName}" uploaded successfully${count === 1 ? ' (1 document)' : ` (${count} documents)`}!`;
+ }
+ return `Dataset "${friendlyName}" uploaded successfully!`;
+ }
},
zh: {
processing: '正在处理你的问题…',
@@ -1249,7 +1257,7 @@ Answer:
uploadDocuments: '上传文档',
uploadDragTitle: '拖拽文件到这里',
uploadDragDesc: '或点击选择文件',
- uploadSupports: '当前支持:.md,.txt,.json',
+ uploadSupports: '当前支持:.txt、.md、.json、.pdf、.docx、.doc',
uploadSampleHeader: '📄 示例格式(.json):',
uploadBtn: '开始上传',
clearUploadBtn: '清空',
@@ -1269,7 +1277,15 @@ Answer:
retrievalProgress: (found,cand) => `检索进度:在 ${cand} 个候选中找到 ${found} 个相关节点。`,
answerStart: '正在综合生成最终答案…',
qaCompleted: '问答流程完成!',
- completedBadge: '✔️ 已完成'
+ completedBadge: '✔️ 已完成',
+ datasetDocCountLabel: (count) => count === 1 ? '(1篇文档)' : `(共${count}篇文档)`,
+ uploadSuccessMessage: (name, count) => {
+ const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || '数据集';
+ if (typeof count === 'number' && count > 0) {
+ return `数据集“${friendlyName}”上传成功${count === 1 ? '(1篇文档)' : `(共${count}篇文档)`}!`;
+ }
+ return `数据集“${friendlyName}”上传成功!`;
+ }
}
};
@@ -1311,6 +1327,11 @@ Answer:
// Processing text & badge (if visible)
const procText = document.getElementById('qaProcessingText'); if (procText) procText.textContent = t.processing;
const badge = document.getElementById('qaCompletionBadge'); if (badge && badge.style.display !== 'none') badge.textContent = t.completedBadge;
+
+ // Refresh dataset renderings to reflect language-specific labels
+ displayDatasets();
+ loadDatasetOptions('graphDataset');
+ loadDatasetOptions('qaDataset');
}
// API base URL
@@ -1452,6 +1473,20 @@ Answer:
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
+ function formatDatasetDisplayName(dataset) {
+ if (!dataset) return '';
+ const rawName = (dataset.display_label || dataset.name || '').toString();
+ const cleanedName = rawName.replace(/_/g, ' ').trim() || (dataset.name || '');
+ const docCount = typeof dataset.docs_count === 'number' ? dataset.docs_count : null;
+ if (docCount && docCount > 0) {
+ const langConfig = i18n[currentLang] || i18n.en;
+ const suffixFn = langConfig?.datasetDocCountLabel;
+ const suffix = suffixFn ? suffixFn(docCount) : ` (${docCount} documents)`;
+ return `${cleanedName}${suffix}`;
+ }
+ return cleanedName;
+ }
+
async function uploadFiles() {
if (selectedFiles.length === 0) return;
@@ -1484,7 +1519,19 @@ Answer:
progressFill.style.width = '100%';
progressText.textContent = 'Upload completed!';
- showMessage('Files uploaded successfully!', 'success');
+ const uploadData = response?.data || {};
+ const datasetName = uploadData.dataset_name;
+ const filesCount = uploadData.files_count;
+ const langConfig = i18n[currentLang] || i18n.en;
+ if (langConfig?.uploadSuccessMessage) {
+ showMessage(langConfig.uploadSuccessMessage(datasetName, filesCount), 'success');
+ } else {
+ let successMessage = datasetName ? `Dataset "${datasetName}" uploaded successfully!` : 'Files uploaded successfully!';
+ if (typeof filesCount === 'number' && filesCount > 0) {
+ successMessage += filesCount === 1 ? ' (1 document)' : ` (${filesCount} documents)`;
+ }
+ showMessage(successMessage, 'success');
+ }
clearFiles();
refreshData();
@@ -1524,6 +1571,7 @@ Answer:
function displayDatasets() {
const container = document.getElementById('datasetsList');
+ if (!container) return;
if (datasets.length === 0) {
container.innerHTML = 'No datasets available. Upload some files to get started.
';
return;
@@ -1532,7 +1580,7 @@ Answer:
container.innerHTML = datasets.map(dataset => `
- ${dataset.name}
+ ${formatDatasetDisplayName(dataset)}
(${dataset.type})
${dataset.type !== 'demo' ? `Schema: ${dataset.has_custom_schema ? 'custom' : 'default'}` : ''}
@@ -1790,10 +1838,19 @@
Answer:
function loadDatasetOptions(selectId) {
const select = document.getElementById(selectId);
+ if (!select) return;
+
const readyDatasets = datasets.filter(d => d.status === 'ready');
-
- select.innerHTML = '
' +
- readyDatasets.map(d => `
`).join('');
+ const currentValue = select.value;
+ const langConfig = i18n[currentLang] || i18n.en;
+ const placeholder = langConfig?.selectDatasetPlaceholder || 'Select a dataset...';
+
+ select.innerHTML = `
` +
+ readyDatasets.map(d => `
`).join('');
+
+ if (readyDatasets.some(d => d.name === currentValue)) {
+ select.value = currentValue;
+ }
}
async function loadGraphData() {
diff --git a/requirements.txt b/requirements.txt
index 797c2f9e..1710eb90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,13 +22,17 @@ spacy==3.7.5
tiktoken==0.9.0
json-repair==0.46.2
+# Document Parsing
+magic-pdf==1.3.12
+python-docx==1.1.2
+pypdf==4.2.0
+
# API & HTTP Clients
openai==1.102.0
requests==2.32.5
# Configuration & Utilities
PyYAML==6.0.2
-pydantic==2.5.0
nanoid==2.0.0
dotenv==0.9.9
python-dotenv==1.1.1
diff --git a/utils/document_parser.py b/utils/document_parser.py
new file mode 100644
index 00000000..79f604b5
--- /dev/null
+++ b/utils/document_parser.py
@@ -0,0 +1,225 @@
+"""
+Document Parser Utility
+Supports parsing PDF, DOCX, DOC files using MinerU and python-docx
+"""
+
+import os
+import tempfile
+from typing import Optional, Dict
+from pathlib import Path
+
+from utils.logger import logger
+
+try:
+ from magic_pdf.data.dataset import PymuDocDataset
+ MINERU_AVAILABLE = True
+except ImportError as e:
+ MINERU_AVAILABLE = False
+ logger.warning(f"MinerU not available: {e}")
+
+try:
+ import fitz # type: ignore[attr-defined]
+ PYMUPDF_AVAILABLE = True
+except ImportError:
+ PYMUPDF_AVAILABLE = False
+ logger.warning("PyMuPDF (fitz) not available; PDF text extraction may be limited")
+
+try:
+ from docx import Document as DocxDocument
+ DOCX_AVAILABLE = True
+except ImportError:
+ DOCX_AVAILABLE = False
+ logger.warning("python-docx not available")
+
+
+class DocumentParser:
+ """Parse various document formats to extract text content"""
+
+ def __init__(self):
+ self.temp_dir = tempfile.mkdtemp(prefix="youtu_graphrag_")
+ logger.info(f"DocumentParser initialized with temp dir: {self.temp_dir}")
+
+ def parse_file(self, file_path: str, file_type: str) -> Optional[str]:
+ """
+ Parse a document file and extract text content
+
+ Args:
+ file_path: Path to the document file
+ file_type: File extension (.pdf, .docx, .doc)
+
+ Returns:
+ Extracted text content or None if parsing fails
+ """
+ file_type = file_type.lower()
+
+ try:
+ if file_type == '.pdf':
+ return self._parse_pdf(file_path)
+ elif file_type in ['.docx', '.doc']:
+ return self._parse_docx(file_path)
+ else:
+ logger.warning(f"Unsupported file type: {file_type}")
+ return None
+ except Exception as e:
+ logger.error(f"Error parsing {file_type} file: {e}")
+ return None
+
+ def _parse_pdf(self, pdf_path: str) -> Optional[str]:
+ """
+ Parse PDF using MinerU
+
+ Args:
+ pdf_path: Path to PDF file
+
+ Returns:
+ Extracted text content
+ """
+ if not MINERU_AVAILABLE:
+ logger.error("MinerU is not installed. Cannot parse PDF files.")
+ return None
+
+ try:
+ with open(pdf_path, 'rb') as f:
+ pdf_bytes = f.read()
+ except Exception as e:
+ logger.error(f"Unable to read PDF file {pdf_path}: {e}")
+ return None
+
+ # -------- MinerU / PyMuPDF pipeline --------
+ if MINERU_AVAILABLE:
+ try:
+ dataset = PymuDocDataset(pdf_bytes, lang='auto')
+ text_parts: list[str] = []
+ for page_index in range(len(dataset)):
+ try:
+ page_doc = dataset.get_page(page_index).get_doc()
+ page_text = page_doc.get_text("text")
+ if page_text:
+ text_parts.append(page_text.strip())
+ except Exception as page_err:
+ logger.warning(
+ f"MinerU pipeline failed to extract page {page_index} of {pdf_path}: {page_err}"
+ )
+ continue
+
+ if text_parts:
+ extracted_text = '\n\n'.join(text_parts)
+ logger.info(
+ f"Successfully extracted {len(extracted_text)} chars from PDF via MinerU pipeline"
+ )
+ return extracted_text
+ except Exception as e:
+ logger.error(f"MinerU PDF parsing failed: {e}")
+
+ # -------- Direct PyMuPDF fallback --------
+ if PYMUPDF_AVAILABLE:
+ try:
+ text_parts: list[str] = []
+ with fitz.open(pdf_path) as doc: # type: ignore[attr-defined]
+ for page in doc:
+ page_text = page.get_text("text")
+ if page_text:
+ text_parts.append(page_text.strip())
+ if text_parts:
+ extracted_text = '\n\n'.join(text_parts)
+ logger.info(
+ f"Successfully extracted {len(extracted_text)} chars from PDF via PyMuPDF fallback"
+ )
+ return extracted_text
+ except Exception as e:
+ logger.error(f"PyMuPDF fallback failed: {e}")
+
+ # -------- PyPDF fallback --------
+ try:
+ from pypdf import PdfReader # type: ignore
+
+ reader = PdfReader(pdf_path)
+ text_parts = []
+ for page_index, page in enumerate(reader.pages):
+ try:
+ page_text = page.extract_text()
+ except Exception as page_err:
+ logger.warning(
+ f"pypdf failed to extract page {page_index} of {pdf_path}: {page_err}"
+ )
+ continue
+ if page_text:
+ text_parts.append(page_text.strip())
+
+ if text_parts:
+ extracted_text = '\n\n'.join(text_parts)
+ logger.info(
+ f"Successfully extracted {len(extracted_text)} chars from PDF via pypdf fallback"
+ )
+ return extracted_text
+ except ImportError:
+ logger.debug("pypdf not installed; skipping pypdf fallback")
+ except Exception as e:
+ logger.error(f"pypdf fallback failed: {e}")
+
+ logger.error(f"Unable to extract text from PDF: {pdf_path}")
+ return None
+
+ def _parse_docx(self, docx_path: str) -> Optional[str]:
+ """
+ Parse DOCX/DOC using python-docx
+
+ Args:
+ docx_path: Path to DOCX/DOC file
+
+ Returns:
+ Extracted text content
+ """
+ if not DOCX_AVAILABLE:
+ logger.error("python-docx is not installed. Cannot parse DOCX/DOC files.")
+ return None
+
+ try:
+ doc = DocxDocument(docx_path)
+ text_parts = []
+
+ # Extract paragraphs
+ for para in doc.paragraphs:
+ if para.text.strip():
+ text_parts.append(para.text)
+
+ # Extract tables
+ for table in doc.tables:
+ for row in table.rows:
+ row_text = ' | '.join(cell.text.strip() for cell in row.cells)
+ if row_text.strip():
+ text_parts.append(row_text)
+
+ extracted_text = '\n'.join(text_parts)
+
+ if not extracted_text.strip():
+ logger.warning(f"No text extracted from DOCX: {docx_path}")
+ return None
+
+ logger.info(f"Successfully extracted {len(extracted_text)} chars from DOCX")
+ return extracted_text
+
+ except Exception as e:
+ logger.error(f"Error parsing DOCX: {e}")
+ return None
+
+ def cleanup(self):
+ """Clean up temporary files"""
+ try:
+ import shutil
+ if os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+ logger.info(f"Cleaned up temp dir: {self.temp_dir}")
+ except Exception as e:
+ logger.error(f"Error cleaning up temp dir: {e}")
+
+
+# Global parser instance
+_parser_instance = None
+
+def get_parser() -> DocumentParser:
+ """Get or create global parser instance"""
+ global _parser_instance
+ if _parser_instance is None:
+ _parser_instance = DocumentParser()
+ return _parser_instance