diff --git a/backend.py b/backend.py index 292dec41..78557e10 100644 --- a/backend.py +++ b/backend.py @@ -28,6 +28,14 @@ from utils.logger import logger import ast +# Import document parser +try: + from utils.document_parser import get_parser + DOCUMENT_PARSER_AVAILABLE = True +except ImportError as e: + DOCUMENT_PARSER_AVAILABLE = False + logger.warning(f"Document parser not available: {e}") + # Try to import GraphRAG components try: from models.constructor import kt_gen as constructor @@ -262,15 +270,21 @@ async def websocket_endpoint(websocket: WebSocket, client_id: str): async def upload_files(files: List[UploadFile] = File(...), client_id: str = "default"): """Upload files and prepare for graph construction""" try: - # Use original filename (without extension) as dataset name - # If multiple files, use the first file's name - main_file = files[0] - original_name = os.path.splitext(main_file.filename)[0] - # Clean filename to be filesystem-safe - dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip() - dataset_name = dataset_name.replace(' ', '_') + # Generate dataset name based on file count + if len(files) == 1: + # Single file: use its name + main_file = files[0] + original_name = os.path.splitext(main_file.filename)[0] + # Clean filename to be filesystem-safe + dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip() + dataset_name = dataset_name.replace(' ', '_') + else: + # Multiple files: create a descriptive name with date + from datetime import datetime + date_str = datetime.now().strftime("%Y%m%d") + dataset_name = f"{len(files)}files_{date_str}" - # Add timestamp if dataset already exists + # Add counter if dataset already exists base_name = dataset_name counter = 1 while os.path.exists(f"data/uploaded/{dataset_name}"): @@ -286,7 +300,13 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de corpus_data = [] skipped_files: List[str] = [] processed_count = 0 - allowed_extensions = {".txt", ".md", ".json"} + allowed_extensions = {".txt", ".md", ".json", ".pdf", ".docx", ".doc"} + + # Initialize document parser if needed + doc_parser = None + if DOCUMENT_PARSER_AVAILABLE: + doc_parser = get_parser() + for i, file in enumerate(files): file_path = os.path.join(upload_dir, file.filename) with open(file_path, "wb") as buffer: @@ -303,6 +323,35 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de progress = 10 + (i + 1) * 80 // len(files) await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}") continue + + # Handle PDF and DOCX/DOC files with document parser + if ext in ['.pdf', '.docx', '.doc']: + if not doc_parser: + logger.warning(f"Document parser not available, skipping {file.filename}") + skipped_files.append(file.filename) + progress = 10 + (i + 1) * 80 // len(files) + await send_progress_update(client_id, "upload", progress, f"Skipped {file.filename} (parser unavailable)") + continue + + try: + text = doc_parser.parse_file(file_path, ext) + if text and text.strip(): + corpus_data.append({ + "title": file.filename, + "text": text + }) + processed_count += 1 + await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Parsed {file.filename}") + else: + logger.warning(f"No text extracted from {file.filename}") + skipped_files.append(file.filename) + await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"No text in {file.filename}") + except Exception as e: + logger.error(f"Error parsing {file.filename}: {e}") + skipped_files.append(file.filename) + await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Failed to parse {file.filename}") + continue + # Treat plain text formats explicitly (.txt and .md) if filename_lower.endswith(('.txt', '.md')): text = decode_bytes_with_detection(content_bytes) @@ -333,7 +382,7 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de # Ensure at least one valid file processed if processed_count == 0: - msg = "No supported files were uploaded. Allowed: .txt, .md, .json" + msg = "No supported files were uploaded. Allowed: .txt, .md, .json, .pdf, .docx, .doc" if skipped_files: msg += f"; skipped: {', '.join(skipped_files)}" await send_progress_update(client_id, "upload", 0, msg) diff --git a/frontend/index.html b/frontend/index.html index 14ec66c6..e271d371 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -1089,7 +1089,7 @@

Upload Documents

Drag & Drop Files Here

or click to browse files

- Currently supports: .txt, .md, .json + Currently supports: .txt, .md, .json, .pdf, .docx, .doc

📄 Sample Format (.json):
@@ -1106,7 +1106,7 @@

Drag & Drop Files Here

]
- + @@ -1213,7 +1213,7 @@

Answer:

uploadDocuments: 'Upload Documents', uploadDragTitle: 'Drag & Drop Files Here', uploadDragDesc: 'or click to browse files', - uploadSupports: 'Currently supports: .txt, .md, .json', + uploadSupports: 'Currently supports: .txt, .md, .json, .pdf, .docx, .doc', uploadSampleHeader: '📄 Sample Format (.json):', uploadBtn: 'Upload Files', clearUploadBtn: 'Clear', @@ -1233,7 +1233,15 @@

Answer:

retrievalProgress: (found,cand) => `Retrieval progress: ${found} relevant nodes from ${cand} candidates.`, answerStart: 'Synthesizing the final answer...', qaCompleted: 'QA process completed!', - completedBadge: '✔️ Completed' + completedBadge: '✔️ Completed', + datasetDocCountLabel: (count) => count === 1 ? ' (1 document)' : ` (${count} documents)`, + uploadSuccessMessage: (name, count) => { + const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || 'dataset'; + if (typeof count === 'number' && count > 0) { + return `Dataset "${friendlyName}" uploaded successfully${count === 1 ? ' (1 document)' : ` (${count} documents)`}!`; + } + return `Dataset "${friendlyName}" uploaded successfully!`; + } }, zh: { processing: '正在处理你的问题…', @@ -1249,7 +1257,7 @@

Answer:

uploadDocuments: '上传文档', uploadDragTitle: '拖拽文件到这里', uploadDragDesc: '或点击选择文件', - uploadSupports: '当前支持:.md,.txt,.json', + uploadSupports: '当前支持:.txt、.md、.json、.pdf、.docx、.doc', uploadSampleHeader: '📄 示例格式(.json):', uploadBtn: '开始上传', clearUploadBtn: '清空', @@ -1269,7 +1277,15 @@

Answer:

retrievalProgress: (found,cand) => `检索进度:在 ${cand} 个候选中找到 ${found} 个相关节点。`, answerStart: '正在综合生成最终答案…', qaCompleted: '问答流程完成!', - completedBadge: '✔️ 已完成' + completedBadge: '✔️ 已完成', + datasetDocCountLabel: (count) => count === 1 ? '(1篇文档)' : `(共${count}篇文档)`, + uploadSuccessMessage: (name, count) => { + const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || '数据集'; + if (typeof count === 'number' && count > 0) { + return `数据集“${friendlyName}”上传成功${count === 1 ? '(1篇文档)' : `(共${count}篇文档)`}!`; + } + return `数据集“${friendlyName}”上传成功!`; + } } }; @@ -1311,6 +1327,11 @@

Answer:

// Processing text & badge (if visible) const procText = document.getElementById('qaProcessingText'); if (procText) procText.textContent = t.processing; const badge = document.getElementById('qaCompletionBadge'); if (badge && badge.style.display !== 'none') badge.textContent = t.completedBadge; + + // Refresh dataset renderings to reflect language-specific labels + displayDatasets(); + loadDatasetOptions('graphDataset'); + loadDatasetOptions('qaDataset'); } // API base URL @@ -1452,6 +1473,20 @@

Answer:

return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } + function formatDatasetDisplayName(dataset) { + if (!dataset) return ''; + const rawName = (dataset.display_label || dataset.name || '').toString(); + const cleanedName = rawName.replace(/_/g, ' ').trim() || (dataset.name || ''); + const docCount = typeof dataset.docs_count === 'number' ? dataset.docs_count : null; + if (docCount && docCount > 0) { + const langConfig = i18n[currentLang] || i18n.en; + const suffixFn = langConfig?.datasetDocCountLabel; + const suffix = suffixFn ? suffixFn(docCount) : ` (${docCount} documents)`; + return `${cleanedName}${suffix}`; + } + return cleanedName; + } + async function uploadFiles() { if (selectedFiles.length === 0) return; @@ -1484,7 +1519,19 @@

Answer:

progressFill.style.width = '100%'; progressText.textContent = 'Upload completed!'; - showMessage('Files uploaded successfully!', 'success'); + const uploadData = response?.data || {}; + const datasetName = uploadData.dataset_name; + const filesCount = uploadData.files_count; + const langConfig = i18n[currentLang] || i18n.en; + if (langConfig?.uploadSuccessMessage) { + showMessage(langConfig.uploadSuccessMessage(datasetName, filesCount), 'success'); + } else { + let successMessage = datasetName ? `Dataset "${datasetName}" uploaded successfully!` : 'Files uploaded successfully!'; + if (typeof filesCount === 'number' && filesCount > 0) { + successMessage += filesCount === 1 ? ' (1 document)' : ` (${filesCount} documents)`; + } + showMessage(successMessage, 'success'); + } clearFiles(); refreshData(); @@ -1524,6 +1571,7 @@

Answer:

function displayDatasets() { const container = document.getElementById('datasetsList'); + if (!container) return; if (datasets.length === 0) { container.innerHTML = '

No datasets available. Upload some files to get started.

'; return; @@ -1532,7 +1580,7 @@

Answer:

container.innerHTML = datasets.map(dataset => `
- ${dataset.name} + ${formatDatasetDisplayName(dataset)} (${dataset.type}) ${dataset.type !== 'demo' ? `Schema: ${dataset.has_custom_schema ? 'custom' : 'default'}` : ''}
@@ -1790,10 +1838,19 @@

Answer:

function loadDatasetOptions(selectId) { const select = document.getElementById(selectId); + if (!select) return; + const readyDatasets = datasets.filter(d => d.status === 'ready'); - - select.innerHTML = '' + - readyDatasets.map(d => ``).join(''); + const currentValue = select.value; + const langConfig = i18n[currentLang] || i18n.en; + const placeholder = langConfig?.selectDatasetPlaceholder || 'Select a dataset...'; + + select.innerHTML = `` + + readyDatasets.map(d => ``).join(''); + + if (readyDatasets.some(d => d.name === currentValue)) { + select.value = currentValue; + } } async function loadGraphData() { diff --git a/requirements.txt b/requirements.txt index 797c2f9e..1710eb90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,13 +22,17 @@ spacy==3.7.5 tiktoken==0.9.0 json-repair==0.46.2 +# Document Parsing +magic-pdf==1.3.12 +python-docx==1.1.2 +pypdf==4.2.0 + # API & HTTP Clients openai==1.102.0 requests==2.32.5 # Configuration & Utilities PyYAML==6.0.2 -pydantic==2.5.0 nanoid==2.0.0 dotenv==0.9.9 python-dotenv==1.1.1 diff --git a/utils/document_parser.py b/utils/document_parser.py new file mode 100644 index 00000000..79f604b5 --- /dev/null +++ b/utils/document_parser.py @@ -0,0 +1,225 @@ +""" +Document Parser Utility +Supports parsing PDF, DOCX, DOC files using MinerU and python-docx +""" + +import os +import tempfile +from typing import Optional, Dict +from pathlib import Path + +from utils.logger import logger + +try: + from magic_pdf.data.dataset import PymuDocDataset + MINERU_AVAILABLE = True +except ImportError as e: + MINERU_AVAILABLE = False + logger.warning(f"MinerU not available: {e}") + +try: + import fitz # type: ignore[attr-defined] + PYMUPDF_AVAILABLE = True +except ImportError: + PYMUPDF_AVAILABLE = False + logger.warning("PyMuPDF (fitz) not available; PDF text extraction may be limited") + +try: + from docx import Document as DocxDocument + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + logger.warning("python-docx not available") + + +class DocumentParser: + """Parse various document formats to extract text content""" + + def __init__(self): + self.temp_dir = tempfile.mkdtemp(prefix="youtu_graphrag_") + logger.info(f"DocumentParser initialized with temp dir: {self.temp_dir}") + + def parse_file(self, file_path: str, file_type: str) -> Optional[str]: + """ + Parse a document file and extract text content + + Args: + file_path: Path to the document file + file_type: File extension (.pdf, .docx, .doc) + + Returns: + Extracted text content or None if parsing fails + """ + file_type = file_type.lower() + + try: + if file_type == '.pdf': + return self._parse_pdf(file_path) + elif file_type in ['.docx', '.doc']: + return self._parse_docx(file_path) + else: + logger.warning(f"Unsupported file type: {file_type}") + return None + except Exception as e: + logger.error(f"Error parsing {file_type} file: {e}") + return None + + def _parse_pdf(self, pdf_path: str) -> Optional[str]: + """ + Parse PDF using MinerU + + Args: + pdf_path: Path to PDF file + + Returns: + Extracted text content + """ + if not MINERU_AVAILABLE: + logger.error("MinerU is not installed. Cannot parse PDF files.") + return None + + try: + with open(pdf_path, 'rb') as f: + pdf_bytes = f.read() + except Exception as e: + logger.error(f"Unable to read PDF file {pdf_path}: {e}") + return None + + # -------- MinerU / PyMuPDF pipeline -------- + if MINERU_AVAILABLE: + try: + dataset = PymuDocDataset(pdf_bytes, lang='auto') + text_parts: list[str] = [] + for page_index in range(len(dataset)): + try: + page_doc = dataset.get_page(page_index).get_doc() + page_text = page_doc.get_text("text") + if page_text: + text_parts.append(page_text.strip()) + except Exception as page_err: + logger.warning( + f"MinerU pipeline failed to extract page {page_index} of {pdf_path}: {page_err}" + ) + continue + + if text_parts: + extracted_text = '\n\n'.join(text_parts) + logger.info( + f"Successfully extracted {len(extracted_text)} chars from PDF via MinerU pipeline" + ) + return extracted_text + except Exception as e: + logger.error(f"MinerU PDF parsing failed: {e}") + + # -------- Direct PyMuPDF fallback -------- + if PYMUPDF_AVAILABLE: + try: + text_parts: list[str] = [] + with fitz.open(pdf_path) as doc: # type: ignore[attr-defined] + for page in doc: + page_text = page.get_text("text") + if page_text: + text_parts.append(page_text.strip()) + if text_parts: + extracted_text = '\n\n'.join(text_parts) + logger.info( + f"Successfully extracted {len(extracted_text)} chars from PDF via PyMuPDF fallback" + ) + return extracted_text + except Exception as e: + logger.error(f"PyMuPDF fallback failed: {e}") + + # -------- PyPDF fallback -------- + try: + from pypdf import PdfReader # type: ignore + + reader = PdfReader(pdf_path) + text_parts = [] + for page_index, page in enumerate(reader.pages): + try: + page_text = page.extract_text() + except Exception as page_err: + logger.warning( + f"pypdf failed to extract page {page_index} of {pdf_path}: {page_err}" + ) + continue + if page_text: + text_parts.append(page_text.strip()) + + if text_parts: + extracted_text = '\n\n'.join(text_parts) + logger.info( + f"Successfully extracted {len(extracted_text)} chars from PDF via pypdf fallback" + ) + return extracted_text + except ImportError: + logger.debug("pypdf not installed; skipping pypdf fallback") + except Exception as e: + logger.error(f"pypdf fallback failed: {e}") + + logger.error(f"Unable to extract text from PDF: {pdf_path}") + return None + + def _parse_docx(self, docx_path: str) -> Optional[str]: + """ + Parse DOCX/DOC using python-docx + + Args: + docx_path: Path to DOCX/DOC file + + Returns: + Extracted text content + """ + if not DOCX_AVAILABLE: + logger.error("python-docx is not installed. Cannot parse DOCX/DOC files.") + return None + + try: + doc = DocxDocument(docx_path) + text_parts = [] + + # Extract paragraphs + for para in doc.paragraphs: + if para.text.strip(): + text_parts.append(para.text) + + # Extract tables + for table in doc.tables: + for row in table.rows: + row_text = ' | '.join(cell.text.strip() for cell in row.cells) + if row_text.strip(): + text_parts.append(row_text) + + extracted_text = '\n'.join(text_parts) + + if not extracted_text.strip(): + logger.warning(f"No text extracted from DOCX: {docx_path}") + return None + + logger.info(f"Successfully extracted {len(extracted_text)} chars from DOCX") + return extracted_text + + except Exception as e: + logger.error(f"Error parsing DOCX: {e}") + return None + + def cleanup(self): + """Clean up temporary files""" + try: + import shutil + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + logger.info(f"Cleaned up temp dir: {self.temp_dir}") + except Exception as e: + logger.error(f"Error cleaning up temp dir: {e}") + + +# Global parser instance +_parser_instance = None + +def get_parser() -> DocumentParser: + """Get or create global parser instance""" + global _parser_instance + if _parser_instance is None: + _parser_instance = DocumentParser() + return _parser_instance