diff --git a/backend.py b/backend.py
index 292dec41..78557e10 100644
--- a/backend.py
+++ b/backend.py
@@ -28,6 +28,14 @@
 from utils.logger import logger
 import ast
 
+# Import document parser
+try:
+    from utils.document_parser import get_parser
+    DOCUMENT_PARSER_AVAILABLE = True
+except ImportError as e:
+    DOCUMENT_PARSER_AVAILABLE = False
+    logger.warning(f"Document parser not available: {e}")
+
 # Try to import GraphRAG components
 try:
     from models.constructor import kt_gen as constructor
@@ -262,15 +270,21 @@ async def websocket_endpoint(websocket: WebSocket, client_id: str):
 async def upload_files(files: List[UploadFile] = File(...), client_id: str = "default"):
     """Upload files and prepare for graph construction"""
     try:
-        # Use original filename (without extension) as dataset name
-        # If multiple files, use the first file's name
-        main_file = files[0]
-        original_name = os.path.splitext(main_file.filename)[0]
-        # Clean filename to be filesystem-safe
-        dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        dataset_name = dataset_name.replace(' ', '_')
+        # Generate dataset name based on file count
+        if len(files) == 1:
+            # Single file: use its name
+            main_file = files[0]
+            original_name = os.path.splitext(main_file.filename)[0]
+            # Clean filename to be filesystem-safe
+            dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
+            dataset_name = dataset_name.replace(' ', '_')
+        else:
+            # Multiple files: create a descriptive name with date
+            from datetime import datetime
+            date_str = datetime.now().strftime("%Y%m%d")
+            dataset_name = f"{len(files)}files_{date_str}"
         
-        # Add timestamp if dataset already exists
+        # Add counter if dataset already exists
         base_name = dataset_name
         counter = 1
         while os.path.exists(f"data/uploaded/{dataset_name}"):
@@ -286,7 +300,13 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
         corpus_data = []
         skipped_files: List[str] = []
         processed_count = 0
-        allowed_extensions = {".txt", ".md", ".json"}
+        allowed_extensions = {".txt", ".md", ".json", ".pdf", ".docx", ".doc"}
+        
+        # Initialize document parser if needed
+        doc_parser = None
+        if DOCUMENT_PARSER_AVAILABLE:
+            doc_parser = get_parser()
+        
         for i, file in enumerate(files):
             file_path = os.path.join(upload_dir, file.filename)
             with open(file_path, "wb") as buffer:
@@ -303,6 +323,35 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
                 progress = 10 + (i + 1) * 80 // len(files)
                 await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
                 continue
+            
+            # Handle PDF and DOCX/DOC files with document parser
+            if ext in ['.pdf', '.docx', '.doc']:
+                if not doc_parser:
+                    logger.warning(f"Document parser not available, skipping {file.filename}")
+                    skipped_files.append(file.filename)
+                    progress = 10 + (i + 1) * 80 // len(files)
+                    await send_progress_update(client_id, "upload", progress, f"Skipped {file.filename} (parser unavailable)")
+                    continue
+                
+                try:
+                    text = doc_parser.parse_file(file_path, ext)
+                    if text and text.strip():
+                        corpus_data.append({
+                            "title": file.filename,
+                            "text": text
+                        })
+                        processed_count += 1
+                        await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Parsed {file.filename}")
+                    else:
+                        logger.warning(f"No text extracted from {file.filename}")
+                        skipped_files.append(file.filename)
+                        await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"No text in {file.filename}")
+                except Exception as e:
+                    logger.error(f"Error parsing {file.filename}: {e}")
+                    skipped_files.append(file.filename)
+                    await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Failed to parse {file.filename}")
+                continue
+            
             # Treat plain text formats explicitly (.txt and .md)
             if filename_lower.endswith(('.txt', '.md')):
                 text = decode_bytes_with_detection(content_bytes)
@@ -333,7 +382,7 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
         
         # Ensure at least one valid file processed
         if processed_count == 0:
-            msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
+            msg = "No supported files were uploaded. Allowed: .txt, .md, .json, .pdf, .docx, .doc"
             if skipped_files:
                 msg += f"; skipped: {', '.join(skipped_files)}"
             await send_progress_update(client_id, "upload", 0, msg)
diff --git a/frontend/index.html b/frontend/index.html
index 14ec66c6..e271d371 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -1089,7 +1089,7 @@ <h2 id="uploadDocumentsTitle">Upload Documents</h2>
                                         <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
                                         <p id="uploadDragDesc">or click to browse files</p>
                     <p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
-                        Currently supports: .txt, .md, .json
+                        Currently supports: .txt, .md, .json, .pdf, .docx, .doc
                     </p>
                     <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.1); border-radius: 10px; text-align: left; font-size: 11px; color: rgba(255,255,255,0.8);">
                                                 <div id="uploadSampleHeader" style="font-weight: 600; margin-bottom: 8px;">📄 Sample Format (.json):</div>
@@ -1106,7 +1106,7 @@ <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
   ]
 </pre>
                     </div>
-                    <input type="file" id="fileInput" multiple accept=".txt,.json,.md" style="display: none;">
+                    <input type="file" id="fileInput" multiple accept=".txt,.json,.md,.pdf,.docx,.doc" style="display: none;">
                 </div>
                 
                 <div id="fileList" class="file-list hidden"></div>
@@ -1213,7 +1213,7 @@ <h3 id="answerTitle">Answer:</h3>
                 uploadDocuments: 'Upload Documents',
                 uploadDragTitle: 'Drag & Drop Files Here',
                 uploadDragDesc: 'or click to browse files',
-                uploadSupports: 'Currently supports: .txt, .md, .json',
+                uploadSupports: 'Currently supports: .txt, .md, .json, .pdf, .docx, .doc',
                 uploadSampleHeader: '📄 Sample Format (.json):',
                 uploadBtn: 'Upload Files',
                 clearUploadBtn: 'Clear',
@@ -1233,7 +1233,15 @@ <h3 id="answerTitle">Answer:</h3>
                 retrievalProgress: (found,cand) => `Retrieval progress: ${found} relevant nodes from ${cand} candidates.`,
                 answerStart: 'Synthesizing the final answer...',
                 qaCompleted: 'QA process completed!',
-                completedBadge: '✔️ Completed'
+                completedBadge: '✔️ Completed',
+                datasetDocCountLabel: (count) => count === 1 ? ' (1 document)' : ` (${count} documents)`,
+                uploadSuccessMessage: (name, count) => {
+                    const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || 'dataset';
+                    if (typeof count === 'number' && count > 0) {
+                        return `Dataset "${friendlyName}" uploaded successfully${count === 1 ? ' (1 document)' : ` (${count} documents)`}!`;
+                    }
+                    return `Dataset "${friendlyName}" uploaded successfully!`;
+                }
             },
             zh: {
                 processing: '正在处理你的问题…',
@@ -1249,7 +1257,7 @@ <h3 id="answerTitle">Answer:</h3>
                 uploadDocuments: '上传文档',
                 uploadDragTitle: '拖拽文件到这里',
                 uploadDragDesc: '或点击选择文件',
-                uploadSupports: '当前支持：.md,.txt,.json',
+                uploadSupports: '当前支持：.txt、.md、.json、.pdf、.docx、.doc',
                 uploadSampleHeader: '📄 示例格式（.json）：',
                 uploadBtn: '开始上传',
                 clearUploadBtn: '清空',
@@ -1269,7 +1277,15 @@ <h3 id="answerTitle">Answer:</h3>
                 retrievalProgress: (found,cand) => `检索进度：在 ${cand} 个候选中找到 ${found} 个相关节点。`,
                 answerStart: '正在综合生成最终答案…',
                 qaCompleted: '问答流程完成！',
-                completedBadge: '✔️ 已完成'
+                completedBadge: '✔️ 已完成',
+                datasetDocCountLabel: (count) => count === 1 ? '（1篇文档）' : `（共${count}篇文档）`,
+                uploadSuccessMessage: (name, count) => {
+                    const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || '数据集';
+                    if (typeof count === 'number' && count > 0) {
+                        return `数据集“${friendlyName}”上传成功${count === 1 ? '（1篇文档）' : `（共${count}篇文档）`}！`;
+                    }
+                    return `数据集“${friendlyName}”上传成功！`;
+                }
             }
         };
 
@@ -1311,6 +1327,11 @@ <h3 id="answerTitle">Answer:</h3>
             // Processing text & badge (if visible)
             const procText = document.getElementById('qaProcessingText'); if (procText) procText.textContent = t.processing;
             const badge = document.getElementById('qaCompletionBadge'); if (badge && badge.style.display !== 'none') badge.textContent = t.completedBadge;
+
+            // Refresh dataset renderings to reflect language-specific labels
+            displayDatasets();
+            loadDatasetOptions('graphDataset');
+            loadDatasetOptions('qaDataset');
         }
 
         // API base URL
@@ -1452,6 +1473,20 @@ <h3 id="answerTitle">Answer:</h3>
             return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
         }
 
+        function formatDatasetDisplayName(dataset) {
+            if (!dataset) return '';
+            const rawName = (dataset.display_label || dataset.name || '').toString();
+            const cleanedName = rawName.replace(/_/g, ' ').trim() || (dataset.name || '');
+            const docCount = typeof dataset.docs_count === 'number' ? dataset.docs_count : null;
+            if (docCount && docCount > 0) {
+                const langConfig = i18n[currentLang] || i18n.en;
+                const suffixFn = langConfig?.datasetDocCountLabel;
+                const suffix = suffixFn ? suffixFn(docCount) : ` (${docCount} documents)`;
+                return `${cleanedName}${suffix}`;
+            }
+            return cleanedName;
+        }
+
         async function uploadFiles() {
             if (selectedFiles.length === 0) return;
 
@@ -1484,7 +1519,19 @@ <h3 id="answerTitle">Answer:</h3>
                 progressFill.style.width = '100%';
                 progressText.textContent = 'Upload completed!';
 
-                showMessage('Files uploaded successfully!', 'success');
+                const uploadData = response?.data || {};
+                const datasetName = uploadData.dataset_name;
+                const filesCount = uploadData.files_count;
+                const langConfig = i18n[currentLang] || i18n.en;
+                if (langConfig?.uploadSuccessMessage) {
+                    showMessage(langConfig.uploadSuccessMessage(datasetName, filesCount), 'success');
+                } else {
+                    let successMessage = datasetName ? `Dataset "${datasetName}" uploaded successfully!` : 'Files uploaded successfully!';
+                    if (typeof filesCount === 'number' && filesCount > 0) {
+                        successMessage += filesCount === 1 ? ' (1 document)' : ` (${filesCount} documents)`;
+                    }
+                    showMessage(successMessage, 'success');
+                }
                 clearFiles();
                 refreshData();
 
@@ -1524,6 +1571,7 @@ <h3 id="answerTitle">Answer:</h3>
 
         function displayDatasets() {
             const container = document.getElementById('datasetsList');
+            if (!container) return;
             if (datasets.length === 0) {
                 container.innerHTML = '<p style="color: rgba(255,255,255,0.6);">No datasets available. Upload some files to get started.</p>';
                 return;
@@ -1532,7 +1580,7 @@ <h3 id="answerTitle">Answer:</h3>
             container.innerHTML = datasets.map(dataset => `
                 <div class="file-item">
                     <div>
-                        <strong>${dataset.name}</strong>
+                        <strong>${formatDatasetDisplayName(dataset)}</strong>
                         <span style="color: rgba(255,255,255,0.6);"> (${dataset.type})</span>
                         ${dataset.type !== 'demo' ? `<span style="margin-left:8px; font-size:12px; color:${dataset.has_custom_schema ? '#228B22' : 'rgba(255,255,255,0.6)'}; font-weight:600;">Schema: ${dataset.has_custom_schema ? 'custom' : 'default'}</span>` : ''}
                     </div>
@@ -1790,10 +1838,19 @@ <h3 id="answerTitle">Answer:</h3>
 
         function loadDatasetOptions(selectId) {
             const select = document.getElementById(selectId);
+            if (!select) return;
+
             const readyDatasets = datasets.filter(d => d.status === 'ready');
-            
-            select.innerHTML = '<option value="">Select a dataset...</option>' +
-                readyDatasets.map(d => `<option value="${d.name}">${d.name}</option>`).join('');
+            const currentValue = select.value;
+            const langConfig = i18n[currentLang] || i18n.en;
+            const placeholder = langConfig?.selectDatasetPlaceholder || 'Select a dataset...';
+
+            select.innerHTML = `<option value="">${placeholder}</option>` +
+                readyDatasets.map(d => `<option value="${d.name}">${formatDatasetDisplayName(d)}</option>`).join('');
+
+            if (readyDatasets.some(d => d.name === currentValue)) {
+                select.value = currentValue;
+            }
         }
 
         async function loadGraphData() {
diff --git a/requirements.txt b/requirements.txt
index 797c2f9e..1710eb90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,13 +22,17 @@ spacy==3.7.5
 tiktoken==0.9.0
 json-repair==0.46.2
 
+# Document Parsing
+magic-pdf==1.3.12
+python-docx==1.1.2
+pypdf==4.2.0
+
 # API & HTTP Clients
 openai==1.102.0
 requests==2.32.5
 
 # Configuration & Utilities
 PyYAML==6.0.2
-pydantic==2.5.0
 nanoid==2.0.0
 dotenv==0.9.9 
 python-dotenv==1.1.1
diff --git a/utils/document_parser.py b/utils/document_parser.py
new file mode 100644
index 00000000..79f604b5
--- /dev/null
+++ b/utils/document_parser.py
@@ -0,0 +1,225 @@
+"""
+Document Parser Utility
+Supports parsing PDF, DOCX, DOC files using MinerU and python-docx
+"""
+
+import os
+import tempfile
+from typing import Optional, Dict
+from pathlib import Path
+
+from utils.logger import logger
+
+try:
+    from magic_pdf.data.dataset import PymuDocDataset
+    MINERU_AVAILABLE = True
+except ImportError as e:
+    MINERU_AVAILABLE = False
+    logger.warning(f"MinerU not available: {e}")
+
+try:
+    import fitz  # type: ignore[attr-defined]
+    PYMUPDF_AVAILABLE = True
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+    logger.warning("PyMuPDF (fitz) not available; PDF text extraction may be limited")
+
+try:
+    from docx import Document as DocxDocument
+    DOCX_AVAILABLE = True
+except ImportError:
+    DOCX_AVAILABLE = False
+    logger.warning("python-docx not available")
+
+
+class DocumentParser:
+    """Parse various document formats to extract text content"""
+    
+    def __init__(self):
+        self.temp_dir = tempfile.mkdtemp(prefix="youtu_graphrag_")
+        logger.info(f"DocumentParser initialized with temp dir: {self.temp_dir}")
+    
+    def parse_file(self, file_path: str, file_type: str) -> Optional[str]:
+        """
+        Parse a document file and extract text content
+        
+        Args:
+            file_path: Path to the document file
+            file_type: File extension (.pdf, .docx, .doc)
+            
+        Returns:
+            Extracted text content or None if parsing fails
+        """
+        file_type = file_type.lower()
+        
+        try:
+            if file_type == '.pdf':
+                return self._parse_pdf(file_path)
+            elif file_type in ['.docx', '.doc']:
+                return self._parse_docx(file_path)
+            else:
+                logger.warning(f"Unsupported file type: {file_type}")
+                return None
+        except Exception as e:
+            logger.error(f"Error parsing {file_type} file: {e}")
+            return None
+    
+    def _parse_pdf(self, pdf_path: str) -> Optional[str]:
+        """
+        Parse PDF using MinerU
+        
+        Args:
+            pdf_path: Path to PDF file
+            
+        Returns:
+            Extracted text content
+        """
+        if not MINERU_AVAILABLE:
+            logger.error("MinerU is not installed. Cannot parse PDF files.")
+            return None
+        
+        try:
+            with open(pdf_path, 'rb') as f:
+                pdf_bytes = f.read()
+        except Exception as e:
+            logger.error(f"Unable to read PDF file {pdf_path}: {e}")
+            return None
+
+        # -------- MinerU / PyMuPDF pipeline --------
+        if MINERU_AVAILABLE:
+            try:
+                dataset = PymuDocDataset(pdf_bytes, lang='auto')
+                text_parts: list[str] = []
+                for page_index in range(len(dataset)):
+                    try:
+                        page_doc = dataset.get_page(page_index).get_doc()
+                        page_text = page_doc.get_text("text")
+                        if page_text:
+                            text_parts.append(page_text.strip())
+                    except Exception as page_err:
+                        logger.warning(
+                            f"MinerU pipeline failed to extract page {page_index} of {pdf_path}: {page_err}"
+                        )
+                        continue
+
+                if text_parts:
+                    extracted_text = '\n\n'.join(text_parts)
+                    logger.info(
+                        f"Successfully extracted {len(extracted_text)} chars from PDF via MinerU pipeline"
+                    )
+                    return extracted_text
+            except Exception as e:
+                logger.error(f"MinerU PDF parsing failed: {e}")
+
+        # -------- Direct PyMuPDF fallback --------
+        if PYMUPDF_AVAILABLE:
+            try:
+                text_parts: list[str] = []
+                with fitz.open(pdf_path) as doc:  # type: ignore[attr-defined]
+                    for page in doc:
+                        page_text = page.get_text("text")
+                        if page_text:
+                            text_parts.append(page_text.strip())
+                if text_parts:
+                    extracted_text = '\n\n'.join(text_parts)
+                    logger.info(
+                        f"Successfully extracted {len(extracted_text)} chars from PDF via PyMuPDF fallback"
+                    )
+                    return extracted_text
+            except Exception as e:
+                logger.error(f"PyMuPDF fallback failed: {e}")
+
+        # -------- PyPDF fallback --------
+        try:
+            from pypdf import PdfReader  # type: ignore
+
+            reader = PdfReader(pdf_path)
+            text_parts = []
+            for page_index, page in enumerate(reader.pages):
+                try:
+                    page_text = page.extract_text()
+                except Exception as page_err:
+                    logger.warning(
+                        f"pypdf failed to extract page {page_index} of {pdf_path}: {page_err}"
+                    )
+                    continue
+                if page_text:
+                    text_parts.append(page_text.strip())
+
+            if text_parts:
+                extracted_text = '\n\n'.join(text_parts)
+                logger.info(
+                    f"Successfully extracted {len(extracted_text)} chars from PDF via pypdf fallback"
+                )
+                return extracted_text
+        except ImportError:
+            logger.debug("pypdf not installed; skipping pypdf fallback")
+        except Exception as e:
+            logger.error(f"pypdf fallback failed: {e}")
+
+        logger.error(f"Unable to extract text from PDF: {pdf_path}")
+        return None
+    
+    def _parse_docx(self, docx_path: str) -> Optional[str]:
+        """
+        Parse DOCX/DOC using python-docx
+        
+        Args:
+            docx_path: Path to DOCX/DOC file
+            
+        Returns:
+            Extracted text content
+        """
+        if not DOCX_AVAILABLE:
+            logger.error("python-docx is not installed. Cannot parse DOCX/DOC files.")
+            return None
+        
+        try:
+            doc = DocxDocument(docx_path)
+            text_parts = []
+            
+            # Extract paragraphs
+            for para in doc.paragraphs:
+                if para.text.strip():
+                    text_parts.append(para.text)
+            
+            # Extract tables
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = ' | '.join(cell.text.strip() for cell in row.cells)
+                    if row_text.strip():
+                        text_parts.append(row_text)
+            
+            extracted_text = '\n'.join(text_parts)
+            
+            if not extracted_text.strip():
+                logger.warning(f"No text extracted from DOCX: {docx_path}")
+                return None
+            
+            logger.info(f"Successfully extracted {len(extracted_text)} chars from DOCX")
+            return extracted_text
+            
+        except Exception as e:
+            logger.error(f"Error parsing DOCX: {e}")
+            return None
+    
+    def cleanup(self):
+        """Clean up temporary files"""
+        try:
+            import shutil
+            if os.path.exists(self.temp_dir):
+                shutil.rmtree(self.temp_dir)
+                logger.info(f"Cleaned up temp dir: {self.temp_dir}")
+        except Exception as e:
+            logger.error(f"Error cleaning up temp dir: {e}")
+
+
+# Global parser instance
+_parser_instance = None
+
+def get_parser() -> DocumentParser:
+    """Get or create global parser instance"""
+    global _parser_instance
+    if _parser_instance is None:
+        _parser_instance = DocumentParser()
+    return _parser_instance