TencentCloudADP · siyuan-youtu · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/backend.py b/backend.py
@@ -28,6 +28,14 @@
 from utils.logger import logger
 import ast
 
+# Import document parser
+try:
+    from utils.document_parser import get_parser
+    DOCUMENT_PARSER_AVAILABLE = True
+except ImportError as e:
+    DOCUMENT_PARSER_AVAILABLE = False
+    logger.warning(f"Document parser not available: {e}")
+
 # Try to import GraphRAG components
 try:
     from models.constructor import kt_gen as constructor
@@ -262,15 +270,21 @@ async def websocket_endpoint(websocket: WebSocket, client_id: str):
 async def upload_files(files: List[UploadFile] = File(...), client_id: str = "default"):
     """Upload files and prepare for graph construction"""
     try:
-        # Use original filename (without extension) as dataset name
-        # If multiple files, use the first file's name
-        main_file = files[0]
-        original_name = os.path.splitext(main_file.filename)[0]
-        # Clean filename to be filesystem-safe
-        dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        dataset_name = dataset_name.replace(' ', '_')
+        # Generate dataset name based on file count
+        if len(files) == 1:
+            # Single file: use its name
+            main_file = files[0]
+            original_name = os.path.splitext(main_file.filename)[0]
+            # Clean filename to be filesystem-safe
+            dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
+            dataset_name = dataset_name.replace(' ', '_')
+        else:
+            # Multiple files: create a descriptive name with date
+            from datetime import datetime
+            date_str = datetime.now().strftime("%Y%m%d")
+            dataset_name = f"{len(files)}files_{date_str}"
 
-        # Add timestamp if dataset already exists
+        # Add counter if dataset already exists
         base_name = dataset_name
         counter = 1
         while os.path.exists(f"data/uploaded/{dataset_name}"):
@@ -286,7 +300,13 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
         corpus_data = []
         skipped_files: List[str] = []
         processed_count = 0
-        allowed_extensions = {".txt", ".md", ".json"}
+        allowed_extensions = {".txt", ".md", ".json", ".pdf", ".docx", ".doc"}
+
+        # Initialize document parser if needed
+        doc_parser = None
+        if DOCUMENT_PARSER_AVAILABLE:
+            doc_parser = get_parser()
+
         for i, file in enumerate(files):
             file_path = os.path.join(upload_dir, file.filename)
             with open(file_path, "wb") as buffer:
@@ -303,6 +323,35 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
                 progress = 10 + (i + 1) * 80 // len(files)
                 await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
                 continue
+
+            # Handle PDF and DOCX/DOC files with document parser
+            if ext in ['.pdf', '.docx', '.doc']:
+                if not doc_parser:
+                    logger.warning(f"Document parser not available, skipping {file.filename}")
+                    skipped_files.append(file.filename)
+                    progress = 10 + (i + 1) * 80 // len(files)
+                    await send_progress_update(client_id, "upload", progress, f"Skipped {file.filename} (parser unavailable)")
+                    continue
+
+                try:
+                    text = doc_parser.parse_file(file_path, ext)
+                    if text and text.strip():
+                        corpus_data.append({
+                            "title": file.filename,
+                            "text": text
+                        })
+                        processed_count += 1
+                        await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Parsed {file.filename}")
+                    else:
+                        logger.warning(f"No text extracted from {file.filename}")
+                        skipped_files.append(file.filename)
+                        await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"No text in {file.filename}")
+                except Exception as e:
+                    logger.error(f"Error parsing {file.filename}: {e}")
+                    skipped_files.append(file.filename)
+                    await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Failed to parse {file.filename}")
+                continue
+
             # Treat plain text formats explicitly (.txt and .md)
             if filename_lower.endswith(('.txt', '.md')):
                 text = decode_bytes_with_detection(content_bytes)
@@ -333,7 +382,7 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
 
         # Ensure at least one valid file processed
         if processed_count == 0:
-            msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
+            msg = "No supported files were uploaded. Allowed: .txt, .md, .json, .pdf, .docx, .doc"
             if skipped_files:
                 msg += f"; skipped: {', '.join(skipped_files)}"
             await send_progress_update(client_id, "upload", 0, msg)

diff --git a/frontend/index.html b/frontend/index.html
@@ -1089,7 +1089,7 @@ <h2 id="uploadDocumentsTitle">Upload Documents</h2>
                                         <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
                                         <p id="uploadDragDesc">or click to browse files</p>
                     <p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
-                        Currently supports: .txt, .md, .json
+                        Currently supports: .txt, .md, .json, .pdf, .docx, .doc
                     </p>
                     <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.1); border-radius: 10px; text-align: left; font-size: 11px; color: rgba(255,255,255,0.8);">
                                                 <div id="uploadSampleHeader" style="font-weight: 600; margin-bottom: 8px;">📄 Sample Format (.json):</div>
@@ -1106,7 +1106,7 @@ <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
   ]
 </pre>
                     </div>
-                    <input type="file" id="fileInput" multiple accept=".txt,.json,.md" style="display: none;">
+                    <input type="file" id="fileInput" multiple accept=".txt,.json,.md,.pdf,.docx,.doc" style="display: none;">
                 </div>
 
                 <div id="fileList" class="file-list hidden"></div>
@@ -1213,7 +1213,7 @@ <h3 id="answerTitle">Answer:</h3>
                 uploadDocuments: 'Upload Documents',
                 uploadDragTitle: 'Drag & Drop Files Here',
                 uploadDragDesc: 'or click to browse files',
-                uploadSupports: 'Currently supports: .txt, .md, .json',
+                uploadSupports: 'Currently supports: .txt, .md, .json, .pdf, .docx, .doc',
                 uploadSampleHeader: '📄 Sample Format (.json):',
                 uploadBtn: 'Upload Files',
                 clearUploadBtn: 'Clear',
@@ -1233,7 +1233,15 @@ <h3 id="answerTitle">Answer:</h3>
                 retrievalProgress: (found,cand) => `Retrieval progress: ${found} relevant nodes from ${cand} candidates.`,
                 answerStart: 'Synthesizing the final answer...',
                 qaCompleted: 'QA process completed!',
-                completedBadge: '✔️ Completed'
+                completedBadge: '✔️ Completed',
+                datasetDocCountLabel: (count) => count === 1 ? ' (1 document)' : ` (${count} documents)`,
+                uploadSuccessMessage: (name, count) => {
+                    const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || 'dataset';
+                    if (typeof count === 'number' && count > 0) {
+                        return `Dataset "${friendlyName}" uploaded successfully${count === 1 ? ' (1 document)' : ` (${count} documents)`}!`;
+                    }
+                    return `Dataset "${friendlyName}" uploaded successfully!`;
+                }
             },
             zh: {
                 processing: '正在处理你的问题…',
@@ -1249,7 +1257,7 @@ <h3 id="answerTitle">Answer:</h3>
                 uploadDocuments: '上传文档',
                 uploadDragTitle: '拖拽文件到这里',
                 uploadDragDesc: '或点击选择文件',
-                uploadSupports: '当前支持：.md,.txt,.json',
+                uploadSupports: '当前支持：.txt、.md、.json、.pdf、.docx、.doc',
                 uploadSampleHeader: '📄 示例格式（.json）：',
                 uploadBtn: '开始上传',
                 clearUploadBtn: '清空',
@@ -1269,7 +1277,15 @@ <h3 id="answerTitle">Answer:</h3>
                 retrievalProgress: (found,cand) => `检索进度：在 ${cand} 个候选中找到 ${found} 个相关节点。`,
                 answerStart: '正在综合生成最终答案…',
                 qaCompleted: '问答流程完成！',
-                completedBadge: '✔️ 已完成'
+                completedBadge: '✔️ 已完成',
+                datasetDocCountLabel: (count) => count === 1 ? '（1篇文档）' : `（共${count}篇文档）`,
+                uploadSuccessMessage: (name, count) => {
+                    const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || '数据集';
+                    if (typeof count === 'number' && count > 0) {
+                        return `数据集“${friendlyName}”上传成功${count === 1 ? '（1篇文档）' : `（共${count}篇文档）`}！`;
+                    }
+                    return `数据集“${friendlyName}”上传成功！`;
+                }
             }
         };
 
@@ -1311,6 +1327,11 @@ <h3 id="answerTitle">Answer:</h3>
             // Processing text & badge (if visible)
             const procText = document.getElementById('qaProcessingText'); if (procText) procText.textContent = t.processing;
             const badge = document.getElementById('qaCompletionBadge'); if (badge && badge.style.display !== 'none') badge.textContent = t.completedBadge;
+
+            // Refresh dataset renderings to reflect language-specific labels
+            displayDatasets();
+            loadDatasetOptions('graphDataset');
+            loadDatasetOptions('qaDataset');
         }
 
         // API base URL
@@ -1452,6 +1473,20 @@ <h3 id="answerTitle">Answer:</h3>
             return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
         }
 
+        function formatDatasetDisplayName(dataset) {
+            if (!dataset) return '';
+            const rawName = (dataset.display_label || dataset.name || '').toString();
+            const cleanedName = rawName.replace(/_/g, ' ').trim() || (dataset.name || '');
+            const docCount = typeof dataset.docs_count === 'number' ? dataset.docs_count : null;
+            if (docCount && docCount > 0) {
+                const langConfig = i18n[currentLang] || i18n.en;
+                const suffixFn = langConfig?.datasetDocCountLabel;
+                const suffix = suffixFn ? suffixFn(docCount) : ` (${docCount} documents)`;
+                return `${cleanedName}${suffix}`;
+            }
+            return cleanedName;
+        }
+
         async function uploadFiles() {
             if (selectedFiles.length === 0) return;
 
@@ -1484,7 +1519,19 @@ <h3 id="answerTitle">Answer:</h3>
                 progressFill.style.width = '100%';
                 progressText.textContent = 'Upload completed!';
 
-                showMessage('Files uploaded successfully!', 'success');
+                const uploadData = response?.data || {};
+                const datasetName = uploadData.dataset_name;
+                const filesCount = uploadData.files_count;
+                const langConfig = i18n[currentLang] || i18n.en;
+                if (langConfig?.uploadSuccessMessage) {
+                    showMessage(langConfig.uploadSuccessMessage(datasetName, filesCount), 'success');
+                } else {
+                    let successMessage = datasetName ? `Dataset "${datasetName}" uploaded successfully!` : 'Files uploaded successfully!';
+                    if (typeof filesCount === 'number' && filesCount > 0) {
+                        successMessage += filesCount === 1 ? ' (1 document)' : ` (${filesCount} documents)`;
+                    }
+                    showMessage(successMessage, 'success');
+                }
                 clearFiles();
                 refreshData();
 
@@ -1524,6 +1571,7 @@ <h3 id="answerTitle">Answer:</h3>
 
         function displayDatasets() {
             const container = document.getElementById('datasetsList');
+            if (!container) return;
             if (datasets.length === 0) {
                 container.innerHTML = '<p style="color: rgba(255,255,255,0.6);">No datasets available. Upload some files to get started.</p>';
                 return;
@@ -1532,7 +1580,7 @@ <h3 id="answerTitle">Answer:</h3>
             container.innerHTML = datasets.map(dataset => `
                 <div class="file-item">
                     <div>
-                        <strong>${dataset.name}</strong>
+                        <strong>${formatDatasetDisplayName(dataset)}</strong>
                         <span style="color: rgba(255,255,255,0.6);"> (${dataset.type})</span>
                         ${dataset.type !== 'demo' ? `<span style="margin-left:8px; font-size:12px; color:${dataset.has_custom_schema ? '#228B22' : 'rgba(255,255,255,0.6)'}; font-weight:600;">Schema: ${dataset.has_custom_schema ? 'custom' : 'default'}</span>` : ''}
                     </div>
@@ -1790,10 +1838,19 @@ <h3 id="answerTitle">Answer:</h3>
 
         function loadDatasetOptions(selectId) {
             const select = document.getElementById(selectId);
+            if (!select) return;
+
             const readyDatasets = datasets.filter(d => d.status === 'ready');
-
-            select.innerHTML = '<option value="">Select a dataset...</option>' +
-                readyDatasets.map(d => `<option value="${d.name}">${d.name}</option>`).join('');
+            const currentValue = select.value;
+            const langConfig = i18n[currentLang] || i18n.en;
+            const placeholder = langConfig?.selectDatasetPlaceholder || 'Select a dataset...';
+
+            select.innerHTML = `<option value="">${placeholder}</option>` +
+                readyDatasets.map(d => `<option value="${d.name}">${formatDatasetDisplayName(d)}</option>`).join('');
+
+            if (readyDatasets.some(d => d.name === currentValue)) {
+                select.value = currentValue;
+            }
         }
 
         async function loadGraphData() {

diff --git a/requirements.txt b/requirements.txt
@@ -22,13 +22,17 @@ spacy==3.7.5
 tiktoken==0.9.0
 json-repair==0.46.2
 
+# Document Parsing
+magic-pdf==1.3.12
+python-docx==1.1.2
+pypdf==4.2.0
+
 # API & HTTP Clients
 openai==1.102.0
 requests==2.32.5
 
 # Configuration & Utilities
 PyYAML==6.0.2
-pydantic==2.5.0
 nanoid==2.0.0
 dotenv==0.9.9 
 python-dotenv==1.1.1