TencentCloudADP · siyuan-youtu · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/backend.py b/backend.py
@@ -149,6 +149,44 @@ async def send_progress_update(client_id: str, stage: str, progress: int, messag
         "timestamp": datetime.now().isoformat()
     }, client_id)
 
+# -------- Encoding detection helpers --------
+def _detect_encoding_from_bytes(data: bytes) -> Optional[str]:
+    """Detect encoding using chardet if available; return lower-cased encoding name or None."""
+    try:
+        import chardet  # type: ignore
+        result = chardet.detect(data) or {}
+        enc = result.get("encoding")
+        if enc:
+            return enc.lower()
+    except Exception:
+        pass
+    return None
+
+def decode_bytes_with_detection(data: bytes) -> str:
+    """Decode bytes to string with encoding detection and robust fallbacks.
+    Order: detected -> utf-8/utf-8-sig -> common Chinese encodings -> utf-16 variants -> latin-1 -> replace.
+    """
+    candidates = []
+    detected = _detect_encoding_from_bytes(data)
+    if detected:
+        candidates.append(detected)
+    candidates.extend([
+        "utf-8", "utf-8-sig", "gb18030", "gbk", "big5",
+        "utf-16", "utf-16le", "utf-16be", "latin-1"
+    ])
+    # De-duplicate while preserving order
+    tried = set()
+    for enc in candidates:
+        if enc in tried or not enc:
+            continue
+        tried.add(enc)
+        try:
+            return data.decode(enc)
+        except Exception:
+            continue
+    # Last resort
+    return data.decode("utf-8", errors="replace")
+
 async def clear_cache_files(dataset_name: str):
     """Clear all cache files for a dataset before graph construction"""
     try:
@@ -246,40 +284,61 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
 
         # Process uploaded files
         corpus_data = []
+        skipped_files: List[str] = []
+        processed_count = 0
+        allowed_extensions = {".txt", ".md", ".json"}
         for i, file in enumerate(files):
             file_path = os.path.join(upload_dir, file.filename)
             with open(file_path, "wb") as buffer:
-                content = await file.read()
-                buffer.write(content)
+                content_bytes = await file.read()
+                buffer.write(content_bytes)
 
-            # Process file content
-            if file.filename.endswith('.txt'):
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                    content = f.read()
+            # Process file content using encoding detection
+            filename_lower = (file.filename or "").lower()
+            ext = os.path.splitext(filename_lower)[1]
+            if ext not in allowed_extensions:
+                # Skip unsupported file types to avoid processing binary files as text
+                logger.warning(f"Skipping unsupported file type: {file.filename}")
+                skipped_files.append(file.filename)
+                progress = 10 + (i + 1) * 80 // len(files)
+                await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
+                continue
+            # Treat plain text formats explicitly (.txt and .md)
+            if filename_lower.endswith(('.txt', '.md')):
+                text = decode_bytes_with_detection(content_bytes)
                 corpus_data.append({
                     "title": file.filename,
-                    "text": content
+                    "text": text
                 })
-            elif file.filename.endswith('.json'):
+                processed_count += 1
+            elif filename_lower.endswith('.json'):
                 try:
-                    with open(file_path, 'r', encoding='utf-8') as f:
-                        data = json.load(f)
-                        if isinstance(data, list):
-                            corpus_data.extend(data)
-                        else:
-                            corpus_data.append(data)
-                except:
+                    json_text = decode_bytes_with_detection(content_bytes)
+                    data_obj = json.loads(json_text)
+                    if isinstance(data_obj, list):
+                        corpus_data.extend(data_obj)
+                    else:
+                        corpus_data.append(data_obj)
+                    processed_count += 1
+                except Exception:
-                except Exception:
+                except (json.JSONDecodeError, UnicodeDecodeError):
-                except Exception:
+                except (json.JSONDecodeError, UnicodeDecodeError):
                     # If JSON parsing fails, treat as text
-                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                        content = f.read()
+                    text = decode_bytes_with_detection(content_bytes)
                     corpus_data.append({
                         "title": file.filename,
-                        "text": content
+                        "text": text
                     })
 
             progress = 10 + (i + 1) * 80 // len(files)
             await send_progress_update(client_id, "upload", progress, f"Processed {file.filename}")
 
+        # Ensure at least one valid file processed
+        if processed_count == 0:
+            msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
+            if skipped_files:
+                msg += f"; skipped: {', '.join(skipped_files)}"
+            await send_progress_update(client_id, "upload", 0, msg)
+            raise HTTPException(status_code=400, detail=msg)
+
         # Save corpus data
         corpus_path = f"{upload_dir}/corpus.json"
         with open(corpus_path, 'w', encoding='utf-8') as f:
@@ -290,11 +349,14 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
 
         await send_progress_update(client_id, "upload", 100, "Upload completed successfully!")
 
+        msg_ok = "Files uploaded successfully"
+        if skipped_files:
+            msg_ok += f"; skipped unsupported: {', '.join(skipped_files)}"
         return FileUploadResponse(
             success=True,
-            message="Files uploaded successfully",
+            message=msg_ok,
             dataset_name=dataset_name,
-            files_count=len(files)
+            files_count=processed_count
         )
 
     except Exception as e:
@@ -978,7 +1040,8 @@ async def upload_schema(dataset_name: str, schema_file: UploadFile = File(...)):
 
         content = await schema_file.read()
         try:
-            data = json.loads(content)
+            schema_text = decode_bytes_with_detection(content)
+            data = json.loads(schema_text)
         except Exception as e:
             raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
         if not isinstance(data, dict):

diff --git a/frontend/index.html b/frontend/index.html
@@ -1088,8 +1088,8 @@ <h2 id="uploadDocumentsTitle">Upload Documents</h2>
                     <div class="upload-icon">📁</div>
                                         <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
                                         <p id="uploadDragDesc">or click to browse files</p>
-                                        <p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
-                                                Currently supports: .json
+                    <p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
+                        Currently supports: .txt, .md, .json
                     </p>
                     <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.1); border-radius: 10px; text-align: left; font-size: 11px; color: rgba(255,255,255,0.8);">
                                                 <div id="uploadSampleHeader" style="font-weight: 600; margin-bottom: 8px;">📄 Sample Format (.json):</div>
@@ -1213,7 +1213,7 @@ <h3 id="answerTitle">Answer:</h3>
                 uploadDocuments: 'Upload Documents',
                 uploadDragTitle: 'Drag & Drop Files Here',
                 uploadDragDesc: 'or click to browse files',
-                uploadSupports: 'Currently supports: .json',
+                uploadSupports: 'Currently supports: .txt, .md, .json',
                 uploadSampleHeader: '📄 Sample Format (.json):',
                 uploadBtn: 'Upload Files',
                 clearUploadBtn: 'Clear',

diff --git a/requirements.txt b/requirements.txt
@@ -31,4 +31,5 @@ PyYAML==6.0.2
 pydantic==2.5.0
 nanoid==2.0.0
 dotenv==0.9.9 
-python-dotenv==1.1.1
+python-dotenv==1.1.1
+chardet==5.2.0