-
Notifications
You must be signed in to change notification settings - Fork 171
Upload encode detect #101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Upload encode detect #101
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -149,6 +149,44 @@ async def send_progress_update(client_id: str, stage: str, progress: int, messag | |||||
| "timestamp": datetime.now().isoformat() | ||||||
| }, client_id) | ||||||
|
|
||||||
| # -------- Encoding detection helpers -------- | ||||||
| def _detect_encoding_from_bytes(data: bytes) -> Optional[str]: | ||||||
| """Detect encoding using chardet if available; return lower-cased encoding name or None.""" | ||||||
| try: | ||||||
| import chardet # type: ignore | ||||||
| result = chardet.detect(data) or {} | ||||||
| enc = result.get("encoding") | ||||||
| if enc: | ||||||
| return enc.lower() | ||||||
| except Exception: | ||||||
| pass | ||||||
| return None | ||||||
|
|
||||||
| def decode_bytes_with_detection(data: bytes) -> str: | ||||||
| """Decode bytes to string with encoding detection and robust fallbacks. | ||||||
| Order: detected -> utf-8/utf-8-sig -> common Chinese encodings -> utf-16 variants -> latin-1 -> replace. | ||||||
| """ | ||||||
| candidates = [] | ||||||
| detected = _detect_encoding_from_bytes(data) | ||||||
| if detected: | ||||||
| candidates.append(detected) | ||||||
| candidates.extend([ | ||||||
| "utf-8", "utf-8-sig", "gb18030", "gbk", "big5", | ||||||
| "utf-16", "utf-16le", "utf-16be", "latin-1" | ||||||
| ]) | ||||||
| # De-duplicate while preserving order | ||||||
| tried = set() | ||||||
| for enc in candidates: | ||||||
| if enc in tried or not enc: | ||||||
| continue | ||||||
| tried.add(enc) | ||||||
| try: | ||||||
| return data.decode(enc) | ||||||
| except Exception: | ||||||
| continue | ||||||
| # Last resort | ||||||
| return data.decode("utf-8", errors="replace") | ||||||
|
|
||||||
| async def clear_cache_files(dataset_name: str): | ||||||
| """Clear all cache files for a dataset before graph construction""" | ||||||
| try: | ||||||
|
|
@@ -246,40 +284,61 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de | |||||
|
|
||||||
| # Process uploaded files | ||||||
| corpus_data = [] | ||||||
| skipped_files: List[str] = [] | ||||||
| processed_count = 0 | ||||||
| allowed_extensions = {".txt", ".md", ".json"} | ||||||
| for i, file in enumerate(files): | ||||||
| file_path = os.path.join(upload_dir, file.filename) | ||||||
| with open(file_path, "wb") as buffer: | ||||||
| content = await file.read() | ||||||
| buffer.write(content) | ||||||
| content_bytes = await file.read() | ||||||
| buffer.write(content_bytes) | ||||||
|
|
||||||
| # Process file content | ||||||
| if file.filename.endswith('.txt'): | ||||||
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | ||||||
| content = f.read() | ||||||
| # Process file content using encoding detection | ||||||
| filename_lower = (file.filename or "").lower() | ||||||
| ext = os.path.splitext(filename_lower)[1] | ||||||
| if ext not in allowed_extensions: | ||||||
| # Skip unsupported file types to avoid processing binary files as text | ||||||
| logger.warning(f"Skipping unsupported file type: {file.filename}") | ||||||
| skipped_files.append(file.filename) | ||||||
| progress = 10 + (i + 1) * 80 // len(files) | ||||||
| await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}") | ||||||
| continue | ||||||
| # Treat plain text formats explicitly (.txt and .md) | ||||||
| if filename_lower.endswith(('.txt', '.md')): | ||||||
| text = decode_bytes_with_detection(content_bytes) | ||||||
| corpus_data.append({ | ||||||
| "title": file.filename, | ||||||
| "text": content | ||||||
| "text": text | ||||||
| }) | ||||||
| elif file.filename.endswith('.json'): | ||||||
| processed_count += 1 | ||||||
| elif filename_lower.endswith('.json'): | ||||||
|
Comment on lines
+307
to
+314
|
||||||
| try: | ||||||
| with open(file_path, 'r', encoding='utf-8') as f: | ||||||
| data = json.load(f) | ||||||
| if isinstance(data, list): | ||||||
| corpus_data.extend(data) | ||||||
| else: | ||||||
| corpus_data.append(data) | ||||||
| except: | ||||||
| json_text = decode_bytes_with_detection(content_bytes) | ||||||
| data_obj = json.loads(json_text) | ||||||
| if isinstance(data_obj, list): | ||||||
| corpus_data.extend(data_obj) | ||||||
| else: | ||||||
| corpus_data.append(data_obj) | ||||||
| processed_count += 1 | ||||||
| except Exception: | ||||||
|
||||||
| except Exception: | |
| except (json.JSONDecodeError, UnicodeDecodeError): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The chardet import should be moved to the top of the file with other imports rather than being imported inside the function. This avoids repeated import overhead and makes dependencies more visible.