Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 84 additions & 21 deletions backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,44 @@ async def send_progress_update(client_id: str, stage: str, progress: int, messag
"timestamp": datetime.now().isoformat()
}, client_id)

# -------- Encoding detection helpers --------
def _detect_encoding_from_bytes(data: bytes) -> Optional[str]:
"""Detect encoding using chardet if available; return lower-cased encoding name or None."""
try:
import chardet # type: ignore
Copy link

Copilot AI Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chardet import should be moved to the top of the file with other imports rather than being imported inside the function. This avoids repeated import overhead and makes dependencies more visible.

Copilot uses AI. Check for mistakes.
result = chardet.detect(data) or {}
enc = result.get("encoding")
if enc:
return enc.lower()
except Exception:
pass
return None

def decode_bytes_with_detection(data: bytes) -> str:
"""Decode bytes to string with encoding detection and robust fallbacks.
Order: detected -> utf-8/utf-8-sig -> common Chinese encodings -> utf-16 variants -> latin-1 -> replace.
"""
candidates = []
detected = _detect_encoding_from_bytes(data)
if detected:
candidates.append(detected)
candidates.extend([
"utf-8", "utf-8-sig", "gb18030", "gbk", "big5",
"utf-16", "utf-16le", "utf-16be", "latin-1"
])
# De-duplicate while preserving order
tried = set()
for enc in candidates:
if enc in tried or not enc:
continue
tried.add(enc)
try:
return data.decode(enc)
except Exception:
continue
# Last resort
return data.decode("utf-8", errors="replace")

async def clear_cache_files(dataset_name: str):
"""Clear all cache files for a dataset before graph construction"""
try:
Expand Down Expand Up @@ -246,40 +284,61 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de

# Process uploaded files
corpus_data = []
skipped_files: List[str] = []
processed_count = 0
allowed_extensions = {".txt", ".md", ".json"}
for i, file in enumerate(files):
file_path = os.path.join(upload_dir, file.filename)
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
content_bytes = await file.read()
buffer.write(content_bytes)

# Process file content
if file.filename.endswith('.txt'):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Process file content using encoding detection
filename_lower = (file.filename or "").lower()
ext = os.path.splitext(filename_lower)[1]
if ext not in allowed_extensions:
# Skip unsupported file types to avoid processing binary files as text
logger.warning(f"Skipping unsupported file type: {file.filename}")
skipped_files.append(file.filename)
progress = 10 + (i + 1) * 80 // len(files)
await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
continue
# Treat plain text formats explicitly (.txt and .md)
if filename_lower.endswith(('.txt', '.md')):
text = decode_bytes_with_detection(content_bytes)
corpus_data.append({
"title": file.filename,
"text": content
"text": text
})
elif file.filename.endswith('.json'):
processed_count += 1
elif filename_lower.endswith('.json'):
Comment on lines +307 to +314
Copy link

Copilot AI Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file extension checking logic is duplicated - first checking ext not in allowed_extensions then using filename_lower.endswith(). Consider using a consistent approach throughout, such as always using the extracted ext variable.

Copilot uses AI. Check for mistakes.
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
corpus_data.extend(data)
else:
corpus_data.append(data)
except:
json_text = decode_bytes_with_detection(content_bytes)
data_obj = json.loads(json_text)
if isinstance(data_obj, list):
corpus_data.extend(data_obj)
else:
corpus_data.append(data_obj)
processed_count += 1
except Exception:
Copy link

Copilot AI Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider catching more specific exceptions (e.g., json.JSONDecodeError, UnicodeDecodeError) instead of the broad Exception to provide better error handling and debugging information.

Suggested change
except Exception:
except (json.JSONDecodeError, UnicodeDecodeError):

Copilot uses AI. Check for mistakes.
# If JSON parsing fails, treat as text
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
text = decode_bytes_with_detection(content_bytes)
corpus_data.append({
"title": file.filename,
"text": content
"text": text
})

progress = 10 + (i + 1) * 80 // len(files)
await send_progress_update(client_id, "upload", progress, f"Processed {file.filename}")

# Ensure at least one valid file processed
if processed_count == 0:
msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
if skipped_files:
msg += f"; skipped: {', '.join(skipped_files)}"
await send_progress_update(client_id, "upload", 0, msg)
raise HTTPException(status_code=400, detail=msg)

# Save corpus data
corpus_path = f"{upload_dir}/corpus.json"
with open(corpus_path, 'w', encoding='utf-8') as f:
Expand All @@ -290,11 +349,14 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de

await send_progress_update(client_id, "upload", 100, "Upload completed successfully!")

msg_ok = "Files uploaded successfully"
if skipped_files:
msg_ok += f"; skipped unsupported: {', '.join(skipped_files)}"
return FileUploadResponse(
success=True,
message="Files uploaded successfully",
message=msg_ok,
dataset_name=dataset_name,
files_count=len(files)
files_count=processed_count
)

except Exception as e:
Expand Down Expand Up @@ -978,7 +1040,8 @@ async def upload_schema(dataset_name: str, schema_file: UploadFile = File(...)):

content = await schema_file.read()
try:
data = json.loads(content)
schema_text = decode_bytes_with_detection(content)
data = json.loads(schema_text)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
if not isinstance(data, dict):
Expand Down
6 changes: 3 additions & 3 deletions frontend/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1088,8 +1088,8 @@ <h2 id="uploadDocumentsTitle">Upload Documents</h2>
<div class="upload-icon">📁</div>
<h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
<p id="uploadDragDesc">or click to browse files</p>
<p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
Currently supports: .json
<p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
Currently supports: .txt, .md, .json
</p>
<div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.1); border-radius: 10px; text-align: left; font-size: 11px; color: rgba(255,255,255,0.8);">
<div id="uploadSampleHeader" style="font-weight: 600; margin-bottom: 8px;">📄 Sample Format (.json):</div>
Expand Down Expand Up @@ -1213,7 +1213,7 @@ <h3 id="answerTitle">Answer:</h3>
uploadDocuments: 'Upload Documents',
uploadDragTitle: 'Drag & Drop Files Here',
uploadDragDesc: 'or click to browse files',
uploadSupports: 'Currently supports: .json',
uploadSupports: 'Currently supports: .txt, .md, .json',
uploadSampleHeader: '📄 Sample Format (.json):',
uploadBtn: 'Upload Files',
clearUploadBtn: 'Clear',
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ PyYAML==6.0.2
pydantic==2.5.0
nanoid==2.0.0
dotenv==0.9.9
python-dotenv==1.1.1
python-dotenv==1.1.1
chardet==5.2.0