Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 59 additions & 10 deletions backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
from utils.logger import logger
import ast

# Import document parser
try:
from utils.document_parser import get_parser
DOCUMENT_PARSER_AVAILABLE = True
except ImportError as e:
DOCUMENT_PARSER_AVAILABLE = False
logger.warning(f"Document parser not available: {e}")

# Try to import GraphRAG components
try:
from models.constructor import kt_gen as constructor
Expand Down Expand Up @@ -262,15 +270,21 @@ async def websocket_endpoint(websocket: WebSocket, client_id: str):
async def upload_files(files: List[UploadFile] = File(...), client_id: str = "default"):
"""Upload files and prepare for graph construction"""
try:
# Use original filename (without extension) as dataset name
# If multiple files, use the first file's name
main_file = files[0]
original_name = os.path.splitext(main_file.filename)[0]
# Clean filename to be filesystem-safe
dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
dataset_name = dataset_name.replace(' ', '_')
# Generate dataset name based on file count
if len(files) == 1:
# Single file: use its name
main_file = files[0]
original_name = os.path.splitext(main_file.filename)[0]
# Clean filename to be filesystem-safe
dataset_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
dataset_name = dataset_name.replace(' ', '_')
else:
# Multiple files: create a descriptive name with date
from datetime import datetime
date_str = datetime.now().strftime("%Y%m%d")
dataset_name = f"{len(files)}files_{date_str}"

# Add timestamp if dataset already exists
# Add counter if dataset already exists
base_name = dataset_name
counter = 1
while os.path.exists(f"data/uploaded/{dataset_name}"):
Expand All @@ -286,7 +300,13 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
corpus_data = []
skipped_files: List[str] = []
processed_count = 0
allowed_extensions = {".txt", ".md", ".json"}
allowed_extensions = {".txt", ".md", ".json", ".pdf", ".docx", ".doc"}

# Initialize document parser if needed
doc_parser = None
if DOCUMENT_PARSER_AVAILABLE:
doc_parser = get_parser()

for i, file in enumerate(files):
file_path = os.path.join(upload_dir, file.filename)
with open(file_path, "wb") as buffer:
Expand All @@ -303,6 +323,35 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de
progress = 10 + (i + 1) * 80 // len(files)
await send_progress_update(client_id, "upload", progress, f"Skipped unsupported file: {file.filename}")
continue

# Handle PDF and DOCX/DOC files with document parser
if ext in ['.pdf', '.docx', '.doc']:
if not doc_parser:
logger.warning(f"Document parser not available, skipping {file.filename}")
skipped_files.append(file.filename)
progress = 10 + (i + 1) * 80 // len(files)
await send_progress_update(client_id, "upload", progress, f"Skipped {file.filename} (parser unavailable)")
continue

try:
text = doc_parser.parse_file(file_path, ext)
if text and text.strip():
corpus_data.append({
"title": file.filename,
"text": text
})
processed_count += 1
await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Parsed {file.filename}")
else:
logger.warning(f"No text extracted from {file.filename}")
skipped_files.append(file.filename)
await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"No text in {file.filename}")
except Exception as e:
logger.error(f"Error parsing {file.filename}: {e}")
skipped_files.append(file.filename)
await send_progress_update(client_id, "upload", 10 + (i + 1) * 80 // len(files), f"Failed to parse {file.filename}")
continue

# Treat plain text formats explicitly (.txt and .md)
if filename_lower.endswith(('.txt', '.md')):
text = decode_bytes_with_detection(content_bytes)
Expand Down Expand Up @@ -333,7 +382,7 @@ async def upload_files(files: List[UploadFile] = File(...), client_id: str = "de

# Ensure at least one valid file processed
if processed_count == 0:
msg = "No supported files were uploaded. Allowed: .txt, .md, .json"
msg = "No supported files were uploaded. Allowed: .txt, .md, .json, .pdf, .docx, .doc"
if skipped_files:
msg += f"; skipped: {', '.join(skipped_files)}"
await send_progress_update(client_id, "upload", 0, msg)
Expand Down
79 changes: 68 additions & 11 deletions frontend/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ <h2 id="uploadDocumentsTitle">Upload Documents</h2>
<h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
<p id="uploadDragDesc">or click to browse files</p>
<p id="uploadSupports" style="font-size: 12px; color: rgba(255,255,255,0.6); margin-top: 12px;">
Currently supports: .txt, .md, .json
Currently supports: .txt, .md, .json, .pdf, .docx, .doc
</p>
<div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.1); border-radius: 10px; text-align: left; font-size: 11px; color: rgba(255,255,255,0.8);">
<div id="uploadSampleHeader" style="font-weight: 600; margin-bottom: 8px;">📄 Sample Format (.json):</div>
Expand All @@ -1106,7 +1106,7 @@ <h3 id="uploadDragTitle">Drag & Drop Files Here</h3>
]
</pre>
</div>
<input type="file" id="fileInput" multiple accept=".txt,.json,.md" style="display: none;">
<input type="file" id="fileInput" multiple accept=".txt,.json,.md,.pdf,.docx,.doc" style="display: none;">
</div>

<div id="fileList" class="file-list hidden"></div>
Expand Down Expand Up @@ -1213,7 +1213,7 @@ <h3 id="answerTitle">Answer:</h3>
uploadDocuments: 'Upload Documents',
uploadDragTitle: 'Drag & Drop Files Here',
uploadDragDesc: 'or click to browse files',
uploadSupports: 'Currently supports: .txt, .md, .json',
uploadSupports: 'Currently supports: .txt, .md, .json, .pdf, .docx, .doc',
uploadSampleHeader: '📄 Sample Format (.json):',
uploadBtn: 'Upload Files',
clearUploadBtn: 'Clear',
Expand All @@ -1233,7 +1233,15 @@ <h3 id="answerTitle">Answer:</h3>
retrievalProgress: (found,cand) => `Retrieval progress: ${found} relevant nodes from ${cand} candidates.`,
answerStart: 'Synthesizing the final answer...',
qaCompleted: 'QA process completed!',
completedBadge: '✔️ Completed'
completedBadge: '✔️ Completed',
datasetDocCountLabel: (count) => count === 1 ? ' (1 document)' : ` (${count} documents)`,
uploadSuccessMessage: (name, count) => {
const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || 'dataset';
if (typeof count === 'number' && count > 0) {
return `Dataset "${friendlyName}" uploaded successfully${count === 1 ? ' (1 document)' : ` (${count} documents)`}!`;
}
return `Dataset "${friendlyName}" uploaded successfully!`;
}
},
zh: {
processing: '正在处理你的问题…',
Expand All @@ -1249,7 +1257,7 @@ <h3 id="answerTitle">Answer:</h3>
uploadDocuments: '上传文档',
uploadDragTitle: '拖拽文件到这里',
uploadDragDesc: '或点击选择文件',
uploadSupports: '当前支持:.md,.txt,.json',
uploadSupports: '当前支持:.txt、.md、.json、.pdf、.docx、.doc',
uploadSampleHeader: '📄 示例格式(.json):',
uploadBtn: '开始上传',
clearUploadBtn: '清空',
Expand All @@ -1269,7 +1277,15 @@ <h3 id="answerTitle">Answer:</h3>
retrievalProgress: (found,cand) => `检索进度:在 ${cand} 个候选中找到 ${found} 个相关节点。`,
answerStart: '正在综合生成最终答案…',
qaCompleted: '问答流程完成!',
completedBadge: '✔️ 已完成'
completedBadge: '✔️ 已完成',
datasetDocCountLabel: (count) => count === 1 ? '(1篇文档)' : `(共${count}篇文档)`,
uploadSuccessMessage: (name, count) => {
const friendlyName = (name || '').replace(/_/g, ' ').trim() || name || '数据集';
if (typeof count === 'number' && count > 0) {
return `数据集“${friendlyName}”上传成功${count === 1 ? '(1篇文档)' : `(共${count}篇文档)`}!`;
}
return `数据集“${friendlyName}”上传成功!`;
}
}
};

Expand Down Expand Up @@ -1311,6 +1327,11 @@ <h3 id="answerTitle">Answer:</h3>
// Processing text & badge (if visible)
const procText = document.getElementById('qaProcessingText'); if (procText) procText.textContent = t.processing;
const badge = document.getElementById('qaCompletionBadge'); if (badge && badge.style.display !== 'none') badge.textContent = t.completedBadge;

// Refresh dataset renderings to reflect language-specific labels
displayDatasets();
loadDatasetOptions('graphDataset');
loadDatasetOptions('qaDataset');
}

// API base URL
Expand Down Expand Up @@ -1452,6 +1473,20 @@ <h3 id="answerTitle">Answer:</h3>
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}

function formatDatasetDisplayName(dataset) {
if (!dataset) return '';
const rawName = (dataset.display_label || dataset.name || '').toString();
const cleanedName = rawName.replace(/_/g, ' ').trim() || (dataset.name || '');
const docCount = typeof dataset.docs_count === 'number' ? dataset.docs_count : null;
if (docCount && docCount > 0) {
const langConfig = i18n[currentLang] || i18n.en;
const suffixFn = langConfig?.datasetDocCountLabel;
const suffix = suffixFn ? suffixFn(docCount) : ` (${docCount} documents)`;
return `${cleanedName}${suffix}`;
}
return cleanedName;
}

async function uploadFiles() {
if (selectedFiles.length === 0) return;

Expand Down Expand Up @@ -1484,7 +1519,19 @@ <h3 id="answerTitle">Answer:</h3>
progressFill.style.width = '100%';
progressText.textContent = 'Upload completed!';

showMessage('Files uploaded successfully!', 'success');
const uploadData = response?.data || {};
const datasetName = uploadData.dataset_name;
const filesCount = uploadData.files_count;
const langConfig = i18n[currentLang] || i18n.en;
if (langConfig?.uploadSuccessMessage) {
showMessage(langConfig.uploadSuccessMessage(datasetName, filesCount), 'success');
} else {
let successMessage = datasetName ? `Dataset "${datasetName}" uploaded successfully!` : 'Files uploaded successfully!';
if (typeof filesCount === 'number' && filesCount > 0) {
successMessage += filesCount === 1 ? ' (1 document)' : ` (${filesCount} documents)`;
}
showMessage(successMessage, 'success');
}
clearFiles();
refreshData();

Expand Down Expand Up @@ -1524,6 +1571,7 @@ <h3 id="answerTitle">Answer:</h3>

function displayDatasets() {
const container = document.getElementById('datasetsList');
if (!container) return;
if (datasets.length === 0) {
container.innerHTML = '<p style="color: rgba(255,255,255,0.6);">No datasets available. Upload some files to get started.</p>';
return;
Expand All @@ -1532,7 +1580,7 @@ <h3 id="answerTitle">Answer:</h3>
container.innerHTML = datasets.map(dataset => `
<div class="file-item">
<div>
<strong>${dataset.name}</strong>
<strong>${formatDatasetDisplayName(dataset)}</strong>
<span style="color: rgba(255,255,255,0.6);"> (${dataset.type})</span>
${dataset.type !== 'demo' ? `<span style="margin-left:8px; font-size:12px; color:${dataset.has_custom_schema ? '#228B22' : 'rgba(255,255,255,0.6)'}; font-weight:600;">Schema: ${dataset.has_custom_schema ? 'custom' : 'default'}</span>` : ''}
</div>
Expand Down Expand Up @@ -1790,10 +1838,19 @@ <h3 id="answerTitle">Answer:</h3>

function loadDatasetOptions(selectId) {
const select = document.getElementById(selectId);
if (!select) return;

const readyDatasets = datasets.filter(d => d.status === 'ready');

select.innerHTML = '<option value="">Select a dataset...</option>' +
readyDatasets.map(d => `<option value="${d.name}">${d.name}</option>`).join('');
const currentValue = select.value;
const langConfig = i18n[currentLang] || i18n.en;
const placeholder = langConfig?.selectDatasetPlaceholder || 'Select a dataset...';

select.innerHTML = `<option value="">${placeholder}</option>` +
readyDatasets.map(d => `<option value="${d.name}">${formatDatasetDisplayName(d)}</option>`).join('');

if (readyDatasets.some(d => d.name === currentValue)) {
select.value = currentValue;
}
}

async function loadGraphData() {
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@ spacy==3.7.5
tiktoken==0.9.0
json-repair==0.46.2

# Document Parsing
magic-pdf==1.3.12
python-docx==1.1.2
pypdf==4.2.0

# API & HTTP Clients
openai==1.102.0
requests==2.32.5

# Configuration & Utilities
PyYAML==6.0.2
pydantic==2.5.0
nanoid==2.0.0
dotenv==0.9.9
python-dotenv==1.1.1
Expand Down
Loading