fix(sem_search): add semantic search quality evaluation with llm judge and improve descriptions#2370
fix(sem_search): add semantic search quality evaluation with llm judge and improve descriptions#2370tusharmath merged 12 commits intomainfrom
Conversation
…void are always passed - Always pass both parameters to llm_judge.ts, even when empty - Fixes issue where llm_judge.ts requires both parameters unconditionally - Sets empty string defaults for missing EXPECTED_TYPES and SHOULD_AVOID Co-Authored-By: ForgeCode <noreply@forgecode.dev>
|
Fixed in commit 65b896c ✅ The review comment was correct - the script was conditionally adding parameters that Changes made:
Co-Authored-By: ForgeCode noreply@forgecode.dev |
Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | ||
| ./run_eval.sh /tmp/test_semantic_eval/full_context.json > /dev/null 2>&1; then | ||
| echo " ✓ PASSED (score >= 70)" | ||
| PASSED=$((PASSED + 1)) | ||
| else | ||
| echo " ✗ FAILED (expected pass)" | ||
| FAILED=$((FAILED + 1)) | ||
| fi | ||
|
|
||
| # Test 2: Documentation queries - should PASS (may be marginal) | ||
| echo "Test 2: Documentation queries..." | ||
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | ||
| ./run_eval.sh /tmp/test_semantic_eval/doc_context.json > /dev/null 2>&1; then | ||
| echo " ✓ PASSED (score >= 70)" | ||
| PASSED=$((PASSED + 1)) | ||
| else | ||
| echo " ✗ FAILED (expected pass)" | ||
| FAILED=$((FAILED + 1)) | ||
| fi | ||
|
|
||
| # Test 3: Bad queries - should FAIL | ||
| echo "Test 3: Bad queries (generic keywords)..." | ||
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | ||
| ./run_eval.sh /tmp/test_semantic_eval/bad_context.json > /dev/null 2>&1; then | ||
| echo " ✗ FAILED (expected failure, got pass)" | ||
| FAILED=$((FAILED + 1)) | ||
| else | ||
| echo " ✓ PASSED (correctly failed - score < 70)" | ||
| PASSED=$((PASSED + 1)) | ||
| fi | ||
|
|
||
| # Test 4: Missing sem_search - should FAIL early | ||
| echo "Test 4: Missing sem_search tool..." | ||
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | ||
| ./run_eval.sh /tmp/test_semantic_eval/no_sem_search_context.json > /dev/null 2>&1; then | ||
| echo " ✗ FAILED (expected early exit failure)" | ||
| FAILED=$((FAILED + 1)) | ||
| else | ||
| echo " ✓ PASSED (correctly failed early)" | ||
| PASSED=$((PASSED + 1)) |
There was a problem hiding this comment.
Hardcoded absolute path /Users/amit/code-forge/benchmarks/evals/semantic_search_quality will fail on any machine other than the original developer's. This breaks portability and will cause test failures in CI/CD or on other developer machines.
Fix: Use relative paths or dynamically determine the script directory:
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if cd "$SCRIPT_DIR" && \
./run_eval.sh /tmp/test_semantic_eval/full_context.json > /dev/null 2>&1; then| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/full_context.json > /dev/null 2>&1; then | |
| echo " ✓ PASSED (score >= 70)" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo " ✗ FAILED (expected pass)" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Test 2: Documentation queries - should PASS (may be marginal) | |
| echo "Test 2: Documentation queries..." | |
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/doc_context.json > /dev/null 2>&1; then | |
| echo " ✓ PASSED (score >= 70)" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo " ✗ FAILED (expected pass)" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Test 3: Bad queries - should FAIL | |
| echo "Test 3: Bad queries (generic keywords)..." | |
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/bad_context.json > /dev/null 2>&1; then | |
| echo " ✗ FAILED (expected failure, got pass)" | |
| FAILED=$((FAILED + 1)) | |
| else | |
| echo " ✓ PASSED (correctly failed - score < 70)" | |
| PASSED=$((PASSED + 1)) | |
| fi | |
| # Test 4: Missing sem_search - should FAIL early | |
| echo "Test 4: Missing sem_search tool..." | |
| if cd /Users/amit/code-forge/benchmarks/evals/semantic_search_quality && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/no_sem_search_context.json > /dev/null 2>&1; then | |
| echo " ✗ FAILED (expected early exit failure)" | |
| FAILED=$((FAILED + 1)) | |
| else | |
| echo " ✓ PASSED (correctly failed early)" | |
| PASSED=$((PASSED + 1)) | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| if cd "$SCRIPT_DIR" && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/full_context.json > /dev/null 2>&1; then | |
| echo " ✓ PASSED (score >= 70)" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo " ✗ FAILED (expected pass)" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Test 2: Documentation queries - should PASS (may be marginal) | |
| echo "Test 2: Documentation queries..." | |
| if cd "$SCRIPT_DIR" && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/doc_context.json > /dev/null 2>&1; then | |
| echo " ✓ PASSED (score >= 70)" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo " ✗ FAILED (expected pass)" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Test 3: Bad queries - should FAIL | |
| echo "Test 3: Bad queries (generic keywords)..." | |
| if cd "$SCRIPT_DIR" && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/bad_context.json > /dev/null 2>&1; then | |
| echo " ✗ FAILED (expected failure, got pass)" | |
| FAILED=$((FAILED + 1)) | |
| else | |
| echo " ✓ PASSED (correctly failed - score < 70)" | |
| PASSED=$((PASSED + 1)) | |
| fi | |
| # Test 4: Missing sem_search - should FAIL early | |
| echo "Test 4: Missing sem_search tool..." | |
| if cd "$SCRIPT_DIR" && \ | |
| ./run_eval.sh /tmp/test_semantic_eval/no_sem_search_context.json > /dev/null 2>&1; then | |
| echo " ✗ FAILED (expected early exit failure)" | |
| FAILED=$((FAILED + 1)) | |
| else | |
| echo " ✓ PASSED (correctly failed early)" | |
| PASSED=$((PASSED + 1)) | |
| fi |
Spotted by Graphite Agent
Is this helpful? React 👍 or 👎 to let us know.
| CMD="npx tsx llm_judge.ts --context \"$CONTEXT_FILE\" --intent \"$INTENT\" --expected-file-types \"$EXPECTED_TYPES\" --should-avoid \"$SHOULD_AVOID\"" | ||
|
|
||
| eval $CMD |
There was a problem hiding this comment.
Using eval with string interpolation is a shell injection vulnerability. If $CONTEXT_FILE, $INTENT, $EXPECTED_TYPES, or $SHOULD_AVOID contain shell metacharacters, arbitrary commands could be executed.
# Fix: Execute command directly without eval
npx tsx llm_judge.ts --context "$CONTEXT_FILE" --intent "$INTENT" --expected-file-types "$EXPECTED_TYPES" --should-avoid "$SHOULD_AVOID"| CMD="npx tsx llm_judge.ts --context \"$CONTEXT_FILE\" --intent \"$INTENT\" --expected-file-types \"$EXPECTED_TYPES\" --should-avoid \"$SHOULD_AVOID\"" | |
| eval $CMD | |
| npx tsx llm_judge.ts --context "$CONTEXT_FILE" --intent "$INTENT" --expected-file-types "$EXPECTED_TYPES" --should-avoid "$SHOULD_AVOID" |
Spotted by Graphite Agent
Is this helpful? React 👍 or 👎 to let us know.
No description provided.