ai_engineer_loop/run_evaluations.sh at main · elecnix/ai_engineer_loop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash

# Ensure pytest is installed using uv
if ! uv pip list | grep -q "pytest"; then
  uv add pytest
fi

# List of prompts to evaluate
PROMPTS=(
  "sample_prompts/1_distributed_rate_limiter.txt"
  "sample_prompts/2_inmemory_database.txt"
  "sample_prompts/3_neural_network_framework.txt"
  "sample_prompts/4_distributed_task_scheduler.txt"
  "sample_prompts/5_compiler.txt"
)

# Challenge names
CHALLENGE_NAMES=(
  "distributed_rate_limiter"
  "inmemory_database"
  "neural_network_framework"
  "distributed_task_scheduler"
  "compiler"
)

# Models to evaluate (filtered models)
MODELS="deepseek-coder:1.3b
deepseek-coder:6.7b-instruct
granite3.2:8b
deepseek-coder:latest
exaone3.5:7.8b
exaone3.5:2.4b
qwen2.5-coder:0.5b-instruct-q4_K_S
qwen2.5-coder:14b-instruct-q2_K
qwen2.5-coder:0.5b
qwen2.5-coder:1.5b
qwen2.5-coder:3b
qwen2.5-coder:7b
qwen2.5-coder:latest
codegeex4:9b
starcoder2:7b
granite-code:8b
granite-code:3b
dolphin-mistral:7b
deepseek-coder:6.7b
deepseek-coder-v2:lite
granite3-dense:2b
granite3-dense:8b
deepseek-v2:latest
starcoder2:3b"

# Run evaluations for each prompt
for i in "${!PROMPTS[@]}"; do
  PROMPT="${PROMPTS[$i]}"
  CHALLENGE_NAME="${CHALLENGE_NAMES[$i]}"
  OUTPUT_DIR="generated/model_evaluations/${CHALLENGE_NAME}"

  echo "==============================================================="
  echo "Evaluating prompt: $PROMPT"
  echo "Challenge: $CHALLENGE_NAME"
  echo "Output directory: $OUTPUT_DIR"
  echo "==============================================================="

  # Create output directory if it doesn't exist
  mkdir -p "$OUTPUT_DIR"

  # Run the evaluation with the specified base directory using uv run
  uv run model_evaluator.py "$PROMPT" --models $MODELS --base-dir "$OUTPUT_DIR" --resume > "$OUTPUT_DIR/evaluation.log" 2>&1

  echo "Evaluation complete for $CHALLENGE_NAME. Results saved to $OUTPUT_DIR/"
  echo ""
done

echo "All evaluations complete!"