from pathlib import Path
import json
from datetime import datetimeEvaluation Reports
Dashboard for RLM evaluation task results
This notebook loads and visualizes results from the RLM evaluation framework. Eval tasks are defined in evals/tasks/ and executed via the CLI:
# Run all tasks
python -m evals.cli run
# Run specific category
python -m evals.cli run 'regression/*'Results are saved to evals/results/ as JSON files and loaded here for analysis.
Load Results
Load all result JSON files from the results directory:
results_dir = Path('../evals/results')
def load_results(results_dir: Path) -> list:
"""Load all evaluation results from directory."""
if not results_dir.exists():
return []
results = []
for result_file in results_dir.glob('*.json'):
try:
with open(result_file) as f:
data = json.load(f)
results.append(data)
except Exception as e:
print(f"Warning: Could not load {result_file}: {e}")
return results
results = load_results(results_dir)
print(f"Loaded {len(results)} result files")Summary Statistics
Overall pass rates across all tasks:
if results:
total_trials = sum(r.get('total_trials', 0) for r in results)
total_passed = sum(r.get('passed_trials', 0) for r in results)
avg_pass_rate = total_passed / total_trials if total_trials > 0 else 0
print(f"Tasks evaluated: {len(results)}")
print(f"Total trials: {total_trials}")
print(f"Total passed: {total_passed}")
print(f"Overall pass rate: {avg_pass_rate:.1%}")
else:
print("No results found. Run evals first:")
print(" python -m evals.cli run")Results by Category
Break down performance by task category:
if results:
from collections import defaultdict
by_category = defaultdict(list)
for r in results:
task_id = r.get('task_id', 'unknown')
category = task_id.split('_')[0] if '_' in task_id else 'unknown'
by_category[category].append(r)
print("\nResults by Category:")
print("=" * 60)
for category in sorted(by_category.keys()):
cat_results = by_category[category]
cat_trials = sum(r.get('total_trials', 0) for r in cat_results)
cat_passed = sum(r.get('passed_trials', 0) for r in cat_results)
cat_rate = cat_passed / cat_trials if cat_trials > 0 else 0
print(f"\n{category.upper()}")
print("-" * 40)
for r in cat_results:
task_id = r.get('task_id', 'unknown')
passed = r.get('passed_trials', 0)
total = r.get('total_trials', 0)
pass_at_k = r.get('pass_at_k', 0)
status = "PASS" if passed == total else "FAIL"
print(f" [{status}] {task_id}: {passed}/{total} ({pass_at_k:.0%})")
print(f" Category total: {cat_passed}/{cat_trials} ({cat_rate:.0%})")Detailed Results
Show full details for each task:
if results:
print("\nDetailed Results:")
print("=" * 60)
for r in results:
task_id = r.get('task_id', 'unknown')
print(f"\n{task_id}")
print("-" * 40)
print(f" pass@{r.get('total_trials', 0)}: {r.get('pass_at_k', 0):.1%}")
print(f" pass^{r.get('total_trials', 0)}: {r.get('pass_power_k', 0):.1%}")
print(f" Passed: {r.get('passed_trials', 0)}/{r.get('total_trials', 0)}")
print(f" Avg iterations: {r.get('avg_iterations', 0):.1f}")
if 'trials' in r:
print(f" Trial details:")
for i, trial in enumerate(r['trials'], 1):
passed = trial.get('passed', False)
status = "✓" if passed else "✗"
print(f" {status} Trial {i}: {trial.get('iterations', 0)} iterations")Next Steps
To run evaluations and generate results:
# Run all tasks
python -m evals.cli run
# Execute this notebook to show results
nbdev exec_nb --path nbs/eval_reports.ipynb
# Regenerate documentation
nbdev_docs