You are viewing a free preview of this lesson.
Subscribe to unlock all 10 lessons in this course and every other course on LearningBro.
Evaluating agents is harder than evaluating simple LLM calls. Agents take multiple steps, use tools, and produce intermediate results — so you need metrics that go beyond just "was the final answer correct?" This lesson covers task completion metrics, trajectory evaluation, cost and latency tracking, and benchmarking different agent architectures.
| Aspect | Simple LLM Call | Agent |
|---|---|---|
| Output | Single response | Final answer + full trajectory |
| Steps | 1 | Variable (1–50+) |
| Cost | Predictable | Variable (depends on steps taken) |
| Latency | Predictable | Variable (depends on tool calls, steps) |
| Success criteria | Output quality | Output quality + efficiency + safety |
from dataclasses import dataclass
@dataclass
class AgentResult:
task: str
final_answer: str
steps_taken: int
tools_used: list[str]
total_tokens: int
total_cost_usd: float
latency_seconds: float
success: bool
error: str | None = None
def task_completion_rate(results: list[AgentResult]) -> float:
"""Percentage of tasks the agent completed successfully."""
if not results:
return 0.0
return sum(1 for r in results if r.success) / len(results)
from openai import OpenAI
import json
client = OpenAI()
def judge_agent_output(
task: str,
agent_answer: str,
reference_answer: str = "",
) -> dict:
"""Use a strong LLM to evaluate agent output quality."""
prompt = f"""Evaluate this agent's output on a scale of 1-5 for each criterion.
Return JSON with scores and brief justifications.
Task: {task}
Agent Answer: {agent_answer}
{"Reference Answer: " + reference_answer if reference_answer else ""}
Criteria:
- correctness: Is the answer factually correct?
- completeness: Does it address all parts of the task?
- relevance: Is the answer focused on what was asked?
- clarity: Is it well-organised and easy to understand?"""
response = client.chat.completions.create(
model="gpt-4o",
response_format={"type": "json_object"},
messages=[{"role": "user", "content": prompt}],
)
return json.loads(response.choices[0].message.content)
The trajectory (sequence of thoughts, actions, and observations) reveals how efficiently and safely the agent worked.
@dataclass
class TrajectoryStep:
step_number: int
thought: str
action: str
observation: str
tokens_used: int
latency_ms: float
@dataclass
class TrajectoryMetrics:
total_steps: int
unique_tools_used: int
repeated_actions: int
total_tokens: int
total_latency_ms: float
average_step_latency_ms: float
wasted_steps: int # Steps that did not contribute to the answer
def analyse_trajectory(steps: list[TrajectoryStep]) -> TrajectoryMetrics:
"""Analyse an agent's trajectory for efficiency."""
actions = [s.action for s in steps]
unique_tools = len(set(a.split("(")[0] for a in actions))
# Count repeated consecutive actions
repeated = 0
for i in range(1, len(actions)):
if actions[i] == actions[i - 1]:
repeated += 1
total_latency = sum(s.latency_ms for s in steps)
return TrajectoryMetrics(
total_steps=len(steps),
unique_tools_used=unique_tools,
repeated_actions=repeated,
total_tokens=sum(s.tokens_used for s in steps),
total_latency_ms=total_latency,
average_step_latency_ms=total_latency / len(steps) if steps else 0,
wasted_steps=repeated, # Simplified: repeated = wasted
)
def trajectory_efficiency(
steps_taken: int,
optimal_steps: int,
) -> float:
"""Score from 0 to 1 — how close to optimal was the trajectory?"""
if steps_taken == 0:
return 0.0
return min(optimal_steps / steps_taken, 1.0)
# Example: Agent took 8 steps but optimal was 4
efficiency = trajectory_efficiency(8, 4) # 0.5
class AgentCostTracker:
"""Track the cost of an agent run."""
PRICES = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
}
Subscribe to continue reading
Get full access to this lesson and all 10 lessons in this course.