How to Test and Debug Your AI Agent
AI agents fail in unexpected ways. A structured testing approach — unit tests for tools, integration tests for the full loop, and observability tools — keeps your agent reliable.
Builder of AI agents, crypto trading bots, and open-source automation tools. Sharing practical guides on how to build, deploy, and profit from AI and DeFi technology.
AI agents are harder to test than regular code because their behavior is non-deterministic. The same input can produce different outputs. Here's how to build reliable, testable agents.
Why Agent Testing Is Different
Regular unit testing: same input → same output. LLM-based agent testing: same input → probably similar output, but not guaranteed.
This requires a different testing philosophy:
- Test tool implementations (these are deterministic)
- Test agent behavior on representative examples (sample-based)
- Monitor output quality rather than exact matches
- Use evaluators to grade responses
Testing Tool Implementations
Tools are deterministic Python functions. Test them normally:
import pytest
from unittest.mock import patch, MagicMock
import requests
# Tool under test
def get_crypto_price(symbol: str) -> dict:
"""Fetch price from CoinGecko."""
r = requests.get(
f"https://api.coingecko.com/api/v3/simple/price",
params={"ids": symbol, "vs_currencies": "usd", "include_24hr_change": "true"},
timeout=10
)
r.raise_for_status()
return r.json()
# Tests
class TestGetCryptoPrice:
def test_valid_symbol_returns_price(self):
result = get_crypto_price("bitcoin")
assert "bitcoin" in result
assert "usd" in result["bitcoin"]
assert isinstance(result["bitcoin"]["usd"], (int, float))
def test_price_is_reasonable(self):
result = get_crypto_price("bitcoin")
btc_price = result["bitcoin"]["usd"]
assert 1000 < btc_price < 1_000_000, f"BTC price {btc_price} seems unreasonable"
def test_network_error_handled(self):
with patch("requests.get", side_effect=requests.ConnectionError):
with pytest.raises(Exception):
get_crypto_price("bitcoin")
@patch("requests.get")
def test_mocked_response(self, mock_get):
mock_get.return_value = MagicMock(
json=lambda: {"bitcoin": {"usd": 65000, "usd_24h_change": 2.5}},
status_code=200
)
mock_get.return_value.raise_for_status = lambda: None
result = get_crypto_price("bitcoin")
assert result["bitcoin"]["usd"] == 65000
Testing the Full Agent Loop
from openai import OpenAI
import json
client = OpenAI()
def run_agent_test(user_message: str, tools: list, expected_tools: list = None) -> dict:
"""Run agent and return results for evaluation."""
messages = [{"role": "user", "content": user_message}]
tools_called = []
for _ in range(10): # Max iterations
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
tools=tools,
tool_choice="auto"
)
msg = response.choices[0].message
messages.append(msg)
if not msg.tool_calls:
return {
"response": msg.content,
"tools_called": tools_called,
"success": True
}
for tc in msg.tool_calls:
tools_called.append(tc.function.name)
# Mock tool execution
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"content": json.dumps({"mock": "result", "data": "test_value"})
})
return {"response": None, "tools_called": tools_called, "success": False, "reason": "max_iterations"}
# Test that agent uses expected tools
def test_agent_uses_price_tool():
result = run_agent_test(
"What is the current price of Ethereum?",
tools=[price_tool_definition],
expected_tools=["get_crypto_price"]
)
assert result["success"], "Agent didn't complete"
assert "get_crypto_price" in result["tools_called"], "Agent didn't use price tool"
assert result["response"] is not None
print(f"✅ Agent correctly called: {result['tools_called']}")
test_agent_uses_price_tool()
Using LangSmith for Production Observability
LangSmith (by LangChain) is the best tool for monitoring production LLM applications:
from langsmith import Client
from langsmith.run_helpers import traceable
import os
os.environ["LANGCHAIN_API_KEY"] = "your_langsmith_key"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "crypto-agent"
@traceable # This decorator automatically logs to LangSmith
def run_agent(user_message: str) -> str:
"""Agent run with automatic tracing."""
messages = [{"role": "user", "content": user_message}]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
tools=tools,
)
return response.choices[0].message.content
# Every call to run_agent() is now logged to LangSmith
# You can see: latency, tokens used, tool calls, errors
result = run_agent("Should I buy ETH today?")
Regression Testing with Saved Examples
import json
from pathlib import Path
# Golden examples: input + expected output characteristics
GOLDEN_TESTS = [
{
"input": "What is Bitcoin's current price?",
"should_contain": ["$", "Bitcoin", "BTC"],
"should_not_contain": ["error", "sorry", "I don't know"],
"min_length": 20,
},
{
"input": "Should I buy ETH with my life savings?",
"should_contain": ["risk", "advice", "not financial"],
"should_not_contain": ["yes, buy", "definitely"],
"min_length": 50,
}
]
def run_regression_tests(agent_func, tests=GOLDEN_TESTS):
results = []
for test in tests:
response = agent_func(test["input"])
passed = True
failures = []
for phrase in test.get("should_contain", []):
if phrase.lower() not in response.lower():
passed = False
failures.append(f"Missing expected: '{phrase}'")
for phrase in test.get("should_not_contain", []):
if phrase.lower() in response.lower():
passed = False
failures.append(f"Contains unexpected: '{phrase}'")
if len(response) < test.get("min_length", 0):
passed = False
failures.append(f"Response too short: {len(response)} chars")
results.append({
"input": test["input"][:50],
"passed": passed,
"failures": failures
})
passed_count = sum(1 for r in results if r["passed"])
print(f"\nRegression test: {passed_count}/{len(tests)} passed")
for r in results:
status = "✅" if r["passed"] else "❌"
print(f"{status} {r['input']}")
for f in r.get("failures", []):
print(f" → {f}")
return all(r["passed"] for r in results)
run_regression_tests(lambda q: run_agent(q))
Debugging Common Agent Failures
Agent loops forever: Add max_iterations counter, log each step.
Agent ignores tools: Check tool descriptions — they need to be clear enough for the LLM to know when to use them.
Wrong tool arguments: Check that your JSON schema matches what the LLM is expected to output.
Hallucinated tool results: The LLM may ignore tool results. Try explicitly saying in the system prompt: "Always use tool results. Never answer from memory when a tool provides current data."
Good testing and observability pays dividends immediately — the first time your agent silently fails on a live trade, you'll wish you had it.
Related Articles
AI Agent Security: Protecting Your Bot From Attacks and Exploits
5 min read
AI AgentsAgentic AI: The Next Evolution Beyond ChatGPT (Complete 2025 Guide)
9 min read
AI AgentsHow to Give Your AI Agent Long-Term Memory
6 min read
AI AgentsAutonomous vs Semi-Autonomous AI Agents: When to Choose Each
4 min read