AI Unit Test Generation
Writing tests is the most disliked part of development: it's clear what needs to be done, but tedious to do manually. AI excels at generating typical test cases better than developers in "need to finish coverage" mode. The system's goal isn't just to cover lines of code, but to generate tests that check real behavior: edge cases, boundary values, error handling.
Test Generator Architecture
from anthropic import Anthropic
import ast
import inspect
from pathlib import Path
from typing import Optional
import subprocess
client = Anthropic()
class TestGenerator:
def __init__(self, project_root: str):
self.project_root = project_root
def extract_function_info(self, source_code: str, function_name: str) -> dict:
"""Extracts function metadata via AST"""
tree = ast.parse(source_code)
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
if node.name == function_name:
return {
"name": node.name,
"args": [arg.arg for arg in node.args.args],
"decorators": [ast.unparse(d) for d in node.decorator_list],
"is_async": isinstance(node, ast.AsyncFunctionDef),
"has_return": any(
isinstance(n, ast.Return) and n.value
for n in ast.walk(node)
),
"raises": [
ast.unparse(n.exc) for n in ast.walk(node)
if isinstance(n, ast.Raise) and n.exc
],
"source": ast.unparse(node),
}
return {}
def find_related_tests(self, source_file: str) -> str:
"""Finds existing tests to understand style"""
source_path = Path(source_file)
# Search for test_*.py or *_test.py
test_candidates = [
source_path.parent / f"test_{source_path.name}",
source_path.parent.parent / "tests" / f"test_{source_path.name}",
source_path.parent / "tests" / f"test_{source_path.name}",
]
for test_file in test_candidates:
if test_file.exists():
return test_file.read_text()[:2000]
return ""
def generate_tests(
self,
source_file: str,
function_name: Optional[str] = None,
) -> str:
"""Generates tests for a file or specific function"""
source_code = Path(source_file).read_text()
existing_tests = self.find_related_tests(source_file)
# If function specified — focus on it
if function_name:
func_info = self.extract_function_info(source_code, function_name)
context = f"Function to test:\n```python\n{func_info.get('source', '')}\n```"
else:
context = f"File to test:\n```python\n{source_code[:4000]}\n```"
existing_context = ""
if existing_tests:
existing_context = f"\nExisting test style (follow this pattern):\n```python\n{existing_tests}\n```"
response = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=4096,
system="""You are a senior developer writing pytest tests.
Rules:
- Test behavior, not implementation
- One test = one check (AAA: Arrange, Act, Assert)
- Name tests as: test_<function>_<scenario>_<expectation>
- Cover: happy path, edge cases, errors/exceptions, boundary values
- Use pytest.mark.parametrize for similar tests
- For async functions — pytest-asyncio
- Mock external dependencies via pytest-mock""",
messages=[{
"role": "user",
"content": f"""{context}{existing_context}
Generate complete test file with pytest. Return only code, no explanations."""
}]
)
return response.content[0].text
Parametrized Tests with Boundary Values
def generate_parametrized_tests(
self,
function_source: str,
function_signature: str,
) -> str:
"""Generates parametrized tests with boundary values"""
response = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=2048,
messages=[{
"role": "user",
"content": f"""Generate pytest.mark.parametrize test for this function
with boundary values and edge cases.
```python
{function_source}
Signature: {function_signature}
Response format — Python code only:
@pytest.mark.parametrize("input,expected", [
# happy path
# edge cases
# boundary values
# error cases (pytest.raises)
])
def test_<function_name>(input, expected):
...
```"""
}]
)
return response.content[0].text
def run_and_fix(self, test_file: str, source_file: str, max_attempts: int = 3) -> str:
"""Runs tests and iteratively fixes errors"""
test_content = Path(test_file).read_text()
for attempt in range(max_attempts):
result = subprocess.run(
["python", "-m", "pytest", test_file, "-v", "--tb=short"],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
return test_content # All tests passed
# Fix errors
response = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=4096,
messages=[{
"role": "user",
"content": f"""Tests failed. Fix the test file.
Test file:
```python
{test_content}
Errors:
{result.stdout[-2000:]}
Return fixed complete test file.""" }] )
test_content = response.content[0].text
Path(test_file).write_text(test_content)
return test_content
### Mutation Testing for Test Quality Assessment
Generated tests should be verified — do they actually cover real bugs?
```python
import mutmut
from pathlib import Path
def evaluate_test_quality(source_file: str, test_file: str) -> dict:
"""Runs mutation testing to assess test quality"""
result = subprocess.run(
["mutmut", "run", f"--paths-to-mutate={source_file}", f"--tests-dir={test_file}"],
capture_output=True, text=True, timeout=300
)
# Parse result
survived = 0
killed = 0
for line in result.stdout.splitlines():
if "survived" in line.lower():
survived += 1
elif "killed" in line.lower():
killed += 1
total = survived + killed
mutation_score = killed / total if total > 0 else 0
return {
"mutation_score": mutation_score,
"killed_mutants": killed,
"survived_mutants": survived,
"verdict": "excellent" if mutation_score > 0.8 else "good" if mutation_score > 0.6 else "needs_improvement"
}
Integration with pytest-cov and Automatic Report
def generate_coverage_report(test_file: str, source_file: str) -> dict:
"""Runs tests with coverage and returns report"""
result = subprocess.run(
[
"python", "-m", "pytest", test_file,
f"--cov={source_file}",
"--cov-report=json:coverage.json",
"--cov-report=term-missing",
"-v"
],
capture_output=True, text=True
)
import json
try:
with open("coverage.json") as f:
coverage_data = json.load(f)
uncovered_lines = []
for file_data in coverage_data.get("files", {}).values():
uncovered_lines.extend(file_data.get("missing_lines", []))
return {
"coverage_percent": coverage_data.get("totals", {}).get("percent_covered", 0),
"uncovered_lines": uncovered_lines,
"passed": result.returncode == 0,
}
except FileNotFoundError:
return {"coverage_percent": 0, "passed": False}
CLI for Team
import click
@click.command()
@click.argument("source_file")
@click.option("--function", "-f", help="Specific function to test")
@click.option("--output", "-o", help="Output test file path")
@click.option("--run/--no-run", default=True, help="Run tests after generation")
def generate(source_file: str, function: str, output: str, run: bool):
"""Generates unit tests for Python file"""
generator = TestGenerator(".")
tests = generator.generate_tests(source_file, function)
if not output:
source_path = Path(source_file)
output = str(source_path.parent / f"test_{source_path.name}")
Path(output).write_text(tests)
click.echo(f"Tests written to {output}")
if run:
click.echo("Running tests...")
fixed_tests = generator.run_and_fix(output, source_file)
coverage = generate_coverage_report(output, source_file)
click.echo(f"Coverage: {coverage['coverage_percent']:.1f}%")
if __name__ == "__main__":
generate()
Practical Case: Legacy Python Service Without Tests
Task: 8000 lines of Python code, 0% coverage, refactoring impossible without tests.
Process:
- Automatic analysis of all
.pyfiles via AST - Test generation by files (batch, 5 files parallel)
- Auto-run and fix cycle (up to 3 iterations)
- Manual review of tests with coverage < 60%
Results in 2 weeks:
- Generated 847 test functions
- Coverage: 0% → 71%
- Found 12 real bugs during generation (AI noticed behavior/type mismatch)
- 94% of generated tests passed without changes
- 6% required manual fixes (complex mock dependencies)
Final test mutation score: 0.74 (good, but not excellent — AI missed some edge cases).
Timeline
- Basic generator (one file, code export): 1–2 days
- Auto-run and fix-cycle: 2–3 days
- CI/CD integration with coverage gate: 1 week
- Full pipeline for legacy codebase: 2–3 weeks







