AI Unit Test Generation System

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Unit Test Generation System
Medium
~1-2 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1240
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1167
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    867
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1084
  • image_logo-advance_0.png
    B2B Advance company logo design
    563
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    829

AI Unit Test Generation

Writing tests is the most disliked part of development: it's clear what needs to be done, but tedious to do manually. AI excels at generating typical test cases better than developers in "need to finish coverage" mode. The system's goal isn't just to cover lines of code, but to generate tests that check real behavior: edge cases, boundary values, error handling.

Test Generator Architecture

from anthropic import Anthropic
import ast
import inspect
from pathlib import Path
from typing import Optional
import subprocess

client = Anthropic()

class TestGenerator:

    def __init__(self, project_root: str):
        self.project_root = project_root

    def extract_function_info(self, source_code: str, function_name: str) -> dict:
        """Extracts function metadata via AST"""
        tree = ast.parse(source_code)

        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                if node.name == function_name:
                    return {
                        "name": node.name,
                        "args": [arg.arg for arg in node.args.args],
                        "decorators": [ast.unparse(d) for d in node.decorator_list],
                        "is_async": isinstance(node, ast.AsyncFunctionDef),
                        "has_return": any(
                            isinstance(n, ast.Return) and n.value
                            for n in ast.walk(node)
                        ),
                        "raises": [
                            ast.unparse(n.exc) for n in ast.walk(node)
                            if isinstance(n, ast.Raise) and n.exc
                        ],
                        "source": ast.unparse(node),
                    }
        return {}

    def find_related_tests(self, source_file: str) -> str:
        """Finds existing tests to understand style"""
        source_path = Path(source_file)
        # Search for test_*.py or *_test.py
        test_candidates = [
            source_path.parent / f"test_{source_path.name}",
            source_path.parent.parent / "tests" / f"test_{source_path.name}",
            source_path.parent / "tests" / f"test_{source_path.name}",
        ]

        for test_file in test_candidates:
            if test_file.exists():
                return test_file.read_text()[:2000]
        return ""

    def generate_tests(
        self,
        source_file: str,
        function_name: Optional[str] = None,
    ) -> str:
        """Generates tests for a file or specific function"""
        source_code = Path(source_file).read_text()
        existing_tests = self.find_related_tests(source_file)

        # If function specified — focus on it
        if function_name:
            func_info = self.extract_function_info(source_code, function_name)
            context = f"Function to test:\n```python\n{func_info.get('source', '')}\n```"
        else:
            context = f"File to test:\n```python\n{source_code[:4000]}\n```"

        existing_context = ""
        if existing_tests:
            existing_context = f"\nExisting test style (follow this pattern):\n```python\n{existing_tests}\n```"

        response = client.messages.create(
            model="claude-sonnet-4-5",
            max_tokens=4096,
            system="""You are a senior developer writing pytest tests.
Rules:
- Test behavior, not implementation
- One test = one check (AAA: Arrange, Act, Assert)
- Name tests as: test_<function>_<scenario>_<expectation>
- Cover: happy path, edge cases, errors/exceptions, boundary values
- Use pytest.mark.parametrize for similar tests
- For async functions — pytest-asyncio
- Mock external dependencies via pytest-mock""",
            messages=[{
                "role": "user",
                "content": f"""{context}{existing_context}

Generate complete test file with pytest. Return only code, no explanations."""
            }]
        )

        return response.content[0].text

Parametrized Tests with Boundary Values

    def generate_parametrized_tests(
        self,
        function_source: str,
        function_signature: str,
    ) -> str:
        """Generates parametrized tests with boundary values"""

        response = client.messages.create(
            model="claude-sonnet-4-5",
            max_tokens=2048,
            messages=[{
                "role": "user",
                "content": f"""Generate pytest.mark.parametrize test for this function
with boundary values and edge cases.

```python
{function_source}

Signature: {function_signature}

Response format — Python code only:

@pytest.mark.parametrize("input,expected", [
    # happy path
    # edge cases
    # boundary values
    # error cases (pytest.raises)
])
def test_<function_name>(input, expected):
    ...
```"""
            }]
        )

        return response.content[0].text

    def run_and_fix(self, test_file: str, source_file: str, max_attempts: int = 3) -> str:
        """Runs tests and iteratively fixes errors"""
        test_content = Path(test_file).read_text()

        for attempt in range(max_attempts):
            result = subprocess.run(
                ["python", "-m", "pytest", test_file, "-v", "--tb=short"],
                capture_output=True, text=True, timeout=60
            )

            if result.returncode == 0:
                return test_content  # All tests passed

            # Fix errors
            response = client.messages.create(
                model="claude-sonnet-4-5",
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": f"""Tests failed. Fix the test file.

Test file:
```python
{test_content}

Errors:

{result.stdout[-2000:]}

Return fixed complete test file.""" }] )

        test_content = response.content[0].text
        Path(test_file).write_text(test_content)

    return test_content

### Mutation Testing for Test Quality Assessment

Generated tests should be verified — do they actually cover real bugs?

```python
import mutmut
from pathlib import Path

def evaluate_test_quality(source_file: str, test_file: str) -> dict:
    """Runs mutation testing to assess test quality"""

    result = subprocess.run(
        ["mutmut", "run", f"--paths-to-mutate={source_file}", f"--tests-dir={test_file}"],
        capture_output=True, text=True, timeout=300
    )

    # Parse result
    survived = 0
    killed = 0
    for line in result.stdout.splitlines():
        if "survived" in line.lower():
            survived += 1
        elif "killed" in line.lower():
            killed += 1

    total = survived + killed
    mutation_score = killed / total if total > 0 else 0

    return {
        "mutation_score": mutation_score,
        "killed_mutants": killed,
        "survived_mutants": survived,
        "verdict": "excellent" if mutation_score > 0.8 else "good" if mutation_score > 0.6 else "needs_improvement"
    }

Integration with pytest-cov and Automatic Report

def generate_coverage_report(test_file: str, source_file: str) -> dict:
    """Runs tests with coverage and returns report"""

    result = subprocess.run(
        [
            "python", "-m", "pytest", test_file,
            f"--cov={source_file}",
            "--cov-report=json:coverage.json",
            "--cov-report=term-missing",
            "-v"
        ],
        capture_output=True, text=True
    )

    import json
    try:
        with open("coverage.json") as f:
            coverage_data = json.load(f)

        uncovered_lines = []
        for file_data in coverage_data.get("files", {}).values():
            uncovered_lines.extend(file_data.get("missing_lines", []))

        return {
            "coverage_percent": coverage_data.get("totals", {}).get("percent_covered", 0),
            "uncovered_lines": uncovered_lines,
            "passed": result.returncode == 0,
        }
    except FileNotFoundError:
        return {"coverage_percent": 0, "passed": False}

CLI for Team

import click

@click.command()
@click.argument("source_file")
@click.option("--function", "-f", help="Specific function to test")
@click.option("--output", "-o", help="Output test file path")
@click.option("--run/--no-run", default=True, help="Run tests after generation")
def generate(source_file: str, function: str, output: str, run: bool):
    """Generates unit tests for Python file"""

    generator = TestGenerator(".")
    tests = generator.generate_tests(source_file, function)

    if not output:
        source_path = Path(source_file)
        output = str(source_path.parent / f"test_{source_path.name}")

    Path(output).write_text(tests)
    click.echo(f"Tests written to {output}")

    if run:
        click.echo("Running tests...")
        fixed_tests = generator.run_and_fix(output, source_file)
        coverage = generate_coverage_report(output, source_file)
        click.echo(f"Coverage: {coverage['coverage_percent']:.1f}%")

if __name__ == "__main__":
    generate()

Practical Case: Legacy Python Service Without Tests

Task: 8000 lines of Python code, 0% coverage, refactoring impossible without tests.

Process:

  1. Automatic analysis of all .py files via AST
  2. Test generation by files (batch, 5 files parallel)
  3. Auto-run and fix cycle (up to 3 iterations)
  4. Manual review of tests with coverage < 60%

Results in 2 weeks:

  • Generated 847 test functions
  • Coverage: 0% → 71%
  • Found 12 real bugs during generation (AI noticed behavior/type mismatch)
  • 94% of generated tests passed without changes
  • 6% required manual fixes (complex mock dependencies)

Final test mutation score: 0.74 (good, but not excellent — AI missed some edge cases).

Timeline

  • Basic generator (one file, code export): 1–2 days
  • Auto-run and fix-cycle: 2–3 days
  • CI/CD integration with coverage gate: 1 week
  • Full pipeline for legacy codebase: 2–3 weeks