Automated AI Code Review System
Automated code review doesn't replace architectural review — it eliminates the mechanical part: style checking, obvious vulnerabilities, test coverage, violation of accepted patterns. A senior developer spends 15–20% of their time on review; most of that time goes to comments like "there's no error handling here" or "name this variable more specifically instead of data". An AI system takes this layer off their plate.
System Components
Diff Analyzer — receives GitHub/GitLab webhook with PR diff, parses changes by files.
Code Analyzer — LLM agent with tools: search the codebase, read related files, run static analysis.
Review Generator — forms structured comments with line numbers, severity, and fix suggestions.
PR Commenter — publishes comments via GitHub/GitLab API on specific lines.
Review Agent with Tools
from anthropic import Anthropic
from github import Github
import subprocess
import json
from dataclasses import dataclass
from typing import Literal
client = Anthropic()
gh = Github("GITHUB_TOKEN")
@dataclass
class ReviewComment:
file: str
line: int
severity: Literal["critical", "warning", "suggestion", "nitpick"]
category: Literal["security", "performance", "style", "logic", "test_coverage", "error_handling"]
comment: str
suggestion: str | None = None
def get_pr_diff(repo_name: str, pr_number: int) -> dict:
"""Gets PR diff by files"""
repo = gh.get_repo(repo_name)
pr = repo.get_pull(pr_number)
files = {}
for f in pr.get_files():
files[f.filename] = {
"patch": f.patch,
"additions": f.additions,
"deletions": f.deletions,
"status": f.status, # added/modified/removed
}
return files
def run_static_analysis(code: str, language: str) -> str:
"""Runs static analyzer"""
if language == "python":
result = subprocess.run(
["ruff", "check", "--select=ALL", "-"],
input=code.encode(),
capture_output=True,
)
return result.stdout.decode()[:2000]
return ""
def read_related_file(file_path: str) -> str:
"""Reads related file for context"""
try:
with open(file_path) as f:
return f.read()[:3000]
except FileNotFoundError:
return f"File {file_path} not found"
REVIEW_TOOLS = [
{
"name": "run_static_analysis",
"description": "Run static analysis (ruff/eslint/etc) on code",
"input_schema": {
"type": "object",
"properties": {
"code": {"type": "string"},
"language": {"type": "string", "enum": ["python", "typescript", "go"]}
},
"required": ["code", "language"]
}
},
{
"name": "read_related_file",
"description": "Read a related file to understand context (models, tests, etc)",
"input_schema": {
"type": "object",
"properties": {"file_path": {"type": "string"}},
"required": ["file_path"]
}
}
]
def review_file(filename: str, diff: str, full_code: str = None) -> list[ReviewComment]:
"""Reviews one file"""
messages = [{
"role": "user",
"content": f"""Review this code change. File: {filename}
Diff:
{diff}
{f"Full file content:{chr(10)}```{chr(10)}{full_code}{chr(10)}```" if full_code else ""}
Use tools to run static analysis and read related files if needed.
Then return a JSON array of review comments with this structure:
{{
"comments": [
{{
"line": <line_number_in_diff>,
"severity": "critical|warning|suggestion|nitpick",
"category": "security|performance|style|logic|test_coverage|error_handling",
"comment": "<explanation>",
"suggestion": "<code suggestion if applicable>"
}}
]
}}"""
}]
# Agentic loop
while True:
response = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=4096,
tools=REVIEW_TOOLS,
messages=messages,
)
if response.stop_reason == "end_turn":
# Extract JSON from response
text = response.content[-1].text
try:
data = json.loads(text[text.find("{"):text.rfind("}") + 1])
return [ReviewComment(file=filename, **c) for c in data.get("comments", [])]
except Exception:
return []
# Process tool calls
tool_results = []
for block in response.content:
if block.type == "tool_use":
if block.name == "run_static_analysis":
result = run_static_analysis(**block.input)
elif block.name == "read_related_file":
result = read_related_file(**block.input)
else:
result = "Unknown tool"
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": result,
})
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": tool_results})
Publishing Comments in PR
def post_review(repo_name: str, pr_number: int, comments: list[ReviewComment]):
"""Publishes review comments in GitHub PR"""
repo = gh.get_repo(repo_name)
pr = repo.get_pull(pr_number)
# Get commit for attaching comments
commit = list(pr.get_commits())[-1]
review_comments = []
critical_count = 0
for comment in comments:
if comment.severity == "critical":
critical_count += 1
body = f"**[{comment.severity.upper()}]** `{comment.category}`\n\n{comment.comment}"
if comment.suggestion:
body += f"\n\n**Suggestion:**\n```python\n{comment.suggestion}\n```"
review_comments.append({
"path": comment.file,
"line": comment.line,
"body": body,
})
# Overall review status
if critical_count > 0:
event = "REQUEST_CHANGES"
body = f"AI Review: found {critical_count} critical issues. Requires fixing."
elif len([c for c in comments if c.severity == "warning"]) > 3:
event = "REQUEST_CHANGES"
body = f"AI Review: found {len(comments)} comments, {critical_count} critical."
else:
event = "COMMENT"
body = f"AI Review: found {len(comments)} minor comments."
pr.create_review(
commit=commit,
body=body,
event=event,
comments=review_comments,
)
Specialized Checkers
LLM sees logic errors well, but pattern matching is more efficient with specialized checks:
import ast
import re
class SecurityChecker:
"""Checks code for typical vulnerabilities"""
DANGEROUS_FUNCTIONS = {"eval", "exec", "compile", "pickle.loads", "yaml.load"}
SQL_INJECTION_PATTERNS = [
r'execute\s*\(\s*[f"\']', # f-string in execute()
r'\.format\s*\(', # .format() in SQL
r'%\s*\(', # % in SQL query
]
def check_python(self, code: str) -> list[dict]:
issues = []
try:
tree = ast.parse(code)
except SyntaxError:
return issues
# Check dangerous function calls
for node in ast.walk(tree):
if isinstance(node, ast.Call):
func_name = ""
if isinstance(node.func, ast.Name):
func_name = node.func.id
elif isinstance(node.func, ast.Attribute):
func_name = f"{node.func.value.id if isinstance(node.func.value, ast.Name) else ''}.{node.func.attr}"
if func_name in self.DANGEROUS_FUNCTIONS:
issues.append({
"line": node.lineno,
"severity": "critical",
"category": "security",
"comment": f"Use of {func_name} is potentially dangerous",
})
# SQL injection patterns
for pattern in self.SQL_INJECTION_PATTERNS:
for match in re.finditer(pattern, code):
line_no = code[:match.start()].count("\n") + 1
issues.append({
"line": line_no,
"severity": "critical",
"category": "security",
"comment": "Possible SQL injection: use parameterized queries",
})
return issues
GitHub Actions Integration
# .github/workflows/ai-review.yml
name: AI Code Review
on:
pull_request:
types: [opened, synchronize]
jobs:
review:
runs-on: ubuntu-latest
permissions:
pull-requests: write
contents: read
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Run AI Review
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
pip install anthropic pygithub ruff
python scripts/ai_review.py \
--repo "${{ github.repository }}" \
--pr "${{ github.event.pull_request.number }}"
Practical Case: 8-Person Developer Team
Problem: senior developer spent 6–8 hours per week on review. 40% of comments were repetitive (no error handling, hardcoded config, no tests).
AI Review Implementation:
- Critical issues (security, obvious bugs): block merge
- Warnings: shown but don't block
- Nitpicks: optional checklist
Results:
- Mechanical comments from senior: -71%
- Average time to first review: 4 hours → 3 minutes (AI triggers immediately)
- Bugs reaching production: -34%
- Senior's review time: 7 hours → 2 hours (only architectural decisions)
Important finding: AI system found real bugs in 23% of PRs — not just style comments.
Timeline
- Basic review with GitHub posting: 3–5 days
- Specialized security checkers + static analysis: 1 week
- Fine-tuning for project conventions: 1–2 weeks
- CI/CD integration with merge policies: 1 week







