AI Agent Development with Code Execution Capability (Code Interpreter)
An AI agent with Code Interpreter capability executes arbitrary code within an isolated environment, obtains real computation results, and uses them for answering. This fundamentally differs from code generation without execution: the agent can iteratively write, run, and fix code until obtaining correct results.
Code Interpreter Architecture
Request → LLM generates code → Sandbox executes → Result/error
↑ |
└──────────── Iteration on error ─────┘
Key requirement: secure isolated execution environment. Without sandboxing, the agent could execute arbitrary system code.
Implementation via Docker Sandbox
import docker
import tempfile
import os
from pathlib import Path
class DockerCodeExecutor:
"""Safe code execution in Docker container"""
def __init__(self, image: str = "python:3.11-slim", timeout: int = 30):
self.client = docker.from_env()
self.image = image
self.timeout = timeout
# Pre-loaded image with numpy, pandas, matplotlib
# docker build -t code-executor-sandbox -f Dockerfile.sandbox .
def execute(self, code: str, files: dict = None) -> dict:
"""
Execute code in isolated container
files: {filename: content} for data passing
"""
with tempfile.TemporaryDirectory() as tmpdir:
# Write data files
if files:
for fname, content in files.items():
(Path(tmpdir) / fname).write_bytes(content)
# Write code
code_file = Path(tmpdir) / "script.py"
code_file.write_text(code, encoding="utf-8")
try:
result = self.client.containers.run(
self.image,
command=["python", "/workspace/script.py"],
volumes={tmpdir: {"bind": "/workspace", "mode": "rw"}},
remove=True,
stdout=True,
stderr=True,
mem_limit="512m",
cpu_quota=50000, # 50% of one CPU
network_disabled=True, # No network!
read_only=False,
timeout=self.timeout,
)
return {
"status": "success",
"output": result.decode("utf-8"),
"files": self._list_output_files(tmpdir),
}
except docker.errors.ContainerError as e:
return {
"status": "error",
"output": e.stderr.decode("utf-8"),
"error_type": "runtime",
}
except Exception as e:
return {"status": "error", "output": str(e), "error_type": "system"}
def _list_output_files(self, tmpdir: str) -> list:
return [f.name for f in Path(tmpdir).iterdir() if f.suffix in [".png", ".csv", ".json", ".txt"]]
Agent with Code Interpreter
from openai import OpenAI
import json
client = OpenAI()
executor = DockerCodeExecutor()
code_tools = [{
"type": "function",
"function": {
"name": "execute_python",
"description": "Execute Python code and return result. Use for computations, data analysis, visualization.",
"parameters": {
"type": "object",
"properties": {
"code": {"type": "string", "description": "Python code to execute"},
"description": {"type": "string", "description": "What this code does (for logging)"},
},
"required": ["code"]
}
}
}]
def code_interpreter_agent(user_request: str, data_files: dict = None) -> str:
messages = [
{
"role": "system",
"content": """You are a data analyst with Python access.
Always write and execute code for calculations, not approximate.
Available libraries: pandas, numpy, matplotlib, scipy, sklearn, json, csv.
On error — analyze traceback and fix code."""
},
{"role": "user", "content": user_request},
]
for _ in range(8): # Max 8 iterations
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=code_tools,
tool_choice="auto",
)
message = response.choices[0].message
messages.append(message)
if not message.tool_calls:
return message.content
for tool_call in message.tool_calls:
code = json.loads(tool_call.function.arguments)["code"]
result = executor.execute(code, files=data_files)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(result, ensure_ascii=False),
})
return "Max iterations reached"
OpenAI Built-in Code Interpreter
OpenAI Assistants API provides built-in code interpreter (no need for own Docker):
from openai import OpenAI
client = OpenAI()
# Create assistant with Code Interpreter
assistant = client.beta.assistants.create(
name="Data Analyst",
instructions="Analyze data using Python. Create visualizations.",
tools=[{"type": "code_interpreter"}],
model="gpt-4o",
)
# Upload data file
with open("sales_data.csv", "rb") as f:
file = client.files.create(file=f, purpose="assistants")
# Request with file
thread = client.beta.threads.create()
client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Analyze sales data and build a monthly chart",
attachments=[{"file_id": file.id, "tools": [{"type": "code_interpreter"}]}]
)
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=assistant.id,
)
Practical Case: Financial Analyst with Code Interpreter
Task: automatic financial report building — agent receives CSV with transactions, independently writes code for analysis, builds charts, generates Excel report.
Request: "Analyze the attached Q1 2026 sales data. Calculate monthly dynamics, top 10 products, conversion funnel. Create PDF report with visualizations."
Agent Iterations:
- Load and verify CSV structure (5 columns, 45K rows)
- Clean data (duplicates, null values)
- Calculate monthly dynamics + bar chart
- ABC product analysis + Pareto chart
- Conversion funnel + funnel visualization
-
reportlab→ generate PDF
Results:
- Report creation time: 3–4 hours (manual analyst) → 8 minutes
- Coverage of metrics: identical
- Requires review: interpretations and conclusions (agent formulates, human validates)
E2B Sandbox as Docker Alternative
E2B — managed sandbox for Code Interpreter without DevOps:
import e2b_code_interpreter as e2b
sandbox = e2b.CodeInterpreter()
# Code execution
execution = sandbox.notebook.exec_cell("""
import pandas as pd
df = pd.read_csv('/data/sales.csv')
print(df.describe())
""")
print(execution.stdout)
sandbox.close()
Timeline
- Setup Docker sandbox + basic agent: 1–2 weeks
- Specialized analytical agent: 2–4 weeks
- Data sources integration: 1–2 weeks
- Total: 4–8 weeks







