Developing AI-Based Automated Test and Exam Generation System
AI-generation of test questions creates unique questions from educational materials, at different difficulty levels (Bloom's taxonomy), with multiple execution variants — prevents cheating in mass testing.
Multi-Level Question Generation
from openai import AsyncOpenAI
from enum import Enum
import json
client = AsyncOpenAI()
class BloomLevel(Enum):
REMEMBER = "remember" # memorize facts
UNDERSTAND = "understand" # explain concepts
APPLY = "apply" # apply in new situation
ANALYZE = "analyze" # break down into parts
EVALUATE = "evaluate" # critically evaluate
CREATE = "create" # create something new
BLOOM_PROMPTS = {
BloomLevel.REMEMBER: "Create a question testing fact memorization, dates, definitions",
BloomLevel.UNDERSTAND: "Create a comprehension question: explanation, paraphrasing, examples",
BloomLevel.APPLY: "Create a practical task: applying knowledge in new situation",
BloomLevel.ANALYZE: "Create an analysis question: comparison, identifying causes, structuring",
BloomLevel.EVALUATE: "Create an evaluation question: justifying judgment, critiquing approach",
BloomLevel.CREATE: "Create a synthesis task: developing solution, creating product",
}
async def generate_question(
topic: str,
source_text: str,
question_type: str, # multiple_choice, true_false, open_answer, case_study
bloom_level: BloomLevel = BloomLevel.UNDERSTAND,
difficulty: str = "medium"
) -> dict:
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": f"""Create a test question.
Type: {question_type}.
Bloom's taxonomy level: {bloom_level.value}. {BLOOM_PROMPTS[bloom_level]}.
Difficulty: {difficulty}.
For multiple_choice: 4 options, 1 correct, 3 plausible distractors.
For open_answer: reference answer + assessment criteria.
For case_study: scenario + 3-5 questions at different levels.
Return JSON: {{
question: "question text",
type: "{question_type}",
bloom_level: "{bloom_level.value}",
options: ["A...", "B...", ...], // only for MC
correct_answer: "...",
explanation: "why this answer is correct",
scoring_rubric: {{...}} // for open_answer
}}"""
}, {
"role": "user",
"content": f"Topic: {topic}\n\nMaterial:\n{source_text[:2000]}"
}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Generating Complete Exam Variant
async def generate_exam_variant(
course_topics: list[str],
total_questions: int = 30,
time_limit_min: int = 60,
bloom_distribution: dict = None
) -> dict:
"""
bloom_distribution: {BloomLevel.REMEMBER: 0.3, BloomLevel.APPLY: 0.4, ...}
Default: 20% Remember, 30% Understand, 30% Apply, 20% Analyze
"""
if not bloom_distribution:
bloom_distribution = {
BloomLevel.REMEMBER: 0.2,
BloomLevel.UNDERSTAND: 0.3,
BloomLevel.APPLY: 0.3,
BloomLevel.ANALYZE: 0.2
}
questions_by_level = {
level: int(total_questions * fraction)
for level, fraction in bloom_distribution.items()
}
all_questions = []
tasks = []
for level, count in questions_by_level.items():
for i in range(count):
topic = course_topics[i % len(course_topics)]
q_type = "multiple_choice" if level in [BloomLevel.REMEMBER, BloomLevel.UNDERSTAND] else "open_answer"
tasks.append(generate_question(
topic=topic,
source_text="",
question_type=q_type,
bloom_level=level
))
all_questions = await asyncio.gather(*tasks)
return {
"variant_id": f"V{random.randint(1000, 9999)}",
"time_limit_min": time_limit_min,
"total_points": sum(q.get("points", 1) for q in all_questions),
"questions": list(all_questions),
"bloom_distribution": {l.value: c for l, c in questions_by_level.items()}
}
Automatic Grading of Open Answers
async def auto_grade_open_answer(
question: str,
correct_answer: str,
rubric: dict,
student_answer: str
) -> dict:
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": f"""Grade student answer using rubric.
Question: {question}
Reference answer: {correct_answer}
Assessment criteria: {json.dumps(rubric, ensure_ascii=False)}
Grade the answer and return JSON:
{{score: 0-100, feedback: "detailed feedback", strengths: [], weaknesses: []}}"""
}, {
"role": "user",
"content": f"Student answer: {student_answer}"
}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Anti-Plagiarism — Unique Variants
async def generate_unique_variants(
base_question: str,
n_variants: int = 30,
maintain_difficulty: bool = True
) -> list[dict]:
"""Generate N unique versions of a question"""
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": f"""Create {n_variants} unique versions of the question.
Vary: numbers, names, context, order of answer options.
Difficulty {'should remain the same' if maintain_difficulty else 'can vary'}.
Return JSON array."""
}, {
"role": "user",
"content": f"Original question: {base_question}"
}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)["variants"]
Timeframe: test generator from text material — 1-2 weeks. Full-fledged platform with auto-grading, analytics, and LMS integration (Moodle/iSpring) — 2-3 months.







