AI Automatic Test and Exam Task Generation System

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Automatic Test and Exam Task Generation System
Medium
~1-2 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Developing AI-Based Automated Test and Exam Generation System

AI-generation of test questions creates unique questions from educational materials, at different difficulty levels (Bloom's taxonomy), with multiple execution variants — prevents cheating in mass testing.

Multi-Level Question Generation

from openai import AsyncOpenAI
from enum import Enum
import json

client = AsyncOpenAI()

class BloomLevel(Enum):
    REMEMBER = "remember"       # memorize facts
    UNDERSTAND = "understand"   # explain concepts
    APPLY = "apply"            # apply in new situation
    ANALYZE = "analyze"        # break down into parts
    EVALUATE = "evaluate"      # critically evaluate
    CREATE = "create"          # create something new

BLOOM_PROMPTS = {
    BloomLevel.REMEMBER: "Create a question testing fact memorization, dates, definitions",
    BloomLevel.UNDERSTAND: "Create a comprehension question: explanation, paraphrasing, examples",
    BloomLevel.APPLY: "Create a practical task: applying knowledge in new situation",
    BloomLevel.ANALYZE: "Create an analysis question: comparison, identifying causes, structuring",
    BloomLevel.EVALUATE: "Create an evaluation question: justifying judgment, critiquing approach",
    BloomLevel.CREATE: "Create a synthesis task: developing solution, creating product",
}

async def generate_question(
    topic: str,
    source_text: str,
    question_type: str,  # multiple_choice, true_false, open_answer, case_study
    bloom_level: BloomLevel = BloomLevel.UNDERSTAND,
    difficulty: str = "medium"
) -> dict:
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "system",
            "content": f"""Create a test question.
            Type: {question_type}.
            Bloom's taxonomy level: {bloom_level.value}. {BLOOM_PROMPTS[bloom_level]}.
            Difficulty: {difficulty}.

            For multiple_choice: 4 options, 1 correct, 3 plausible distractors.
            For open_answer: reference answer + assessment criteria.
            For case_study: scenario + 3-5 questions at different levels.

            Return JSON: {{
                question: "question text",
                type: "{question_type}",
                bloom_level: "{bloom_level.value}",
                options: ["A...", "B...", ...],  // only for MC
                correct_answer: "...",
                explanation: "why this answer is correct",
                scoring_rubric: {{...}}  // for open_answer
            }}"""
        }, {
            "role": "user",
            "content": f"Topic: {topic}\n\nMaterial:\n{source_text[:2000]}"
        }],
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

Generating Complete Exam Variant

async def generate_exam_variant(
    course_topics: list[str],
    total_questions: int = 30,
    time_limit_min: int = 60,
    bloom_distribution: dict = None
) -> dict:
    """
    bloom_distribution: {BloomLevel.REMEMBER: 0.3, BloomLevel.APPLY: 0.4, ...}
    Default: 20% Remember, 30% Understand, 30% Apply, 20% Analyze
    """
    if not bloom_distribution:
        bloom_distribution = {
            BloomLevel.REMEMBER: 0.2,
            BloomLevel.UNDERSTAND: 0.3,
            BloomLevel.APPLY: 0.3,
            BloomLevel.ANALYZE: 0.2
        }

    questions_by_level = {
        level: int(total_questions * fraction)
        for level, fraction in bloom_distribution.items()
    }

    all_questions = []
    tasks = []

    for level, count in questions_by_level.items():
        for i in range(count):
            topic = course_topics[i % len(course_topics)]
            q_type = "multiple_choice" if level in [BloomLevel.REMEMBER, BloomLevel.UNDERSTAND] else "open_answer"
            tasks.append(generate_question(
                topic=topic,
                source_text="",
                question_type=q_type,
                bloom_level=level
            ))

    all_questions = await asyncio.gather(*tasks)

    return {
        "variant_id": f"V{random.randint(1000, 9999)}",
        "time_limit_min": time_limit_min,
        "total_points": sum(q.get("points", 1) for q in all_questions),
        "questions": list(all_questions),
        "bloom_distribution": {l.value: c for l, c in questions_by_level.items()}
    }

Automatic Grading of Open Answers

async def auto_grade_open_answer(
    question: str,
    correct_answer: str,
    rubric: dict,
    student_answer: str
) -> dict:
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "system",
            "content": f"""Grade student answer using rubric.
            Question: {question}
            Reference answer: {correct_answer}
            Assessment criteria: {json.dumps(rubric, ensure_ascii=False)}

            Grade the answer and return JSON:
            {{score: 0-100, feedback: "detailed feedback", strengths: [], weaknesses: []}}"""
        }, {
            "role": "user",
            "content": f"Student answer: {student_answer}"
        }],
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

Anti-Plagiarism — Unique Variants

async def generate_unique_variants(
    base_question: str,
    n_variants: int = 30,
    maintain_difficulty: bool = True
) -> list[dict]:
    """Generate N unique versions of a question"""
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "system",
            "content": f"""Create {n_variants} unique versions of the question.
            Vary: numbers, names, context, order of answer options.
            Difficulty {'should remain the same' if maintain_difficulty else 'can vary'}.
            Return JSON array."""
        }, {
            "role": "user",
            "content": f"Original question: {base_question}"
        }],
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)["variants"]

Timeframe: test generator from text material — 1-2 weeks. Full-fledged platform with auto-grading, analytics, and LMS integration (Moodle/iSpring) — 2-3 months.