Tabular Data Generation Model Training

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
Tabular Data Generation Model Training
Medium
~5 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1221
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    855
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1056
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    828

Training Model for Tabular Data Generation (Tabular ML)

Tabular data generation is training a model that synthesizes new rows, statistically indistinguishable from the training dataset. Applications: augmentation for imbalanced datasets, synthetic data for privacy, test data.

CTGAN: Training and Tuning

from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic
import pandas as pd

def train_tabular_generator(df: pd.DataFrame,
                              target_column: str = None) -> CTGANSynthesizer:
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    # Fine-tuning column types
    if target_column:
        # Binary target — always categorical
        metadata.update_column(target_column, sdtype='categorical')

    synthesizer = CTGANSynthesizer(
        metadata,
        epochs=500,
        batch_size=500,
        generator_dim=(256, 256, 256),    # Deeper for complex data
        discriminator_dim=(256, 256, 256),
        pac=10,                            # Packing for training stabilization
        log_frequency=True,
        verbose=True
    )

    synthesizer.fit(df)
    return synthesizer

def evaluate_generator(real_df: pd.DataFrame,
                        synthetic_df: pd.DataFrame,
                        metadata: SingleTableMetadata) -> dict:
    quality_report = evaluate_quality(real_df, synthetic_df, metadata)
    diagnostic = run_diagnostic(real_df, synthetic_df, metadata)

    return {
        'quality_score': quality_report.get_score(),
        'column_shapes': quality_report.get_details('Column Shapes'),
        'column_pair_trends': quality_report.get_details('Column Pair Trends'),
        'diagnostic': diagnostic.get_results()
    }

SMOTE for Imbalanced Classes

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek

def balance_dataset(X: pd.DataFrame, y: pd.Series,
                     method: str = 'smote') -> tuple:
    methods = {
        'smote': SMOTE(sampling_strategy='auto', random_state=42),
        'adasyn': ADASYN(sampling_strategy='auto', random_state=42),
        'borderline': BorderlineSMOTE(random_state=42),
        'smote_tomek': SMOTETomek(random_state=42)  # SMOTE + Tomek Links cleanup
    }

    resampler = methods[method]
    X_res, y_res = resampler.fit_resample(X, y)

    print(f"Original distribution: {y.value_counts().to_dict()}")
    print(f"Resampled distribution: {pd.Series(y_res).value_counts().to_dict()}")

    return X_res, y_res

TableDiffusion: Diffusion Models for Tables

Latest approach for complex multimodal tabular data:

# TabDDPM — SOTA for tabular generation
from tab_ddpm import TabDDPM

model = TabDDPM(
    num_numerical=10,  # Number of numerical features
    num_categorical=5, # Number of categorical features
    d_layers=[256, 256, 256],
    dropout=0.0,
    rtdl_params={},
)

model.fit(X_train, num_epochs=1000, batch_size=256)
synthetic = model.sample(n=len(X_train))

ML Utility Test (TSTR)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

def tstr_evaluation(real_train: pd.DataFrame, synthetic_train: pd.DataFrame,
                     real_test: pd.DataFrame, target: str) -> dict:
    """Train on Synthetic, Test on Real"""
    X_real = real_train.drop(columns=[target])
    X_synth = synthetic_train.drop(columns=[target])
    X_test = real_test.drop(columns=[target])

    # Model on real data
    clf_real = GradientBoostingClassifier(random_state=42)
    clf_real.fit(X_real, real_train[target])
    auc_real = roc_auc_score(real_test[target], clf_real.predict_proba(X_test)[:, 1])

    # Model on synthetic data
    clf_synth = GradientBoostingClassifier(random_state=42)
    clf_synth.fit(X_synth, synthetic_train[target])
    auc_synth = roc_auc_score(real_test[target], clf_synth.predict_proba(X_test)[:, 1])

    utility_gap = (auc_real - auc_synth) / auc_real
    print(f"AUC Real: {auc_real:.4f}, AUC Synthetic: {auc_synth:.4f}")
    print(f"ML Utility Gap: {utility_gap:.2%} (target: < 5%)")

    return {'auc_real': auc_real, 'auc_synthetic': auc_synth, 'utility_gap': utility_gap}

For most tasks, CTGAN with 300-500 training epochs gives ML utility gap < 5% with high quality original data. TabDDPM shows better results on complex datasets with high dimensionality.