Training Model for Tabular Data Generation (Tabular ML)
Tabular data generation is training a model that synthesizes new rows, statistically indistinguishable from the training dataset. Applications: augmentation for imbalanced datasets, synthetic data for privacy, test data.
CTGAN: Training and Tuning
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic
import pandas as pd
def train_tabular_generator(df: pd.DataFrame,
target_column: str = None) -> CTGANSynthesizer:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
# Fine-tuning column types
if target_column:
# Binary target — always categorical
metadata.update_column(target_column, sdtype='categorical')
synthesizer = CTGANSynthesizer(
metadata,
epochs=500,
batch_size=500,
generator_dim=(256, 256, 256), # Deeper for complex data
discriminator_dim=(256, 256, 256),
pac=10, # Packing for training stabilization
log_frequency=True,
verbose=True
)
synthesizer.fit(df)
return synthesizer
def evaluate_generator(real_df: pd.DataFrame,
synthetic_df: pd.DataFrame,
metadata: SingleTableMetadata) -> dict:
quality_report = evaluate_quality(real_df, synthetic_df, metadata)
diagnostic = run_diagnostic(real_df, synthetic_df, metadata)
return {
'quality_score': quality_report.get_score(),
'column_shapes': quality_report.get_details('Column Shapes'),
'column_pair_trends': quality_report.get_details('Column Pair Trends'),
'diagnostic': diagnostic.get_results()
}
SMOTE for Imbalanced Classes
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek
def balance_dataset(X: pd.DataFrame, y: pd.Series,
method: str = 'smote') -> tuple:
methods = {
'smote': SMOTE(sampling_strategy='auto', random_state=42),
'adasyn': ADASYN(sampling_strategy='auto', random_state=42),
'borderline': BorderlineSMOTE(random_state=42),
'smote_tomek': SMOTETomek(random_state=42) # SMOTE + Tomek Links cleanup
}
resampler = methods[method]
X_res, y_res = resampler.fit_resample(X, y)
print(f"Original distribution: {y.value_counts().to_dict()}")
print(f"Resampled distribution: {pd.Series(y_res).value_counts().to_dict()}")
return X_res, y_res
TableDiffusion: Diffusion Models for Tables
Latest approach for complex multimodal tabular data:
# TabDDPM — SOTA for tabular generation
from tab_ddpm import TabDDPM
model = TabDDPM(
num_numerical=10, # Number of numerical features
num_categorical=5, # Number of categorical features
d_layers=[256, 256, 256],
dropout=0.0,
rtdl_params={},
)
model.fit(X_train, num_epochs=1000, batch_size=256)
synthetic = model.sample(n=len(X_train))
ML Utility Test (TSTR)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
def tstr_evaluation(real_train: pd.DataFrame, synthetic_train: pd.DataFrame,
real_test: pd.DataFrame, target: str) -> dict:
"""Train on Synthetic, Test on Real"""
X_real = real_train.drop(columns=[target])
X_synth = synthetic_train.drop(columns=[target])
X_test = real_test.drop(columns=[target])
# Model on real data
clf_real = GradientBoostingClassifier(random_state=42)
clf_real.fit(X_real, real_train[target])
auc_real = roc_auc_score(real_test[target], clf_real.predict_proba(X_test)[:, 1])
# Model on synthetic data
clf_synth = GradientBoostingClassifier(random_state=42)
clf_synth.fit(X_synth, synthetic_train[target])
auc_synth = roc_auc_score(real_test[target], clf_synth.predict_proba(X_test)[:, 1])
utility_gap = (auc_real - auc_synth) / auc_real
print(f"AUC Real: {auc_real:.4f}, AUC Synthetic: {auc_synth:.4f}")
print(f"ML Utility Gap: {utility_gap:.2%} (target: < 5%)")
return {'auc_real': auc_real, 'auc_synthetic': auc_synth, 'utility_gap': utility_gap}
For most tasks, CTGAN with 300-500 training epochs gives ML utility gap < 5% with high quality original data. TabDDPM shows better results on complex datasets with high dimensionality.







