ML-targeting of audiences in advertising
Machine learning for targeting transforms "show to all women 25-34" into "show to those who are 73%+ likely to convert in the next 7 days." The difference in effectiveness is 3-5 times greater with the same budget.
Predictive Targeting: From Segments to Probabilities
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import LabelEncoder
class PredictiveAudienceBuilder:
"""Создание аудиторий на основе вероятностей конверсии"""
def build_intent_features(self, user_events: pd.DataFrame) -> pd.DataFrame:
"""
Признаки намерения из событий пользователя.
user_events: user_id, event_type, page_url, timestamp, session_id
"""
df = user_events.copy()
df['ts'] = pd.to_datetime(df['timestamp'])
# Рекентность последней активности
now = df['ts'].max()
recency = df.groupby('user_id')['ts'].max().apply(
lambda t: (now - t).total_seconds() / 3600
).rename('hours_since_last_event')
# Поведенческие признаки
behavior = df.groupby('user_id').agg(
total_sessions=('session_id', 'nunique'),
total_events=('event_type', 'count'),
product_views=('event_type', lambda x: (x == 'product_view').sum()),
cart_adds=('event_type', lambda x: (x == 'add_to_cart').sum()),
checkout_starts=('event_type', lambda x: (x == 'checkout_start').sum()),
search_queries=('event_type', lambda x: (x == 'search').sum()),
)
# Конверсионная воронка (нормализованная)
behavior['funnel_depth'] = (
behavior['product_views'] * 1 +
behavior['cart_adds'] * 3 +
behavior['checkout_starts'] * 7
) / behavior['total_sessions'].clip(1)
# Сессионная активность: тренд последних 7 дней vs предыдущие 7
last_7d = df[df['ts'] >= now - pd.Timedelta(days=7)]
prev_7d = df[df['ts'].between(now - pd.Timedelta(days=14), now - pd.Timedelta(days=7))]
activity_last = last_7d.groupby('user_id')['event_type'].count().rename('events_last_7d')
activity_prev = prev_7d.groupby('user_id')['event_type'].count().rename('events_prev_7d')
result = behavior.join(recency).join(activity_last).join(activity_prev).fillna(0)
result['activity_trend'] = (
result['events_last_7d'] - result['events_prev_7d']
) / (result['events_prev_7d'] + 1)
return result
def score_purchase_propensity(self, features: pd.DataFrame,
model: lgb.LGBMClassifier) -> pd.DataFrame:
"""Оценка вероятности покупки для каждого пользователя"""
scores = model.predict_proba(features)[:, 1]
result = pd.DataFrame({
'user_id': features.index,
'purchase_probability': scores,
'audience_tier': pd.cut(
scores,
bins=[0, 0.1, 0.3, 0.6, 1.0],
labels=['cold', 'warm', 'hot', 'ready_to_buy']
)
})
return result.sort_values('purchase_probability', ascending=False)
class BehavioralClusteringAudience:
"""Поведенческая сегментация без supervision"""
def segment_by_behavior(self, user_features: pd.DataFrame,
n_clusters: int = 8) -> pd.DataFrame:
"""
K-Means кластеризация для выявления скрытых аудиторных сегментов.
"""
from sklearn.preprocessing import StandardScaler
feature_cols = user_features.select_dtypes(include=[np.number]).columns
X = user_features[feature_cols].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
user_features = user_features.copy()
user_features['cluster'] = clusters
# Профили кластеров
profiles = user_features.groupby('cluster')[feature_cols].mean()
return user_features, profiles
def label_clusters(self, cluster_profiles: pd.DataFrame) -> dict:
"""Автоматическая маркировка кластеров по профилям"""
labels = {}
for cluster_id, row in cluster_profiles.iterrows():
# Упрощённая эвристическая маркировка
if row.get('checkout_starts', 0) > 2:
label = 'high_intent_buyers'
elif row.get('product_views', 0) > 10 and row.get('cart_adds', 0) == 0:
label = 'browsers_not_buyers'
elif row.get('total_sessions', 0) > 20:
label = 'loyal_visitors'
elif row.get('hours_since_last_event', 9999) > 720:
label = 'dormant_users'
else:
label = f'segment_{cluster_id}'
labels[cluster_id] = label
return labels
Contextual targeting without cookies
class ContextualTargetingEngine:
"""ML-таргетинг на основе контента страницы (cookieless)"""
def classify_page_context(self, page_text: str,
page_url: str) -> dict:
"""
IAB категоризация страницы для контекстуального таргетинга.
Работает без user-level data (GDPR-compliant).
"""
# Ключевые сигналы контекста
url_signals = self._extract_url_signals(page_url)
# В production: BERT-based classifier, обученный на IAB taxonomy
# Здесь упрощённая keyword-based версия
iab_keywords = {
'IAB19': ['technology', 'software', 'programming', 'tech'],
'IAB13': ['finance', 'investment', 'stock', 'crypto', 'money'],
'IAB7': ['health', 'fitness', 'medical', 'diet'],
'IAB9': ['hobby', 'crafts', 'games', 'gaming'],
}
text_lower = page_text.lower()
scores = {}
for iab_cat, keywords in iab_keywords.items():
score = sum(text_lower.count(kw) for kw in keywords)
if score > 0:
scores[iab_cat] = score
if not scores:
return {'categories': ['IAB24'], 'confidence': 0.5}
primary_cat = max(scores, key=scores.get)
total = sum(scores.values())
return {
'primary_category': primary_cat,
'all_categories': list(scores.keys()),
'confidence': round(scores[primary_cat] / total, 2),
'url_signals': url_signals,
}
def _extract_url_signals(self, url: str) -> list:
signals = []
if '/news/' in url or '/article/' in url:
signals.append('editorial_content')
if '/product/' in url or '/shop/' in url:
signals.append('ecommerce')
if '/blog/' in url:
signals.append('blog_content')
return signals
Comparison of targeting methods
| Method | CPM | CTR | Conversion | Privacy |
|---|---|---|---|---|
| Demographics (age/gender) | low | 0.05-0.1% | low | safe |
| Behavioral (3rd party cookies) | High | 0.2-0.5% | Medium | Limited |
| Predictive (ML propensity) | average | 0.3-0.8% | high | 1st party |
| Lookalike ML | average | 0.2-0.6% | average | 1st party |
| Contextual (cookieless) | average | 0.1-0.3% | average | safe |
Predictive targeting based on first-party data is the most robust option in the world without third-party cookies. It requires high-quality event data (at least 50,000 users with conversion history) to train a propensity model with an acceptable AUC > 0.72.







