AI-powered news feed personalization system
News feed personalization is about balancing relevance and diversity. Pure relevance optimization creates "filter bubbles" and reduces engagement after 2-3 weeks. Modern systems (Google News, Apple News) explicitly introduce a diversity component and ensure exposure to viewpoints beyond the echo chamber.
Multi-factor ranking
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
class NewsPersonalizationEngine:
"""Content feed personalization"""
def __init__(self):
self.encoder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
def build_user_interest_profile(self,
reading_history: list[dict],
explicit_preferences: dict = None) -> dict:
"""
Build interest profile from reading history.
reading_history: [{'article_id': ..., 'topic': ..., 'time_spent_sec': ..., 'completed': ...}]
"""
if not reading_history:
return {'topics': {}, 'is_cold_start': True}
# Weight interests: reading time + completion
topic_weights = {}
for article in reading_history:
topic = article.get('topic', 'general')
time_weight = min(article.get('time_spent_sec', 30) / 180, 1.0) # Normalize to 3 min
completion_bonus = 0.5 if article.get('completed') else 0
weight = time_weight + completion_bonus
topic_weights[topic] = topic_weights.get(topic, 0) + weight
# Normalize + decay (old interests count less)
total = sum(topic_weights.values())
normalized = {t: w / total for t, w in topic_weights.items()}
# Top interests for profile embedding
recent_titles = [a.get('title', '') for a in reading_history[-20:] if a.get('completed')]
profile_embedding = None
if recent_titles:
profile_embedding = np.mean(
self.encoder.encode(recent_titles, normalize_embeddings=True),
axis=0
)
return {
'topics': normalized,
'top_interests': sorted(normalized.items(), key=lambda x: -x[1])[:5],
'profile_embedding': profile_embedding,
'is_cold_start': False,
'explicit_preferences': explicit_preferences or {}
}
def score_article(self, article: dict,
user_profile: dict,
seen_topics_last_hour: list[str]) -> dict:
"""Multi-factor score for article"""
topic = article.get('topic', 'general')
topics = user_profile.get('topics', {})
# === Relevance ===
topic_score = topics.get(topic, 0.05) # Default interest
# Semantic similarity to profile
semantic_score = 0.5 # Default for cold start
profile_emb = user_profile.get('profile_embedding')
if profile_emb is not None and article.get('embedding') is not None:
semantic_score = float(cosine_similarity(
profile_emb.reshape(1, -1),
np.array(article['embedding']).reshape(1, -1)
)[0, 0])
relevance = topic_score * 0.4 + semantic_score * 0.6
# === Freshness ===
hours_old = article.get('hours_since_published', 24)
freshness = np.exp(-hours_old / 12) # Half-life 12 hours
# === Quality ===
quality_score = (
article.get('engagement_rate', 0.5) * 0.4 +
article.get('source_trust_score', 0.7) * 0.3 +
min(article.get('word_count', 500) / 800, 1.0) * 0.3
)
# === Diversity penalty ===
# Lower score if topic seen recently
topic_seen_count = seen_topics_last_hour.count(topic)
diversity_penalty = 0.9 ** topic_seen_count # 0→1.0, 1→0.9, 2→0.81...
# === Breaking news boost ===
breaking_boost = 1.5 if article.get('is_breaking') else 1.0
# === Final score ===
final_score = (
relevance * 0.40 +
freshness * 0.25 +
quality_score * 0.20 +
0.15 # Base noise for serendipity
) * diversity_penalty * breaking_boost
return {
'article_id': article.get('id'),
'final_score': round(final_score, 4),
'relevance': round(relevance, 3),
'freshness': round(freshness, 3),
'quality': round(quality_score, 3),
'diversity_penalty': round(diversity_penalty, 3),
}
def rank_feed(self, articles: list[dict],
user_profile: dict,
max_items: int = 20,
diversity_floor: float = 0.15) -> list[dict]:
"""
Final feed ranking with diversity constraint.
diversity_floor: minimum share of articles outside top-3 topics.
"""
seen_topics = []
scored = []
for article in articles:
score_data = self.score_article(article, user_profile, seen_topics)
scored.append({**article, **score_data})
scored.sort(key=lambda x: -x['final_score'])
# Apply diversity: no more than 3 articles from same topic in a row
result = []
topic_counts = {}
max_per_topic = max(2, max_items // len(user_profile.get('topics', {'general': 1})))
for item in scored:
if len(result) >= max_items:
break
topic = item.get('topic', 'general')
if topic_counts.get(topic, 0) >= max_per_topic:
continue
result.append(item)
topic_counts[topic] = topic_counts.get(topic, 0) + 1
seen_topics.append(topic)
# Ensure minimum diversity: add articles from other topics
if len(result) > 5:
top_topics = set(list(topic_counts.keys())[:2])
non_top_in_result = sum(1 for item in result if item.get('topic') not in top_topics)
diversity_actual = non_top_in_result / len(result)
if diversity_actual < diversity_floor:
# Insert articles from uncovered topics
for item in scored[len(result):]:
if item.get('topic') not in top_topics:
result.insert(len(result) // 2, item) # Insert in middle
if sum(1 for i in result if i.get('topic') not in top_topics) / len(result) >= diversity_floor:
break
return result[:max_items]
class EngagementTracker:
"""Track reader behavior for profile updates"""
def update_profile_from_session(self, user_profile: dict,
session_events: list[dict]) -> dict:
"""Incremental profile update from session"""
profile = user_profile.copy()
topics = dict(profile.get('topics', {}))
for event in session_events:
topic = event.get('topic', 'general')
action = event.get('action')
value = event.get('value', 0)
if action == 'completed_read':
topics[topic] = topics.get(topic, 0) + 0.3
elif action == 'quick_skip':
topics[topic] = max(0, topics.get(topic, 0) - 0.1)
elif action == 'share':
topics[topic] = topics.get(topic, 0) + 0.5
elif action == 'dislike':
topics[topic] = max(0, topics.get(topic, 0) - 0.3)
# Normalize
total = sum(topics.values())
if total > 0:
profile['topics'] = {t: w / total for t, w in topics.items()}
return profile
Properly configured personalization increases time-on-site by 25-40% and DAU/MAU by 8-15%. Without diversity constraint: short-term engagement growth, long-term churn from information fatigue. Google News publicly states it introduces diversity as an explicit objective in ranking.







