AI System for Planogram Control in Retail
A planogram is a product placement plan on a shelf. Traditional planogram compliance control is done manually by a merchandiser once a week. Automation via CV cameras or photos from sales representatives reduces reaction time from days to minutes.
Task: From Shelf Photo to Violation Report
The pipeline consists of three steps: product detection on shelf → identification of each product → comparison with reference planogram.
from ultralytics import YOLO
import numpy as np
from PIL import Image
import torch
import torch.nn.functional as F
class PlanogramComplianceChecker:
"""
Step 1: YOLOv8 detects all products on shelf (bbox + class)
Step 2: CLIP/ViT identifies specific SKU by crop
Step 3: Comparison with planogram
"""
def __init__(
self,
detector_path: str, # fine-tuned YOLO on shelves
sku_embeddings_path: str, # CLIP embeddings of all SKU
planogram: dict # {position: sku_id}
):
self.detector = YOLO(detector_path)
sku_data = np.load(sku_embeddings_path)
self.sku_embeddings = torch.from_numpy(
sku_data['embeddings']
).float() # (N_SKU, embedding_dim)
self.sku_ids = sku_data['sku_ids'].tolist()
self.planogram = planogram
# CLIP for SKU identification
from transformers import CLIPProcessor, CLIPModel
self.clip_model = CLIPModel.from_pretrained(
'openai/clip-vit-large-patch14'
).eval().cuda()
self.clip_processor = CLIPProcessor.from_pretrained(
'openai/clip-vit-large-patch14'
)
def analyze_shelf(
self,
shelf_image: Image.Image,
confidence_threshold: float = 0.5
) -> dict:
img_array = np.array(shelf_image)
# Step 1: detection
detections = self.detector.predict(
img_array, conf=confidence_threshold, verbose=False
)[0]
shelf_products = []
for box in detections.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
crop = shelf_image.crop((x1, y1, x2, y2))
# Step 2: SKU identification via CLIP
sku_id, similarity = self._identify_sku(crop)
shelf_products.append({
'bbox': [x1, y1, x2, y2],
'sku_id': sku_id,
'confidence': float(box.conf),
'sku_similarity': float(similarity),
'position': self._get_shelf_position(
[x1, y1, x2, y2], img_array.shape
)
})
# Step 3: comparison with planogram
compliance = self._check_compliance(shelf_products)
return compliance
@torch.no_grad()
def _identify_sku(
self, crop: Image.Image
) -> tuple[str, float]:
inputs = self.clip_processor(
images=crop, return_tensors='pt'
).to('cuda')
features = self.clip_model.get_image_features(**inputs)
features = F.normalize(features, dim=-1).cpu()
# Cosine similarity with all SKU embeddings
similarities = (features @ self.sku_embeddings.T).squeeze()
best_idx = similarities.argmax().item()
return self.sku_ids[best_idx], float(similarities[best_idx])
def _get_shelf_position(
self, bbox: list, img_shape: tuple
) -> dict:
"""Horizontal position + shelf row"""
h, w = img_shape[:2]
cx = (bbox[0] + bbox[2]) / 2
cy = (bbox[1] + bbox[3]) / 2
return {
'col': int(cx / w * 10), # 0-9 — ten columns
'row': int(cy / h * 5) # 0-4 — five rows
}
def _check_compliance(self, shelf_products: list) -> dict:
violations = []
actual_positions = {
f"{p['position']['row']}_{p['position']['col']}": p['sku_id']
for p in shelf_products
}
for position_key, expected_sku in self.planogram.items():
actual_sku = actual_positions.get(position_key)
if actual_sku is None:
violations.append({
'type': 'out_of_stock',
'position': position_key,
'expected_sku': expected_sku
})
elif actual_sku != expected_sku:
violations.append({
'type': 'wrong_product',
'position': position_key,
'expected_sku': expected_sku,
'actual_sku': actual_sku
})
compliance_score = 1.0 - len(violations) / max(len(self.planogram), 1)
return {
'compliance_score': round(compliance_score, 3),
'violations': violations,
'total_positions': len(self.planogram),
'violations_count': len(violations),
'detected_products': len(shelf_products)
}
SKU Indexing via CLIP Embeddings
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from pathlib import Path
def build_sku_index(
product_images_dir: str, # directory: {sku_id}/{image1.jpg, ...}
output_path: str,
model_name: str = 'openai/clip-vit-large-patch14',
images_per_sku: int = 5 # average embedding over multiple photos
) -> None:
"""
Build CLIP index of all SKU.
Multiple photos per product → more stable averaged embedding.
"""
model = CLIPModel.from_pretrained(model_name).eval().cuda()
processor = CLIPProcessor.from_pretrained(model_name)
sku_embeddings = []
sku_ids = []
for sku_dir in sorted(Path(product_images_dir).iterdir()):
if not sku_dir.is_dir():
continue
sku_id = sku_dir.name
image_files = list(sku_dir.glob('*.{jpg,jpeg,png}'))[:images_per_sku]
if not image_files:
continue
batch_embeddings = []
for img_path in image_files:
image = Image.open(img_path).convert('RGB')
inputs = processor(images=image, return_tensors='pt').to('cuda')
with torch.no_grad():
emb = model.get_image_features(**inputs)
emb = F.normalize(emb, dim=-1).cpu().numpy()
batch_embeddings.append(emb)
mean_emb = np.mean(batch_embeddings, axis=0)
mean_emb = mean_emb / np.linalg.norm(mean_emb)
sku_embeddings.append(mean_emb.squeeze())
sku_ids.append(sku_id)
np.savez(
output_path,
embeddings=np.array(sku_embeddings),
sku_ids=np.array(sku_ids)
)
print(f'Indexed {len(sku_ids)} SKUs')
Timeline
| Task | Timeline |
|---|---|
| Product detector on shelf (fine-tuning YOLO) | 3–5 weeks |
| Full system (detection + identification + planogram) | 7–12 weeks |
| Integration with ERP / mobile app for merchandisers | 10–16 weeks |







