AI Visual Search System for Product Search by Photo

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Visual Search System for Product Search by Photo
Medium
~1-2 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

AI Visual Search for Products by Photo

User photographs an item they want to buy — system finds similar products in catalog. This is reverse image search adapted for ecommerce: need to find visually similar products, not identical images.

Architecture: Embedding + Vector Search

import torch
import torch.nn.functional as F
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import qdrant_client
from qdrant_client.models import Distance, VectorParams, PointStruct

class VisualSearchEngine:
    """
    CLIP-embedding + Qdrant vector DB for photo search.
    CLIP can do text→image and image→image search out of the box.
    """
    def __init__(
        self,
        qdrant_url: str = 'http://localhost:6333',
        collection_name: str = 'products',
        embedding_dim: int = 768    # CLIP ViT-L/14
    ):
        self.clip_model = CLIPModel.from_pretrained(
            'openai/clip-vit-large-patch14'
        ).eval().cuda()
        self.clip_processor = CLIPProcessor.from_pretrained(
            'openai/clip-vit-large-patch14'
        )

        self.client = qdrant_client.QdrantClient(url=qdrant_url)
        self.collection_name = collection_name
        self._ensure_collection(embedding_dim)

    def _ensure_collection(self, dim: int) -> None:
        if not self.client.collection_exists(self.collection_name):
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=dim,
                    distance=Distance.COSINE
                )
            )

    @torch.no_grad()
    def embed_image(self, image: Image.Image) -> np.ndarray:
        inputs = self.clip_processor(
            images=image, return_tensors='pt'
        ).to('cuda')
        emb = self.clip_model.get_image_features(**inputs)
        return F.normalize(emb, dim=-1).cpu().numpy().squeeze()

    @torch.no_grad()
    def embed_text(self, text: str) -> np.ndarray:
        inputs = self.clip_processor(
            text=[text], return_tensors='pt', padding=True
        ).to('cuda')
        emb = self.clip_model.get_text_features(**inputs)
        return F.normalize(emb, dim=-1).cpu().numpy().squeeze()

    def index_product(
        self,
        product_id: str,
        product_image: Image.Image,
        metadata: dict
    ) -> None:
        embedding = self.embed_image(product_image)
        self.client.upsert(
            collection_name=self.collection_name,
            points=[PointStruct(
                id=hash(product_id) % (2**63),
                vector=embedding.tolist(),
                payload={
                    'product_id': product_id,
                    'category': metadata.get('category'),
                    'price': metadata.get('price'),
                    'brand': metadata.get('brand'),
                    **metadata
                }
            )]
        )

    def search_by_image(
        self,
        query_image: Image.Image,
        top_k: int = 20,
        filters: dict = None,       # {'category': 'shoes', 'max_price': 5000}
        score_threshold: float = 0.65
    ) -> list[dict]:
        query_embedding = self.embed_image(query_image)

        # Build Qdrant filters
        qdrant_filter = None
        if filters:
            from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
            conditions = []
            for key, value in filters.items():
                if key == 'max_price':
                    conditions.append(
                        FieldCondition(key='price', range=Range(lte=value))
                    )
                elif key == 'min_price':
                    conditions.append(
                        FieldCondition(key='price', range=Range(gte=value))
                    )
                else:
                    conditions.append(
                        FieldCondition(key=key, match=MatchValue(value=value))
                    )
            qdrant_filter = Filter(must=conditions)

        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding.tolist(),
            limit=top_k,
            query_filter=qdrant_filter,
            score_threshold=score_threshold,
            with_payload=True
        )

        return [
            {
                'product_id': r.payload['product_id'],
                'score': round(r.score, 4),
                'metadata': {k: v for k, v in r.payload.items()
                             if k != 'product_id'}
            }
            for r in results
        ]

    def search_by_text(
        self, query_text: str, top_k: int = 20
    ) -> list[dict]:
        """Text-to-image search: 'red Nike sneakers' → results"""
        query_embedding = self.embed_text(query_text)
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding.tolist(),
            limit=top_k,
            with_payload=True
        )
        return [{'product_id': r.payload['product_id'],
                 'score': r.score} for r in results]

Object Cropping Before Search

If user uploaded a photo with complex background — CLIP embedding will be "contaminated" by background. Pre-processing to remove background improves search accuracy by 10–18%.

from rembg import remove as rembg_remove

def prepare_search_query(
    user_image: Image.Image,
    remove_background: bool = True,
    crop_to_object: bool = True
) -> Image.Image:
    if remove_background:
        # rembg → RGBA
        rgba = rembg_remove(user_image)
        if crop_to_object:
            # Auto-crop by bounding box of opaque pixels
            bbox = rgba.getbbox()   # (left, upper, right, lower)
            if bbox:
                rgba = rgba.crop(bbox)
        # White background under object
        background = Image.new('RGB', rgba.size, (255, 255, 255))
        background.paste(rgba, mask=rgba.split()[3])
        return background
    return user_image

Fine-tuning CLIP for Fashion Domain

CLIP is trained on general data; for specific domains (fashion, furniture, electronics) fine-tuning makes sense:

from transformers import CLIPModel, CLIPProcessor
import torch
from torch.optim import AdamW

def finetune_clip_for_domain(
    model: CLIPModel,
    train_loader,          # (image_tensor, text_tensor) pairs
    num_epochs: int = 10,
    learning_rate: float = 1e-6   # very small LR — CLIP already well-trained
) -> CLIPModel:
    """
    Fine-tune visual encoder only.
    Freeze text encoder — we need it for text→image search.
    """
    for param in model.text_model.parameters():
        param.requires_grad = False

    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=learning_rate, weight_decay=0.01
    )

    for epoch in range(num_epochs):
        model.train()
        for batch_images, batch_texts in train_loader:
            outputs = model(
                input_ids=batch_texts['input_ids'].cuda(),
                attention_mask=batch_texts['attention_mask'].cuda(),
                pixel_values=batch_images.cuda()
            )
            # InfoNCE loss
            logits_per_image = outputs.logits_per_image
            labels = torch.arange(
                logits_per_image.shape[0], device='cuda'
            )
            loss = (F.cross_entropy(logits_per_image, labels) +
                    F.cross_entropy(logits_per_image.T, labels)) / 2

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return model

Performance

Catalog Size Method Search Latency Accuracy (R@10)
10,000 products CLIP + Qdrant 8ms 74%
100,000 products CLIP + Qdrant 12ms 74%
1M products CLIP + Qdrant (HNSW) 25ms 73%
10,000 products CLIP fine-tuned + Qdrant 8ms 86%

Timeline

Task Timeline
CLIP zero-shot visual search (ready catalog) 2–3 weeks
Fine-tuning + indexing large catalog 5–8 weeks
Full system with multimodal search (photo + text) 8–13 weeks