AI Video Background Replacement Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Video Background Replacement Implementation
Medium
~3-5 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

AI Development for Video Background Replacement

Background replacement in video is more complex than in static images: temporal coherence between frames is required (otherwise background will "flicker"). Applications: virtual background in videoconferences (Zoom, Teams), sports broadcasting, news studios without chroma key, social networks. Real-time requirements for conferences — 15–30 FPS with latency <50 ms on CPU/GPU with low power consumption.

RVM — Robust Video Matting with Temporal Coherence

import torch
import torchvision.transforms as T
from PIL import Image
import numpy as np
import cv2

class VideoBackgroundReplacer:
    def __init__(self, model_path: str, device: str = 'cuda'):
        self.device = device
        # RVM with recurrent state — key to temporal coherence
        self.model = torch.jit.load(model_path).to(device)
        self.model.eval()
        self.transform = T.ToTensor()
        # Recurrent state preserved between frames
        self.rec = [None] * 4

    def reset_state(self):
        """Reset state on scene/source change"""
        self.rec = [None] * 4

    @torch.no_grad()
    def process_frame(self, frame_bgr: np.ndarray,
                       background_bgr: np.ndarray) -> np.ndarray:
        """
        Processing single video frame.
        State (rec) preserved between calls for smoothness.
        """
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        bg_rgb = cv2.cvtColor(background_bgr, cv2.COLOR_BGR2RGB)

        # Resize to multiple of 64 for model
        h, w = frame_rgb.shape[:2]
        src = self.transform(Image.fromarray(frame_rgb)).unsqueeze(0).to(self.device)
        bgr_tensor = self.transform(
            Image.fromarray(bg_rgb).resize((w, h))
        ).unsqueeze(0).to(self.device)

        # Main inference with recurrent state passing
        fgr, pha, *self.rec = self.model(src, *self.rec, downsample_ratio=0.25)

        # Compositing
        composite = fgr * pha + bgr_tensor * (1 - pha)
        result = (composite.squeeze().permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
        return cv2.cvtColor(result, cv2.COLOR_RGB2BGR)

    def replace_in_video(self, input_path: str,
                          background_path: str,
                          output_path: str) -> dict:
        cap = cv2.VideoCapture(input_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        # Load background (can be image or video)
        if background_path.endswith(('.jpg', '.png')):
            bg = cv2.imread(background_path)
            bg = cv2.resize(bg, (w, h))
            bg_is_video = False
        else:
            bg_cap = cv2.VideoCapture(background_path)
            bg_is_video = True

        out = cv2.VideoWriter(output_path,
                              cv2.VideoWriter_fourcc(*'mp4v'),
                              fps, (w, h))

        self.reset_state()
        frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if bg_is_video:
                ret_bg, bg = bg_cap.read()
                if not ret_bg:
                    bg_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                    _, bg = bg_cap.read()
                bg = cv2.resize(bg, (w, h))

            result = self.process_frame(frame, bg)
            out.write(result)
            frame_count += 1

        cap.release()
        out.release()

        return {'frames': frame_count, 'fps': fps, 'output': output_path}

Real-Time for Videoconferences (WebRTC/ONNX)

import onnxruntime as ort

class RealtimeBackgroundProcessor:
    """
    ONNX Runtime for CPU optimization on machines without GPU.
    Target: 30 FPS on laptop, latency <33 ms/frame.
    """
    def __init__(self, onnx_model_path: str):
        # Settings for maximum CPU performance
        opts = ort.SessionOptions()
        opts.intra_op_num_threads = 4
        opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

        self.session = ort.InferenceSession(
            onnx_model_path,
            sess_options=opts,
            providers=['TensorrtExecutionProvider',
                       'CUDAExecutionProvider',
                       'CPUExecutionProvider']
        )

        # Recurrent state as numpy arrays
        self.rec_states = [
            np.zeros((1, 1, 1, 1), dtype=np.float32) for _ in range(4)
        ]

    def process_frame_fast(self, frame_rgb: np.ndarray,
                             target_size: tuple = (256, 144)) -> np.ndarray:
        """
        Downscale to 256x144 for CPU real-time.
        Upscale mask back via bilinear interpolation.
        """
        orig_h, orig_w = frame_rgb.shape[:2]
        small = cv2.resize(frame_rgb, target_size)
        small_f = small.astype(np.float32) / 255.0
        src = small_f.transpose(2, 0, 1)[np.newaxis]  # [1, 3, H, W]

        outputs = self.session.run(
            None,
            {'src': src, 'r1i': self.rec_states[0], 'r2i': self.rec_states[1],
             'r3i': self.rec_states[2], 'r4i': self.rec_states[3],
             'downsample_ratio': np.array([0.25])}
        )
        fgr, pha = outputs[0], outputs[1]
        self.rec_states = list(outputs[2:6])

        # Upscale mask back to original size
        alpha_small = pha[0, 0]
        alpha_full = cv2.resize(alpha_small, (orig_w, orig_h),
                                interpolation=cv2.INTER_LINEAR)
        return alpha_full

Virtual Blurred Background (Bokeh Effect)

def apply_background_blur(frame: np.ndarray,
                            alpha: np.ndarray,
                            blur_radius: int = 25) -> np.ndarray:
    """
    Alternative to replacement — background blur (like Google Meet/Teams).
    Doesn't require image loading, works faster.
    """
    # Blur entire frame
    blurred = cv2.GaussianBlur(frame, (blur_radius * 2 + 1, blur_radius * 2 + 1), 0)

    # Compositing with soft edges
    alpha_3ch = np.stack([alpha, alpha, alpha], axis=2)
    result = (frame * alpha_3ch + blurred * (1 - alpha_3ch)).astype(np.uint8)
    return result
Method FPS (CPU) FPS (GPU) Quality
RVM MobileNetV3 28–35 100–140 High
MediaPipe Selfie Segmentation 60+ Medium
RVM ResNet50 8–12 45–60 Best
Background Matting V2 5–8 30–40 High
Task Timeline
RVM integration for video files 1–2 weeks
Real-time plugin for videoconferences 4–8 weeks
Mobile app with background replacement 8–14 weeks