AI Development for Video Background Replacement
Background replacement in video is more complex than in static images: temporal coherence between frames is required (otherwise background will "flicker"). Applications: virtual background in videoconferences (Zoom, Teams), sports broadcasting, news studios without chroma key, social networks. Real-time requirements for conferences — 15–30 FPS with latency <50 ms on CPU/GPU with low power consumption.
RVM — Robust Video Matting with Temporal Coherence
import torch
import torchvision.transforms as T
from PIL import Image
import numpy as np
import cv2
class VideoBackgroundReplacer:
def __init__(self, model_path: str, device: str = 'cuda'):
self.device = device
# RVM with recurrent state — key to temporal coherence
self.model = torch.jit.load(model_path).to(device)
self.model.eval()
self.transform = T.ToTensor()
# Recurrent state preserved between frames
self.rec = [None] * 4
def reset_state(self):
"""Reset state on scene/source change"""
self.rec = [None] * 4
@torch.no_grad()
def process_frame(self, frame_bgr: np.ndarray,
background_bgr: np.ndarray) -> np.ndarray:
"""
Processing single video frame.
State (rec) preserved between calls for smoothness.
"""
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
bg_rgb = cv2.cvtColor(background_bgr, cv2.COLOR_BGR2RGB)
# Resize to multiple of 64 for model
h, w = frame_rgb.shape[:2]
src = self.transform(Image.fromarray(frame_rgb)).unsqueeze(0).to(self.device)
bgr_tensor = self.transform(
Image.fromarray(bg_rgb).resize((w, h))
).unsqueeze(0).to(self.device)
# Main inference with recurrent state passing
fgr, pha, *self.rec = self.model(src, *self.rec, downsample_ratio=0.25)
# Compositing
composite = fgr * pha + bgr_tensor * (1 - pha)
result = (composite.squeeze().permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
return cv2.cvtColor(result, cv2.COLOR_RGB2BGR)
def replace_in_video(self, input_path: str,
background_path: str,
output_path: str) -> dict:
cap = cv2.VideoCapture(input_path)
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Load background (can be image or video)
if background_path.endswith(('.jpg', '.png')):
bg = cv2.imread(background_path)
bg = cv2.resize(bg, (w, h))
bg_is_video = False
else:
bg_cap = cv2.VideoCapture(background_path)
bg_is_video = True
out = cv2.VideoWriter(output_path,
cv2.VideoWriter_fourcc(*'mp4v'),
fps, (w, h))
self.reset_state()
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if bg_is_video:
ret_bg, bg = bg_cap.read()
if not ret_bg:
bg_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
_, bg = bg_cap.read()
bg = cv2.resize(bg, (w, h))
result = self.process_frame(frame, bg)
out.write(result)
frame_count += 1
cap.release()
out.release()
return {'frames': frame_count, 'fps': fps, 'output': output_path}
Real-Time for Videoconferences (WebRTC/ONNX)
import onnxruntime as ort
class RealtimeBackgroundProcessor:
"""
ONNX Runtime for CPU optimization on machines without GPU.
Target: 30 FPS on laptop, latency <33 ms/frame.
"""
def __init__(self, onnx_model_path: str):
# Settings for maximum CPU performance
opts = ort.SessionOptions()
opts.intra_op_num_threads = 4
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
self.session = ort.InferenceSession(
onnx_model_path,
sess_options=opts,
providers=['TensorrtExecutionProvider',
'CUDAExecutionProvider',
'CPUExecutionProvider']
)
# Recurrent state as numpy arrays
self.rec_states = [
np.zeros((1, 1, 1, 1), dtype=np.float32) for _ in range(4)
]
def process_frame_fast(self, frame_rgb: np.ndarray,
target_size: tuple = (256, 144)) -> np.ndarray:
"""
Downscale to 256x144 for CPU real-time.
Upscale mask back via bilinear interpolation.
"""
orig_h, orig_w = frame_rgb.shape[:2]
small = cv2.resize(frame_rgb, target_size)
small_f = small.astype(np.float32) / 255.0
src = small_f.transpose(2, 0, 1)[np.newaxis] # [1, 3, H, W]
outputs = self.session.run(
None,
{'src': src, 'r1i': self.rec_states[0], 'r2i': self.rec_states[1],
'r3i': self.rec_states[2], 'r4i': self.rec_states[3],
'downsample_ratio': np.array([0.25])}
)
fgr, pha = outputs[0], outputs[1]
self.rec_states = list(outputs[2:6])
# Upscale mask back to original size
alpha_small = pha[0, 0]
alpha_full = cv2.resize(alpha_small, (orig_w, orig_h),
interpolation=cv2.INTER_LINEAR)
return alpha_full
Virtual Blurred Background (Bokeh Effect)
def apply_background_blur(frame: np.ndarray,
alpha: np.ndarray,
blur_radius: int = 25) -> np.ndarray:
"""
Alternative to replacement — background blur (like Google Meet/Teams).
Doesn't require image loading, works faster.
"""
# Blur entire frame
blurred = cv2.GaussianBlur(frame, (blur_radius * 2 + 1, blur_radius * 2 + 1), 0)
# Compositing with soft edges
alpha_3ch = np.stack([alpha, alpha, alpha], axis=2)
result = (frame * alpha_3ch + blurred * (1 - alpha_3ch)).astype(np.uint8)
return result
| Method |
FPS (CPU) |
FPS (GPU) |
Quality |
| RVM MobileNetV3 |
28–35 |
100–140 |
High |
| MediaPipe Selfie Segmentation |
60+ |
— |
Medium |
| RVM ResNet50 |
8–12 |
45–60 |
Best |
| Background Matting V2 |
5–8 |
30–40 |
High |
| Task |
Timeline |
| RVM integration for video files |
1–2 weeks |
| Real-time plugin for videoconferences |
4–8 weeks |
| Mobile app with background replacement |
8–14 weeks |