AI Development for Video Stabilization
Camera shake is inevitable artifact of hand-held shooting, drones, sports cameras. Classical stabilization works through optical flow: estimate motion between frames, smooth trajectory, compensate for shake. AI methods add semantic understanding: distinguish operator movement from subject movement, better handle dynamic scenes, can restore "cropped" pixels through inpainting.
Classical Stabilization via Optical Flow
import cv2
import numpy as np
from scipy.signal import medfilt
class VideoStabilizer:
def __init__(self, smoothing_window: int = 30,
crop_ratio: float = 0.1):
self.smoothing_window = smoothing_window
self.crop_ratio = crop_ratio # crop edges after stabilization
def stabilize(self, input_path: str, output_path: str) -> dict:
cap = cv2.VideoCapture(input_path)
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Step 1: Compute camera trajectory
transforms = self._estimate_transforms(cap)
cap.release()
# Step 2: Smooth trajectory
smoothed = self._smooth_trajectory(transforms)
# Step 3: Apply stabilizing transforms
cap = cv2.VideoCapture(input_path)
out = cv2.VideoWriter(output_path,
cv2.VideoWriter_fourcc(*'mp4v'),
fps, (w, h))
for i, (orig, smooth) in enumerate(zip(transforms, smoothed)):
ret, frame = cap.read()
if not ret:
break
stabilized = self._apply_transform(frame, orig, smooth, w, h)
out.write(stabilized)
cap.release()
out.release()
return {'frames': len(transforms), 'smoothing_window': self.smoothing_window}
def _estimate_transforms(self, cap) -> list[np.ndarray]:
"""Estimate affine transforms between adjacent frames"""
ret, prev = cap.read()
prev_gray = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
transforms = []
while True:
ret, curr = cap.read()
if not ret:
break
curr_gray = cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
# Feature detection and tracking
prev_pts = cv2.goodFeaturesToTrack(
prev_gray, maxCorners=200, qualityLevel=0.01,
minDistance=30, blockSize=3
)
curr_pts, status, _ = cv2.calcOpticalFlowPyrLK(
prev_gray, curr_gray, prev_pts, None
)
# Filter reliable points
valid_prev = prev_pts[status == 1]
valid_curr = curr_pts[status == 1]
# Estimate affine transform
m, _ = cv2.estimateAffinePartial2D(valid_prev, valid_curr)
if m is None:
m = np.eye(2, 3, dtype=np.float64)
transforms.append(m)
prev_gray = curr_gray
return transforms
def _smooth_trajectory(self, transforms: list) -> list:
"""Moving average to smooth trajectory"""
trajectory = np.cumsum([m[:, 2] for m in transforms], axis=0)
smoothed = np.zeros_like(trajectory)
for i in range(len(trajectory)):
start = max(0, i - self.smoothing_window // 2)
end = min(len(trajectory), i + self.smoothing_window // 2)
smoothed[i] = trajectory[start:end].mean(axis=0)
# Delta transforms to apply
delta = smoothed - trajectory
result = []
for i, m in enumerate(transforms):
m_smooth = m.copy()
m_smooth[:, 2] += delta[i]
result.append(m_smooth)
return result
def _apply_transform(self, frame: np.ndarray,
orig_m: np.ndarray,
smooth_m: np.ndarray,
w: int, h: int) -> np.ndarray:
stabilized = cv2.warpAffine(frame, smooth_m, (w, h))
# Crop to hide black edges
crop = int(min(w, h) * self.crop_ratio)
stabilized = stabilized[crop:h-crop, crop:w-crop]
return cv2.resize(stabilized, (w, h))
DUT — Deep Unified Transformer for AI Stabilization
class DeepVideoStabilizer:
"""
AI approach: train to stabilize video on unstable/stable pairs.
Advantage over classical: better handles
rolling shutter, fast motion, blur.
"""
def __init__(self, checkpoint_path: str, device: str = 'cuda'):
import sys
sys.path.append('/opt/DUT')
from model import DUTStabilizer
self.model = DUTStabilizer()
self.model.load_state_dict(torch.load(checkpoint_path))
self.model.eval().to(device)
self.device = device
@torch.no_grad()
def stabilize_clip(self, frames: list[np.ndarray],
window_size: int = 16) -> list[np.ndarray]:
"""
Processes video in window_size frame windows.
Key feature of DUT: uses future frames
to predict current stabilization.
"""
results = []
for i in range(0, len(frames), window_size // 2):
window = frames[i:i+window_size]
if len(window) < window_size:
# Pad with last frame
window = window + [window[-1]] * (window_size - len(window))
tensor = self._frames_to_tensor(window)
stabilized_tensor = self.model(tensor.to(self.device))
stabilized_frames = self._tensor_to_frames(stabilized_tensor)
results.extend(stabilized_frames[:window_size//2])
return results[:len(frames)]
Video Stabilization Quality Metrics
def evaluate_stabilization(unstable_frames: list, stable_frames: list) -> dict:
"""
Metrics:
- Cropping Ratio: how many pixels preserved (higher = better)
- Distortion Value: geometry distortion (lower = better)
- Stability Score: motion variance between frames (lower = better)
"""
# Stability: optical flow variance in stabilized video
flows = []
for i in range(1, len(stable_frames)):
prev_gray = cv2.cvtColor(stable_frames[i-1], cv2.COLOR_BGR2GRAY)
curr_gray = cv2.cvtColor(stable_frames[i], cv2.COLOR_BGR2GRAY)
flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None,
0.5, 3, 15, 3, 5, 1.2, 0)
flows.append(np.abs(flow).mean())
return {
'stability_score': float(np.std(flows)),
'mean_motion': float(np.mean(flows)),
'max_motion': float(np.max(flows))
}
| Method |
Stability↓ |
Cropping↑ |
Speed |
| OpenCV (vidstab) |
0.35 |
0.91 |
Real-time |
| DIFRINT |
0.18 |
0.89 |
5–10 FPS |
| DUT |
0.14 |
0.87 |
3–5 FPS |
| StabNet |
0.16 |
0.90 |
8 FPS |
| Task |
Timeline |
| Batch stabilization via OpenCV |
1 week |
| AI stabilization with DUT/DIFRINT |
4–6 weeks |
| Real-time stabilization for streams |
6–10 weeks |