Video understanding has lagged behind image and text AI, but 2025 is changing that. Models can now watch videos and answer questions about them. This enables new categories of applications.
Here’s how to build applications that understand video.
Current Capabilities
What’s Possible
video_ai_capabilities:
content_understanding:
- Action recognition
- Scene description
- Object tracking
- Event detection
temporal_reasoning:
- Sequence understanding
- Cause and effect
- Timeline extraction
- Change detection
practical_applications:
- Video summarization
- Content moderation
- Meeting analysis
- Security monitoring
- Sports analysis
Model Options
video_models_2025:
gemini_15_pro:
context: "Up to 1 hour of video"
strengths: ["Long context", "Google integration"]
use_case: "Long-form video analysis"
gpt4_vision:
approach: "Frame extraction"
strengths: ["Quality", "Reasoning"]
use_case: "Detailed analysis of clips"
specialized_models:
examples: ["Twelve Labs", "Video LLaVA"]
strengths: ["Domain-specific", "Efficiency"]
use_case: "Production video pipelines"
Implementation Patterns
Frame-Based Analysis
class FrameBasedVideoAnalyzer:
"""Analyze video by extracting key frames."""
def __init__(self, vision_model, frames_per_minute: int = 2):
self.model = vision_model
self.frames_per_minute = frames_per_minute
async def analyze_video(
self,
video_path: str,
prompt: str
) -> VideoAnalysis:
# Extract key frames
frames = self._extract_frames(video_path)
# Analyze frames with context
frame_analyses = []
for i, frame in enumerate(frames):
analysis = await self.model.analyze_image(
image=frame.image,
prompt=f"""This is frame {i+1}/{len(frames)} from a video.
Timestamp: {frame.timestamp}
{prompt}
Describe what's happening in this frame:"""
)
frame_analyses.append(FrameAnalysis(
timestamp=frame.timestamp,
description=analysis
))
# Synthesize overall analysis
summary = await self._synthesize(frame_analyses, prompt)
return VideoAnalysis(
frames=frame_analyses,
summary=summary
)
def _extract_frames(self, video_path: str) -> list[Frame]:
# Use ffmpeg to extract frames at intervals
import cv2
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * 60 / self.frames_per_minute)
frames = []
frame_count = 0
while True:
success, image = video.read()
if not success:
break
if frame_count % frame_interval == 0:
timestamp = frame_count / fps
frames.append(Frame(image=image, timestamp=timestamp))
frame_count += 1
video.release()
return frames
Native Video Models
class NativeVideoAnalyzer:
"""Use models with native video understanding."""
async def analyze_with_gemini(
self,
video_path: str,
prompt: str
) -> str:
import google.generativeai as genai
# Upload video
video_file = genai.upload_file(video_path)
# Wait for processing
while video_file.state.name == "PROCESSING":
await asyncio.sleep(5)
video_file = genai.get_file(video_file.name)
# Analyze
model = genai.GenerativeModel("gemini-1.5-pro")
response = await model.generate_content_async([
video_file,
prompt
])
return response.text
Use Cases
Meeting Summarization
class MeetingAnalyzer:
"""Analyze recorded meetings."""
async def analyze_meeting(
self,
video_path: str
) -> MeetingSummary:
# Extract transcript (if not provided)
transcript = await self._extract_transcript(video_path)
# Analyze video for visual context
visual_analysis = await self.video_analyzer.analyze_video(
video_path,
prompt="Note any presentations, demos, or visual materials shown."
)
# Combine for comprehensive summary
summary = await self.llm.generate(
prompt=f"""Summarize this meeting.
Transcript:
{transcript}
Visual elements observed:
{visual_analysis.summary}
Provide:
1. Key discussion points
2. Decisions made
3. Action items
4. Visual materials referenced
"""
)
return MeetingSummary.parse(summary)
Content Moderation
class VideoModerator:
"""Moderate video content for policy violations."""
async def moderate(
self,
video_path: str,
policies: list[str]
) -> ModerationResult:
# Sample frames throughout video
frames = self._sample_frames(video_path, samples=20)
violations = []
for frame in frames:
check = await self.model.analyze_image(
image=frame.image,
prompt=f"""Check this video frame for policy violations.
Policies:
{chr(10).join(f'- {p}' for p in policies)}
If any violation, describe:
- Which policy
- Confidence (high/medium/low)
- What's shown
"""
)
if check.has_violation:
violations.append(Violation(
timestamp=frame.timestamp,
policy=check.policy,
confidence=check.confidence
))
return ModerationResult(
violations=violations,
recommendation=self._recommend_action(violations)
)
Key Takeaways
- Video understanding is becoming practical in 2025
- Frame-based analysis works with any vision model
- Native video models handle longer content
- Meeting analysis is high-value use case
- Content moderation scalable with AI
- Combine transcript and visual for best results
- Consider processing costs for long videos
- Start with specific use cases, not general “video understanding”
Video AI opens new application categories. Build for specific needs.