Video Understanding with AI

February 17, 2025

Video understanding has lagged behind image and text AI, but 2025 is changing that. Models can now watch videos and answer questions about them. This enables new categories of applications.

Here’s how to build applications that understand video.

Current Capabilities

What’s Possible

video_ai_capabilities:
  content_understanding:
    - Action recognition
    - Scene description
    - Object tracking
    - Event detection

  temporal_reasoning:
    - Sequence understanding
    - Cause and effect
    - Timeline extraction
    - Change detection

  practical_applications:
    - Video summarization
    - Content moderation
    - Meeting analysis
    - Security monitoring
    - Sports analysis

Model Options

video_models_2025:
  gemini_15_pro:
    context: "Up to 1 hour of video"
    strengths: ["Long context", "Google integration"]
    use_case: "Long-form video analysis"

  gpt4_vision:
    approach: "Frame extraction"
    strengths: ["Quality", "Reasoning"]
    use_case: "Detailed analysis of clips"

  specialized_models:
    examples: ["Twelve Labs", "Video LLaVA"]
    strengths: ["Domain-specific", "Efficiency"]
    use_case: "Production video pipelines"

Implementation Patterns

Frame-Based Analysis

class FrameBasedVideoAnalyzer:
    """Analyze video by extracting key frames."""

    def __init__(self, vision_model, frames_per_minute: int = 2):
        self.model = vision_model
        self.frames_per_minute = frames_per_minute

    async def analyze_video(
        self,
        video_path: str,
        prompt: str
    ) -> VideoAnalysis:
        # Extract key frames
        frames = self._extract_frames(video_path)

        # Analyze frames with context
        frame_analyses = []
        for i, frame in enumerate(frames):
            analysis = await self.model.analyze_image(
                image=frame.image,
                prompt=f"""This is frame {i+1}/{len(frames)} from a video.
Timestamp: {frame.timestamp}

{prompt}

Describe what's happening in this frame:"""
            )
            frame_analyses.append(FrameAnalysis(
                timestamp=frame.timestamp,
                description=analysis
            ))

        # Synthesize overall analysis
        summary = await self._synthesize(frame_analyses, prompt)

        return VideoAnalysis(
            frames=frame_analyses,
            summary=summary
        )

    def _extract_frames(self, video_path: str) -> list[Frame]:
        # Use ffmpeg to extract frames at intervals
        import cv2
        video = cv2.VideoCapture(video_path)
        fps = video.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps * 60 / self.frames_per_minute)

        frames = []
        frame_count = 0
        while True:
            success, image = video.read()
            if not success:
                break
            if frame_count % frame_interval == 0:
                timestamp = frame_count / fps
                frames.append(Frame(image=image, timestamp=timestamp))
            frame_count += 1

        video.release()
        return frames

Native Video Models

class NativeVideoAnalyzer:
    """Use models with native video understanding."""

    async def analyze_with_gemini(
        self,
        video_path: str,
        prompt: str
    ) -> str:
        import google.generativeai as genai

        # Upload video
        video_file = genai.upload_file(video_path)

        # Wait for processing
        while video_file.state.name == "PROCESSING":
            await asyncio.sleep(5)
            video_file = genai.get_file(video_file.name)

        # Analyze
        model = genai.GenerativeModel("gemini-1.5-pro")
        response = await model.generate_content_async([
            video_file,
            prompt
        ])

        return response.text

Use Cases

Meeting Summarization

class MeetingAnalyzer:
    """Analyze recorded meetings."""

    async def analyze_meeting(
        self,
        video_path: str
    ) -> MeetingSummary:
        # Extract transcript (if not provided)
        transcript = await self._extract_transcript(video_path)

        # Analyze video for visual context
        visual_analysis = await self.video_analyzer.analyze_video(
            video_path,
            prompt="Note any presentations, demos, or visual materials shown."
        )

        # Combine for comprehensive summary
        summary = await self.llm.generate(
            prompt=f"""Summarize this meeting.

Transcript:
{transcript}

Visual elements observed:
{visual_analysis.summary}

Provide:
1. Key discussion points
2. Decisions made
3. Action items
4. Visual materials referenced
"""
        )

        return MeetingSummary.parse(summary)

Content Moderation

class VideoModerator:
    """Moderate video content for policy violations."""

    async def moderate(
        self,
        video_path: str,
        policies: list[str]
    ) -> ModerationResult:
        # Sample frames throughout video
        frames = self._sample_frames(video_path, samples=20)

        violations = []
        for frame in frames:
            check = await self.model.analyze_image(
                image=frame.image,
                prompt=f"""Check this video frame for policy violations.

Policies:
{chr(10).join(f'- {p}' for p in policies)}

If any violation, describe:
- Which policy
- Confidence (high/medium/low)
- What's shown
"""
            )
            if check.has_violation:
                violations.append(Violation(
                    timestamp=frame.timestamp,
                    policy=check.policy,
                    confidence=check.confidence
                ))

        return ModerationResult(
            violations=violations,
            recommendation=self._recommend_action(violations)
        )

Key Takeaways

Video AI opens new application categories. Build for specific needs.