From db2bf164274195ea3463d9dafe8c24aa9ef51fd6 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 10 Mar 2026 21:18:33 -0700 Subject: [PATCH] docs: resolve videodb review findings --- skills/videodb/SKILL.md | 128 ++++++++--------- skills/videodb/reference/capture-reference.md | 46 +++--- skills/videodb/reference/editor.md | 11 +- skills/videodb/reference/streaming.md | 6 +- skills/videodb/scripts/ws_listener.py | 131 ++++++++++++------ 5 files changed, 188 insertions(+), 134 deletions(-) diff --git a/skills/videodb/SKILL.md b/skills/videodb/SKILL.md index 27787036..e9195d7d 100644 --- a/skills/videodb/SKILL.md +++ b/skills/videodb/SKILL.md @@ -10,59 +10,48 @@ argument-hint: "[task description]" **Perception + memory + actions for video, live streams, and desktop sessions.** -Use this skill when you need to: +## When to Use -## 1) Desktop Perception +### Desktop Perception - Start/stop a **desktop session** capturing **screen, mic, and system audio** - Stream **live context** and store **episodic session memory** - Run **real-time alerts/triggers** on what's spoken and what's happening on screen - Produce **session summaries**, a searchable timeline, and **playable evidence links** -## 2) Video ingest + stream +### Video ingest + stream - Ingest a **file or URL** and return a **playable web stream link** - Transcode/normalize: **codec, bitrate, fps, resolution, aspect ratio** -## 3) Index + search (timestamps + evidence) +### Index + search (timestamps + evidence) - Build **visual**, **spoken**, and **keyword** indexes - Search and return exact moments with **timestamps** and **playable evidence** - Auto-create **clips** from search results -## 4) Timeline editing + generation +### Timeline editing + generation - Subtitles: **generate**, **translate**, **burn-in** - Overlays: **text/image/branding**, motion captions - Audio: **background music**, **voiceover**, **dubbing** - Programmatic composition and exports via **timeline operations** -## 5) Live streams (RTSP) + monitoring +### Live streams (RTSP) + monitoring - Connect **RTSP/live feeds** - Run **real-time visual and spoken understanding** and emit **events/alerts** for monitoring workflows ---- +## How It Works -## Common inputs +### Common inputs - Local **file path**, public **URL**, or **RTSP URL** - Desktop capture request: **start / stop / summarize session** - Desired operations: get context for understanding, transcode spec, index spec, search query, clip ranges, timeline edits, alert rules -## Common outputs +### Common outputs - **Stream URL** - Search results with **timestamps** and **evidence links** - Generated assets: subtitles, audio, images, clips - **Event/alert payloads** for live streams - Desktop **session summaries** and memory entries ---- - -## Canonical prompts (examples) -- "Start desktop capture and alert when a password field appears." -- "Record my session and produce an actionable summary when it ends." -- "Ingest this file and return a playable stream link." -- "Index this folder and find every scene with people, return timestamps." -- "Generate subtitles, burn them in, and add light background music." -- "Connect this RTSP URL and alert when a person enters the zone." - - -## Running Python code +### Running Python code Before running any VideoDB code, change to the project directory and load environment variables: @@ -96,7 +85,7 @@ print(f"Videos: {len(coll.get_videos())}") EOF ``` -## Setup +### Setup When the user asks to "setup videodb" or similar: @@ -123,7 +112,7 @@ Get a free API key at https://console.videodb.io (50 free uploads, no credit car **Do NOT** read, write, or handle the API key yourself. Always let the user set it. -## Quick Reference +### Quick Reference ### Upload media @@ -298,6 +287,55 @@ except InvalidRequestError as e: | Negative timestamps on Timeline | Silently produces broken stream | Always validate `start >= 0` before creating `VideoAsset` | | `generate_video()` / `create_collection()` fails | `Operation not allowed` or `maximum limit` | Plan-gated features — inform the user about plan limits | +## Examples + +### Canonical prompts +- "Start desktop capture and alert when a password field appears." +- "Record my session and produce an actionable summary when it ends." +- "Ingest this file and return a playable stream link." +- "Index this folder and find every scene with people, return timestamps." +- "Generate subtitles, burn them in, and add light background music." +- "Connect this RTSP URL and alert when a person enters the zone." + +### Screen Recording (Desktop Capture) + +Use `ws_listener.py` to capture WebSocket events during recording sessions. Desktop capture supports **macOS** only. + +#### Quick Start + +1. **Start listener**: `python scripts/ws_listener.py --clear &` +2. **Get WebSocket ID**: `cat "${VIDEODB_EVENTS_DIR:-$HOME/.local/state/videodb}/videodb_ws_id"` +3. **Run capture code** (see reference/capture.md for the full workflow) +4. **Events written to**: `${VIDEODB_EVENTS_DIR:-$HOME/.local/state/videodb}/videodb_events.jsonl` + +Use `--clear` whenever you start a fresh capture run so stale transcript and visual events do not leak into the new session. + +#### Query Events + +```python +import json +import time +from pathlib import Path + +events_file = Path.home() / ".local" / "state" / "videodb" / "videodb_events.jsonl" +events = [] + +if events_file.exists(): + with events_file.open(encoding="utf-8") as handle: + for line in handle: + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + +transcripts = [e["data"]["text"] for e in events if e.get("channel") == "transcript"] +cutoff = time.time() - 300 +recent_visual = [ + e for e in events + if e.get("channel") == "visual_index" and e["unix_ts"] > cutoff +] +``` + ## Additional docs Reference documentation is in the `reference/` directory adjacent to this SKILL.md file. Use the Glob tool to locate it if needed. @@ -313,50 +351,6 @@ Reference documentation is in the `reference/` directory adjacent to this SKILL. - [reference/capture-reference.md](reference/capture-reference.md) - Capture SDK and WebSocket events - [reference/use-cases.md](reference/use-cases.md) - Common video processing patterns and examples -## Screen Recording (Desktop Capture) - -Use `ws_listener.py` to capture WebSocket events during recording sessions. Desktop capture supports **macOS** only. - -### Quick Start - -1. **Start listener**: `python scripts/ws_listener.py &` -2. **Get WebSocket ID**: `cat /tmp/videodb_ws_id` -3. **Run capture code** (see reference/capture.md for full workflow) -4. **Events written to**: `/tmp/videodb_events.jsonl` - -### Query Events - -```python -import json -from pathlib import Path - -events_file = Path("/tmp/videodb_events.jsonl") -events = [] - -if events_file.exists(): - with events_file.open(encoding="utf-8") as handle: - for line in handle: - try: - events.append(json.loads(line)) - except json.JSONDecodeError: - continue - -# Get all transcripts -transcripts = [e["data"]["text"] for e in events if e.get("channel") == "transcript"] - -# Get visual descriptions from last 5 minutes -import time -cutoff = time.time() - 300 -recent_visual = [e for e in events - if e.get("channel") == "visual_index" and e["unix_ts"] > cutoff] -``` - -### Utility Scripts - -- [scripts/ws_listener.py](scripts/ws_listener.py) - WebSocket event listener (dumps to JSONL) - -For complete capture workflow, see [reference/capture.md](reference/capture.md). - **Do not use ffmpeg, moviepy, or local encoding tools** when VideoDB supports the operation. The following are all handled server-side by VideoDB — trimming, combining clips, overlaying audio or music, adding subtitles, text/image overlays, transcoding, resolution changes, aspect-ratio conversion, resizing for platform requirements, transcription, and media generation. Only fall back to local tools for operations listed under Limitations in reference/editor.md (transitions, speed changes, crop/zoom, colour grading, volume mixing). diff --git a/skills/videodb/reference/capture-reference.md b/skills/videodb/reference/capture-reference.md index 4427aae9..302e0ef1 100644 --- a/skills/videodb/reference/capture-reference.md +++ b/skills/videodb/reference/capture-reference.md @@ -8,7 +8,7 @@ Code-level details for VideoDB capture sessions. For workflow guide, see [captur Real-time events from capture sessions and AI pipelines. No webhooks or polling required. -Use [scripts/ws_listener.py](../scripts/ws_listener.py) to connect and dump events to `/tmp/videodb_events.jsonl`. +Use [scripts/ws_listener.py](../scripts/ws_listener.py) to connect and dump events to `${VIDEODB_EVENTS_DIR:-$HOME/.local/state/videodb}/videodb_events.jsonl`. ### Event Channels @@ -137,12 +137,12 @@ The script outputs `WS_ID=` on the first line, then listens indef **Get the ws_id:** ```bash -cat /tmp/videodb_ws_id +cat "${VIDEODB_EVENTS_DIR:-$HOME/.local/state/videodb}/videodb_ws_id" ``` **Stop the listener:** ```bash -kill $(cat /tmp/videodb_ws_pid) +kill "$(cat "${VIDEODB_EVENTS_DIR:-$HOME/.local/state/videodb}/videodb_ws_pid")" ``` **Functions that accept `ws_connection_id`:** @@ -152,7 +152,7 @@ kill $(cat /tmp/videodb_ws_pid) | `conn.create_capture_session()` | Session lifecycle events | | RTStream methods | See [rtstream-reference.md](rtstream-reference.md) | -**Output files** (in output directory, default `/tmp`): +**Output files** (in output directory, default `${XDG_STATE_HOME:-$HOME/.local/state}/videodb`): - `videodb_ws_id` - WebSocket connection ID - `videodb_events.jsonl` - All events - `videodb_ws_pid` - Process ID for easy termination @@ -176,20 +176,27 @@ Each line is a JSON object with added timestamps: ```python import json -events = [json.loads(l) for l in open("/tmp/videodb_events.jsonl")] - -# Filter by channel -transcripts = [e for e in events if e.get("channel") == "transcript"] - -# Filter by time (last 10 minutes) import time -cutoff = time.time() - 600 -recent = [e for e in events if e["unix_ts"] > cutoff] +from pathlib import Path -# Filter visual events containing keyword -visual = [e for e in events - if e.get("channel") == "visual_index" - and "code" in e.get("data", {}).get("text", "").lower()] +events_path = Path.home() / ".local" / "state" / "videodb" / "videodb_events.jsonl" +transcripts = [] +recent = [] +visual = [] + +cutoff = time.time() - 600 +with events_path.open(encoding="utf-8") as handle: + for line in handle: + event = json.loads(line) + if event.get("channel") == "transcript": + transcripts.append(event) + if event.get("unix_ts", 0) > cutoff: + recent.append(event) + if ( + event.get("channel") == "visual_index" + and "code" in event.get("data", {}).get("text", "").lower() + ): + visual.append(event) ``` --- @@ -224,7 +231,9 @@ ws_id = ws.connection_id ### Create a Capture Session ```python -ws_id = open("/tmp/videodb_ws_id").read().strip() +from pathlib import Path + +ws_id = (Path.home() / ".local" / "state" / "videodb" / "videodb_ws_id").read_text().strip() session = conn.create_capture_session( end_user_id="user-123", # required @@ -390,7 +399,8 @@ For RTStream methods (indexing, transcription, alerts, batch config), see [rtstr ┌───────────────┐ WebSocket: capture_session.exported │ exported │ ──> Access video_id, stream_url, player_url └───────────────┘ - + + unrecoverable capture error │ v ┌───────────────┐ WebSocket: capture_session.failed diff --git a/skills/videodb/reference/editor.md b/skills/videodb/reference/editor.md index 22262f5b..6f772af0 100644 --- a/skills/videodb/reference/editor.md +++ b/skills/videodb/reference/editor.md @@ -280,6 +280,7 @@ For more streaming options (segment streams, search-to-stream, audio playback), ```python import videodb from videodb import SearchType +from videodb.exceptions import InvalidRequestError from videodb.timeline import Timeline from videodb.asset import VideoAsset, TextAsset, TextStyle @@ -289,8 +290,14 @@ video = coll.get_video("your-video-id") # 1. Search for key moments video.index_spoken_words(force=True) -results = video.search("product announcement", search_type=SearchType.semantic) -shots = results.get_shots() # may be empty if no results +try: + results = video.search("product announcement", search_type=SearchType.semantic) + shots = results.get_shots() +except InvalidRequestError as exc: + if "No results found" in str(exc): + shots = [] + else: + raise # 2. Build timeline timeline = Timeline(conn) diff --git a/skills/videodb/reference/streaming.md b/skills/videodb/reference/streaming.md index 55e653c5..cf591d92 100644 --- a/skills/videodb/reference/streaming.md +++ b/skills/videodb/reference/streaming.md @@ -47,10 +47,10 @@ video.play() ```python # Index and add subtitles first video.index_spoken_words(force=True) -video.add_subtitle() +stream_url = video.add_subtitle() -# Stream now includes subtitles -stream_url = video.generate_stream() +# Returned URL already includes subtitles +print(f"Subtitled stream: {stream_url}") ``` ### Specific Segments diff --git a/skills/videodb/scripts/ws_listener.py b/skills/videodb/scripts/ws_listener.py index 968038fe..456bfb20 100644 --- a/skills/videodb/scripts/ws_listener.py +++ b/skills/videodb/scripts/ws_listener.py @@ -6,7 +6,7 @@ Usage: python scripts/ws_listener.py [OPTIONS] [output_dir] Arguments: - output_dir Directory for output files (default: /tmp or VIDEODB_EVENTS_DIR env var) + output_dir Directory for output files (default: XDG_STATE_HOME/videodb or ~/.local/state/videodb) Options: --clear Clear the events file before starting (use when starting a new session) @@ -20,10 +20,10 @@ Output (first line, for parsing): WS_ID= Examples: - python scripts/ws_listener.py & # Run in background - python scripts/ws_listener.py --clear # Clear events and start fresh - python scripts/ws_listener.py --clear /tmp/mydir # Custom dir with clear - kill $(cat /tmp/videodb_ws_pid) # Stop the listener + python scripts/ws_listener.py & # Run in background + python scripts/ws_listener.py --clear # Clear events and start fresh + python scripts/ws_listener.py --clear /tmp/mydir # Custom dir with clear + kill "$(cat ~/.local/state/videodb/videodb_ws_pid)" # Stop the listener """ import os import sys @@ -31,6 +31,7 @@ import json import signal import asyncio import logging +import contextlib from datetime import datetime, timezone from pathlib import Path @@ -52,6 +53,27 @@ logging.basicConfig( LOGGER = logging.getLogger(__name__) # Parse arguments +RETRYABLE_ERRORS = (ConnectionError, TimeoutError) + + +def default_output_dir() -> Path: + """Return a private per-user state directory for listener artifacts.""" + xdg_state_home = os.environ.get("XDG_STATE_HOME") + if xdg_state_home: + return Path(xdg_state_home) / "videodb" + return Path.home() / ".local" / "state" / "videodb" + + +def ensure_private_dir(path: Path) -> Path: + """Create the listener state directory with private permissions.""" + path.mkdir(parents=True, exist_ok=True, mode=0o700) + try: + path.chmod(0o700) + except OSError: + pass + return path + + def parse_args() -> tuple[bool, Path]: clear = False output_dir: str | None = None @@ -64,9 +86,9 @@ def parse_args() -> tuple[bool, Path]: output_dir = arg if output_dir is None: - output_dir = os.environ.get("VIDEODB_EVENTS_DIR", "/tmp") - - return clear, Path(output_dir) + return clear, ensure_private_dir(default_output_dir()) + + return clear, ensure_private_dir(Path(output_dir)) CLEAR_EVENTS, OUTPUT_DIR = parse_args() EVENTS_FILE = OUTPUT_DIR / "videodb_events.jsonl" @@ -93,7 +115,7 @@ def append_event(event: dict): def write_pid(): """Write PID file for easy process management.""" - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + OUTPUT_DIR.mkdir(parents=True, exist_ok=True, mode=0o700) PID_FILE.write_text(str(os.getpid())) @@ -118,43 +140,10 @@ async def listen_with_retry(): ws_wrapper = conn.connect_websocket() ws = await ws_wrapper.connect() ws_id = ws.connection_id - - # Ensure output directory exists - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - # Clear events file only on first connection if --clear flag is set - if _first_connection and CLEAR_EVENTS: - EVENTS_FILE.unlink(missing_ok=True) - log("Cleared events file") - _first_connection = False - - # Write ws_id to file for easy retrieval - WS_ID_FILE.write_text(ws_id) - - # Print ws_id (parseable format for LLM) - if retry_count == 0: - print(f"WS_ID={ws_id}", flush=True) - log(f"Connected (ws_id={ws_id})") - - # Reset retry state on successful connection - retry_count = 0 - backoff = INITIAL_BACKOFF - - # Listen for messages - async for msg in ws.receive(): - append_event(msg) - channel = msg.get("channel", msg.get("event", "unknown")) - text = msg.get("data", {}).get("text", "") - if text: - print(f"[{channel}] {text[:80]}", flush=True) - - # If we exit the loop normally, connection was closed - log("Connection closed by server") - except asyncio.CancelledError: log("Shutdown requested") raise - except Exception as e: + except RETRYABLE_ERRORS as e: retry_count += 1 log(f"Connection error: {e}") @@ -165,6 +154,52 @@ async def listen_with_retry(): log(f"Reconnecting in {backoff}s (attempt {retry_count}/{MAX_RETRIES})...") await asyncio.sleep(backoff) backoff = min(backoff * 2, MAX_BACKOFF) + continue + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True, mode=0o700) + + if _first_connection and CLEAR_EVENTS: + EVENTS_FILE.unlink(missing_ok=True) + log("Cleared events file") + _first_connection = False + + WS_ID_FILE.write_text(ws_id) + + if retry_count == 0: + print(f"WS_ID={ws_id}", flush=True) + log(f"Connected (ws_id={ws_id})") + + retry_count = 0 + backoff = INITIAL_BACKOFF + + receiver = ws.receive().__aiter__() + while True: + try: + msg = await anext(receiver) + except StopAsyncIteration: + log("Connection closed by server") + break + except asyncio.CancelledError: + log("Shutdown requested") + raise + except RETRYABLE_ERRORS as e: + retry_count += 1 + log(f"Connection error: {e}") + + if retry_count >= MAX_RETRIES: + log(f"Max retries ({MAX_RETRIES}) exceeded, exiting") + return + + log(f"Reconnecting in {backoff}s (attempt {retry_count}/{MAX_RETRIES})...") + await asyncio.sleep(backoff) + backoff = min(backoff * 2, MAX_BACKOFF) + break + + append_event(msg) + channel = msg.get("channel", msg.get("event", "unknown")) + text = msg.get("data", {}).get("text", "") + if text: + print(f"[{channel}] {text[:80]}", flush=True) async def main_async(): @@ -178,7 +213,8 @@ async def main_async(): # Register signal handlers for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, handle_signal) + with contextlib.suppress(NotImplementedError): + loop.add_signal_handler(sig, handle_signal) # Run listener with cancellation support listen_task = asyncio.create_task(listen_with_retry()) @@ -188,6 +224,9 @@ async def main_async(): [listen_task, shutdown_task], return_when=asyncio.FIRST_COMPLETED, ) + + if listen_task.done(): + await listen_task # Cancel remaining tasks for task in pending: @@ -196,6 +235,10 @@ async def main_async(): await task except asyncio.CancelledError: pass + + for sig in (signal.SIGINT, signal.SIGTERM): + with contextlib.suppress(NotImplementedError): + loop.remove_signal_handler(sig) log("Shutdown complete")