xpark.dataset.VideoCompute#

class xpark.dataset.VideoCompute(*args, **kwargs)[source]#

Note

Do not construct this class, use the staticmethod instead.

Methods

`average_rate`(videos)	The average framerate of the video, as a float
`base_rate`(videos)	The fundamental framerate of the stream, as a float
`bit_rate`(videos)	The average bitrate of the video stream, in bits per second
`codec`(videos)	The name of the codec used for the video stream, e.g., 'h264', 'vp9'
`display_aspect_ratio`(videos)	The display aspect ratio (DAR) of the video, e.g., 16:9 is 1.777...
`extract_audio`(videos[, codec, sample_rate, ...])	Extract audio from video.
`extract_frame_at`(videos, timestamp)	Extract a video frame at specified timestamp.
`extract_frames`(videos[, timestamps, fps, ...])	Extract frames from video.
`height`(videos)	Video height, in pixels
`pix_fmt`(videos)	The pixel format of the video, e.g., 'yuv420p'
`split_by_duration`(videos[, ...])	Split video by duration.
`split_by_key_frame`(videos)	Split video by keyframe
`time_base`(videos)	The time base of the stream, representing the unit of time for timestamps
`width`(videos)	Video width, in pixels

static average_rate(videos: pa.ChunkedArray) → pa.Array#: The average framerate of the video, as a float

static base_rate(videos: pa.ChunkedArray) → pa.Array#: The fundamental framerate of the stream, as a float

static bit_rate(videos: pa.ChunkedArray) → pa.Array#: The average bitrate of the video stream, in bits per second

static codec(videos: pa.ChunkedArray) → pa.Array#: The name of the codec used for the video stream, e.g., ‘h264’, ‘vp9’

static display_aspect_ratio(videos: pa.ChunkedArray) → pa.Array#: The display aspect ratio (DAR) of the video, e.g., 16:9 is 1.777…

static extract_audio(videos: pa.ChunkedArray, codec: str | None = None, sample_rate: int | None = None, stream_index: int | None = None, start_second: float = 0, end_second: float | None = None) → pa.Array#

Extract audio from video.

This processor extracts audio data from video files and returns the audio binary data along with its file extension. It supports various file systems including COS, S3, HTTP, binary sources, and other fsspec-compatible storage systems.

Parameters:

videos – The videos to be processed.
codec – Output audio format.
sample_rate – Output audio sample rate.
stream_index – Index of the audio stream to extract.
start_second – Start time of the audio to extract.
end_second – End time of the audio to extract.

Examples

from xpark.dataset.expressions import col
from xpark.dataset.processors.video_compute import VideoCompute
from xpark.dataset.context import DatasetContext

ctx = DatasetContext.get_current()
# set cos storage_options
# ctx.storage_options = {"cos": {"endpoint_url": "https://your-cos-endpoint"}}
# VIDEO_COS_PATH is cos path like cos://bucket/path/to/video.mp4

ds = from_items([{"video": VIDEO_COS_PATH}])
ds = ds.with_column(
    "audio",
    VideoCompute.extract_audio
    .options(num_workers={"CPU": 1}, batch_size=1)
    .with_column(col("video")),
)

audio_bytes = ds.take(1)[0]['audio']

static extract_frame_at(videos: pa.ChunkedArray, timestamp: pa.ChunkedArray | float) → pa.Array#

Extract a video frame at specified timestamp.

This processor decodes a video frame from video file at the given timestamp and returns it as a numpy array in RGB mode. It uses PyAV (ffmpeg wrapper) for decoding. It supports various file systems including COS, S3, HTTP, binary sources, and other fsspec-compatible storage systems.

Parameters:

videos – Column of video file paths (strings) or binary data. Supports local paths and remote paths (cos://, s3://, hf://, etc.).
timestamp – Column of timestamps in seconds, or a single float value to extract frames at the same timestamp for all videos.

Returns:

ArrowTensorArray of decoded frame with shape (H, W, C), dtype uint8, values in range [0, 255].

Examples

Basic usage:

from xpark.dataset import from_items
from xpark.dataset.expressions import col
from xpark.dataset.processors.video_compute import VideoCompute

ds = from_items([
    {"video": "video.mp4", "timestamp": 0.5},
    {"video": "video.mp4", "timestamp": 1.0},
])

# Decode video frame at specified timestamp
ds = ds.with_column(
    "frame",
    VideoCompute.extract_frame_at(col("video"), col("timestamp")),
)

# Get extract frames
frames = ds.take(2)

With float input:

ds = from_items([{"video": "video.mp4"}, {"video": "video1.mp4"}])
ds = ds.with_column(
    "frame",
    VideoCompute.extract_frame_at(col("video"), 0.5),
)

With custom options:

ds = ds.with_column(
    "frame",
    VideoCompute.extract_frame_at
    .options(num_workers={"CPU": 2}, batch_size=16)
    .with_column(
        col("video"),
        col("timestamp"),
    ),
)

static extract_frames(videos: pa.ChunkedArray, timestamps: pa.ChunkedArray | None = None, *, fps: float | None = None, keyframes_only: bool = False, num_frames: int | None = None, start_time: float = 0.0, end_time: float | None = None) → pa.Array#

Extract frames from video.

When keyframes_only is True, the extraction process will extract the nearest keyframe before each specified timestamp.

Three extraction modes are supported: 1. Specify timestamps parameter: Extract frames at the corresponding timestamps 2. Specify fps parameter: Extract frames uniformly at the specified frame rate

from start_time to end_time

Specify num_frames parameter: Extract a specified number of frames uniformly distributed from start_time to end_time

Note: fps and num_frames parameters cannot be specified simultaneously.

Parameters:

videos – Column of video file paths (strings) or binary data (bytes). Supports local paths and remote paths (cos://, s3://, hf://, etc.).
timestamps – Optional column of timestamps in seconds (floats) at which to extract frames. Can be a single float or a list of floats per video.
fps – Target frames per second to extract. For example, fps=2.0 extracts 2 frames per second of video.
keyframes_only – If True, extract only keyframes (I-frames). Keyframes are independently decodable frames, useful for scene detection.
num_frames – Number of frames to extract uniformly distributed across the video duration.
start_time – Start time in seconds for extraction range. Defaults to 0.0, meaning extraction starts from the beginning of the video.
end_time – End time in seconds for extraction range. Defaults to None, meaning extraction continues until the end of the video.

Returns:

ArrowTensorArray of extracted frames with shape (N, H, W, C) per video, dtype uint8, values in range [0, 255].

Examples

Extract frames at specific timestamps:

from xpark.dataset import from_items
from xpark.dataset.expressions import col
from xpark.dataset.processors.video_compute import VideoCompute

ds = from_items([{"video": "video.mp4", "ts": [0.5, 1.0, 1.5]}])
ds = ds.with_column(
    "frames",
    VideoCompute.extract_frames(col("video"), col("ts")),
)

Extract frames at 2 FPS:

ds = ds.with_column(
    "frames",
    VideoCompute.extract_frames(col("video"), fps=2.0, start_time=0.0, end_time=5.0),
)

Extract keyframes only:

ds = ds.with_column(
    "keyframes",
    VideoCompute.extract_frames(col("video"), keyframes_only=True, num_frames=10, start_time=0.0, end_time=50.0),
)

Extract 10 frames uniformly:

ds = ds.with_column(
    "frames",
    VideoCompute.extract_frames(col("video"), num_frames=10, start_time=0.0),
)

See also

extract_frame_at(): Simpler API for single frame extraction.
data-juicer VideoExtractFramesMapper for similar functionality.

static height(videos: pa.ChunkedArray) → pa.Array#: Video height, in pixels

static pix_fmt(videos: pa.ChunkedArray) → pa.Array#: The pixel format of the video, e.g., ‘yuv420p’

static split_by_duration(videos: pa.ChunkedArray, segment_duration: float = 10, min_segment_duration: float = 0) → pa.Array#

Split video by duration.

This processor splits video files into multiple segments based on a fixed time length (segment_duration). For each video in the input, it outputs a list of binary data for the video segments. The default split points are keyframes. It supports various file systems including COS, S3, HTTP, binary sources, and other fsspec-compatible storage systems.

Parameters:

videos – The videos to be processed.
segment_duration – Target duration for each segment in seconds, default is 10s
min_segment_duration – Minimum duration for each segment, segments shorter than this value will be discarded, used for handling overly short segments at the end of videos, default value is 0

Examples

from xpark.dataset.expressions import col
from xpark.dataset.processors.video_compute import VideoCompute
from xpark.dataset.context import DatasetContext

ctx = DatasetContext.get_current()
# set cos storage_options
# ctx.storage_options = {"cos": {"endpoint_url": "https://your-cos-endpoint"}}
# VIDEO_COS_PATH is cos path like cos://bucket/path/to/video.mp4

ds = from_items([{"video": VIDEO_COS_PATH}])
ds = ds.with_column(
    "split_videos",
    VideoCompute.split_by_duration
    .options(num_workers={"CPU": 1}, batch_size=1)
    .with_column(col("video")),
)

split_videos = ds.take(1)[0]['split_videos']

static split_by_key_frame(videos: pa.ChunkedArray) → pa.Array#

Split video by keyframe

This function splits a video into segments based on keyframes, which are frames in a video stream that contain a complete image. Unlike other frames, keyframes (also known as I-frames) do not rely on previous frames for decoding and can be used as reference points to extract or seek specific video segments. Keyframes are crucial for tasks like video editing, seeking, or streaming, as they represent points where the video can be independently decoded.

It supports various file systems including COS, S3, HTTP, binary sources, and other fsspec-compatible storage systems.

Parameters:: videos – The videos to be processed.

Examples

from xpark.dataset.expressions import col
from xpark.dataset.processors.video_compute import VideoCompute
from xpark.dataset.context import DatasetContext

ctx = DatasetContext.get_current()
# set cos storage_options
# ctx.storage_options = {"cos": {"endpoint_url": "https://your-cos-endpoint"}}
# VIDEO_COS_PATH is cos path like cos://bucket/path/to/video.mp4

ds = from_items([{"video": VIDEO_COS_PATH}])
ds = ds.with_column(
    "split_videos",
    VideoCompute.split_by_key_frame
    .options(num_workers={"CPU": 1}, batch_size=1)
    .with_column(col("video")),
)

split_videos = ds.take(1)[0]['split_videos']

static time_base(videos: pa.ChunkedArray) → pa.Array#: The time base of the stream, representing the unit of time for timestamps

static width(videos: pa.ChunkedArray) → pa.Array#: Video width, in pixels