Welcome to Xpark!#

Xpark is a multimodal AI data processing platform designed to streamline and optimize data workflows for AI applications. It provides comprehensive capabilities for data handling, transformation, and seamless integration with AI workflows.

Processing Multimodal Data with Xpark#

from xpark.dataset import TextEmbedding, from_items
from xpark.dataset.expressions import col

ds = from_items(
   [
      "what is the advantage of using the GPU rendering options in Android?",
      "Blank video when converting uncompressed AVI files with ffmpeg",
   ]
)
ds = ds.with_column(
   "embedding",
   TextEmbedding(
      # Local embedding model.
      "Qwen/Qwen3-Embedding-0.6B",
   )
   .options(num_workers={"CPU": 1})
   .with_column(col("item")),
)

output = ds.take_all()

from PIL import Image

from xpark.dataset import ImageCompute, ImageTextSimilarityScore, read_image
from xpark.dataset.expressions import col

ds = read_image("/data/Test/test-ray-data/data/mini_coco_images")

# Image Data Function: resized_image
ds = ds.with_column("image_resized", ImageCompute.resize(col("image"), size=(224, 224)))

# Image AI Function: image text similarity score
ds = ds.with_column(
   "image_text_similarity",
   ImageTextSimilarityScore(text="a photo of a cat")
   .options(batch_size=16, num_workers={"CPU": 1})
   .with_column(col("image")),
)

output = ds.take_all()

print(output[0]["image_text_similarity"])
Image.fromarray(output[0]["image_resized"]).show()

import pyarrow as pa

from xpark.dataset import VideoCompute, from_arrow
from xpark.dataset.expressions import col

ds = from_arrow(
   pa.table(
      {
            "video": ["/path/to/video1.mp4", "/path/to/video2.mp4"],
      }
   )
)

# Get Video Bit Rate
ds = ds.with_column("video_bit_rate", VideoCompute.bit_rate(col("videos")))

# Extract Audio
ds = ds.with_column("audio", VideoCompute.extract_audio(col("video"), codec="aac", sample_rate=16000))

# Extract frames
ds = ds.with_column("frames", VideoCompute.extract_frames(col("video"), start_time=30, end_time=50, num_frames=3))


output = ds.take_all()

from __future__ import annotations

from xpark.dataset.expressions import col
from xpark.dataset import SpeechToText, from_items

ds = from_items(["multilingual.mp3"])
ds = ds.with_column(
   "text",
   SpeechToText(
      # Local transcriptions model.
      "Systran/faster-whisper-large-v3",
   )
   .options(num_workers={"GPU": 1})
   .with_column(col("item")),
)

print(ds.take_all(2))

Cache Model (Required)#

Before using any AI processors, you must cache the required models locally.

# Cache test models
python /path/to/xpark/dataset/scripts/cache_models.py -g test

# Cache all models
python /path/to/xpark/dataset/scripts/cache_models.py -g all

The default model cache path is ~/.cache/xpark. For distributed Ray clusters, it is recommended to use a distributed cloud disk for model caching.

Note

The current mode is manual caching. In the future, the EMR product will integrate this into the management system, supporting model caching through configuration.

Next Steps#

Getting Started — A quick tutorial to get you started with Xpark
Dataset API — Full Dataset API reference
Processors — All built-in Data and AI Processors