Source code for xpark.dataset.namespace_expressions.string_namespace

"""String namespace for expression operations on string-typed columns."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from ray.data.namespace_expressions.string_namespace import _StringNamespace as RayStringNamespace

from xpark.dataset.datatype import DataType
from xpark.dataset.expressions import udf
from xpark.dataset.filters.dedup import utils as dedup_utils
from xpark.dataset.import_utils import lazy_import

if TYPE_CHECKING:
    import pyarrow as pa
    from ray.data.expressions import UDFExpr
else:
    pa = lazy_import("pyarrow", rename="pa")


[docs] @dataclass class _StringNamespace(RayStringNamespace): """Namespace for string operations on expression columns. This namespace provides methods for operating on string-typed columns using PyArrow compute functions. Example: >>> from xpark.dataset.expressions import col >>> # Convert to uppercase >>> expr = col("name").str.upper() >>> # Get string length >>> expr = col("name").str.len() >>> # Check if string starts with a prefix >>> expr = col("name").str.starts_with("A") """
[docs] def word_count(self, tokenizer: str = "cjk") -> UDFExpr: """Count words in texts using the specified tokenizer. Args: tokenizer: The tokenizer type to use for word segmentation. Defaults to "cjk" for Chinese-Japanese-Korean text processing. Examples: .. code-block:: python from xpark.dataset import from_items from xpark.dataset.expressions import col ds = from_items(["Hello world", "This is a test"]) ds = ds.with_column( "word_count", col("text").str.word_count(tokenizer="cjk"), ) print(ds.take_all()) """ @udf(return_dtype=DataType.int32()) def word_count_wrapped( texts: pa.ChunkedArray, tokenizer: str, ) -> pa.Array: import pyarrow as pa if tokenizer == "cjk": tokens = dedup_utils.utf8_split_mixed(texts) else: raise ValueError(f"Invalid tokenizer: {tokenizer}") return pa.array([len(words) for words in tokens], pa.int32()) return word_count_wrapped(texts=self._expr, tokenizer=tokenizer)