Source code for xpark.dataset.namespace_expressions.string_namespace

"""String namespace for expression operations on string-typed columns."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from ray.data.namespace_expressions.string_namespace import _StringNamespace as RayStringNamespace

from xpark.dataset.datatype import DataType
from xpark.dataset.expressions import udf
from xpark.dataset.filters.dedup import utils as dedup_utils
from xpark.dataset.import_utils import lazy_import

if TYPE_CHECKING:
    import pyarrow as pa
    from ray.data.expressions import UDFExpr
else:
    pa = lazy_import("pyarrow", rename="pa")



[docs]
@dataclass
class _StringNamespace(RayStringNamespace):
    """Namespace for string operations on expression columns.

    This namespace provides methods for operating on string-typed columns using
    PyArrow compute functions.

    Example:
        >>> from xpark.dataset.expressions import col
        >>> # Convert to uppercase
        >>> expr = col("name").str.upper()
        >>> # Get string length
        >>> expr = col("name").str.len()
        >>> # Check if string starts with a prefix
        >>> expr = col("name").str.starts_with("A")
    """


[docs]
    def word_count(self, tokenizer: str = "cjk") -> UDFExpr:
        """Count words in texts using the specified tokenizer.

        Args:
            tokenizer: The tokenizer type to use for word segmentation.
                Defaults to "cjk" for Chinese-Japanese-Korean text processing.

        Examples:
            .. code-block:: python

                from xpark.dataset import from_items
                from xpark.dataset.expressions import col

                ds = from_items(["Hello world", "This is a test"])
                ds = ds.with_column(
                    "word_count",
                    col("text").str.word_count(tokenizer="cjk"),
                )
                print(ds.take_all())
        """

        @udf(return_dtype=DataType.int32())
        def word_count_wrapped(
            texts: pa.ChunkedArray,
            tokenizer: str,
        ) -> pa.Array:
            import pyarrow as pa

            if tokenizer == "cjk":
                tokens = dedup_utils.utf8_split_mixed(texts)
            else:
                raise ValueError(f"Invalid tokenizer: {tokenizer}")
            return pa.array([len(words) for words in tokens], pa.int32())

        return word_count_wrapped(texts=self._expr, tokenizer=tokenizer)