Source code for xpark.dataset.namespace_expressions.string_namespace
"""String namespace for expression operations on string-typed columns."""
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
from ray.data.namespace_expressions.string_namespace import _StringNamespace as RayStringNamespace
from xpark.dataset.datatype import DataType
from xpark.dataset.expressions import udf
from xpark.dataset.filters.dedup import utils as dedup_utils
from xpark.dataset.import_utils import lazy_import
if TYPE_CHECKING:
import pyarrow as pa
from ray.data.expressions import UDFExpr
else:
pa = lazy_import("pyarrow", rename="pa")
[docs]
@dataclass
class _StringNamespace(RayStringNamespace):
"""Namespace for string operations on expression columns.
This namespace provides methods for operating on string-typed columns using
PyArrow compute functions.
Example:
>>> from xpark.dataset.expressions import col
>>> # Convert to uppercase
>>> expr = col("name").str.upper()
>>> # Get string length
>>> expr = col("name").str.len()
>>> # Check if string starts with a prefix
>>> expr = col("name").str.starts_with("A")
"""
[docs]
def word_count(self, tokenizer: str = "cjk") -> UDFExpr:
"""Count words in texts using the specified tokenizer.
Args:
tokenizer: The tokenizer type to use for word segmentation.
Defaults to "cjk" for Chinese-Japanese-Korean text processing.
Examples:
.. code-block:: python
from xpark.dataset import from_items
from xpark.dataset.expressions import col
ds = from_items(["Hello world", "This is a test"])
ds = ds.with_column(
"word_count",
col("text").str.word_count(tokenizer="cjk"),
)
print(ds.take_all())
"""
@udf(return_dtype=DataType.int32())
def word_count_wrapped(
texts: pa.ChunkedArray,
tokenizer: str,
) -> pa.Array:
import pyarrow as pa
if tokenizer == "cjk":
tokens = dedup_utils.utf8_split_mixed(texts)
else:
raise ValueError(f"Invalid tokenizer: {tokenizer}")
return pa.array([len(words) for words in tokens], pa.int32())
return word_count_wrapped(texts=self._expr, tokenizer=tokenizer)