Source code for xpark.dataset.processors.text_mask

from __future__ import annotations

import logging
from functools import partial
from typing import TYPE_CHECKING, Any, Iterable

from xpark.dataset.constants import NOT_SET
from xpark.dataset.datatype import DataType
from xpark.dataset.expressions import BatchColumnClassProtocol, udf
from xpark.dataset.import_utils import lazy_import
from xpark.dataset.utils import (
    LabelSpec,
    LLMChatCompletions,
    _format_labels,
    format_prompt,
    reject_cascade_params,
    skip_empty_texts,
)

if TYPE_CHECKING:
    import pyarrow as pa
    from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
else:
    openai = lazy_import("openai")
    pa = lazy_import("pyarrow", rename="pa")

logger = logging.getLogger("ray")

# prompt modify from https://github.com/apache/doris/blob/4.0.2-rc01/be/src/vec/functions/ai/ai_mask.h
_ROLE_AND_TASK_PROMPT = (
    "You are a data privacy assistant. You will identify and mask sensitive information "
    "in the user's input according to the provided labels. "
    "The user will provide `labels` and `input_text`. For each label, you must hide all related "
    "information in the input text. "
    "Labels are organized into multiple lines, with each line containing two fields: label and description, separated by a space. "
    "They represent the label name and the description of the content respectively. The description may be empty."
)

_RESPONSE_FORMAT_PROMPT = (
    'Replace every piece of sensitive information with the literal placeholder "[MASKED]" '
    "(uppercase, inside square brackets, exactly as written).\n"
    "Output only the masked text, with no preamble, reasoning, or trailing notes."
)

_LABELS_BLOCK = "<labels>\n{labels}\n</labels>"

PROMPT_TEMPLATE = """
<input_text>
{}
</input_text>
"""


def build_prompt(
    specs: list[LabelSpec],
    text: str,
    hint: str | list[str] | None = None,
) -> Iterable[ChatCompletionMessageParam]:
    from openai.types.chat.chat_completion_message_param import (
        ChatCompletionSystemMessageParam,
        ChatCompletionUserMessageParam,
    )

    rendered_labels = _LABELS_BLOCK.format(labels=_format_labels(specs))

    _system_prompt = format_prompt(
        roles_and_tasks=_ROLE_AND_TASK_PROMPT,
        response_format=_RESPONSE_FORMAT_PROMPT,
        hint=hint,
        extra=rendered_labels,
    )

    return [
        ChatCompletionSystemMessageParam(role="system", content=_system_prompt),
        ChatCompletionUserMessageParam(role="user", content=PROMPT_TEMPLATE.format(str(text))),
    ]



[docs]
@udf(return_dtype=DataType.string())
class TextMask(BatchColumnClassProtocol):
    """TextMask processor replaces sensitive information in the original text with [MASKED] according to the labels.

    Args:
        labels: The labels to mask. Accepts two formats:

            - ``list[str]``: plain label names, e.g. ``[\"email\", \"phone_num\"]``
            - ``list[dict]``: dicts with ``\"label\"`` (required) and ``\"description\"`` (optional),
              e.g. ``[{\"label\": \"email\", \"description\": \"email address\"}]``

            Descriptions are injected into the prompt to guide the model when label names alone
            are ambiguous.
        base_url: The base URL of the LLM server.
        model: The request model name.
        api_key: The request API key.
        max_qps: The maximum query-per-second rate for remote LLM requests.
        max_concurrency: The maximum number of in-flight remote LLM requests allowed concurrently.
        max_retries: The maximum number of retries per request in the event of failures.
            We retry with exponential backoff upto this specific maximum retries.
        fallback_response: The response value to return when the LLM request fails.
            If set to None, the exception will be raised instead.
        hint: Optional extra instructions or constraints to guide the model (e.g. domain-specific
            masking rules, which label takes precedence on overlap, or how to handle partial
            matches). Accepts either a single string or a list of strings, where each item is one
            hint written in plain text. Passing a list is recommended — use one string per hint.
        **kwargs: Keyword arguments to pass to the `openai.AsyncClient.chat.completions.create
            <https://github.com/openai/openai-python/blob/main/src/openai/resources/chat/completions/completions.py>`_ API.

    Examples:
        .. code-block:: python

            from xpark.dataset.expressions import col
            from xpark.dataset import TextMask, from_items

            ds = from_items(["My email is rarity@example.com and my phone is 123-456-7890"])

            # Plain labels
            ds = ds.with_column(
                "masked_text",
                TextMask(
                    ["email", "phone_num"],
                    model="deepseek-v3-0324",
                    base_url=os.getenv("LLM_ENDPOINT"),
                    api_key=os.getenv("LLM_API_KEY"),
                )
                .options(num_workers={"IO": 1}, batch_size=1)
                .with_column(col("item")),
            )

            # Labels with descriptions
            ds = ds.with_column(
                "masked_text",
                TextMask(
                    [
                        {"label": "email", "description": "email address"},
                        {"label": "phone_num", "description": "phone number"},
                    ],
                    model="deepseek-v3-0324",
                    base_url=os.getenv("LLM_ENDPOINT"),
                    api_key=os.getenv("LLM_API_KEY"),
                )
                .options(num_workers={"IO": 1}, batch_size=1)
                .with_column(col("item")),
            )

            print(ds.take_all())

    """

    def __init__(
        self,
        labels: list[str | dict[str, str]],
        /,
        *,
        base_url: str,
        model: str,
        api_key: str = NOT_SET,
        max_qps: int | None = None,
        max_concurrency: int | None = None,
        max_retries: int = 0,
        fallback_response: str | None = None,
        hint: str | list[str] | None = None,
        **kwargs: dict[str, Any],
    ):
        reject_cascade_params("TextMask", kwargs)
        if not labels:
            raise ValueError("labels must not be empty")
        self.specs = [LabelSpec.model_validate({"label": item} if isinstance(item, str) else item) for item in labels]
        self.hint = hint

        self.model = LLMChatCompletions(
            base_url=base_url,
            model=model,
            api_key=api_key,
            max_qps=max_qps,
            max_concurrency=max_concurrency,
            max_retries=max_retries,
            fallback_response=fallback_response,
            response_format="text",
            **kwargs,
        )

    @skip_empty_texts
    async def __call__(self, texts: pa.ChunkedArray) -> pa.Array:
        return await self.model.batch_generate(
            texts=texts,
            build_prompt=partial(build_prompt, self.specs, hint=self.hint),
        )