Source code for nv_ingest_client.primitives.tasks.split

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


# pylint: disable=too-few-public-methods
# pylint: disable=too-many-arguments

import logging
from typing import Dict

from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema

from .task_base import Task

logger = logging.getLogger(__name__)



[docs]
class SplitTask(Task):
    """
    Object for document splitting task
    """

    def __init__(
        self,
        tokenizer: str = None,
        chunk_size: int = 1024,
        chunk_overlap: int = 150,
        params: dict = None,
    ):
        """
        Setup Split Task Config
        """
        super().__init__()

        # Handle None params by converting to empty dict for backward compatibility
        if params is None:
            params = {}

        # Use the API schema for validation
        validated_data = IngestTaskSplitSchema(
            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, params=params
        )

        self._tokenizer = validated_data.tokenizer
        self._chunk_size = validated_data.chunk_size
        self._chunk_overlap = validated_data.chunk_overlap
        self._params = validated_data.params

    def __str__(self) -> str:
        """
        Returns a string with the object's config and run time state
        """
        info = ""
        info += "Split Task:\n"
        info += f"  tokenizer: {self._tokenizer}\n"
        info += f"  chunk_size: {self._chunk_size}\n"
        info += f"  chunk_overlap: {self._chunk_overlap}\n"
        for key, value in self._params.items():
            info += f"  {key}: {value}\n"
        return info


[docs]
    def to_dict(self) -> Dict:
        """
        Convert to a dict for submission to redis
        """
        split_params = {}

        if self._tokenizer is not None:
            split_params["tokenizer"] = self._tokenizer
        if self._chunk_size is not None:
            split_params["chunk_size"] = self._chunk_size
        if self._chunk_overlap is not None:
            split_params["chunk_overlap"] = self._chunk_overlap
        if self._params is not None:
            split_params["params"] = self._params

        return {"type": "split", "task_properties": split_params}