Source code for nv_ingest_client.primitives.tasks.dedup

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


# pylint: disable=too-few-public-methods
# pylint: disable=too-many-arguments

import logging
from typing import Dict
from typing import Literal

from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema

from .task_base import Task

logger = logging.getLogger(__name__)


[docs] class DedupTask(Task): """ Object for document dedup task """ _TypeContentType = Literal["image"] def __init__( self, content_type: _TypeContentType = "image", filter: bool = False, ) -> None: """ Setup Dedup Task Config """ super().__init__() # Use the API schema for validation validated_data = IngestTaskDedupSchema( content_type=content_type, params={"filter": filter}, ) self._content_type = validated_data.content_type self._filter = validated_data.params.filter def __str__(self) -> str: """ Returns a string with the object's config and run time state """ info = "" info += "Dedup Task:\n" info += f" content_type: {self._content_type.value}\n" info += f" filter: {self._filter}\n" return info
[docs] def to_dict(self) -> Dict: """ Convert to a dict for submission to redis """ dedup_params = {"filter": self._filter} task_properties = { "content_type": self._content_type.value, "params": dedup_params, } return {"type": "dedup", "task_properties": task_properties}