nv_ingest_client.primitives.tasks package#

Submodules#

nv_ingest_client.primitives.tasks.audio_extraction module#

class nv_ingest_client.primitives.tasks.audio_extraction.AudioExtractionTask( auth_token: str = None, grpc_endpoint: str = None, http_endpoint: str = None, infer_protocol: str = None, function_id: str | None = None, use_ssl: bool = None, ssl_cert: str = None, segment_audio: bool = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.caption module#

class nv_ingest_client.primitives.tasks.caption.CaptionTask( api_key: str = None, endpoint_url: str = None, prompt: str = None, model_name: str = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.chart_extraction module#

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionTask(params: dict = None)[source]#

Bases: Task

Object for chart extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.dedup module#

class nv_ingest_client.primitives.tasks.dedup.DedupTask( content_type: Literal['image'] = 'image', filter: bool = False, )[source]#

Bases: Task

Object for document dedup task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.embed module#

class nv_ingest_client.primitives.tasks.embed.EmbedTask( endpoint_url: str | None = None, model_name: str | None = None, api_key: str | None = None, text: bool | None = None, tables: bool | None = None, filter_errors: bool = False, text_elements_modality: str | None = None, image_elements_modality: str | None = None, structured_elements_modality: str | None = None, audio_elements_modality: str | None = None, )[source]#

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() → Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:: A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).
Return type:: Dict[str, Any]

nv_ingest_client.primitives.tasks.extract module#

class nv_ingest_client.primitives.tasks.extract.ExtractTask( document_type, extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None, extract_text: bool = False, extract_images: bool = False, extract_tables: bool = False, extract_charts: bool | None = None, extract_audio_params: Dict[str, Any] | None = None, extract_images_method: Literal['group', 'yolox'] = 'group', extract_images_params: Dict[str, Any] | None = None, extract_tables_method: Literal['yolox', 'paddle'] = 'yolox', extract_infographics: bool = False, extract_page_as_image: bool = False, text_depth: str = 'document', paddle_output_format: str = 'pseudo_markdown', table_output_format: str = 'pseudo_markdown', )[source]#

Bases: Task

Object for document extraction task

property document_type#

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.filter module#

class nv_ingest_client.primitives.tasks.filter.FilterTask( content_type: Literal['image'] = 'image', min_size: int = 128, max_aspect_ratio: int | float = 5.0, min_aspect_ratio: int | float = 0.2, filter: bool = False, )[source]#

Bases: Task

Object for document filter task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.infographic_extraction module#

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionTask(params: dict = None)[source]#

Bases: Task

Object for infographic extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.split module#

class nv_ingest_client.primitives.tasks.split.SplitTask( tokenizer: str = None, chunk_size: int = 1024, chunk_overlap: int = 150, params: dict = None, )[source]#

Bases: Task

Object for document splitting task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.store module#

class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreTask(

structured: bool = True,

images: bool = False,

store_method: Literal['minio'] = None,

params: dict = None,

**extra_params,

)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

nv_ingest_client.primitives.tasks.table_extraction module#

pydantic model nv_ingest_client.primitives.tasks.table_extraction.TableExtractionSchema[source]#

Bases: BaseModel

Show JSON schema

{
   "title": "TableExtractionSchema",
   "type": "object",
   "properties": {},
   "additionalProperties": false
}

Config:

extra: str = forbid

class Config[source]#

Bases: object

extra = 'forbid'#

class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.task_base module#

class nv_ingest_client.primitives.tasks.task_base.Task[source]#

Bases: object

Generic task Object

to_dict() → Dict[source]#: Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.task_base.TaskType(*values)[source]#

Bases: Enum

CAPTION = 1#

CHART_DATA_EXTRACT = 2#

DEDUP = 3#

EMBED = 4#

EXTRACT = 5#

FILTER = 6#

INFOGRAPHIC_DATA_EXTRACT = 7#

SPLIT = 8#

STORE = 9#

STORE_EMBEDDING = 10#

TABLE_DATA_EXTRACT = 11#

TRANSFORM = 12#

UDF = 13#

VDB_UPLOAD = 14#

nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) → bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:: task_type_str (str) – The string to check against the TaskType enum values.
Returns:: True if the string is a valid TaskType enum value, False otherwise.
Return type:: bool

nv_ingest_client.primitives.tasks.task_factory module#

class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#

Bases: Task

Placeholder for unimplemented tasks

nv_ingest_client.primitives.tasks.task_factory.task_factory(

task_type: TaskType | str,

**kwargs,

) → Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:

task_type (TaskType) – The type of the task to create.
**kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.

nv_ingest_client.primitives.tasks.udf module#

Bases: Task

User-Defined Function (UDF) task for custom processing logic.

This task allows users to provide custom Python functions that will be executed during the ingestion pipeline. The UDF function must accept a control_message parameter and return an IngestControlMessage.

Supports four UDF function specification formats: 1. Inline function string: ‘def my_func(control_message): …’ 2. Module path with colon: ‘my_module.my_submodule:my_function’ (preserves imports) 3. File path: ‘/path/to/file.py:my_function’ 4. Legacy import path: ‘my_module.my_function’ (function name only, no imports)

property phase: PipelinePhase#: Returns the pipeline phase for this UDF task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis

property udf_function: str | None#: Returns the UDF function string or specification.

property udf_function_name: str | None#: Returns the UDF function name.

nv_ingest_client.primitives.tasks.vdb_upload module#

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask( filter_errors: bool = False, bulk_ingest: bool = False, bulk_ingest_path: str = 'embeddings/', params: dict = None, )[source]#

Bases: Task

Object for document embedding task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

pydantic model nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema[source]#

Bases: BaseModel

Show JSON schema

{
   "title": "VdbUploadTaskSchema",
   "type": "object",
   "properties": {
      "filter_errors": {
         "default": false,
         "title": "Filter Errors",
         "type": "boolean"
      },
      "bulk_ingest": {
         "default": false,
         "title": "Bulk Ingest",
         "type": "boolean"
      },
      "bulk_ingest_path": {
         "default": "",
         "title": "Bulk Ingest Path",
         "type": "string"
      },
      "params": {
         "additionalProperties": true,
         "default": null,
         "title": "Params",
         "type": "object"
      }
   },
   "additionalProperties": false
}

Config:

extra: str = forbid

Fields:

bulk_ingest (bool)
bulk_ingest_path (str)
filter_errors (bool)
params (dict)

field bulk_ingest: bool = False#

field bulk_ingest_path: str = ''#

field filter_errors: bool = False#

field params: dict = None#

class Config[source]#

Bases: object

extra = 'forbid'#

Module contents#

class nv_ingest_client.primitives.tasks.AudioExtractionTask( auth_token: str = None, grpc_endpoint: str = None, http_endpoint: str = None, infer_protocol: str = None, function_id: str | None = None, use_ssl: bool = None, ssl_cert: str = None, segment_audio: bool = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.CaptionTask( api_key: str = None, endpoint_url: str = None, prompt: str = None, model_name: str = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.ChartExtractionTask(params: dict = None)[source]#

Bases: Task

Object for chart extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.DedupTask( content_type: Literal['image'] = 'image', filter: bool = False, )[source]#

Bases: Task

Object for document dedup task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.EmbedTask( endpoint_url: str | None = None, model_name: str | None = None, api_key: str | None = None, text: bool | None = None, tables: bool | None = None, filter_errors: bool = False, text_elements_modality: str | None = None, image_elements_modality: str | None = None, structured_elements_modality: str | None = None, audio_elements_modality: str | None = None, )[source]#

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() → Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:: A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).
Return type:: Dict[str, Any]

class nv_ingest_client.primitives.tasks.ExtractTask( document_type, extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None, extract_text: bool = False, extract_images: bool = False, extract_tables: bool = False, extract_charts: bool | None = None, extract_audio_params: Dict[str, Any] | None = None, extract_images_method: Literal['group', 'yolox'] = 'group', extract_images_params: Dict[str, Any] | None = None, extract_tables_method: Literal['yolox', 'paddle'] = 'yolox', extract_infographics: bool = False, extract_page_as_image: bool = False, text_depth: str = 'document', paddle_output_format: str = 'pseudo_markdown', table_output_format: str = 'pseudo_markdown', )[source]#

Bases: Task

Object for document extraction task

property document_type#

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.FilterTask( content_type: Literal['image'] = 'image', min_size: int = 128, max_aspect_ratio: int | float = 5.0, min_aspect_ratio: int | float = 0.2, filter: bool = False, )[source]#

Bases: Task

Object for document filter task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.InfographicExtractionTask(params: dict = None)[source]#

Bases: Task

Object for infographic extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.SplitTask( tokenizer: str = None, chunk_size: int = 1024, chunk_overlap: int = 150, params: dict = None, )[source]#

Bases: Task

Object for document splitting task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.StoreTask(

structured: bool = True,

images: bool = False,

store_method: Literal['minio'] = None,

params: dict = None,

**extra_params,

)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.Task[source]#

Bases: object

Generic task Object

to_dict() → Dict[source]#: Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.TaskType(*values)[source]#

Bases: Enum

CAPTION = 1#

CHART_DATA_EXTRACT = 2#

DEDUP = 3#

EMBED = 4#

EXTRACT = 5#

FILTER = 6#

INFOGRAPHIC_DATA_EXTRACT = 7#

SPLIT = 8#

STORE = 9#

STORE_EMBEDDING = 10#

TABLE_DATA_EXTRACT = 11#

TRANSFORM = 12#

UDF = 13#

VDB_UPLOAD = 14#

Bases: Task

User-Defined Function (UDF) task for custom processing logic.

property phase: PipelinePhase#: Returns the pipeline phase for this UDF task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis

property udf_function: str | None#: Returns the UDF function string or specification.

property udf_function_name: str | None#: Returns the UDF function name.

nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) → bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:: task_type_str (str) – The string to check against the TaskType enum values.
Returns:: True if the string is a valid TaskType enum value, False otherwise.
Return type:: bool

nv_ingest_client.primitives.tasks.task_factory(

task_type: TaskType | str,

**kwargs,

) → Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:

task_type (TaskType) – The type of the task to create.
**kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.