nv_ingest_client.primitives.tasks package#

Submodules#

nv_ingest_client.primitives.tasks.audio_extraction module#

class nv_ingest_client.primitives.tasks.audio_extraction.AudioExtractionTask(
auth_token: str = None,
grpc_endpoint: str = None,
http_endpoint: str = None,
infer_protocol: str = None,
function_id: str | None = None,
use_ssl: bool = None,
ssl_cert: str = None,
segment_audio: bool = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.caption module#

class nv_ingest_client.primitives.tasks.caption.CaptionTask(
api_key: str = None,
endpoint_url: str = None,
prompt: str = None,
model_name: str = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.chart_extraction module#

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionTask(params: dict = None)[source]#

Bases: Task

Object for chart extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.dedup module#

class nv_ingest_client.primitives.tasks.dedup.DedupTask(
content_type: Literal['image'] = 'image',
filter: bool = False,
)[source]#

Bases: Task

Object for document dedup task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.embed module#

class nv_ingest_client.primitives.tasks.embed.EmbedTask(
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
text: bool | None = None,
tables: bool | None = None,
filter_errors: bool = False,
text_elements_modality: str | None = None,
image_elements_modality: str | None = None,
structured_elements_modality: str | None = None,
audio_elements_modality: str | None = None,
)[source]#

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:

A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).

Return type:

Dict[str, Any]

nv_ingest_client.primitives.tasks.extract module#

class nv_ingest_client.primitives.tasks.extract.ExtractTask(
document_type,
extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None,
extract_text: bool = False,
extract_images: bool = False,
extract_tables: bool = False,
extract_charts: bool | None = None,
extract_audio_params: Dict[str, Any] | None = None,
extract_images_method: Literal['group', 'yolox'] = 'group',
extract_images_params: Dict[str, Any] | None = None,
extract_tables_method: Literal['yolox', 'paddle'] = 'yolox',
extract_infographics: bool = False,
extract_page_as_image: bool = False,
text_depth: str = 'document',
paddle_output_format: str = 'pseudo_markdown',
table_output_format: str = 'pseudo_markdown',
)[source]#

Bases: Task

Object for document extraction task

property document_type#
to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.filter module#

class nv_ingest_client.primitives.tasks.filter.FilterTask(
content_type: Literal['image'] = 'image',
min_size: int = 128,
max_aspect_ratio: int | float = 5.0,
min_aspect_ratio: int | float = 0.2,
filter: bool = False,
)[source]#

Bases: Task

Object for document filter task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.infographic_extraction module#

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionTask(params: dict = None)[source]#

Bases: Task

Object for infographic extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.split module#

class nv_ingest_client.primitives.tasks.split.SplitTask(
tokenizer: str = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = None,
)[source]#

Bases: Task

Object for document splitting task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.store module#

class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreTask(
structured: bool = True,
images: bool = False,
store_method: Literal['minio'] = None,
params: dict = None,
**extra_params,
)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

nv_ingest_client.primitives.tasks.table_extraction module#

pydantic model nv_ingest_client.primitives.tasks.table_extraction.TableExtractionSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "TableExtractionSchema",
   "type": "object",
   "properties": {},
   "additionalProperties": false
}

Config:
  • extra: str = forbid

class Config[source]#

Bases: object

extra = 'forbid'#
class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.task_base module#

class nv_ingest_client.primitives.tasks.task_base.Task[source]#

Bases: object

Generic task Object

to_dict() Dict[source]#

Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.task_base.TaskType(*values)[source]#

Bases: Enum

CAPTION = 1#
CHART_DATA_EXTRACT = 2#
DEDUP = 3#
EMBED = 4#
EXTRACT = 5#
FILTER = 6#
INFOGRAPHIC_DATA_EXTRACT = 7#
SPLIT = 8#
STORE = 9#
STORE_EMBEDDING = 10#
TABLE_DATA_EXTRACT = 11#
TRANSFORM = 12#
UDF = 13#
VDB_UPLOAD = 14#
nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:

task_type_str (str) – The string to check against the TaskType enum values.

Returns:

True if the string is a valid TaskType enum value, False otherwise.

Return type:

bool

nv_ingest_client.primitives.tasks.task_factory module#

class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#

Bases: Task

Placeholder for unimplemented tasks

nv_ingest_client.primitives.tasks.task_factory.task_factory(
task_type: TaskType | str,
**kwargs,
) Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:
  • task_type (TaskType) – The type of the task to create.

  • **kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.

nv_ingest_client.primitives.tasks.udf module#

class nv_ingest_client.primitives.tasks.udf.UDFTask(
udf_function: str | None = None,
udf_function_name: str | None = None,
phase: PipelinePhase | int | str | None = PipelinePhase.RESPONSE,
target_stage: str | None = None,
run_before: bool = False,
run_after: bool = False,
)[source]#

Bases: Task

User-Defined Function (UDF) task for custom processing logic.

This task allows users to provide custom Python functions that will be executed during the ingestion pipeline. The UDF function must accept a control_message parameter and return an IngestControlMessage.

Supports four UDF function specification formats: 1. Inline function string: ‘def my_func(control_message): …’ 2. Module path with colon: ‘my_module.my_submodule:my_function’ (preserves imports) 3. File path: ‘/path/to/file.py:my_function’ 4. Legacy import path: ‘my_module.my_function’ (function name only, no imports)

property phase: PipelinePhase#

Returns the pipeline phase for this UDF task.

to_dict() Dict[source]#

Convert to a dict for submission to redis

property udf_function: str | None#

Returns the UDF function string or specification.

property udf_function_name: str | None#

Returns the UDF function name.

nv_ingest_client.primitives.tasks.vdb_upload module#

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask(
filter_errors: bool = False,
bulk_ingest: bool = False,
bulk_ingest_path: str = 'embeddings/',
params: dict = None,
)[source]#

Bases: Task

Object for document embedding task

to_dict() Dict[source]#

Convert to a dict for submission to redis

pydantic model nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "VdbUploadTaskSchema",
   "type": "object",
   "properties": {
      "filter_errors": {
         "default": false,
         "title": "Filter Errors",
         "type": "boolean"
      },
      "bulk_ingest": {
         "default": false,
         "title": "Bulk Ingest",
         "type": "boolean"
      },
      "bulk_ingest_path": {
         "default": "",
         "title": "Bulk Ingest Path",
         "type": "string"
      },
      "params": {
         "additionalProperties": true,
         "default": null,
         "title": "Params",
         "type": "object"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field bulk_ingest: bool = False#
field bulk_ingest_path: str = ''#
field filter_errors: bool = False#
field params: dict = None#
class Config[source]#

Bases: object

extra = 'forbid'#

Module contents#

class nv_ingest_client.primitives.tasks.AudioExtractionTask(
auth_token: str = None,
grpc_endpoint: str = None,
http_endpoint: str = None,
infer_protocol: str = None,
function_id: str | None = None,
use_ssl: bool = None,
ssl_cert: str = None,
segment_audio: bool = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.CaptionTask(
api_key: str = None,
endpoint_url: str = None,
prompt: str = None,
model_name: str = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.ChartExtractionTask(params: dict = None)[source]#

Bases: Task

Object for chart extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.DedupTask(
content_type: Literal['image'] = 'image',
filter: bool = False,
)[source]#

Bases: Task

Object for document dedup task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.EmbedTask(
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
text: bool | None = None,
tables: bool | None = None,
filter_errors: bool = False,
text_elements_modality: str | None = None,
image_elements_modality: str | None = None,
structured_elements_modality: str | None = None,
audio_elements_modality: str | None = None,
)[source]#

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:

A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).

Return type:

Dict[str, Any]

class nv_ingest_client.primitives.tasks.ExtractTask(
document_type,
extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None,
extract_text: bool = False,
extract_images: bool = False,
extract_tables: bool = False,
extract_charts: bool | None = None,
extract_audio_params: Dict[str, Any] | None = None,
extract_images_method: Literal['group', 'yolox'] = 'group',
extract_images_params: Dict[str, Any] | None = None,
extract_tables_method: Literal['yolox', 'paddle'] = 'yolox',
extract_infographics: bool = False,
extract_page_as_image: bool = False,
text_depth: str = 'document',
paddle_output_format: str = 'pseudo_markdown',
table_output_format: str = 'pseudo_markdown',
)[source]#

Bases: Task

Object for document extraction task

property document_type#
to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.FilterTask(
content_type: Literal['image'] = 'image',
min_size: int = 128,
max_aspect_ratio: int | float = 5.0,
min_aspect_ratio: int | float = 0.2,
filter: bool = False,
)[source]#

Bases: Task

Object for document filter task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.InfographicExtractionTask(params: dict = None)[source]#

Bases: Task

Object for infographic extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.SplitTask(
tokenizer: str = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = None,
)[source]#

Bases: Task

Object for document splitting task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.StoreTask(
structured: bool = True,
images: bool = False,
store_method: Literal['minio'] = None,
params: dict = None,
**extra_params,
)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.Task[source]#

Bases: object

Generic task Object

to_dict() Dict[source]#

Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.TaskType(*values)[source]#

Bases: Enum

CAPTION = 1#
CHART_DATA_EXTRACT = 2#
DEDUP = 3#
EMBED = 4#
EXTRACT = 5#
FILTER = 6#
INFOGRAPHIC_DATA_EXTRACT = 7#
SPLIT = 8#
STORE = 9#
STORE_EMBEDDING = 10#
TABLE_DATA_EXTRACT = 11#
TRANSFORM = 12#
UDF = 13#
VDB_UPLOAD = 14#
class nv_ingest_client.primitives.tasks.UDFTask(
udf_function: str | None = None,
udf_function_name: str | None = None,
phase: PipelinePhase | int | str | None = PipelinePhase.RESPONSE,
target_stage: str | None = None,
run_before: bool = False,
run_after: bool = False,
)[source]#

Bases: Task

User-Defined Function (UDF) task for custom processing logic.

This task allows users to provide custom Python functions that will be executed during the ingestion pipeline. The UDF function must accept a control_message parameter and return an IngestControlMessage.

Supports four UDF function specification formats: 1. Inline function string: ‘def my_func(control_message): …’ 2. Module path with colon: ‘my_module.my_submodule:my_function’ (preserves imports) 3. File path: ‘/path/to/file.py:my_function’ 4. Legacy import path: ‘my_module.my_function’ (function name only, no imports)

property phase: PipelinePhase#

Returns the pipeline phase for this UDF task.

to_dict() Dict[source]#

Convert to a dict for submission to redis

property udf_function: str | None#

Returns the UDF function string or specification.

property udf_function_name: str | None#

Returns the UDF function name.

nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:

task_type_str (str) – The string to check against the TaskType enum values.

Returns:

True if the string is a valid TaskType enum value, False otherwise.

Return type:

bool

nv_ingest_client.primitives.tasks.task_factory(
task_type: TaskType | str,
**kwargs,
) Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:
  • task_type (TaskType) – The type of the task to create.

  • **kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.