nv_ingest_client.primitives.tasks package#
Submodules#
nv_ingest_client.primitives.tasks.audio_extraction module#
- class nv_ingest_client.primitives.tasks.audio_extraction.AudioExtractionTask(
- auth_token: str = None,
- grpc_endpoint: str = None,
- http_endpoint: str = None,
- infer_protocol: str = None,
- function_id: str | None = None,
- use_ssl: bool = None,
- ssl_cert: str = None,
- segment_audio: bool = None,
Bases:
Task
nv_ingest_client.primitives.tasks.caption module#
nv_ingest_client.primitives.tasks.chart_extraction module#
nv_ingest_client.primitives.tasks.dedup module#
nv_ingest_client.primitives.tasks.embed module#
- class nv_ingest_client.primitives.tasks.embed.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
- text_elements_modality: str | None = None,
- image_elements_modality: str | None = None,
- structured_elements_modality: str | None = None,
- audio_elements_modality: str | None = None,
Bases:
Task
Object for document embedding tasks.
This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.
nv_ingest_client.primitives.tasks.extract module#
- class nv_ingest_client.primitives.tasks.extract.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None,
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['group', 'yolox'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'paddle'] = 'yolox',
- extract_infographics: bool = False,
- extract_page_as_image: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
- table_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
nv_ingest_client.primitives.tasks.filter module#
nv_ingest_client.primitives.tasks.infographic_extraction module#
nv_ingest_client.primitives.tasks.split module#
nv_ingest_client.primitives.tasks.store module#
- class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
nv_ingest_client.primitives.tasks.table_extraction module#
nv_ingest_client.primitives.tasks.task_base module#
- class nv_ingest_client.primitives.tasks.task_base.TaskType(*values)[source]#
Bases:
Enum
- CAPTION = 1#
- CHART_DATA_EXTRACT = 2#
- DEDUP = 3#
- EMBED = 4#
- EXTRACT = 5#
- FILTER = 6#
- INFOGRAPHIC_DATA_EXTRACT = 7#
- SPLIT = 8#
- STORE = 9#
- STORE_EMBEDDING = 10#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 12#
- UDF = 13#
- VDB_UPLOAD = 14#
- nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool
nv_ingest_client.primitives.tasks.task_factory module#
- class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#
Bases:
Task
Placeholder for unimplemented tasks
nv_ingest_client.primitives.tasks.udf module#
- class nv_ingest_client.primitives.tasks.udf.UDFTask(
- udf_function: str | None = None,
- udf_function_name: str | None = None,
- phase: PipelinePhase | int | str | None = PipelinePhase.RESPONSE,
- target_stage: str | None = None,
- run_before: bool = False,
- run_after: bool = False,
Bases:
Task
User-Defined Function (UDF) task for custom processing logic.
This task allows users to provide custom Python functions that will be executed during the ingestion pipeline. The UDF function must accept a control_message parameter and return an IngestControlMessage.
Supports four UDF function specification formats: 1. Inline function string: ‘def my_func(control_message): …’ 2. Module path with colon: ‘my_module.my_submodule:my_function’ (preserves imports) 3. File path: ‘/path/to/file.py:my_function’ 4. Legacy import path: ‘my_module.my_function’ (function name only, no imports)
- property phase: PipelinePhase#
Returns the pipeline phase for this UDF task.
- property udf_function: str | None#
Returns the UDF function string or specification.
- property udf_function_name: str | None#
Returns the UDF function name.
nv_ingest_client.primitives.tasks.vdb_upload module#
- class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask(
- filter_errors: bool = False,
- bulk_ingest: bool = False,
- bulk_ingest_path: str = 'embeddings/',
- params: dict = None,
Bases:
Task
Object for document embedding task
- pydantic model nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema[source]#
Bases:
BaseModel
Show JSON schema
{ "title": "VdbUploadTaskSchema", "type": "object", "properties": { "filter_errors": { "default": false, "title": "Filter Errors", "type": "boolean" }, "bulk_ingest": { "default": false, "title": "Bulk Ingest", "type": "boolean" }, "bulk_ingest_path": { "default": "", "title": "Bulk Ingest Path", "type": "string" }, "params": { "additionalProperties": true, "default": null, "title": "Params", "type": "object" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field bulk_ingest: bool = False#
- field bulk_ingest_path: str = ''#
- field filter_errors: bool = False#
- field params: dict = None#
Module contents#
- class nv_ingest_client.primitives.tasks.AudioExtractionTask(
- auth_token: str = None,
- grpc_endpoint: str = None,
- http_endpoint: str = None,
- infer_protocol: str = None,
- function_id: str | None = None,
- use_ssl: bool = None,
- ssl_cert: str = None,
- segment_audio: bool = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.CaptionTask(
- api_key: str = None,
- endpoint_url: str = None,
- prompt: str = None,
- model_name: str = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.ChartExtractionTask(params: dict = None)[source]#
Bases:
Task
Object for chart extraction task
- class nv_ingest_client.primitives.tasks.DedupTask(
- content_type: Literal['image'] = 'image',
- filter: bool = False,
Bases:
Task
Object for document dedup task
- class nv_ingest_client.primitives.tasks.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
- text_elements_modality: str | None = None,
- image_elements_modality: str | None = None,
- structured_elements_modality: str | None = None,
- audio_elements_modality: str | None = None,
Bases:
Task
Object for document embedding tasks.
This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.
- class nv_ingest_client.primitives.tasks.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = None,
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['group', 'yolox'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'paddle'] = 'yolox',
- extract_infographics: bool = False,
- extract_page_as_image: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
- table_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
- class nv_ingest_client.primitives.tasks.FilterTask(
- content_type: Literal['image'] = 'image',
- min_size: int = 128,
- max_aspect_ratio: int | float = 5.0,
- min_aspect_ratio: int | float = 0.2,
- filter: bool = False,
Bases:
Task
Object for document filter task
- class nv_ingest_client.primitives.tasks.InfographicExtractionTask(params: dict = None)[source]#
Bases:
Task
Object for infographic extraction task
- class nv_ingest_client.primitives.tasks.SplitTask(
- tokenizer: str = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = None,
Bases:
Task
Object for document splitting task
- class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.StoreTask(
- structured: bool = True,
- images: bool = False,
- store_method: Literal['minio'] = None,
- params: dict = None,
- **extra_params,
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#
Bases:
Task
Object for table extraction tasks
- class nv_ingest_client.primitives.tasks.TaskType(*values)[source]#
Bases:
Enum
- CAPTION = 1#
- CHART_DATA_EXTRACT = 2#
- DEDUP = 3#
- EMBED = 4#
- EXTRACT = 5#
- FILTER = 6#
- INFOGRAPHIC_DATA_EXTRACT = 7#
- SPLIT = 8#
- STORE = 9#
- STORE_EMBEDDING = 10#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 12#
- UDF = 13#
- VDB_UPLOAD = 14#
- class nv_ingest_client.primitives.tasks.UDFTask(
- udf_function: str | None = None,
- udf_function_name: str | None = None,
- phase: PipelinePhase | int | str | None = PipelinePhase.RESPONSE,
- target_stage: str | None = None,
- run_before: bool = False,
- run_after: bool = False,
Bases:
Task
User-Defined Function (UDF) task for custom processing logic.
This task allows users to provide custom Python functions that will be executed during the ingestion pipeline. The UDF function must accept a control_message parameter and return an IngestControlMessage.
Supports four UDF function specification formats: 1. Inline function string: ‘def my_func(control_message): …’ 2. Module path with colon: ‘my_module.my_submodule:my_function’ (preserves imports) 3. File path: ‘/path/to/file.py:my_function’ 4. Legacy import path: ‘my_module.my_function’ (function name only, no imports)
- property phase: PipelinePhase#
Returns the pipeline phase for this UDF task.
- property udf_function: str | None#
Returns the UDF function string or specification.
- property udf_function_name: str | None#
Returns the UDF function name.
- nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool