Source code for nv_ingest_api.internal.schemas.meta.metadata_schema

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import logging
from datetime import datetime
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Union

from pydantic import field_validator, model_validator, Field

from nv_ingest_api.internal.enums.common import (
    AccessLevelEnum,
    ContentTypeEnum,
    TextTypeEnum,
    LanguageEnum,
    TableFormatEnum,
    StatusEnum,
    DocumentTypeEnum,
    TaskTypeEnum,
)
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
from nv_ingest_api.util.converters import datetools

logger = logging.getLogger(__name__)


# Sub schemas
[docs] class SourceMetadataSchema(BaseModelNoExt): """ Schema for the knowledge base file from which content and metadata is extracted. """ source_name: str """The name of the source file.""" source_id: str """The ID of the source file.""" source_location: str = "" """The URL, URI, or pointer to the storage location of the source file.""" source_type: Union[DocumentTypeEnum, str] """The type of the source file, such as pdf, docx, pptx, or txt.""" collection_id: str = "" """The ID of the collection in which the source is contained.""" date_created: str = datetime.now().isoformat() """The date the source was created.""" last_modified: str = datetime.now().isoformat() """The date the source was last modified.""" summary: str = "" """A summary of the source.""" partition_id: int = -1 """The offset of this data fragment within a larger set of fragments.""" access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN """The role-based access control for the source.""" custom_content: Optional[Dict[str, Any]] = None
[docs] @field_validator("date_created", "last_modified") @classmethod def validate_fields(cls, field_value): datetools.validate_iso8601(field_value) return field_value
[docs] class NearbyObjectsSubSchema(BaseModelNoExt): """ Schema to hold related extracted object. """ content: List[str] = Field(default_factory=list) bbox: List[tuple] = Field(default_factory=list) type: List[str] = Field(default_factory=list)
[docs] class NearbyObjectsSchema(BaseModelNoExt): """ Schema to hold types of related extracted objects. """ text: NearbyObjectsSubSchema = NearbyObjectsSubSchema() images: NearbyObjectsSubSchema = NearbyObjectsSubSchema() structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
[docs] class ContentHierarchySchema(BaseModelNoExt): """ Schema for the extracted content hierarchy. """ page_count: int = -1 page: int = -1 block: int = -1 line: int = -1 span: int = -1 nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
[docs] class ContentMetadataSchema(BaseModelNoExt): """ Data extracted from a source; generally Text or Image. """ type: ContentTypeEnum """The type of the content. Text, Image, Structured, Table, or Chart.""" description: str = "" """A text description of the content object.""" page_number: int = -1 """The page number of the content in the source.""" hierarchy: ContentHierarchySchema = ContentHierarchySchema() """The location or order of the content within the source.""" subtype: Union[ContentTypeEnum, str] = "" """The type of the content for structured data types, such as table or chart.""" start_time: int = -1 """The timestamp of the start of a piece of audio content.""" end_time: int = -1 """The timestamp of the end of a piece of audio content.""" custom_content: Optional[Dict[str, Any]] = None
[docs] class TextMetadataSchema(BaseModelNoExt): """ The schema for the extracted text content. """ text_type: TextTypeEnum """The type of the text, such as header or body.""" summary: str = "" """An abbreviated summary of the content.""" keywords: Union[str, List[str], Dict] = "" """Keywords, named entities, or other phrases.""" language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check """The language of the content.""" text_location: tuple = (0, 0, 0, 0) """The bounding box of the text, in the format (x1,y1,x2,y2).""" text_location_max_dimensions: tuple = (0, 0) """The maximum dimensions of the bounding box of the text, in the format (x_max,y_max).""" custom_content: Optional[Dict[str, Any]] = None
[docs] class ImageMetadataSchema(BaseModelNoExt): """ The schema for the extracted image content. """ image_type: Union[DocumentTypeEnum, str] """The type of the image, such as structured, natural, hybrid, and others.""" structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE """The type of the content for structured data types, such as bar chart, pie chart, and others.""" caption: str = "" """A caption or subheading associated with the image.""" text: str = "" """Extracted text from a structured chart.""" image_location: tuple = (0, 0, 0, 0) """The bounding box of the image, in the format (x1,y1,x2,y2).""" image_location_max_dimensions: tuple = (0, 0) """The maximum dimensions of the bounding box of the image, in the format (x_max,y_max).""" uploaded_image_url: str = "" """A mirror of source_metadata.source_location.""" width: int = 0 """The width of the image.""" height: int = 0 """The height of the image.""" custom_content: Optional[Dict[str, Any]] = None
[docs] @field_validator("image_type") def validate_image_type(cls, v): if not isinstance(v, (DocumentTypeEnum, str)): raise ValueError("image_type must be a string or DocumentTypeEnum") return v
[docs] @field_validator("width", "height") def clamp_non_negative(cls, v, field): if v < 0: logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}") return 0 return v
[docs] class TableMetadataSchema(BaseModelNoExt): """ The schema for the extracted table content. """ caption: str = "" """The caption for the table.""" table_format: TableFormatEnum """ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown, html, latex, simple (cells separated as spaces). """ table_content: str = "" """Extracted text content, formatted according to table_metadata.table_format.""" table_content_format: Union[TableFormatEnum, str] = "" table_location: tuple = (0, 0, 0, 0) """The bounding box of the table, in the format (x1,y1,x2,y2).""" table_location_max_dimensions: tuple = (0, 0) """The maximum dimensions of the bounding box of the table, in the format (x_max,y_max).""" uploaded_image_uri: str = "" """A mirror of source_metadata.source_location.""" custom_content: Optional[Dict[str, Any]] = None
[docs] class ChartMetadataSchema(BaseModelNoExt): """ The schema for extracted chart content. """ caption: str = "" """The caption for the chart.""" table_format: TableFormatEnum """ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown, html, latex, simple (cells separated as spaces). """ table_content: str = "" """Extracted text content, formatted according to chart_metadata.table_format.""" table_content_format: Union[TableFormatEnum, str] = "" table_location: tuple = (0, 0, 0, 0) """The bounding box of the chart, in the format (x1,y1,x2,y2).""" table_location_max_dimensions: tuple = (0, 0) """The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max).""" uploaded_image_uri: str = "" """A mirror of source_metadata.source_location.""" custom_content: Optional[Dict[str, Any]] = None
[docs] class AudioMetadataSchema(BaseModelNoExt): """ The schema for extracted audio content. """ audio_transcript: str = "" """A transcript of the audio content.""" audio_type: str = "" """The type or format of the audio, such as mp3, wav.""" custom_content: Optional[Dict[str, Any]] = None
# TODO consider deprecating this in favor of info msg...
[docs] class ErrorMetadataSchema(BaseModelNoExt): task: TaskTypeEnum status: StatusEnum source_id: str = "" error_msg: str custom_content: Optional[Dict[str, Any]] = None
[docs] class InfoMessageMetadataSchema(BaseModelNoExt): task: TaskTypeEnum status: StatusEnum message: str filter: bool custom_content: Optional[Dict[str, Any]] = None
# Main metadata schema
[docs] class MetadataSchema(BaseModelNoExt): """ The primary container schema for extraction results. """ content: str = "" """The actual textual content extracted from the source.""" content_url: str = "" """A URL that points to the location of the content, if applicable.""" embedding: Optional[List[float]] = None """An optional numerical vector representation (embedding) of the content.""" source_metadata: Optional[SourceMetadataSchema] = None """Metadata about the original source of the content.""" content_metadata: Optional[ContentMetadataSchema] = None """General metadata about the extracted content itself.""" audio_metadata: Optional[AudioMetadataSchema] = None """Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO.""" text_metadata: Optional[TextMetadataSchema] = None """Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT.""" image_metadata: Optional[ImageMetadataSchema] = None """Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE.""" table_metadata: Optional[TableMetadataSchema] = None """Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED.""" chart_metadata: Optional[ChartMetadataSchema] = None """Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED.""" error_metadata: Optional[ErrorMetadataSchema] = None """Metadata that describes any errors encountered during processing.""" info_message_metadata: Optional[InfoMessageMetadataSchema] = None """Informational messages related to the processing.""" debug_metadata: Optional[Dict[str, Any]] = None """A dictionary for storing any arbitrary debug information.""" raise_on_failure: bool = False """If True, indicates that processing should halt on failure.""" custom_content: Optional[Dict[str, Any]] = None
[docs] @model_validator(mode="before") @classmethod def check_metadata_type(cls, values): content_type = values.get("content_metadata", {}).get("type", None) if content_type != ContentTypeEnum.AUDIO: values["audio_metadata"] = None if content_type != ContentTypeEnum.IMAGE: values["image_metadata"] = None if content_type != ContentTypeEnum.TEXT: values["text_metadata"] = None if content_type != ContentTypeEnum.STRUCTURED: values["table_metadata"] = None return values
[docs] def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema: """ Validates the given metadata dictionary against the MetadataSchema. Parameters: - metadata: A dictionary representing metadata to be validated. Returns: - An instance of MetadataSchema if validation is successful. Raises: - ValidationError: If the metadata does not conform to the schema. """ return MetadataSchema(**metadata)