Source code for nv_ingest_api.internal.schemas.meta.metadata_schema

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import logging
from datetime import datetime
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Union

from pydantic import field_validator, model_validator, Field

from nv_ingest_api.internal.enums.common import (
    AccessLevelEnum,
    ContentTypeEnum,
    TextTypeEnum,
    LanguageEnum,
    TableFormatEnum,
    StatusEnum,
    DocumentTypeEnum,
    TaskTypeEnum,
)
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
from nv_ingest_api.util.converters import datetools

logger = logging.getLogger(__name__)


# Sub schemas

[docs]
class SourceMetadataSchema(BaseModelNoExt):
    """
    Schema for the knowledge base file from which content
    and metadata is extracted.
    """

    source_name: str
    """The name of the source file."""

    source_id: str
    """The ID of the source file."""

    source_location: str = ""
    """The URL, URI, or pointer to the storage location of the source file."""

    source_type: Union[DocumentTypeEnum, str]
    """The type of the source file, such as pdf, docx, pptx, or txt."""

    collection_id: str = ""
    """The ID of the collection in which the source is contained."""

    date_created: str = datetime.now().isoformat()
    """The date the source was created."""

    last_modified: str = datetime.now().isoformat()
    """The date the source was last modified."""

    summary: str = ""
    """A summary of the source."""

    partition_id: int = -1
    """The offset of this data fragment within a larger set of fragments."""

    access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
    """The role-based access control for the source."""

    custom_content: Optional[Dict[str, Any]] = None


[docs]
    @field_validator("date_created", "last_modified")
    @classmethod
    def validate_fields(cls, field_value):
        datetools.validate_iso8601(field_value)
        return field_value





[docs]
class NearbyObjectsSubSchema(BaseModelNoExt):
    """
    Schema to hold related extracted object.
    """

    content: List[str] = Field(default_factory=list)
    bbox: List[tuple] = Field(default_factory=list)
    type: List[str] = Field(default_factory=list)




[docs]
class NearbyObjectsSchema(BaseModelNoExt):
    """
    Schema to hold types of related extracted objects.
    """

    text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
    images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
    structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()




[docs]
class ContentHierarchySchema(BaseModelNoExt):
    """
    Schema for the extracted content hierarchy.
    """

    page_count: int = -1
    page: int = -1
    block: int = -1
    line: int = -1
    span: int = -1
    nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()




[docs]
class ContentMetadataSchema(BaseModelNoExt):
    """
    Data extracted from a source; generally Text or Image.
    """

    type: ContentTypeEnum
    """The type of the content. Text, Image, Structured, Table, or Chart."""

    description: str = ""
    """A text description of the content object."""

    page_number: int = -1
    """The page number of the content in the source."""

    hierarchy: ContentHierarchySchema = ContentHierarchySchema()
    """The location or order of the content within the source."""

    subtype: Union[ContentTypeEnum, str] = ""
    """The type of the content for structured data types, such as table or chart."""

    start_time: int = -1
    """The timestamp of the start of a piece of audio content."""

    end_time: int = -1
    """The timestamp of the end of a piece of audio content."""

    custom_content: Optional[Dict[str, Any]] = None




[docs]
class TextMetadataSchema(BaseModelNoExt):
    """
    The schema for the extracted text content.
    """

    text_type: TextTypeEnum
    """The type of the text, such as header or body."""

    summary: str = ""
    """An abbreviated summary of the content."""

    keywords: Union[str, List[str], Dict] = ""
    """Keywords, named entities, or other phrases."""

    language: LanguageEnum = "en"  # default to Unknown? Maybe do some kind of heuristic check
    """The language of the content."""

    text_location: tuple = (0, 0, 0, 0)
    """The bounding box of the text, in the format (x1,y1,x2,y2)."""

    text_location_max_dimensions: tuple = (0, 0)
    """The maximum dimensions of the bounding box of the text, in the format (x_max,y_max)."""

    custom_content: Optional[Dict[str, Any]] = None




[docs]
class ImageMetadataSchema(BaseModelNoExt):
    """
    The schema for the extracted image content.
    """

    image_type: Union[DocumentTypeEnum, str]
    """The type of the image, such as structured, natural, hybrid, and others."""

    structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
    """The type of the content for structured data types, such as bar chart, pie chart, and others."""

    caption: str = ""
    """A caption or subheading associated with the image."""

    text: str = ""
    """Extracted text from a structured chart."""

    image_location: tuple = (0, 0, 0, 0)
    """The bounding box of the image, in the format (x1,y1,x2,y2)."""

    image_location_max_dimensions: tuple = (0, 0)
    """The maximum dimensions of the bounding box of the image, in the format (x_max,y_max)."""

    uploaded_image_url: str = ""
    """A mirror of source_metadata.source_location."""

    width: int = 0
    """The width of the image."""

    height: int = 0
    """The height of the image."""

    custom_content: Optional[Dict[str, Any]] = None


[docs]
    @field_validator("image_type")
    def validate_image_type(cls, v):
        if not isinstance(v, (DocumentTypeEnum, str)):
            raise ValueError("image_type must be a string or DocumentTypeEnum")
        return v



[docs]
    @field_validator("width", "height")
    def clamp_non_negative(cls, v, field):
        if v < 0:
            logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
            return 0
        return v





[docs]
class TableMetadataSchema(BaseModelNoExt):
    """
    The schema for the extracted table content.
    """

    caption: str = ""
    """The caption for the table."""

    table_format: TableFormatEnum
    """
    The format of the table.  One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
    html, latex, simple (cells separated as spaces).
    """

    table_content: str = ""
    """Extracted text content, formatted according to table_metadata.table_format."""

    table_content_format: Union[TableFormatEnum, str] = ""

    table_location: tuple = (0, 0, 0, 0)
    """The bounding box of the table, in the format (x1,y1,x2,y2)."""

    table_location_max_dimensions: tuple = (0, 0)
    """The maximum dimensions of the bounding box of the table, in the format (x_max,y_max)."""

    uploaded_image_uri: str = ""
    """A mirror of source_metadata.source_location."""

    custom_content: Optional[Dict[str, Any]] = None




[docs]
class ChartMetadataSchema(BaseModelNoExt):
    """
    The schema for extracted chart content.
    """

    caption: str = ""
    """The caption for the chart."""

    table_format: TableFormatEnum
    """
    The format of the table.  One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
    html, latex, simple (cells separated as spaces).
    """

    table_content: str = ""
    """Extracted text content, formatted according to chart_metadata.table_format."""

    table_content_format: Union[TableFormatEnum, str] = ""

    table_location: tuple = (0, 0, 0, 0)
    """The bounding box of the chart, in the format (x1,y1,x2,y2)."""

    table_location_max_dimensions: tuple = (0, 0)
    """The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max)."""

    uploaded_image_uri: str = ""
    """A mirror of source_metadata.source_location."""

    custom_content: Optional[Dict[str, Any]] = None




[docs]
class AudioMetadataSchema(BaseModelNoExt):
    """
    The schema for extracted audio content.
    """

    audio_transcript: str = ""
    """A transcript of the audio content."""

    audio_type: str = ""
    """The type or format of the audio, such as mp3, wav."""

    custom_content: Optional[Dict[str, Any]] = None



# TODO consider deprecating this in favor of info msg...

[docs]
class ErrorMetadataSchema(BaseModelNoExt):
    task: TaskTypeEnum
    status: StatusEnum
    source_id: str = ""
    error_msg: str
    custom_content: Optional[Dict[str, Any]] = None




[docs]
class InfoMessageMetadataSchema(BaseModelNoExt):
    task: TaskTypeEnum
    status: StatusEnum
    message: str
    filter: bool
    custom_content: Optional[Dict[str, Any]] = None



# Main metadata schema

[docs]
class MetadataSchema(BaseModelNoExt):
    """
    The primary container schema for extraction results.
    """

    content: str = ""
    """The actual textual content extracted from the source."""

    content_url: str = ""
    """A URL that points to the location of the content, if applicable."""

    embedding: Optional[List[float]] = None
    """An optional numerical vector representation (embedding) of the content."""

    source_metadata: Optional[SourceMetadataSchema] = None
    """Metadata about the original source of the content."""

    content_metadata: Optional[ContentMetadataSchema] = None
    """General metadata about the extracted content itself."""

    audio_metadata: Optional[AudioMetadataSchema] = None
    """Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO."""

    text_metadata: Optional[TextMetadataSchema] = None
    """Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT."""

    image_metadata: Optional[ImageMetadataSchema] = None
    """Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE."""

    table_metadata: Optional[TableMetadataSchema] = None
    """Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED."""

    chart_metadata: Optional[ChartMetadataSchema] = None
    """Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED."""

    error_metadata: Optional[ErrorMetadataSchema] = None
    """Metadata that describes any errors encountered during processing."""

    info_message_metadata: Optional[InfoMessageMetadataSchema] = None
    """Informational messages related to the processing."""

    debug_metadata: Optional[Dict[str, Any]] = None
    """A dictionary for storing any arbitrary debug information."""

    raise_on_failure: bool = False
    """If True, indicates that processing should halt on failure."""

    custom_content: Optional[Dict[str, Any]] = None


[docs]
    @model_validator(mode="before")
    @classmethod
    def check_metadata_type(cls, values):
        content_type = values.get("content_metadata", {}).get("type", None)
        if content_type != ContentTypeEnum.AUDIO:
            values["audio_metadata"] = None
        if content_type != ContentTypeEnum.IMAGE:
            values["image_metadata"] = None
        if content_type != ContentTypeEnum.TEXT:
            values["text_metadata"] = None
        if content_type != ContentTypeEnum.STRUCTURED:
            values["table_metadata"] = None
        return values





[docs]
def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
    """
    Validates the given metadata dictionary against the MetadataSchema.

    Parameters:
    - metadata: A dictionary representing metadata to be validated.

    Returns:
    - An instance of MetadataSchema if validation is successful.

    Raises:
    - ValidationError: If the metadata does not conform to the schema.
    """
    return MetadataSchema(**metadata)