Source code for nv_ingest_api.internal.extract.html.html_extractor

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import logging
import uuid
from typing import Optional, Dict, Any, Union, Tuple, List

import pandas as pd

from nv_ingest_api.internal.enums.common import ContentTypeEnum
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
from nv_ingest_api.util.schema.schema_validator import validate_schema
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler

from markitdown.converters import HtmlConverter

logger = logging.getLogger(__name__)


@unified_exception_handler
def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
    metadata = row.get("metadata")
    html_content = row.get("content")

    if html_content:
        html_converter = HtmlConverter()
        md_content = html_converter.convert_string(html_content=html_content).text_content
        metadata["content"] = md_content

    return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]


[docs] def extract_markdown_from_html_internal( df_extraction_ledger: pd.DataFrame, task_config: Dict[str, Any], extraction_config: HtmlExtractorSchema, execution_trace_log: Optional[Dict[str, Any]] = None, ) -> Tuple[pd.DataFrame, Union[Dict, None]]: """ Processes a pandas DataFrame containing HTML file content, extracting html as text from each document and converting it to markdown. Parameters ---------- df_extraction_ledger : pd.DataFrame The input DataFrame containing html files as raw text. Expected columns include 'source_id' and 'content'. task_config : Union[Dict[str, Any], BaseModel] Configuration instructions for the document processing task. This can be provided as a dictionary or a Pydantic model. extraction_config : Any A configuration object for document extraction that guides the extraction process. execution_trace_log : Optional[Dict[str, Any]], default=None An optional dictionary containing trace information for debugging or logging. Returns ------- pd.DataFrame A DataFrame with the original html content converted to markdown. The resulting DataFrame contains the columns "document_type", "metadata", and "uuid". Raises ------ Exception If an error occurs during the document extraction process, the exception is logged and re-raised. """ # Apply the decode_and_extract function to each row in the DataFrame. sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1) # Explode any list results and drop missing values. sr_extraction = sr_extraction.explode().dropna() # Convert the extraction results to a DataFrame if available. if not sr_extraction.empty: extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"]) else: extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []}) return extracted_df, {}