Source code for nv_ingest_api.internal.extract.html.html_extractor

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import logging
import uuid
from typing import Optional, Dict, Any, Union, Tuple, List

import pandas as pd

from nv_ingest_api.internal.enums.common import ContentTypeEnum
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
from nv_ingest_api.util.schema.schema_validator import validate_schema
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler

from markitdown.converters import HtmlConverter

logger = logging.getLogger(__name__)


@unified_exception_handler
def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
    metadata = row.get("metadata")
    html_content = row.get("content")

    if html_content:
        html_converter = HtmlConverter()
        md_content = html_converter.convert_string(html_content=html_content).text_content
        metadata["content"] = md_content

    return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]



[docs]
def extract_markdown_from_html_internal(
    df_extraction_ledger: pd.DataFrame,
    task_config: Dict[str, Any],
    extraction_config: HtmlExtractorSchema,
    execution_trace_log: Optional[Dict[str, Any]] = None,
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
    """
    Processes a pandas DataFrame containing HTML file content, extracting html as text from
    each document and converting it to markdown.

    Parameters
    ----------
    df_extraction_ledger : pd.DataFrame
        The input DataFrame containing html files as raw text. Expected columns include
        'source_id' and 'content'.
    task_config : Union[Dict[str, Any], BaseModel]
        Configuration instructions for the document processing task. This can be provided as a
        dictionary or a Pydantic model.
    extraction_config : Any
        A configuration object for document extraction that guides the extraction process.
    execution_trace_log : Optional[Dict[str, Any]], default=None
        An optional dictionary containing trace information for debugging or logging.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the original html content converted to markdown. The resulting
        DataFrame contains the columns "document_type", "metadata", and "uuid".

    Raises
    ------
    Exception
        If an error occurs during the document extraction process, the exception is logged and
        re-raised.
    """

    # Apply the decode_and_extract function to each row in the DataFrame.
    sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)

    # Explode any list results and drop missing values.
    sr_extraction = sr_extraction.explode().dropna()

    # Convert the extraction results to a DataFrame if available.
    if not sr_extraction.empty:
        extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
    else:
        extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})

    return extracted_df, {}