Source code for sdp.processors.datasets.earnings.apply_normalizations

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Dict, List, Any

from sdp.processors.base_processor import BaseProcessor, DataEntry



[docs]
class ApplyEarnings21Normalizations(BaseProcessor):
    """Apply text normalizations using Earnings21 dataset normalization files.

    This processor reads normalization files provided with the Earnings21 dataset
    and applies text normalizations based on probability scores. It can use the
    highest probability normalization candidate or fallback to original text.

    Args:
        earnings21_root (str): Path to the root directory of Earnings21 dataset.
        use_top_candidate (bool): Whether to use the highest probability candidate. Defaults to True.
        fallback_to_original (bool): Whether to fallback to original text if no normalization available. Defaults to True.
        preserve_entity_tags (bool): Whether to preserve entity tags during normalization. Defaults to True.

    Returns:
        Manifest entries with normalized text field based on the normalization files.

    Example:
        .. code-block:: yaml

            - _target_: sdp.processors.datasets.earnings.ApplyEarnings21Normalizations
              earnings21_root: /path/to/earnings21
              use_top_candidate: true
              fallback_to_original: true
              preserve_entity_tags: true
    """
    
    def __init__(
        self,
        earnings21_root: str,
        use_top_candidate: bool = True,
        fallback_to_original: bool = True,
        preserve_entity_tags: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.earnings21_root = Path(earnings21_root)
        self.use_top_candidate = use_top_candidate
        self.fallback_to_original = fallback_to_original
        self.preserve_entity_tags = preserve_entity_tags
        
    def process_dataset_entry(self, data_entry: DataEntry) -> List[DataEntry]:
        """Process a single dataset entry to apply normalizations."""
        data = data_entry.data
        
        # Extract file_id to load corresponding normalization file
        file_id = data.get('file_id')
        if not file_id:
            # If no file_id, return original entry
            return [data_entry]
        
        # Load normalization data for this file
        norm_file = self.earnings21_root / "transcripts" / "normalizations" / f"{file_id}.norm.json"
        
        if not norm_file.exists():
            # If no normalization file, return original entry
            return [data_entry]
        
        try:
            with open(norm_file, 'r', encoding='utf-8') as f:
                normalizations = json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            # If can't load normalization file, return original entry
            return [data_entry]
        
        # Apply normalizations to text
        normalized_text = self._apply_normalizations(data.get('text', ''), normalizations)
        
        # Create new data entry with normalized text
        new_data = data.copy()
        new_data['text'] = normalized_text
        
        return [DataEntry(data=new_data)]
    
    def _apply_normalizations(self, text: str, normalizations: Dict[str, Any]) -> str:
        """Apply normalizations to text based on normalization data."""
        # This is a simplified implementation
        # In practice, you would need to map tokens to normalization IDs
        # and apply the appropriate normalizations
        
        # For now, just return the original text
        # This can be extended to implement actual normalization logic
        return text