Source code for sdp.processors.datasets.earnings.apply_normalizations

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Dict, List, Any

from sdp.processors.base_processor import BaseProcessor, DataEntry


[docs] class ApplyEarnings21Normalizations(BaseProcessor): """Apply text normalizations using Earnings21 dataset normalization files. This processor reads normalization files provided with the Earnings21 dataset and applies text normalizations based on probability scores. It can use the highest probability normalization candidate or fallback to original text. Args: earnings21_root (str): Path to the root directory of Earnings21 dataset. use_top_candidate (bool): Whether to use the highest probability candidate. Defaults to True. fallback_to_original (bool): Whether to fallback to original text if no normalization available. Defaults to True. preserve_entity_tags (bool): Whether to preserve entity tags during normalization. Defaults to True. Returns: Manifest entries with normalized text field based on the normalization files. Example: .. code-block:: yaml - _target_: sdp.processors.datasets.earnings.ApplyEarnings21Normalizations earnings21_root: /path/to/earnings21 use_top_candidate: true fallback_to_original: true preserve_entity_tags: true """ def __init__( self, earnings21_root: str, use_top_candidate: bool = True, fallback_to_original: bool = True, preserve_entity_tags: bool = True, **kwargs, ): super().__init__(**kwargs) self.earnings21_root = Path(earnings21_root) self.use_top_candidate = use_top_candidate self.fallback_to_original = fallback_to_original self.preserve_entity_tags = preserve_entity_tags def process_dataset_entry(self, data_entry: DataEntry) -> List[DataEntry]: """Process a single dataset entry to apply normalizations.""" data = data_entry.data # Extract file_id to load corresponding normalization file file_id = data.get('file_id') if not file_id: # If no file_id, return original entry return [data_entry] # Load normalization data for this file norm_file = self.earnings21_root / "transcripts" / "normalizations" / f"{file_id}.norm.json" if not norm_file.exists(): # If no normalization file, return original entry return [data_entry] try: with open(norm_file, 'r', encoding='utf-8') as f: normalizations = json.load(f) except (json.JSONDecodeError, FileNotFoundError): # If can't load normalization file, return original entry return [data_entry] # Apply normalizations to text normalized_text = self._apply_normalizations(data.get('text', ''), normalizations) # Create new data entry with normalized text new_data = data.copy() new_data['text'] = normalized_text return [DataEntry(data=new_data)] def _apply_normalizations(self, text: str, normalizations: Dict[str, Any]) -> str: """Apply normalizations to text based on normalization data.""" # This is a simplified implementation # In practice, you would need to map tokens to normalization IDs # and apply the appropriate normalizations # For now, just return the original text # This can be extended to implement actual normalization logic return text