Source code for megatron.energon.av.utils
# Copyright (c) 2025, NVIDIA CORPORATION.
# SPDX-License-Identifier: BSD-3-Clause
from typing import List, Literal, Optional, Tuple, Union, overload
import numpy as np
import torch
from megatron.energon.av.av_decoder import AVData, AVDecoder
[docs]
def get_clips_uniform(
av_decoder: AVDecoder,
clip_duration_seconds: float,
num_clips: int,
request_video: bool = False,
request_audio: bool = False,
video_out_frame_size: Optional[tuple[int, int]] = None,
) -> AVData:
"""Extracts a sequence of clips, such that each clip is of
equal duration and the clips are equidistant from each other.
Args:
av_decoder: An AVDecoder instance.
clip_duration_seconds: The duration of each clip in seconds.
num_clips: The number of clips to extract.
request_video: Whether to request video clips.
request_audio: Whether to request audio clips.
video_out_frame_size: The size of the video frames to output, or None to use the original size.
Returns:
An AVData object containing the extracted video and audio clips.
"""
if not request_video and not request_audio:
raise ValueError("You must request at least one of video or audio")
video_duration = float("inf")
audio_duration = float("inf")
if request_video:
video_duration, _ = av_decoder.get_video_duration()
if video_duration is None:
raise ValueError("No video duration found")
if request_audio:
audio_duration = av_decoder.get_audio_duration()
if audio_duration is None:
raise ValueError("No audio duration found")
# Typically, audio and video don't have the exact same duration, so we take the minimum
# so that we can safely extract clips of equal duration.
total_duration = min(video_duration, audio_duration)
assert total_duration != float("inf")
if clip_duration_seconds == 0:
# Special case of single frames: End point should be start of last frame
video_fps = av_decoder.get_video_fps()
video_spf = 1 / video_fps
first_start_time = video_spf * 0.5
last_start_time = total_duration - video_spf * 0.5
else:
first_start_time = 0
last_start_time = total_duration - clip_duration_seconds
clips = [
(float(start_time), float(start_time + clip_duration_seconds))
for start_time in np.linspace(first_start_time, last_start_time, num_clips)
]
return av_decoder.get_clips(
video_clip_ranges=clips if request_video else None,
audio_clip_ranges=clips if request_audio else None,
video_unit="seconds",
audio_unit="seconds",
video_out_frame_size=video_out_frame_size,
)
@overload
def get_single_frames_uniform(
av_decoder: "AVDecoder",
num_frames: int,
*,
video_out_frame_size: Optional[Tuple[int, int]] = None,
return_timestamps: Literal[False] = False,
) -> torch.Tensor: ...
@overload
def get_single_frames_uniform(
av_decoder: "AVDecoder",
num_frames: int,
*,
video_out_frame_size: Optional[Tuple[int, int]] = None,
return_timestamps: Literal[True],
) -> Tuple[torch.Tensor, List[float]]: ...
[docs]
def get_single_frames_uniform(
av_decoder: AVDecoder,
num_frames: int,
*,
video_out_frame_size: Optional[tuple[int, int]] = None,
return_timestamps: bool = False,
) -> Union[torch.Tensor, tuple[torch.Tensor, list[float]]]:
"""Extracts a sequence of clips, such that each clip contains
only a single frame and the frames are equidistant from each other.
Args:
av_decoder: An AVDecoder instance.
num_frames: The number of frames to extract.
video_out_frame_size: The size of the video frames to output, or None to use the original size.
Returns:
A tensor of shape (num_frames, channels, height, width) containing the extracted frames.
"""
av_data = get_clips_uniform(
av_decoder=av_decoder,
clip_duration_seconds=0,
num_clips=num_frames,
request_video=True,
request_audio=False,
video_out_frame_size=video_out_frame_size,
)
if len(av_data.video_clips) == 0:
raise ValueError("No video frames found")
# Concatenate all video single-frame clips to form a single tensor
video_tensor = torch.cat(av_data.video_clips, dim=0)
if return_timestamps:
return video_tensor, [t for t, _ in av_data.video_timestamps]
else:
return video_tensor