nvidia-resiliency-ext
Documentation contents:
Fault Tolerance
Inprocess Restart
Async Checkpointing
Local Checkpointing
Straggler Detection
nvidia-resiliency-ext
Index
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
L
|
M
|
N
|
O
|
P
|
R
|
S
|
T
|
V
|
W
_
__str__() (nvidia_resiliency_ext.straggler.straggler.CallableId method)
A
add_finalize_fn() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest method)
async_fn (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
(nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt.TorchAsyncCheckpoint attribute)
async_fn_args (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
async_fn_kwargs (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
async_loop() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller static method)
async_save() (nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt.TorchAsyncCheckpoint method)
AsyncCaller (class in nvidia_resiliency_ext.checkpointing.async_ckpt.core)
AsyncCallsQueue (class in nvidia_resiliency_ext.checkpointing.async_ckpt.core)
AsyncRequest (class in nvidia_resiliency_ext.checkpointing.async_ckpt.core)
B
BaseCheckpointManager (class in nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager)
BasicTensorAwareStateDict (class in nvidia_resiliency_ext.checkpointing.local.basic_state_dict)
C
calculate_and_set_hb_timeouts() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
calculate_and_set_section_timeouts() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
call_idx (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
CallableId (class in nvidia_resiliency_ext.straggler.straggler)
checkpoint_id (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync property)
CheckpointingException
CheckpointMetadataCache (class in nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
CliqueReplicationStrategy (class in nvidia_resiliency_ext.checkpointing.local.replication.strategies)
close() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller method)
copy_tensors_to_cpu() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict method)
CudaHealthCheck (class in nvidia_resiliency_ext.inprocess.health_check)
cupti_manager (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
custom_sections (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
CustomSection (class in nvidia_resiliency_ext.straggler.straggler)
D
detection_section() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
Detector (class in nvidia_resiliency_ext.straggler.straggler)
E
end_all_sections() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
end_section() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
execute_sync() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest method)
F
FaultCounter (class in nvidia_resiliency_ext.inprocess.health_check)
FaultCounterExceeded
FaultToleranceConfig (class in nvidia_resiliency_ext.fault_tolerance.config)
FileSystemWriterAsync (class in nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async)
finalize_async_save() (nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt.TorchAsyncCheckpoint method)
finalize_fns (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
find_latest() (nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager.BaseCheckpointManager method)
finish() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
freeze() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest method)
from_args() (nvidia_resiliency_ext.fault_tolerance.config.FaultToleranceConfig static method)
from_kwargs() (nvidia_resiliency_ext.fault_tolerance.config.FaultToleranceConfig static method)
from_replication_params() (nvidia_resiliency_ext.checkpointing.local.replication.strategies.CliqueReplicationStrategy class method)
from_yaml_file() (nvidia_resiliency_ext.fault_tolerance.config.FaultToleranceConfig static method)
G
gather_on_rank0 (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
generate_report() (nvidia_resiliency_ext.straggler.reporting.ReportGenerator method)
(nvidia_resiliency_ext.straggler.straggler.Detector class method)
generate_report_if_interval_elapsed() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
get_cache_metadata() (nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.CheckpointMetadataCache method)
get_metadata_caching_status() (in module nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
(nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.CheckpointMetadataCache method)
get_num_unfinalized_calls() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue method)
get_save_function_and_args() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
H
HealthCheckError
I
identify_stragglers() (nvidia_resiliency_ext.straggler.reporting.Report method)
init_checkpoint_metadata_cache() (in module nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
init_tensors() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict method)
init_workload_monitoring() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
initialize() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
initialized (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
insert_tensors() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict method)
InternalError
is_current_async_call_done() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller method)
is_frozen (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
is_hollow (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict property)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict property)
is_interval_elapsed() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
L
LazyCliqueReplicationStrategy (class in nvidia_resiliency_ext.checkpointing.local.replication.strategies)
LazyReplicationStrategyBuilder (class in nvidia_resiliency_ext.checkpointing.local.replication.strategies)
load() (nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager.BaseCheckpointManager method)
load_state_dict() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
local_ckpt_dir (nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager.LocalCheckpointManager property)
LocalCheckpointManager (class in nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager)
M
maybe_finalize_async_calls() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue method)
module
nvidia_resiliency_ext.checkpointing.async_ckpt.core
nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async
nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver
nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt
nvidia_resiliency_ext.checkpointing.local.base_state_dict
nvidia_resiliency_ext.checkpointing.local.basic_state_dict
nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager
nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager
nvidia_resiliency_ext.checkpointing.local.replication.strategies
nvidia_resiliency_ext.fault_tolerance.config
nvidia_resiliency_ext.fault_tolerance.rank_monitor_client
nvidia_resiliency_ext.inprocess.exception
nvidia_resiliency_ext.inprocess.finalize
nvidia_resiliency_ext.inprocess.health_check
nvidia_resiliency_ext.inprocess.initialize
nvidia_resiliency_ext.straggler.reporting
nvidia_resiliency_ext.straggler.statistics
nvidia_resiliency_ext.straggler.straggler
N
name (nvidia_resiliency_ext.straggler.straggler.CallableId attribute)
NoReplicasAvailableError
nvidia_resiliency_ext.checkpointing.async_ckpt.core
module
nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async
module
nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver
module
nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt
module
nvidia_resiliency_ext.checkpointing.local.base_state_dict
module
nvidia_resiliency_ext.checkpointing.local.basic_state_dict
module
nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager
module
nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager
module
nvidia_resiliency_ext.checkpointing.local.replication.strategies
module
nvidia_resiliency_ext.fault_tolerance.config
module
nvidia_resiliency_ext.fault_tolerance.rank_monitor_client
module
nvidia_resiliency_ext.inprocess.exception
module
nvidia_resiliency_ext.inprocess.finalize
module
nvidia_resiliency_ext.inprocess.health_check
module
nvidia_resiliency_ext.inprocess.initialize
module
nvidia_resiliency_ext.straggler.reporting
module
nvidia_resiliency_ext.straggler.statistics
module
nvidia_resiliency_ext.straggler.straggler
module
O
obj (nvidia_resiliency_ext.straggler.straggler.CallableId attribute)
original_callables (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
P
PersistentAsyncCaller (class in nvidia_resiliency_ext.checkpointing.async_ckpt.core)
pop_tensors() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict method)
preload_fn (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest attribute)
preload_tensors() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync static method)
prepare_decentralized_global_plan() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
prepare_local_plan() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
prepare_save_state_dict_ret() (nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.CheckpointMetadataCache method)
prepare_write_data() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
profiling_interval (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
R
rank (nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager.BaseCheckpointManager property)
RankMonitorClient (class in nvidia_resiliency_ext.fault_tolerance.rank_monitor_client)
RankMonitorClientError
replicate() (nvidia_resiliency_ext.checkpointing.local.replication.strategies.CliqueReplicationStrategy method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.LazyReplicationStrategyBuilder method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.ReplicationStrategy method)
replication_strategy (nvidia_resiliency_ext.checkpointing.local.replication.strategies.LazyReplicationStrategyBuilder property)
ReplicationStrategy (class in nvidia_resiliency_ext.checkpointing.local.replication.strategies)
Report (class in nvidia_resiliency_ext.straggler.reporting)
report_interval_tracker (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
report_time_interval (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
reporter (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
ReportGenerator (class in nvidia_resiliency_ext.straggler.reporting)
RestartAbort
RestartError
restore_original_callables() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
restore_tensor_device() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict method)
retrieve_execute() (nvidia_resiliency_ext.checkpointing.local.replication.strategies.CliqueReplicationStrategy method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.LazyReplicationStrategyBuilder method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.ReplicationStrategy method)
retrieve_plan() (nvidia_resiliency_ext.checkpointing.local.replication.strategies.CliqueReplicationStrategy method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.LazyReplicationStrategyBuilder method)
(nvidia_resiliency_ext.checkpointing.local.replication.strategies.ReplicationStrategy method)
retrieve_write_results() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
RetryController (class in nvidia_resiliency_ext.inprocess.initialize)
S
SameMachineReplicationException
save() (nvidia_resiliency_ext.checkpointing.local.ckpt_managers.base_manager.BaseCheckpointManager method)
save_state_dict_async_finalize() (in module nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
save_state_dict_async_plan() (in module nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
schedule_async_call() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller method)
(nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller method)
schedule_async_request() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue method)
scores_to_compute (nvidia_resiliency_ext.straggler.straggler.Detector attribute)
send_heartbeat() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
send_workload_control_request() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
set_cache_metadata() (nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.CheckpointMetadataCache method)
set_cached_global_metadata() (nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.CheckpointMetadataCache method)
shutdown() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
shutdown_workload_monitoring() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
start_section() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
state_dict() (nvidia_resiliency_ext.fault_tolerance.rank_monitor_client.RankMonitorClient method)
Statistic (class in nvidia_resiliency_ext.straggler.statistics)
StragglerId (class in nvidia_resiliency_ext.straggler.reporting)
sync_all_async_calls() (nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCaller method)
T
TemporalAsyncCaller (class in nvidia_resiliency_ext.checkpointing.async_ckpt.core)
TensorAwareStateDict (class in nvidia_resiliency_ext.checkpointing.local.base_state_dict)
tensors (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict property)
(nvidia_resiliency_ext.checkpointing.local.basic_state_dict.BasicTensorAwareStateDict property)
ThreadedFinalize (class in nvidia_resiliency_ext.inprocess.finalize)
TimeoutError
to_yaml_file() (nvidia_resiliency_ext.fault_tolerance.config.FaultToleranceConfig method)
TorchAsyncCheckpoint (class in nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt)
V
validate_checkpoint_id() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync class method)
values() (nvidia_resiliency_ext.checkpointing.local.base_state_dict.TensorAwareStateDict method)
verify_global_md_reuse() (in module nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver)
W
wrap_callables() (nvidia_resiliency_ext.straggler.straggler.Detector class method)
write_data() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync method)
write_preloaded_data() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync static method)
write_preloaded_data_multiproc() (nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync static method)