Systems#

This section lists all systems supported by CloudAI. The attributes shown for each system can be set in TOML configuration files.

System

Scheduler Value

Slurm

slurm

Kubernetes

kubernetes

RunAI

runai

LSF

lsf

Standalone

standalone

Slurm#

pydantic model cloudai.systems.slurm.slurm_system.SlurmSystem[source]#

Represents a Slurm system.

field default_partition: str [Required]#
field partitions: List[SlurmPartition] [Required]#
field account: str | None = None#
field distribution: str | None = None#
field mpi: str = 'pmix'#
field gpus_per_node: int | None = None#
field ntasks_per_node: int | None = None#
field cache_docker_images_locally: bool = False#
field scheduler: str = 'slurm'#
field monitor_interval: int = 60#
field extra_srun_args: str | None = None#
field extra_sbatch_args: list[str] [Optional]#
field container_mount_home: bool = False#
field data_repository: DataRepositoryConfig | None = None#
field reports: dict[str, ReportConfig] | None = None#
field name: str [Required]#
field install_path: Path [Required]#
field output_path: Path [Required]#
field hf_home_path: Path [Optional]#
field global_env_vars: dict[str, Any] [Optional]#
pydantic model cloudai.systems.slurm.slurm_system.SlurmPartition[source]#

Represents a partition within a Slurm system.

field name: str [Required]#
field groups: List[SlurmGroup] = []#
pydantic model cloudai.systems.slurm.slurm_system.SlurmGroup[source]#

Represents a group of nodes within a partition.

field name: str [Required]#
field nodes: List[str] [Required]#
pydantic model cloudai.systems.slurm.slurm_system.DataRepositoryConfig[source]#

Configuration for a data repository.

field endpoint: str [Required]#
field verify_certs: bool = True#

Kubernetes#

pydantic model cloudai.systems.kubernetes.kubernetes_system.KubernetesSystem[source]#

Represents a Kubernetes system.

field kube_config_path: Path [Required]#
field default_namespace: str [Required]#
field scheduler: str = 'kubernetes'#
field monitor_interval: int = 1#
field gpus_per_node: int = 1#
field name: str [Required]#
field install_path: Path [Required]#
field output_path: Path [Required]#
field hf_home_path: Path [Optional]#
field global_env_vars: dict[str, Any] [Optional]#

RunAI#

pydantic model cloudai.systems.runai.runai_system.RunAISystem[source]#

RunAISystem integrates with the RunAI platform to manage and monitor jobs and nodes.

field scheduler: str = 'runai'#
field monitor_interval: int = 60#
field base_url: str [Required]#
field user_email: str [Required]#
field app_id: str [Required]#
field app_secret: str [Required]#
field project_id: str [Required]#
field cluster_id: str [Required]#
field name: str [Required]#
field install_path: Path [Required]#
field output_path: Path [Required]#
field hf_home_path: Path [Optional]#
field global_env_vars: dict[str, Any] [Optional]#
pydantic model cloudai.systems.runai.runai_node.RunAINode[source]#

Represent a node in the RunAI cluster.

field status: NodeStatus = NodeStatus.UNKNOWN#
field conditions: List[Dict[str, Any]] [Optional]#
field taints: List[Dict[str, Any]] [Optional]#
field node_pool: str = '' (alias 'nodePool')#
field created_at: str = '' (alias 'createdAt')#
field gpu_type: str | None = None (alias 'gpuType')#
field gpu_count: int | None = None (alias 'gpuCount')#
field name: str = ''#
field id: str = ''#
field cluster_uuid: str = '' (alias 'clusterUuid')#
field updated_at: str = '' (alias 'updatedAt')#

LSF#

pydantic model cloudai.systems.lsf.lsf_system.LSFSystem[source]#

Represents an LSF system.

field queues: List[LSFQueue] [Optional]#

A list of queues in the LSF system, filled in automatically

field account: str | None = None#
field scheduler: str = 'lsf'#
field project_name: str | None = None#
field default_queue: str | None = None#
field monitor_interval: int = 60#
field app: str | None = None#
field os_version: str | None = None#
field name: str [Required]#
field install_path: Path [Required]#
field output_path: Path [Required]#
field hf_home_path: Path [Optional]#
field global_env_vars: dict[str, Any] [Optional]#
pydantic model cloudai.systems.lsf.lsf_system.LSFQueue[source]#

Represents a queue within the LSF system.

field name: str [Required]#
field groups: List[LSFGroup] = []#
pydantic model cloudai.systems.lsf.lsf_system.LSFGroup[source]#

Represents a group of nodes within a queue.

field name: str [Required]#
field nodes: List[str] [Required]#

Standalone#

pydantic model cloudai.systems.standalone.standalone_system.StandaloneSystem[source]#

Class representing a Standalone system.

This class is used for systems that execute commands directly without a job scheduler.

field scheduler: str = 'standalone'#
field monitor_interval: int = 1#
field name: str [Required]#
field install_path: Path [Required]#
field output_path: Path [Required]#
field hf_home_path: Path [Optional]#
field global_env_vars: dict[str, Any] [Optional]#