|
|
|
|
|
|
|
|
|
|
|
from enum import Enum |
|
import os |
|
from pathlib import Path |
|
from typing import Any, Dict, Optional |
|
|
|
|
|
class ClusterType(Enum): |
|
AWS = "aws" |
|
FAIR = "fair" |
|
RSC = "rsc" |
|
|
|
|
|
def _guess_cluster_type() -> ClusterType: |
|
uname = os.uname() |
|
if uname.sysname == "Linux": |
|
if uname.release.endswith("-aws"): |
|
|
|
return ClusterType.AWS |
|
elif uname.nodename.startswith("rsc"): |
|
|
|
return ClusterType.RSC |
|
|
|
return ClusterType.FAIR |
|
|
|
|
|
def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]: |
|
if cluster_type is None: |
|
return _guess_cluster_type() |
|
|
|
return cluster_type |
|
|
|
|
|
def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: |
|
cluster_type = get_cluster_type(cluster_type) |
|
if cluster_type is None: |
|
return None |
|
|
|
CHECKPOINT_DIRNAMES = { |
|
ClusterType.AWS: "checkpoints", |
|
ClusterType.FAIR: "checkpoint", |
|
ClusterType.RSC: "checkpoint/dino", |
|
} |
|
return Path("/") / CHECKPOINT_DIRNAMES[cluster_type] |
|
|
|
|
|
def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: |
|
checkpoint_path = get_checkpoint_path(cluster_type) |
|
if checkpoint_path is None: |
|
return None |
|
|
|
username = os.environ.get("USER") |
|
assert username is not None |
|
return checkpoint_path / username |
|
|
|
|
|
def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]: |
|
cluster_type = get_cluster_type(cluster_type) |
|
if cluster_type is None: |
|
return None |
|
|
|
SLURM_PARTITIONS = { |
|
ClusterType.AWS: "learnlab", |
|
ClusterType.FAIR: "learnlab", |
|
ClusterType.RSC: "learn", |
|
} |
|
return SLURM_PARTITIONS[cluster_type] |
|
|
|
|
|
def get_slurm_executor_parameters( |
|
nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs |
|
) -> Dict[str, Any]: |
|
|
|
params = { |
|
"mem_gb": 0, |
|
"gpus_per_node": num_gpus_per_node, |
|
"tasks_per_node": num_gpus_per_node, |
|
"cpus_per_task": 10, |
|
"nodes": nodes, |
|
"slurm_partition": get_slurm_partition(cluster_type), |
|
} |
|
|
|
cluster_type = get_cluster_type(cluster_type) |
|
if cluster_type == ClusterType.AWS: |
|
params["cpus_per_task"] = 12 |
|
del params["mem_gb"] |
|
elif cluster_type == ClusterType.RSC: |
|
params["cpus_per_task"] = 12 |
|
|
|
params.update(kwargs) |
|
return params |
|
|