diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-29 11:13:53.873023 +slurm submission log: 2024-05-30 23:53:12.890916 created following sbatch script: ############################### @@ -7,24 +7,23 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 -#SBATCH --dependency=afterok:7667689 #SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-3223007 +#SBATCH --job-name=tthrush-job-4396652 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/llms/pythia-70m_xnli_es_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29526 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' ############################### @@ -34,507 +33,485 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7667690 +Submitted batch job 7673212 ############################### -slurm submission log: 2024-05-30 08:40:47.591270 -created following sbatch script: - -############################### - -#!/bin/bash - -#SBATCH --account=nlp -#SBATCH --cpus-per-task=16 -#SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-2184404 -#SBATCH --mem=100G -#SBATCH --nodelist=sphinx2 -#SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1/train_job_output.txt -#SBATCH --partition=sphinx -#SBATCH --time=14-0 +/var/lib/slurm/slurmd/job7673212/slurm_script: line 15: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory -# activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection - -# cd to working directory -cd . - -# launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' - -############################### +CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. +To initialize your shell, run -submission to slurm complete! + $ conda init +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell -############################### -slurm submission output +See 'conda init --help' for more information and options. -Submitted batch job 7670601 +IMPORTANT: You may need to close and restart your shell after running 'conda init'. - -############################### - ############################### -start time: 2024-05-30 16:57:11.163102 +start time: 2024-05-31 02:56:42.451619 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 + torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 ############################### command outputs: -[2024-05-30 16:57:29,546] torch.distributed.run: [WARNING] -[2024-05-30 16:57:29,546] torch.distributed.run: [WARNING] ***************************************** -[2024-05-30 16:57:29,546] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-30 16:57:29,546] torch.distributed.run: [WARNING] ***************************************** -05/30/2024 16:58:08 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/30/2024 16:58:10 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_constrained/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +[2024-05-31 02:56:44,158] torch.distributed.run: [WARNING] +[2024-05-31 02:56:44,158] torch.distributed.run: [WARNING] ***************************************** +[2024-05-31 02:56:44,158] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-05-31 02:56:44,158] torch.distributed.run: [WARNING] ***************************************** +05/31/2024 02:56:52 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +05/31/2024 02:56:53 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/paper_writeup_tests/ordinal_ph_proj/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( - 0%| | 0/10714 [00:00