Using the Accelerate API to train models on multiple GPUs
I have installed flash attention & rotary emb:
pip install flash-attn==2.1.1 --no-build-isolation
pip install git+https://github.com/HazyResearch/[email protected]#subdirectory=csrc/rotary
The machine has 4 A100 gpus. While using the accelerate API I get the following error:
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation
')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation
')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation
')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation
')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 11162) of binary: /usr/bin/python3.9
Traceback (most recent call last):
File "/home/paperspace/.local/bin/accelerate", line 8, in
sys.exit(main())
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 970, in launch_command
multi_gpu_launcher(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Hi @ajash , it looks like this is an issue arising from incompatible versions of pytorch / cuda / flash_attn. Can you provide more details about your setup? what versions do you have installed?
nvcc --version (cuda version)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0
python3 -c "import torch; print(torch.version)". (pytorch version)
2.0.1+cu117
pip freeze | grep flash-attn
flash-attn==2.1.1
flash attention was installed with: pip install flash-attn --no-build-isolation
ok, the version seem to be compatible. It might be that installing flash-attn with pip install ...
points to a different python environment since according to the stacktrace it can't find the library. Can you try to install flash attention using python3 -m pip install flash-attn --no-build-isolation
?
I uninstalled flsh-attn and then installed it back... that seemed to have worked. I am getting another error: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
I have printed out the entire stack trace here:
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
run_model()run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,model.train_model(gradient_accum_steps=args.batch_size,
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
outputs = self.model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)outputs = self.model(
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
inputs_embeds = self.embed_tokens(input_ids)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
inputs_embeds = self.embed_tokens(input_ids)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
outputs = self.model(
return F.embedding( File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return F.embedding(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError : inputs_embeds = self.embed_tokens(input_ids)return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py:350: UserWarning: operator() profile_node %34 : int[] = prim::profile_ivalue(%32)
does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
kv = repeat_kv(kv, self.num_key_value_groups)
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
outputs = self.model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 687, in forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, *args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(*args)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 683, in custom_forward
return module(*inputs, output_attentions, None)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 444, in forward
hidden_states = self.input_layernorm(hidden_states)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 88, in forward
return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 70, in rmsnorm_func
hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 5654) of binary: /usr/bin/python3.9
Traceback (most recent call last):
File "/home/paperspace/.local/bin/accelerate", line 8, in
sys.exit(main())
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 970, in launch_command
multi_gpu_launcher(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
run_models.py FAILED
Failures:
[1]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 5655)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 5656)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 5657)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 5654)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
after looking at the stack trace a bit more feels like the layernorm is complaining: https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/08639a72e17836184096ae6a7e2766f2a34c3e36/modeling_flash_llama.py#L444
Is it because of model sharding.... output of the same layer is sharded across devices so its causing a problem?
great that the installation now worked! re the new error -- it's hard to say without seeing the code but I think your hunch is correct. How does your setup look like? and how are you distributing the model to different devices?
My code is very basic:
Start of code
def train_model(self, gradient_accum_steps, model_hub_loc):
# Multi-gpu implementation will use accelerates implementation
accelerator = Accelerator(gradient_accumulation_steps=gradient_accum_steps)
device = accelerator.device
model = self.model
model.train().to(device)
dataset = self.dataset.with_format("torch")
dataloader = DataLoader(dataset, collate_fn=DataCollatorForLanguageModeling(
self.data_processor.tokenizer, mlm=False), batch_size=1)
#optimizer = self.set_optimizer()
# Do better
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=10000,
)
# Accelerator specific code
model, optimizer, dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, dataloader, lr_scheduler
)
# ######################################
if self.is_debug_mode:
self.print_model_device_placement(model)
# There is no model training.
return
# ######################################
for i, batch in enumerate(dataloader):
with accelerator.accumulate(model):
#batch = {k: v for k, v in batch.items()}
outputs = model(**batch)
loss = outputs[0]
# Gradient accumulation need not be done manually
# Instead of loss.backward()
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
if i % 100 == 0: # Poor mans logging
print(f"loss: {loss}, steps: {i}")
if model_hub_loc:
model.push_to_hub(model_hub_loc)
I have the sharding info as well. Pasting it below:
module.base_model.model.model.embed_tokens.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.gate_proj.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.up_proj.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.down_proj.weight -> cuda:0
module.base_model.model.model.layers.0.input_layernorm.weight -> cuda:0
module.base_model.model.model.layers.0.post_attention_layernorm.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight -> cuda:0module.base_model.model.model.embed_tokens.weight -> cuda:1
module.base_model.model.model.embed_tokens.weight -> cuda:2
module.base_model.model.model.layers.1.mlp.gate_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:1module.base_model.model.model.embed_tokens.weight -> cuda:3
module.base_model.model.model.layers.1.mlp.up_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:2
module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.1.mlp.down_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:1module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.1.input_layernorm.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:1module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:2
module.base_model.model.model.layers.1.post_attention_layernorm.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:2module.base_model.model.model.layers.2.self_attn.q_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.k_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:2module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:1
module.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:2
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.2.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:2
would love some help...