We need an `offload_dir` to dispatch this model according to this `device_map`
#3
by
littleevillin
- opened
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/baichuan-7B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("baichuan-inc/baichuan-7B", device_map="auto", trust_remote_code=True)
inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt')
inputs = inputs.to('cuda:0')
pred = model.generate(**inputs, max_new_tokens=64,repetition_penalty=1.1)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
result:
登鹳雀楼->王之涣
夜雨寄北->李商隐
过零丁洋->文天祥
己亥杂诗(其五)->龚自珍
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from peft import PeftModel
tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/baichuan-7B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("baichuan-inc/baichuan-7B", device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(model, "hiyouga/baichuan-7b-sft")
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
query = "晚上睡不着怎么办"
inputs = tokenizer(["<human>:{}\n<bot>:".format(query)], return_tensors="pt")
inputs = inputs.to("cuda")
generate_ids = model.generate(**inputs, max_new_tokens=256, streamer=streamer)
result:
Traceback (most recent call last):
File "model-sft.py", line 10, in <module>
model = PeftModel.from_pretrained(model, "/root/.cache/huggingface/hub/models--hiyouga--baichuan-7b-sft/snapshots/64cf906a964bc94bf754cef8aa0d8c05107c7745")
File "/usr/local/lib/python3.8/dist-packages/peft/peft_model.py", line 181, in from_pretrained
model.load_adapter(model_id, adapter_name, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/peft/peft_model.py", line 406, in load_adapter
dispatch_model(
File "/usr/local/lib/python3.8/dist-packages/accelerate/big_modeling.py", line 345, in dispatch_model
raise ValueError(
ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: base_model.model.model.layers.18, base_model.model.model.layers.19, base_model.model.model.layers.20, base_model.model.model.layers.21, base_model.model.model.layers.22, base_model.model.model.layers.23, base_model.model.model.layers.24, base_model.model.model.layers.25, base_model.model.model.layers.26, base_model.model.model.layers.27, base_model.model.model.layers.28, base_model.model.model.layers.29, base_model.model.model.layers.30, base_model.model.model.layers.31, base_model.model.model.norm, base_model.model.lm_head.
update:model = PeftModel.from_pretrained(model, "hiyouga/baichuan-7b-sft", offload_folder='./')
littleevillin
changed discussion status to
closed
I encountered same problem, did you solve it?
I'm also facing the same issue here the code
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
MODEL_NAME = "antony-pk/llama-3-8b-Instruct-bnb-4bit-e10-emp-gold-jul16"
offload_dir = "offload_directory"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME,
# max_seq_length = max_seq_length,
# dtype = dtype,
dtype=torch.float32,
load_in_4bit = False,
# offload_dir = offload_dir, # Add the offload directory here
device_map = "auto" # Adjust this according to your needs
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
Error
==((====))== Unsloth: Fast Llama patching release 2024.7
\\ /| GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \ Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\ / Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
"-____-" Free Apache license: http://github.com/unslothai/unsloth
Loading checkpoint shards: 100%
4/4 [01:06<00:00, 14.34s/it]
generation_config.json: 100%
131/131 [00:00<00:00, 13.3kB/s]
tokenizer_config.json: 100%
51.1k/51.1k [00:00<00:00, 4.49MB/s]
tokenizer.json: 100%
9.09M/9.09M [00:00<00:00, 50.2MB/s]
special_tokens_map.json: 100%
459/459 [00:00<00:00, 39.9kB/s]
adapter_model.safetensors: 100%
168M/168M [00:03<00:00, 40.9MB/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[4], line 10
7 MODEL_NAME = "antony-pk/llama-3-8b-Instruct-bnb-4bit-e10-emp-gold-jul16"
8 offload_dir = "offload_directory"
---> 10 model, tokenizer = FastLanguageModel.from_pretrained(
11 model_name = MODEL_NAME,
12 # max_seq_length = max_seq_length,
13 # dtype = dtype,
14 dtype=torch.float32,
15 load_in_4bit = False,
16 # offload_dir = offload_dir, # Add the offload directory here
17 device_map = "auto" # Adjust this according to your needs
18 # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
19 )
File /opt/conda/lib/python3.10/site-packages/unsloth/models/loader.py:223, in FastLanguageModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, *args, **kwargs)
219 if is_peft:
220 # From https://github.com/huggingface/peft/issues/184
221 # Now add PEFT adapters
222 model.enable_input_require_grads()
--> 223 model = PeftModel.from_pretrained(
224 model,
225 old_model_name,
226 token = token,
227 revision = revision,
228 is_trainable = True,
229 )
230 # Patch it as well!
231 model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
File /opt/conda/lib/python3.10/site-packages/peft/peft_model.py:430, in PeftModel.from_pretrained(cls, model, model_id, adapter_name, is_trainable, config, **kwargs)
428 else:
429 model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config, adapter_name)
--> 430 model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
431 return model
File /opt/conda/lib/python3.10/site-packages/peft/peft_model.py:1025, in PeftModel.load_adapter(self, model_id, adapter_name, is_trainable, torch_device, **kwargs)
1022 self._update_offload(offload_index, adapters_weights)
1023 dispatch_model_kwargs["offload_index"] = offload_index
-> 1025 dispatch_model(
1026 self,
1027 device_map=device_map,
1028 offload_dir=offload_dir,
1029 **dispatch_model_kwargs,
1030 )
1032 hook = AlignDevicesHook(io_same_device=True)
1033 if self.peft_config[adapter_name].is_prompt_learning:
File /opt/conda/lib/python3.10/site-packages/accelerate/big_modeling.py:376, in dispatch_model(model, device_map, main_device, state_dict, offload_dir, offload_index, offload_buffers, skip_keys, preload_module_classes, force_hooks)
374 disk_modules = [name for name, device in device_map.items() if device == "disk"]
375 if offload_dir is None and offload_index is None and len(disk_modules) > 0:
--> 376 raise ValueError(
377 "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
378 f"need to be offloaded: {', '.join(disk_modules)}."
379 )
380 if (
381 len(disk_modules) > 0
382 and offload_index is None
383 and (not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")))
384 ):
385 disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: base_model.model.model.layers.9, base_model.model.model.layers.10, base_model.model.model.layers.11, base_model.model.model.layers.12, base_model.model.model.layers.13, base_model.model.model.layers.14, base_model.model.model.layers.15, base_model.model.model.layers.16, base_model.model.model.layers.17, base_model.model.model.layers.18, base_model.model.model.layers.19, base_model.model.model.layers.20, base_model.model.model.layers.21, base_model.model.model.layers.22, base_model.model.model.layers.23, base_model.model.model.layers.24, base_model.model.model.layers.25, base_model.model.model.layers.26, base_model.model.model.layers.27, base_model.model.model.layers.28, base_model.model.model.layers.29, base_model.model.model.layers.30, base_model.model.model.layers.31, base_model.model.model.norm, base_model.model.lm_head.