Run it on Colab.
#4
by
girrajjangid
- opened
!pip -q install transformers==4.34.0
!pip -q install accelerate==0.23.0
!pip -q install flash-attn==2.3.3 --no-build-isolation
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch
model_id = "amazon/MistralLite"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
torch_dtype=torch.bfloat16,
offload_folder = "offload",
device_map="auto")
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer)
prompt = "<|prompter|>What are the main challenges to support a long context for LLM? Explain in details 1000-2000 words.</s><|assistant|>"
sequences = pipeline(
prompt,
max_new_tokens=5000,
do_sample=False,
return_full_text=False,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
print(f"{seq['generated_text']}")
flash_attn v2 not supported on T4 GPU.
Yes, in this case, we can run the model without flash_attn v2. Thank you!
flash_attn v2 not supported on T4 GPU.
Also, T4 doesn't support bfloat16
You can try to use float16
, it should work as well. Cheers!
yinsong1986
changed discussion status to
closed