namespace-Pt
/

Llama-3-8B-Instruct-80K-QLoRA

namespace-Pt commited on Apr 29

Commit

cc68800

•

1 Parent(s): 0a05167

Upload folder using huggingface_hub

Files changed (1) hide show

README.md CHANGED Viewed

@@ -72,15 +72,19 @@ from peft import PeftModel
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
 torch_dtype = torch.bfloat16
 # place the model on GPU
 device_map = {"": "cuda"}
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 base_model = AutoModelForCausalLM.from_pretrained(
   model_id,
   torch_dtype=torch.bfloat16,
   device_map=device_map,
   # NOTE: expand rope base
   rope_theta=200e6,
   max_position_embeddings=81920,
@@ -92,7 +96,6 @@ model = PeftModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map=device_map,
 )
 # NOTE: merge LoRA weights
 model = model.merge_and_unload().eval()
@@ -100,9 +103,9 @@ with torch.no_grad():
   # short context
   messages = [{"role": "user", "content": "Tell me about yourself."}]
   inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
-  outputs = model.generate(**inputs, max_new_tokens=50)
   print(f"Input Length: {inputs['input_ids'].shape[1]}")
-  print(f"Output:       {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
   # long context
   with open("data/narrativeqa.json", encoding="utf-8") as f:
@@ -113,5 +116,5 @@ with torch.no_grad():
   print("*"*20)
   print(f"Input Length: {inputs['input_ids'].shape[1]}")
   print(f"Answers:      {example['answer']}")
-  print(f"Prediction:   {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
 ```

 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
 torch_dtype = torch.bfloat16
 # place the model on GPU
 device_map = {"": "cuda"}
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 base_model = AutoModelForCausalLM.from_pretrained(
   model_id,
   torch_dtype=torch.bfloat16,
   device_map=device_map,
+  attn_implementation="flash_attention_2",
   # NOTE: expand rope base
   rope_theta=200e6,
   max_position_embeddings=81920,
     torch_dtype=torch.bfloat16,
     device_map=device_map,
 )
 # NOTE: merge LoRA weights
 model = model.merge_and_unload().eval()
   # short context
   messages = [{"role": "user", "content": "Tell me about yourself."}]
   inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
+  outputs = model.generate(**inputs, max_new_tokens=50)[:, inputs["input_ids"].shape[1]:]
   print(f"Input Length: {inputs['input_ids'].shape[1]}")
+  print(f"Output:       {tokenizer.decode(outputs[0])}")
   # long context
   with open("data/narrativeqa.json", encoding="utf-8") as f:
   print("*"*20)
   print(f"Input Length: {inputs['input_ids'].shape[1]}")
   print(f"Answers:      {example['answer']}")
+  print(f"Prediction:   {tokenizer.decode(outputs[0])}")
 ```